6. limited memory graio newton
牛顿方法需要计算\( hessian \)矩阵和其逆,为方便计算和减少内存使用,使用L_BFGS算法优化.
最大墒模型:
\[ \begin{align} p(y|x) &= \frac{\exp \left[ \sum_{i=1} w_i f_i(x,y) \right] }{ Z_w(x) } \\
&= \frac{ \exp \left[ \sum_{i=1} w_i f_i(x,y) \right] }{ \sum_y \exp \left[ \sum_{i=1} w_i f_i(x,y) \right] }
\end{align} \]
目标优化函数:
\[ \begin{align}
\min_w f(w) &= -\sum_{x,y} \tilde{p}(x,y) \log p(y|x) \\
&= \sum_{x} \tilde{p}(x) \log Z_w(x) – \sum_{x,y} \tilde{p}(x,y) \sum_{i=1} w_i f_i(x,y)
\end{align} \]
梯度:
\[ \nabla f(w) = \left[\frac{\partial f(w)}{\partial w_1},\frac{\partial f(w)}{\partial w_2},\frac{\partial f(w)}{\partial w_3},\cdots \right]^T \]
\[ \frac{\partial f(w)}{\partial w_i} = \sum_{x,y} \tilde{p}(x) p_w(y|x) f_i(x,y) – \sum_{x,y} \tilde{p}(x,y) f_i(x,y) \]
7. 代码实现和效果
7.1 代码实现
转换pku语料格式,空行表示换行,这么处理的目的是为了方便后续统计
# -*- coding: utf-8 -*-
import re
import numpy as np
def readfileBylines(filename):
with open(filename, 'rt', encoding='utf8') as f:
lines = f.readlines()
return lines
lines = readfileBylines('./training/pku_training.utf8')
for line in lines:
sent,label = line.replace(r' ',''), []
for w, word in enumerate(re.split(r'\s{2}', line)):
I = len(word)
for i, c in enumerate(word):
if I == 1: a = 'S'
else:
if i == 0: a = 'B'
elif i == I-1: a = 'E'
else: a = 'M'
label.append(a)
sent, label = np.asarray(list(sent))[:-1], np.asarray(label)[:-1]
for s,l in zip(sent,label):
print('%s\t%s' % (s, l))
print('')
转换后格式如下, 输入序列和标签用\t分隔,空行为原输入的换行.
字符/词\t标签
以 B
及 E
近 S
千 S
名 S
接着用此语料和模板生成模型并训练.这个过程时间很长,需要大约8-12小时(i7 4790k),这是试验性质(python不支持多线程,实在太慢),如生产环境使用更改为c++代码并使用openmp/openmpi性能可提高很多.
# -*- coding: utf-8 -*-
import time, math, pickle
import numpy as np
from scipy.optimize import fmin_l_bfgs_b
def readfileBylines(filename):
with open(filename, 'rt', encoding='utf8') as f:
lines = f.readlines()
return lines
def getTextSequences(lines):
sequences, result = [[[],[]]], []
for line in lines:
if len(line.split('\t')) == 2:
word,label = line.split('\t')
sequences[-1][0].append(word)
sequences[-1][1].append(label.strip())
else:
sequences.append([[],[]])
for i in range(len(sequences)):
if len(sequences[i][0]) == len(sequences[i][1]) > 0:
result.append( dict(enumerate(zip(*sequences[i]))) )
return result
def statistics_transition_matrix(labelTypes, sequences):
matrix = {}
for sequence in sequences:
prev_label = None
for i in range(len(sequence)):
if prev_label not in matrix:
matrix[prev_label] = {}
if sequence[i][1] not in matrix[prev_label]:
matrix[prev_label][sequence[i][1]] = 0
matrix[prev_label][sequence[i][1]] += 1
prev_label = sequence[i][1]
for row in [None]+labelTypes:
total = sum(matrix[row].values())
for col in labelTypes:
matrix[row][col] = matrix[row].get(col,0.0) / total
return matrix
def generate_features(sequences):
# 定义特征扫描模板,每个元素为定义输入特征为当前字符索引的相对偏移位置。
templates = [[-2],[-1],[0],[1],[2],
[-2,-1],[-1,0],[0,1],[1,2]]
X, XY, features, total = {},{},{},0
for sequence in sequences:
for i in range(len(sequence)):
for template in templates:
value = [ sequence.get(i+pos, (None,None))[0] for pos in template]
x = (template, value)
xy = (template, value, sequence[i][1])
feature = (template, value, sequence[i][1])
key_x = str(x)
key_xy = str(xy)
key_f = str(feature)
if key_x not in X: X[key_x] = [x, 0]
if key_xy not in XY: XY[key_xy] = [xy, 0]
if key_f not in features: features[key_f] = feature
X[key_x][1] += 1
XY[key_xy][1] += 1
total += 1
features = list(features.values())
weights = np.zeros((len(features)))
featureHashTable = {}
for i, feature in enumerate(features):
featureHashTable[str(feature)] = (feature, i)
X = dict([(k, (x, c/total) ) for k,(x,c) in X.items()])
XY = dict([(k, (xy, c/total) ) for k,(xy,c) in XY.items()])
return templates, features, featureHashTable, weights, X, XY
#@nb.jit
def model_probability(xy, featureHashTable, weights, labelTypes):
allyx = {}
for _y in labelTypes:
key_f = str((xy[0],xy[1],_y))
allyx[_y] = math.exp(weights[featureHashTable[ key_f ][1]]) if key_f in featureHashTable else 1.0
return allyx[xy[2]] / sum(allyx.values())
def opt_fun(weights, features, featureHashTable, labelTypes, data_x, data_xy):
return -1.0 * sum([p * math.log( model_probability(xy, featureHashTable, weights, labelTypes) ) for xy,p in data_xy.values()])
def grads_fun(weights, features, featureHashTable, labelTypes, data_x, data_xy):
return np.asarray( [data_x[str((features[i][0], features[i][1]))][1] * model_probability(features[i],featureHashTable, weights, labelTypes) - data_xy[str(features[i])][1] for i in range( len(features) )] )
_starTime = time.time()
filename_ = './memm_train_text.txt'
defineLabelTypes_ = ['B','M','E','S']
sequences_ = getTextSequences(readfileBylines(filename_))
ransition_matrix_ = statistics_transition_matrix(defineLabelTypes_, sequences_)
templates_, features_, featureHashTable_, weights_, data_X_, data_XY_ = generate_features(sequences_)
#opt_fun(weights_, features_, featureHashTable_, defineLabelTypes_, data_X_, data_XY_)
#grads_fun(weights_, features_, featureHashTable_, defineLabelTypes_, data_X_, data_XY_)
print( 'templates:', len(templates_))
print( 'features:', len(features_))
iter_n = 0
def callback(xk):
global iter_n
v = opt_fun(xk, features_, featureHashTable_, defineLabelTypes_, data_X_, data_XY_)
iter_n += 1
print('iter:', iter_n, 'f:', v)
x,_,_ = fmin_l_bfgs_b(func=opt_fun, x0=weights_, fprime=grads_fun,
args=(features_, featureHashTable_, defineLabelTypes_, data_X_, data_XY_),
callback=callback, disp=1)
features = [(features_[i], x[i]) for i in range(x.shape[0])]
with open('memm_model.pkl', 'wb') as f:
pickle.dump((ransition_matrix_, features), f)
_endTime = time.time()
print( 'execute time: %fs' % (_endTime - _starTime) )
对pku测试语料分词. 此处代码和hmm分词十分类似. 注意在这里使用 \( p(q_i|q_{i-1}) * p(q_i | o) = p(q_i | q_{i-1}, o) \).实现概率计算,并且和HMM的处理过程一样,用对数求和替代概率连乘。因都是区间内递增函数,并不影响效果.
# -*- coding: utf-8 -*-
import sys, re, os, pickle, math
import numpy as np
import numba as nb
sys.argv.append('./gold/pku_training_words.utf8')
sys.argv.append('./training/pku_training.utf8')
sys.argv.append('./testing/pku_test.utf8')
sys.argv.append('train')
assert len(sys.argv) >= 4
test_filename = sys.argv[3]
with open(test_filename, 'rt', encoding='utf8') as f:
test = f.readlines()
with open('memm_model.pkl', 'rb') as f:
matrix, features = pickle.load(f)
templates = {}
feature2weight = {}
labels = set()
for (template, value, label), proba in features:
key = str(template)
if key not in templates:
templates[key] = template
key = str((template, value, label))
if key not in feature2weight:
feature2weight[key] = proba
labels.add(label)
templates = templates.values()
cache = {}
def memm_log_proba(x, y):
key = str((x,y))
if key in cache: return cache[key]
sequence, index = dict(enumerate(x[0])), x[1]
values = [(template, [sequence.get(index + pos, None) for pos in template]) for template in templates]
ally = dict([ (_label, math.exp( sum([feature2weight.get(str((value[0],value[1], _label)), 0.0) for value in values]))) for _label in labels])
log_proba = math.log( ally[y] / sum(ally.values()) )
cache[key] = log_proba
return log_proba
def model_log_proba(label, prev_label, word):
proba = memm_log_proba(word, label) + math.log(max(sys.float_info.min, matrix[prev_label][label], sys.float_info.min))
return proba
def viterbi(observation):
T = len(observation)
delta = [None] * (T + 1)
psi = [None] * (T + 1)
delta[0] = dict([(i, model_log_proba(i, None, (observation, 0))) for i in labels])
psi[0] = dict( [ (i, None) for i in labels ] )
for t in range(1, len(observation)):
Ot = observation,t
delta[t] = dict([(j, max([delta[t-1][i] + model_log_proba(j, i, Ot) for i in labels])) for j in labels])
psi[t] = dict([(j, max([(delta[t-1][i] + model_log_proba(j, i, Ot), i) for i in labels])[1]) for j in labels ])
delta[T] = max( [ delta[T-1][i] for i in labels ] )
psi[T] = max( [(delta[T-1][i], i) for i in labels ] )[1]
q = [None] * (T+1)
q[T] = psi[T]
for t in range(T-1, -1, -1):
q[t] = psi[t][q[t+1]]
return q[1:]
for sent in test:
sequence = viterbi( list(sent) )
segment = []
for char, tag in zip(sent, sequence):
if tag == 'B':
segment.append(char)
elif tag == 'M':
segment[-1] += char
elif tag == 'E':
segment[-1] += char
elif tag == 'S':
segment.append(char)
else:
raise Exception()
print(' '.join(segment), sep='', end='')
#break
7.2 pku测试算法分词性能对比
这里可通过调整特征模板得到更好的性能,但训练一次太慢,试验性质,也就将就了.
algorithm | P | R | F | OOV | OOV Recall | IV Recall |
---|---|---|---|---|---|---|
maximum matching | 0.836 | 0.904 | 0.869 | 0.058 | 0.059 | 0.956 |
maximum probability | 0.859 | 0.919 | 0.888 | 0.058 | 0.085 | 0.970 |
HMM | 0.804 | 0.789 | 0.796 | 0.058 | 0.364 | 0.815 |
Full Second Order HMM | 0.824 | 0.799 | 0.811 | 0.058 | 0.360 | 0.825 | MEMM | 0.909 | 0.891 | 0.900 | 0.058 | 0.383 | 0.923 |
参考:
[1] 李航; 统计学习方法
[2] Jorge Nocedal Stephen J. Wright; 《Numerical Optimization》 Second Edition 7.2 LIMITED-MEMORY QUASI-NEWTON METHODS
Leave a Reply