6. limited memory graio newton
牛顿方法需要计算\( hessian \)矩阵和其逆,为方便计算和减少内存使用,使用L_BFGS算法优化.
最大墒模型:
\[ \begin{align} p(y|x) &= \frac{\exp \left[ \sum_{i=1} w_i f_i(x,y) \right] }{ Z_w(x) } \\
&= \frac{ \exp \left[ \sum_{i=1} w_i f_i(x,y) \right] }{ \sum_y \exp \left[ \sum_{i=1} w_i f_i(x,y) \right] }
\end{align} \]
目标优化函数:
\[ \begin{align}
\min_w f(w) &= -\sum_{x,y} \tilde{p}(x,y) \log p(y|x) \\
&= \sum_{x} \tilde{p}(x) \log Z_w(x) – \sum_{x,y} \tilde{p}(x,y) \sum_{i=1} w_i f_i(x,y)
\end{align} \]
梯度:
\[ \nabla f(w) = \left[\frac{\partial f(w)}{\partial w_1},\frac{\partial f(w)}{\partial w_2},\frac{\partial f(w)}{\partial w_3},\cdots \right]^T \]
\[ \frac{\partial f(w)}{\partial w_i} = \sum_{x,y} \tilde{p}(x) p_w(y|x) f_i(x,y) – \sum_{x,y} \tilde{p}(x,y) f_i(x,y) \]
7. 代码实现和效果
7.1 代码实现
转换pku语料格式,空行表示换行,这么处理的目的是为了方便后续统计
# -*- coding: utf-8 -*- import re import numpy as np def readfileBylines(filename): with open(filename, 'rt', encoding='utf8') as f: lines = f.readlines() return lines lines = readfileBylines('./training/pku_training.utf8') for line in lines: sent,label = line.replace(r' ',''), [] for w, word in enumerate(re.split(r'\s{2}', line)): I = len(word) for i, c in enumerate(word): if I == 1: a = 'S' else: if i == 0: a = 'B' elif i == I-1: a = 'E' else: a = 'M' label.append(a) sent, label = np.asarray(list(sent))[:-1], np.asarray(label)[:-1] for s,l in zip(sent,label): print('%s\t%s' % (s, l)) print('') |
转换后格式如下, 输入序列和标签用\t分隔,空行为原输入的换行.
字符/词\t标签 以 B 及 E 近 S 千 S 名 S |
接着用此语料和模板生成模型并训练.这个过程时间很长,需要大约8-12小时(i7 4790k),这是试验性质(python不支持多线程,实在太慢),如生产环境使用更改为c++代码并使用openmp/openmpi性能可提高很多.
# -*- coding: utf-8 -*- import time, math, pickle import numpy as np from scipy.optimize import fmin_l_bfgs_b def readfileBylines(filename): with open(filename, 'rt', encoding='utf8') as f: lines = f.readlines() return lines def getTextSequences(lines): sequences, result = [[[],[]]], [] for line in lines: if len(line.split('\t')) == 2: word,label = line.split('\t') sequences[-1][0].append(word) sequences[-1][1].append(label.strip()) else: sequences.append([[],[]]) for i in range(len(sequences)): if len(sequences[i][0]) == len(sequences[i][1]) > 0: result.append( dict(enumerate(zip(*sequences[i]))) ) return result def statistics_transition_matrix(labelTypes, sequences): matrix = {} for sequence in sequences: prev_label = None for i in range(len(sequence)): if prev_label not in matrix: matrix[prev_label] = {} if sequence[i][1] not in matrix[prev_label]: matrix[prev_label][sequence[i][1]] = 0 matrix[prev_label][sequence[i][1]] += 1 prev_label = sequence[i][1] for row in [None]+labelTypes: total = sum(matrix[row].values()) for col in labelTypes: matrix[row][col] = matrix[row].get(col,0.0) / total return matrix def generate_features(sequences): # 定义特征扫描模板,每个元素为定义输入特征为当前字符索引的相对偏移位置。 templates = [[-2],[-1],[0],[1],[2], [-2,-1],[-1,0],[0,1],[1,2]] X, XY, features, total = {},{},{},0 for sequence in sequences: for i in range(len(sequence)): for template in templates: value = [ sequence.get(i+pos, (None,None))[0] for pos in template] x = (template, value) xy = (template, value, sequence[i][1]) feature = (template, value, sequence[i][1]) key_x = str(x) key_xy = str(xy) key_f = str(feature) if key_x not in X: X[key_x] = [x, 0] if key_xy not in XY: XY[key_xy] = [xy, 0] if key_f not in features: features[key_f] = feature X[key_x][1] += 1 XY[key_xy][1] += 1 total += 1 features = list(features.values()) weights = np.zeros((len(features))) featureHashTable = {} for i, feature in enumerate(features): featureHashTable[str(feature)] = (feature, i) X = dict([(k, (x, c/total) ) for k,(x,c) in X.items()]) XY = dict([(k, (xy, c/total) ) for k,(xy,c) in XY.items()]) return templates, features, featureHashTable, weights, X, XY #@nb.jit def model_probability(xy, featureHashTable, weights, labelTypes): allyx = {} for _y in labelTypes: key_f = str((xy[0],xy[1],_y)) allyx[_y] = math.exp(weights[featureHashTable[ key_f ][1]]) if key_f in featureHashTable else 1.0 return allyx[xy[2]] / sum(allyx.values()) def opt_fun(weights, features, featureHashTable, labelTypes, data_x, data_xy): return -1.0 * sum([p * math.log( model_probability(xy, featureHashTable, weights, labelTypes) ) for xy,p in data_xy.values()]) def grads_fun(weights, features, featureHashTable, labelTypes, data_x, data_xy): return np.asarray( [data_x[str((features[i][0], features[i][1]))][1] * model_probability(features[i],featureHashTable, weights, labelTypes) - data_xy[str(features[i])][1] for i in range( len(features) )] ) _starTime = time.time() filename_ = './memm_train_text.txt' defineLabelTypes_ = ['B','M','E','S'] sequences_ = getTextSequences(readfileBylines(filename_)) ransition_matrix_ = statistics_transition_matrix(defineLabelTypes_, sequences_) templates_, features_, featureHashTable_, weights_, data_X_, data_XY_ = generate_features(sequences_) #opt_fun(weights_, features_, featureHashTable_, defineLabelTypes_, data_X_, data_XY_) #grads_fun(weights_, features_, featureHashTable_, defineLabelTypes_, data_X_, data_XY_) print( 'templates:', len(templates_)) print( 'features:', len(features_)) iter_n = 0 def callback(xk): global iter_n v = opt_fun(xk, features_, featureHashTable_, defineLabelTypes_, data_X_, data_XY_) iter_n += 1 print('iter:', iter_n, 'f:', v) x,_,_ = fmin_l_bfgs_b(func=opt_fun, x0=weights_, fprime=grads_fun, args=(features_, featureHashTable_, defineLabelTypes_, data_X_, data_XY_), callback=callback, disp=1) features = [(features_[i], x[i]) for i in range(x.shape[0])] with open('memm_model.pkl', 'wb') as f: pickle.dump((ransition_matrix_, features), f) _endTime = time.time() print( 'execute time: %fs' % (_endTime - _starTime) ) |
对pku测试语料分词. 此处代码和hmm分词十分类似. 注意在这里使用 \( p(q_i|q_{i-1}) * p(q_i | o) = p(q_i | q_{i-1}, o) \).实现概率计算,并且和HMM的处理过程一样,用对数求和替代概率连乘。因都是区间内递增函数,并不影响效果.
# -*- coding: utf-8 -*- import sys, re, os, pickle, math import numpy as np import numba as nb sys.argv.append('./gold/pku_training_words.utf8') sys.argv.append('./training/pku_training.utf8') sys.argv.append('./testing/pku_test.utf8') sys.argv.append('train') assert len(sys.argv) >= 4 test_filename = sys.argv[3] with open(test_filename, 'rt', encoding='utf8') as f: test = f.readlines() with open('memm_model.pkl', 'rb') as f: matrix, features = pickle.load(f) templates = {} feature2weight = {} labels = set() for (template, value, label), proba in features: key = str(template) if key not in templates: templates[key] = template key = str((template, value, label)) if key not in feature2weight: feature2weight[key] = proba labels.add(label) templates = templates.values() cache = {} def memm_log_proba(x, y): key = str((x,y)) if key in cache: return cache[key] sequence, index = dict(enumerate(x[0])), x[1] values = [(template, [sequence.get(index + pos, None) for pos in template]) for template in templates] ally = dict([ (_label, math.exp( sum([feature2weight.get(str((value[0],value[1], _label)), 0.0) for value in values]))) for _label in labels]) log_proba = math.log( ally[y] / sum(ally.values()) ) cache[key] = log_proba return log_proba def model_log_proba(label, prev_label, word): proba = memm_log_proba(word, label) + math.log(max(sys.float_info.min, matrix[prev_label][label], sys.float_info.min)) return proba def viterbi(observation): T = len(observation) delta = [None] * (T + 1) psi = [None] * (T + 1) delta[0] = dict([(i, model_log_proba(i, None, (observation, 0))) for i in labels]) psi[0] = dict( [ (i, None) for i in labels ] ) for t in range(1, len(observation)): Ot = observation,t delta[t] = dict([(j, max([delta[t-1][i] + model_log_proba(j, i, Ot) for i in labels])) for j in labels]) psi[t] = dict([(j, max([(delta[t-1][i] + model_log_proba(j, i, Ot), i) for i in labels])[1]) for j in labels ]) delta[T] = max( [ delta[T-1][i] for i in labels ] ) psi[T] = max( [(delta[T-1][i], i) for i in labels ] )[1] q = [None] * (T+1) q[T] = psi[T] for t in range(T-1, -1, -1): q[t] = psi[t][q[t+1]] return q[1:] for sent in test: sequence = viterbi( list(sent) ) segment = [] for char, tag in zip(sent, sequence): if tag == 'B': segment.append(char) elif tag == 'M': segment[-1] += char elif tag == 'E': segment[-1] += char elif tag == 'S': segment.append(char) else: raise Exception() print(' '.join(segment), sep='', end='') #break |
7.2 pku测试算法分词性能对比
这里可通过调整特征模板得到更好的性能,但训练一次太慢,试验性质,也就将就了.
algorithm | P | R | F | OOV | OOV Recall | IV Recall |
---|---|---|---|---|---|---|
maximum matching | 0.836 | 0.904 | 0.869 | 0.058 | 0.059 | 0.956 |
maximum probability | 0.859 | 0.919 | 0.888 | 0.058 | 0.085 | 0.970 |
HMM | 0.804 | 0.789 | 0.796 | 0.058 | 0.364 | 0.815 |
Full Second Order HMM | 0.824 | 0.799 | 0.811 | 0.058 | 0.360 | 0.825 | MEMM | 0.909 | 0.891 | 0.900 | 0.058 | 0.383 | 0.923 |
参考:
[1] 李航; 统计学习方法
[2] Jorge Nocedal Stephen J. Wright; 《Numerical Optimization》 Second Edition 7.2 LIMITED-MEMORY QUASI-NEWTON METHODS
Leave a Reply