3 查找最大概率状态序列
当给定模型参数\(\lambda\)和状态序列\(O\)时
3.1 Viterbi
目标为:
\[ \bar{Q} = \underset{Q}{\arg \max} P(Q, O | \lambda) \]
- 定义\( ^{[2]} \)
\[ \begin{align}
\delta_t(i) &= \underset{q_1,q_2,\cdots,q_{t-1}}{\max} P(q_1,q_2,\cdots,q_t=i, O_1, O_2, \cdots, O_t | \lambda) \\
\delta_{t+1}(j) &= \underset{i}{\max} [\delta_t(i) a_{ij}] b_j(O_{t+1})
\end{align} \]
每次选择过去的概率乘转换概率后的最大概率作为最佳概率, 只要保持跟踪记录上次选择的最大概率对应状态\( S_i \). 则得到了最大概率状态序列.
- 初始化
\[ \begin{align}
\delta_1(i) &= \pi_i b_i(O_1) \\
\psi_1(i) &= null
\end{align} \]
当前就是第一个状态,没有上一个状态.
- 循环
\[ \begin{align}
\delta_t(j) &= \underset{i}{\max} [\delta_{t-1}(i) a_{ij}] b_j(O_t) \\
\psi_t(j) &= \underset{i}{\arg \max} [\delta_{t-1}(i) a_{ij}]
\end{align} \]
- 最后
\[ \begin{align}
\bar{P} &= \underset{i}{\max} \delta_T(i) \\
\bar{q}_T &= \underset{i}{\arg \max} \delta_T(i)
\end{align} \]
\( \bar{P} \) 得到的是最佳路径状态序列和观测序列的联合概率 \( \underset{Q}{\max} P(Q, O | \lambda) \)
- 回朔得到最大概率状态序列路径
\[ \bar{q}_t = \psi_{t+1}( \bar{q}_{t+1} ) \qquad \text{where} \qquad t=T-1,T-2,\cdots,2,1 \]
3.2 数值稳定性
连乘改为对数求和即可\( ^{[1]} \)
\[ \delta_{t+1}(j) = \underset{i}{\max} \delta_t(i) + \log a_{ij} + \log b_j(O_{t+1}) \]
3.3 代码实验
import numpy as np
np.random.seed(42)
# T: length of observation
# N: number of statesin the model
# M: number of distinct observation symbols
g_T, g_N, g_M = 30,10,20
# observation sequence
g_O = np.random.randint(low=0, high=g_M, size=g_T).flatten() # O = { O_1,O_2,\cdots,O_T }
# initial state probabilities
g_Pi = np.random.dirichlet(np.ones(g_N), size=1).flatten() # \Pi = [\pi_i]
# state transition probabilities
g_A = np.random.dirichlet(np.ones(g_N), size=g_N) # A = [a_{ij}]
# observation emission probabilities
g_B = np.random.dirichlet(np.ones(g_M), size=g_N) # B = [b_j(m)]
# the parameters of the model
g_Lambda = (g_A, g_B, g_Pi) # \lambda = (A,B,\Pi)
# the probability of the model given observation sequence and state sequence
def P(Q, O, Lambda):
''' p(Q,O | \lambda)=\prod^T_{t=1}p(q_t|q_{t-1}, \lambda)p(O_t|q_t, \lambda) '''
A, B, Pi = Lambda
T = O.shape[0]
assert( Q.shape == O.shape )
assert( A.shape[0] == A.shape[1] == B.shape[0] == Pi.shape[0] )
assert( Q.min() >= 0 and Q.max() < A.shape[0] )
assert( O.min() >= 0 and O.max() < B.shape[1] )
return np.prod([(A[Q[t-1], Q[t]] if t>0 else Pi[Q[t]]) * B[Q[t],O[t]] for t in range(T)])
# the log probability of the model given observation sequence and state sequence
def logP(Q, O, Lambda):
''' \log P(Q,O | \lambda)=\sum^T_{t=1} \log p(q_t|q_{t-1}, \lambda) + \log p(O_t|q_t, \lambda) '''
A, B, Pi = Lambda
T = O.shape[0]
assert( Q.shape == O.shape )
assert( A.shape[0] == A.shape[1] == B.shape[0] == Pi.shape[0] )
assert( Q.min() >= 0 and Q.max() < A.shape[0] )
assert( O.min() >= 0 and O.max() < B.shape[1] )
logPi, logA, logB = np.log(Pi), np.log(A), np.log(B)
return np.sum([(logA[Q[t-1], Q[t]] if t>0 else logPi[Q[t]]) + logB[Q[t],O[t]] for t in range(T)])
# find best state sequnce in the model given the parameters of the model and observation sequence
def Viterbi(O, Lambda):
A, B, Pi = Lambda
T = O.shape[0]
N = A.shape[0]
assert( A.shape[0] == A.shape[1] == B.shape[0] == Pi.shape[0] )
assert( O.min() >= 0 and O.max() < B.shape[1] )
delta = np.zeros(shape=(T, N))
psi = np.zeros(shape=(T, N), dtype=np.int)
for t in range(T):
''' 0=>t1, 1=>t2, ...,T-1=>T '''
if t == 0:
# \delta_1(i) &= \pi_i b_i(O_1)
delta[t] = Pi * B[:, O[t]]
# \psi_1(i) &= null
psi[t] = np.nan
else:
# \delta_t(j) &= \underset{i}{\max} [\delta_{t-1}(i) a_{ij}] b_j(O_t)
# \psi_t(j) &= \underset{i}{\arg \max} [\delta_{t-1}(i) a_{ij}]
for j in range(N):
max_delta = max([(delta[t-1, i] * A[i, j] * B[j, O[t]], i) for i in range(N)], key=lambda x:x[0])
delta[t, j] = max_delta[0]
psi[t, j] = max_delta[1]
Q = np.zeros(shape=T, dtype=np.int)
# \bar{P} &= \underset{i}{\max} \delta_T(i)
# \bar{q}_T &= \underset{i}{\arg \max} \delta_T(i)
P = np.max(delta[T-1])
Q[T-1] = np.argmax(delta[T-1])
# \bar{q}_t = \psi_{t+1}( \bar{q}_{t+1} ) \qquad \text{where} \qquad t=T-1,T-2,\cdots,2,1
for t in range(T-2, -1, -1):
Q[t] = psi[t+1, Q[t+1]]
return Q, P
viterbi_Q, viterbi_P = Viterbi(g_O, g_Lambda)
# find best state sequnce using log in the model given the parameters of the model and observation sequence
def logViterbi(O, Lambda):
logA, logB, logPi = np.log(Lambda[0]), np.log(Lambda[1]), np.log(Lambda[2])
T = O.shape[0]
N = logA.shape[0]
assert( logA.shape[0] == logA.shape[1] == logB.shape[0] == logPi.shape[0] )
assert( O.min() >= 0 and O.max() < logB.shape[1] )
logdelta = np.zeros(shape=(T, N))
logpsi = np.zeros(shape=(T, N), dtype=np.int)
for t in range(T):
''' 0=>t1, 1=>t2, ...,T-1=>T '''
if t == 0:
# \delta_1(i) &= \pi_i b_i(O_1)
logdelta[t] = logPi + logB[:, O[t]]
# \psi_1(i) &= null
logpsi[t] = np.nan
else:
# \delta_{t+1}(j) = \underset{i}{\max} \delta_t(i) + \log a_{ij} + \log b_j(O_{t+1})
# \psi_t(j) &= \underset{i}{\arg \max} [\delta_{t-1}(i) a_{ij}]
for j in range(N):
max_logdelta = max([(logdelta[t-1, i] + logA[i, j] + logB[j, O[t]], i) for i in range(N)], key=lambda x:x[0])
logdelta[t, j] = max_logdelta[0]
logpsi[t, j] = max_logdelta[1]
logQ = np.zeros(shape=T, dtype=np.int)
# \bar{P} &= \underset{i}{\max} \delta_T(i)
# \bar{q}_T &= \underset{i}{\arg \max} \delta_T(i)
logP = np.max(logdelta[T-1])
logQ[T-1] = np.argmax(logdelta[T-1])
# \bar{q}_t = \psi_{t+1}( \bar{q}_{t+1} ) \qquad \text{where} \qquad t=T-1,T-2,\cdots,2,1
for t in range(T-2, -1, -1):
logQ[t] = logpsi[t+1, logQ[t+1]]
return logQ, logP
logviterbi_Q, logviterbi_P = logViterbi(g_O, g_Lambda)
assert( np.array_equal(logviterbi_Q, viterbi_Q) )
print('\\bar{P} = ', viterbi_P)
print('P(Q, O) = ', P(viterbi_Q, g_O, g_Lambda))
print('Q = ', viterbi_Q)
print('\\bar{\logP} = ', logviterbi_P)
print('\logP(Q, O) = ', logP(logviterbi_Q, g_O, g_Lambda))
print('\logQ = ', logviterbi_Q )
output: \[ \begin{align} \bar{P} &= 9.511542024402762e-53 \\ P(Q, O) &= 9.511542024402753e-53 \\ Q &= [3 7 8 2 3 9 1 8 2 2 3 9 1 3 9 1 6 0 1 9 7 1 8 2 3 4 4 5 0 1] \\ \log \bar{ P} &= -119.78450391759523 \\ \log P(Q, O) &= -119.78450391759522 \\ logQ &= [3 7 8 2 3 9 1 8 2 2 3 9 1 3 9 1 6 0 1 9 7 1 8 2 3 4 4 5 0 1] \end{align} \]
References:
[1] Rabiner, L. R. 1989. “A Tutorial on Hidden Markov Models and Selected Applications in Speech Recognition.” Proceedings of the IEEE 77:257–286.
[2] Daniel Jurafsky, James H. Martin “Speech and Language Processing” 15.3 Hidden Markov Models :P421
Leave a Reply