HMM-3-查找最大概率状态序列(Viterbi)

3 查找最大概率状态序列

当给定模型参数\(\lambda\)和状态序列\(O\)时

3.1 Viterbi

目标为:

\[ \bar{Q} = \underset{Q}{\arg \max} P(Q, O | \lambda) \]

  • 定义\( ^{[2]} \)

\[ \begin{align}
\delta_t(i) &= \underset{q_1,q_2,\cdots,q_{t-1}}{\max} P(q_1,q_2,\cdots,q_t=i, O_1, O_2, \cdots, O_t | \lambda) \\
\delta_{t+1}(j) &= \underset{i}{\max} [\delta_t(i) a_{ij}] b_j(O_{t+1})
\end{align} \]

每次选择过去的概率乘转换概率后的最大概率作为最佳概率, 只要保持跟踪记录上次选择的最大概率对应状态\( S_i \). 则得到了最大概率状态序列.

  • 初始化

\[ \begin{align}
\delta_1(i) &= \pi_i b_i(O_1) \\
\psi_1(i) &= null
\end{align} \]

当前就是第一个状态,没有上一个状态.

  • 循环

\[ \begin{align}
\delta_t(j) &= \underset{i}{\max} [\delta_{t-1}(i) a_{ij}] b_j(O_t) \\
\psi_t(j) &= \underset{i}{\arg \max} [\delta_{t-1}(i) a_{ij}]
\end{align} \]

  • 最后

\[ \begin{align}
\bar{P} &= \underset{i}{\max} \delta_T(i) \\
\bar{q}_T &= \underset{i}{\arg \max} \delta_T(i)
\end{align} \]

\( \bar{P} \) 得到的是最佳路径状态序列和观测序列的联合概率 \( \underset{Q}{\max} P(Q, O | \lambda) \)

  • 回朔得到最大概率状态序列路径

\[ \bar{q}_t = \psi_{t+1}( \bar{q}_{t+1} ) \qquad \text{where} \qquad t=T-1,T-2,\cdots,2,1 \]

3.2 数值稳定性

连乘改为对数求和即可\( ^{[1]} \)

\[ \delta_{t+1}(j) = \underset{i}{\max} \delta_t(i) + \log a_{ij} + \log b_j(O_{t+1}) \]

3.3 代码实验

import numpy as np

np.random.seed(42)

# T: length of observation
# N: number of statesin the model
# M: number of distinct observation symbols
g_T, g_N, g_M = 30,10,20
# observation sequence
g_O = np.random.randint(low=0, high=g_M, size=g_T).flatten() # O = { O_1,O_2,\cdots,O_T } 
# initial state probabilities
g_Pi = np.random.dirichlet(np.ones(g_N), size=1).flatten() # \Pi = [\pi_i]
# state transition probabilities
g_A = np.random.dirichlet(np.ones(g_N), size=g_N) # A = [a_{ij}]
# observation emission probabilities
g_B = np.random.dirichlet(np.ones(g_M), size=g_N) # B = [b_j(m)]
# the parameters of the model
g_Lambda = (g_A, g_B, g_Pi) # \lambda = (A,B,\Pi)

# the probability of the model given observation sequence and state sequence
def P(Q, O, Lambda):
  ''' p(Q,O | \lambda)=\prod^T_{t=1}p(q_t|q_{t-1}, \lambda)p(O_t|q_t, \lambda) '''
  A, B, Pi = Lambda
  T = O.shape[0]
  assert( Q.shape == O.shape )
  assert( A.shape[0] == A.shape[1] == B.shape[0] == Pi.shape[0] )
  assert( Q.min() >= 0 and Q.max() <  A.shape[0] )
  assert( O.min() >= 0 and O.max() <  B.shape[1] )
  return np.prod([(A[Q[t-1], Q[t]] if t>0 else Pi[Q[t]]) * B[Q[t],O[t]] for t in range(T)])

# the log probability of the model given observation sequence and state sequence
def logP(Q, O, Lambda):
  ''' \log P(Q,O | \lambda)=\sum^T_{t=1} \log p(q_t|q_{t-1}, \lambda) + \log p(O_t|q_t, \lambda) '''
  A, B, Pi = Lambda
  T = O.shape[0]
  assert( Q.shape == O.shape )
  assert( A.shape[0] == A.shape[1] == B.shape[0] == Pi.shape[0] )
  assert( Q.min() >= 0 and Q.max() <  A.shape[0] )
  assert( O.min() >= 0 and O.max() <  B.shape[1] )
  logPi, logA, logB = np.log(Pi), np.log(A), np.log(B)
  return np.sum([(logA[Q[t-1], Q[t]] if t>0 else logPi[Q[t]]) + logB[Q[t],O[t]] for t in range(T)])
 
# find best state sequnce in the model given the parameters of the model and observation sequence
def Viterbi(O, Lambda):
  A, B, Pi = Lambda
  T = O.shape[0]
  N = A.shape[0]
  assert( A.shape[0] == A.shape[1] == B.shape[0] == Pi.shape[0] )
  assert( O.min() >= 0 and O.max() <  B.shape[1] )  
  delta = np.zeros(shape=(T, N))
  psi = np.zeros(shape=(T, N), dtype=np.int)
  for t in range(T):
    ''' 0=>t1, 1=>t2, ...,T-1=>T '''
    if t == 0:
      # \delta_1(i) &= \pi_i b_i(O_1)
      delta[t] = Pi * B[:, O[t]]
      # \psi_1(i) &= null
      psi[t] = np.nan
    else:
      # \delta_t(j) &= \underset{i}{\max} [\delta_{t-1}(i) a_{ij}] b_j(O_t)
      # \psi_t(j) &= \underset{i}{\arg \max} [\delta_{t-1}(i) a_{ij}] 
      for j in range(N):
        max_delta = max([(delta[t-1, i] * A[i, j] * B[j, O[t]], i) for i in range(N)], key=lambda x:x[0])
        delta[t, j] = max_delta[0]
        psi[t, j] = max_delta[1]

  Q = np.zeros(shape=T, dtype=np.int)
  # \bar{P} &= \underset{i}{\max} \delta_T(i) 
  # \bar{q}_T &= \underset{i}{\arg \max} \delta_T(i) 
  P = np.max(delta[T-1])
  Q[T-1] = np.argmax(delta[T-1])
  # \bar{q}_t = \psi_{t+1}( \bar{q}_{t+1} ) \qquad \text{where} \qquad t=T-1,T-2,\cdots,2,1
  for t in range(T-2, -1, -1):
    Q[t] = psi[t+1, Q[t+1]]
    
  return Q, P


viterbi_Q, viterbi_P = Viterbi(g_O, g_Lambda)

# find best state sequnce using log in the model given the parameters of the model and observation sequence
def logViterbi(O, Lambda):
  logA, logB, logPi = np.log(Lambda[0]), np.log(Lambda[1]), np.log(Lambda[2])
  T = O.shape[0]
  N = logA.shape[0]
  assert( logA.shape[0] == logA.shape[1] == logB.shape[0] == logPi.shape[0] )
  assert( O.min() >= 0 and O.max() <  logB.shape[1] )  
  logdelta = np.zeros(shape=(T, N))
  logpsi = np.zeros(shape=(T, N), dtype=np.int)
  for t in range(T):
    ''' 0=>t1, 1=>t2, ...,T-1=>T '''
    if t == 0:
      # \delta_1(i) &= \pi_i b_i(O_1)
      logdelta[t] = logPi + logB[:, O[t]]
      # \psi_1(i) &= null
      logpsi[t] = np.nan
    else:
      # \delta_{t+1}(j) = \underset{i}{\max} \delta_t(i) + \log a_{ij} + \log b_j(O_{t+1})
      # \psi_t(j) &= \underset{i}{\arg \max} [\delta_{t-1}(i) a_{ij}] 
      for j in range(N):
        max_logdelta = max([(logdelta[t-1, i] + logA[i, j] + logB[j, O[t]], i) for i in range(N)], key=lambda x:x[0])
        logdelta[t, j] = max_logdelta[0]
        logpsi[t, j] = max_logdelta[1]

  logQ = np.zeros(shape=T, dtype=np.int)
  # \bar{P} &= \underset{i}{\max} \delta_T(i) 
  # \bar{q}_T &= \underset{i}{\arg \max} \delta_T(i) 
  logP = np.max(logdelta[T-1])
  logQ[T-1] = np.argmax(logdelta[T-1])
  # \bar{q}_t = \psi_{t+1}( \bar{q}_{t+1} ) \qquad \text{where} \qquad t=T-1,T-2,\cdots,2,1
  for t in range(T-2, -1, -1):
    logQ[t] = logpsi[t+1, logQ[t+1]]
    
  return logQ, logP

logviterbi_Q, logviterbi_P = logViterbi(g_O, g_Lambda)

assert( np.array_equal(logviterbi_Q, viterbi_Q) )

print('\\bar{P} = ', viterbi_P)
print('P(Q, O) = ', P(viterbi_Q, g_O, g_Lambda))
print('Q = ', viterbi_Q)
print('\\bar{\logP} = ', logviterbi_P)
print('\logP(Q, O) = ', logP(logviterbi_Q, g_O, g_Lambda))
print('\logQ = ', logviterbi_Q )

output: \[ \begin{align} \bar{P} &= 9.511542024402762e-53 \\ P(Q, O) &= 9.511542024402753e-53 \\ Q &= [3 7 8 2 3 9 1 8 2 2 3 9 1 3 9 1 6 0 1 9 7 1 8 2 3 4 4 5 0 1] \\ \log \bar{ P} &= -119.78450391759523 \\ \log P(Q, O) &= -119.78450391759522 \\ logQ &= [3 7 8 2 3 9 1 8 2 2 3 9 1 3 9 1 6 0 1 9 7 1 8 2 3 4 4 5 0 1] \end{align} \]

References:
[1] Rabiner, L. R. 1989. “A Tutorial on Hidden Markov Models and Selected Applications in Speech Recognition.” Proceedings of the IEEE 77:257–286.
[2] Daniel Jurafsky, James H. Martin “Speech and Language Processing” 15.3 Hidden Markov Models :P421

Leave a Reply

Your email address will not be published. Required fields are marked *

Time limit is exhausted. Please reload the CAPTCHA.

Proudly powered by WordPress   Premium Style Theme by www.gopiplus.com