HMM-3-查找最大概率状态序列(Viterbi)

3 查找最大概率状态序列

当给定模型参数\(\lambda\)和状态序列\(O\)时

3.1 Viterbi

目标为:

\[ \bar{Q} = \underset{Q}{\arg \max} P(Q, O | \lambda) \]

  • 定义\( ^{[2]} \)

\[ \begin{align}
\delta_t(i) &= \underset{q_1,q_2,\cdots,q_{t-1}}{\max} P(q_1,q_2,\cdots,q_t=i, O_1, O_2, \cdots, O_t | \lambda) \\
\delta_{t+1}(j) &= \underset{i}{\max} [\delta_t(i) a_{ij}] b_j(O_{t+1})
\end{align} \]

每次选择过去的概率乘转换概率后的最大概率作为最佳概率, 只要保持跟踪记录上次选择的最大概率对应状态\( S_i \). 则得到了最大概率状态序列.

  • 初始化

\[ \begin{align}
\delta_1(i) &= \pi_i b_i(O_1) \\
\psi_1(i) &= null
\end{align} \]

当前就是第一个状态,没有上一个状态.

  • 循环

\[ \begin{align}
\delta_t(j) &= \underset{i}{\max} [\delta_{t-1}(i) a_{ij}] b_j(O_t) \\
\psi_t(j) &= \underset{i}{\arg \max} [\delta_{t-1}(i) a_{ij}]
\end{align} \]

  • 最后

\[ \begin{align}
\bar{P} &= \underset{i}{\max} \delta_T(i) \\
\bar{q}_T &= \underset{i}{\arg \max} \delta_T(i)
\end{align} \]

\( \bar{P} \) 得到的是最佳路径状态序列和观测序列的联合概率 \( \underset{Q}{\max} P(Q, O | \lambda) \)

  • 回朔得到最大概率状态序列路径

\[ \bar{q}_t = \psi_{t+1}( \bar{q}_{t+1} ) \qquad \text{where} \qquad t=T-1,T-2,\cdots,2,1 \]

3.2 数值稳定性

连乘改为对数求和即可\( ^{[1]} \)

\[ \delta_{t+1}(j) = \underset{i}{\max} \delta_t(i) + \log a_{ij} + \log b_j(O_{t+1}) \]

3.3 代码实验

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import numpy as np
 
np.random.seed(42)
 
# T: length of observation
# N: number of statesin the model
# M: number of distinct observation symbols
g_T, g_N, g_M = 30,10,20
# observation sequence
g_O = np.random.randint(low=0, high=g_M, size=g_T).flatten() # O = { O_1,O_2,\cdots,O_T } 
# initial state probabilities
g_Pi = np.random.dirichlet(np.ones(g_N), size=1).flatten() # \Pi = [\pi_i]
# state transition probabilities
g_A = np.random.dirichlet(np.ones(g_N), size=g_N) # A = [a_{ij}]
# observation emission probabilities
g_B = np.random.dirichlet(np.ones(g_M), size=g_N) # B = [b_j(m)]
# the parameters of the model
g_Lambda = (g_A, g_B, g_Pi) # \lambda = (A,B,\Pi)
 
# the probability of the model given observation sequence and state sequence
def P(Q, O, Lambda):
  ''' p(Q,O | \lambda)=\prod^T_{t=1}p(q_t|q_{t-1}, \lambda)p(O_t|q_t, \lambda) '''
  A, B, Pi = Lambda
  T = O.shape[0]
  assert( Q.shape == O.shape )
  assert( A.shape[0] == A.shape[1] == B.shape[0] == Pi.shape[0] )
  assert( Q.min() >= 0 and Q.max() <  A.shape[0] )
  assert( O.min() >= 0 and O.max() <  B.shape[1] )
  return np.prod([(A[Q[t-1], Q[t]] if t>0 else Pi[Q[t]]) * B[Q[t],O[t]] for t in range(T)])
 
# the log probability of the model given observation sequence and state sequence
def logP(Q, O, Lambda):
  ''' \log P(Q,O | \lambda)=\sum^T_{t=1} \log p(q_t|q_{t-1}, \lambda) + \log p(O_t|q_t, \lambda) '''
  A, B, Pi = Lambda
  T = O.shape[0]
  assert( Q.shape == O.shape )
  assert( A.shape[0] == A.shape[1] == B.shape[0] == Pi.shape[0] )
  assert( Q.min() >= 0 and Q.max() <  A.shape[0] )
  assert( O.min() >= 0 and O.max() <  B.shape[1] )
  logPi, logA, logB = np.log(Pi), np.log(A), np.log(B)
  return np.sum([(logA[Q[t-1], Q[t]] if t>0 else logPi[Q[t]]) + logB[Q[t],O[t]] for t in range(T)])
 
# find best state sequnce in the model given the parameters of the model and observation sequence
def Viterbi(O, Lambda):
  A, B, Pi = Lambda
  T = O.shape[0]
  N = A.shape[0]
  assert( A.shape[0] == A.shape[1] == B.shape[0] == Pi.shape[0] )
  assert( O.min() >= 0 and O.max() <  B.shape[1] )  
  delta = np.zeros(shape=(T, N))
  psi = np.zeros(shape=(T, N), dtype=np.int)
  for t in range(T):
    ''' 0=>t1, 1=>t2, ...,T-1=>T '''
    if t == 0:
      # \delta_1(i) &= \pi_i b_i(O_1)
      delta[t] = Pi * B[:, O[t]]
      # \psi_1(i) &= null
      psi[t] = np.nan
    else:
      # \delta_t(j) &= \underset{i}{\max} [\delta_{t-1}(i) a_{ij}] b_j(O_t)
      # \psi_t(j) &= \underset{i}{\arg \max} [\delta_{t-1}(i) a_{ij}] 
      for j in range(N):
        max_delta = max([(delta[t-1, i] * A[i, j] * B[j, O[t]], i) for i in range(N)], key=lambda x:x[0])
        delta[t, j] = max_delta[0]
        psi[t, j] = max_delta[1]
 
  Q = np.zeros(shape=T, dtype=np.int)
  # \bar{P} &= \underset{i}{\max} \delta_T(i) 
  # \bar{q}_T &= \underset{i}{\arg \max} \delta_T(i) 
  P = np.max(delta[T-1])
  Q[T-1] = np.argmax(delta[T-1])
  # \bar{q}_t = \psi_{t+1}( \bar{q}_{t+1} ) \qquad \text{where} \qquad t=T-1,T-2,\cdots,2,1
  for t in range(T-2, -1, -1):
    Q[t] = psi[t+1, Q[t+1]]
 
  return Q, P
 
 
viterbi_Q, viterbi_P = Viterbi(g_O, g_Lambda)
 
# find best state sequnce using log in the model given the parameters of the model and observation sequence
def logViterbi(O, Lambda):
  logA, logB, logPi = np.log(Lambda[0]), np.log(Lambda[1]), np.log(Lambda[2])
  T = O.shape[0]
  N = logA.shape[0]
  assert( logA.shape[0] == logA.shape[1] == logB.shape[0] == logPi.shape[0] )
  assert( O.min() >= 0 and O.max() <  logB.shape[1] )  
  logdelta = np.zeros(shape=(T, N))
  logpsi = np.zeros(shape=(T, N), dtype=np.int)
  for t in range(T):
    ''' 0=>t1, 1=>t2, ...,T-1=>T '''
    if t == 0:
      # \delta_1(i) &= \pi_i b_i(O_1)
      logdelta[t] = logPi + logB[:, O[t]]
      # \psi_1(i) &= null
      logpsi[t] = np.nan
    else:
      # \delta_{t+1}(j) = \underset{i}{\max} \delta_t(i) + \log a_{ij} + \log b_j(O_{t+1})
      # \psi_t(j) &= \underset{i}{\arg \max} [\delta_{t-1}(i) a_{ij}] 
      for j in range(N):
        max_logdelta = max([(logdelta[t-1, i] + logA[i, j] + logB[j, O[t]], i) for i in range(N)], key=lambda x:x[0])
        logdelta[t, j] = max_logdelta[0]
        logpsi[t, j] = max_logdelta[1]
 
  logQ = np.zeros(shape=T, dtype=np.int)
  # \bar{P} &= \underset{i}{\max} \delta_T(i) 
  # \bar{q}_T &= \underset{i}{\arg \max} \delta_T(i) 
  logP = np.max(logdelta[T-1])
  logQ[T-1] = np.argmax(logdelta[T-1])
  # \bar{q}_t = \psi_{t+1}( \bar{q}_{t+1} ) \qquad \text{where} \qquad t=T-1,T-2,\cdots,2,1
  for t in range(T-2, -1, -1):
    logQ[t] = logpsi[t+1, logQ[t+1]]
 
  return logQ, logP
 
logviterbi_Q, logviterbi_P = logViterbi(g_O, g_Lambda)
 
assert( np.array_equal(logviterbi_Q, viterbi_Q) )
 
print('\\bar{P} = ', viterbi_P)
print('P(Q, O) = ', P(viterbi_Q, g_O, g_Lambda))
print('Q = ', viterbi_Q)
print('\\bar{\logP} = ', logviterbi_P)
print('\logP(Q, O) = ', logP(logviterbi_Q, g_O, g_Lambda))
print('\logQ = ', logviterbi_Q )

output: \[ \begin{align} \bar{P} &= 9.511542024402762e-53 \\ P(Q, O) &= 9.511542024402753e-53 \\ Q &= [3 7 8 2 3 9 1 8 2 2 3 9 1 3 9 1 6 0 1 9 7 1 8 2 3 4 4 5 0 1] \\ \log \bar{ P} &= -119.78450391759523 \\ \log P(Q, O) &= -119.78450391759522 \\ logQ &= [3 7 8 2 3 9 1 8 2 2 3 9 1 3 9 1 6 0 1 9 7 1 8 2 3 4 4 5 0 1] \end{align} \]

References:
[1] Rabiner, L. R. 1989. “A Tutorial on Hidden Markov Models and Selected Applications in Speech Recognition.” Proceedings of the IEEE 77:257–286.
[2] Daniel Jurafsky, James H. Martin “Speech and Language Processing” 15.3 Hidden Markov Models :P421

Leave a Reply

Your email address will not be published. Required fields are marked *

Time limit is exhausted. Please reload the CAPTCHA.

Proudly powered by WordPress   Premium Style Theme by www.gopiplus.com