2 向前向后算法(forward-backward)
给定模型参数,计算给定任意观测序列\( O \)的概率\( ^{[2]} \).
\[ P(O|\lambda) = \sum_Q P(O, Q | \lambda) \]
\( Q \)有\( N^T \)种可能,很显然,计算复杂度\( o(N^T) \)太高. 因此使用(forward-backward)向前向后算法实现可接受的计算复杂度.
2.1 向前算法(forward)
向前向后算法时间复杂度均为 \( o(N^2 T) \).
- 定义:
\[ \alpha_t(i) \equiv P(O_1 \cdots O_t, q_t = S_i | \lambda) \]
- 初始化:
\[ \begin{align}
\alpha_1(i) &\equiv P(O_1,q_1=S_i|\lambda) \\
&= P(O_1 | q_1=S_i,\lambda)P(q_1=S_i|\lambda) \\
&= \pi_i b_i(O_1)
\end{align} \]
- 递归:
\[ \begin{align}
\alpha_{t+1}(j) &\equiv P(O_1 \cdots O_{t+1}, q_{t+1}=S_j|\lambda) \\
&= P(O_1 \cdots O_{t+1} | q_{t+1}=S_j, \lambda) P(q_{t+1}=S_j | \lambda) \\
&= P(O_1 \cdots O_t|q_{t+1}=S_j,\lambda)P(O_{t+1}|q_{t+1}=S_j,\lambda)P(q_{t+1}=S_j|\lambda) \\
&= P(O_1 \cdots O_t, q_{t+1}=S_j|\lambda)P(O_{t+1}|q_{t+1}=S_j,\lambda) \\
&= P(O_{t+1}|q_{t+1}=S_j,\lambda) \sum_{i=1}^N P(O_1 \cdots O_t, q_t=S_i, q_{t+1}=S_j|\lambda) \\
&= P(O_{t+1}|q_{t+1}=S_j,\lambda) \sum_{i=1}^N P(O_1 \cdots O_t, q_{t+1}=S_j | q_t=S_i, \lambda)P(q_t=S_i|\lambda) \\
&= P(O_{t+1}|q_{t+1}=S_j,\lambda) \sum_{i=1}^N P(O_1 \cdots O_t | q_t=S_i, \lambda) P(q_{t+1} = S_j | q_t=S_i, \lambda) P(q_t=S_i|\lambda) \\
&= P(O_{t+1}|q_{t+1}=S_j,\lambda) \sum_{i=1}^N P(O_1 \cdots O_t, q_t=S_i, \lambda) P(q_{t+1} = S_j | q_t=S_i, \lambda) \\
&= \left[ \sum_{i=1}^N \alpha_t(i) a_{ij} \right] b_j(O_{t+1})
\end{align} \]
- 最后:
\[ \begin{align}
P(O|\lambda) &= \sum_{i=1}^N P(O, q_T = S_i | \lambda) \\
&= \sum_{i=1}^N \alpha_T(i)
\end{align} \]
2.1.1 数值稳定性
如同之前的模型概率,在这里同样存在大量小于1.0的数值相乘,且因为在过程中有求和,因此这里不适合直接使用对数概率。为了提高数值稳定性让其在机器精度范围内, 可将计算过程中的\( \alpha_{t}(i) \)缩放\( ^{[1][5]} \).
Chain rule (probability)\( ^{[3]} \) Conditional probability\( ^{[4]} \)
- 定义
\[ \begin{align}
C_t & \equiv P(O_t | O_1 \cdots O_{t-1}, \lambda) \\
\hat{\alpha}_t(i) & \equiv P(q_t = S_i | O_1 \cdots O_t, \lambda) \\
&= \frac{ P(q_t = S_i, O_1 \cdots O_t, \lambda) }{ P(O_1 \cdots O_t | \lambda) } \\
&= \frac{\alpha_t(i)}{\prod_{u=1}^t C_u}
\end{align} \]
- 递归
\[ \begin{align}
\hat{\alpha_{t+1}} (j) & \equiv P(q_{t+1} = S_j | O_1 \cdots O_{t+1}, \lambda) \\
&= \frac{ P(q_{t+1} = S_j, O_1 \cdots O_{t+1} | \lambda)}{ P( O_1 \cdots O_{t+1} | \lambda)} \\
&= \frac{\alpha_{t+1}(j)}{P( O_1 \cdots O_{t+1} | \lambda)} \\
&= \frac{ \left[ \sum_{i=1}^N \alpha_t(i) a_{ij} \right] b_j(O_{t+1}) }{P( O_1 \cdots O_{t+1} | \lambda)} \\
&=\frac{\left[ \sum_{i=1}^N P(O_1 \cdots O_t, q_t = S_i | \lambda) a_{ij} \right] b_j(O_{t+1})}{P(O_1 \cdots O_{t+1} | \lambda)} \\
&=\frac{\left[ \sum_{i=1}^N P(q_t = S_i | O_1 \cdots O_t, \lambda) a_{ij} \right] b_j(O_{t+1})}{P(O_{t+1} |O_1 \cdots O_t, \lambda)} \\
&=\frac{\left[ \sum_{i=1}^N \hat{\alpha_t}(i) a_{ij} \right] b_j(O_{t+1})}{P(O_{t+1} |O_1 \cdots O_t, \lambda)} \\
&=\frac{\left[ \sum_{i=1}^N \hat{\alpha_t}(i) a_{ij} \right] b_j(O_{t+1})}{C_{t+1}} \\
\text{where:} \\
\left[ \sum_{i=1}^N \hat{\alpha_t}(i) a_{ij} \right] b_j(O_{t+1}) &= \left[ \sum_{i=1}^N P(q_t = S_i | O_1 \cdots O_t, \lambda) P(q_{t+1}=S_j | q_t = S_i, \lambda) \right] P(O_{t+1} | q_{t+1}=S_j, \lambda) \\
&= \sum_{i=1}^N P(q_t = S_i | O_1 \cdots O_t, \lambda) P(q_{t+1}=S_j | q_t = S_i, \lambda) P(O_{t+1} | q_{t+1}=S_j, \lambda) \\
& = \sum_{i=1}^N P(q_t = S_i, q_{t+1}=S_j, O_{t+1} | O_1 \cdots O_t, \lambda) \\
& = P(q_{t+1}=S_j, O_{t+1} | O_1 \cdots O_t, \lambda) \\
\text{thus:} \\
C_{t+1} &= \sum_{j=1}^N P(q_{t+1}=S_j, O_{t+1} | O_1 \cdots O_t, \lambda) \\
&= P(O_{t+1} | O_1 \cdots O_t, \lambda) \\
\text{so:} \\
C_{t+1} &= \sum_{j=1}^N \left[ \sum_{i=1}^N \hat{\alpha_t}(i) a_{ij} \right] b_j(O_{t+1}) \\
\end{align} \]
- 最后
\[ \begin{align}
P(O | \lambda) &= \prod_{t=1}^T C_t \\
\log P(O | \lambda) &= \sum_{t=1}^T \log C_t
\end{align} \]
2.2 向后算法
- 定义
\[ \beta_t(i) \equiv P(O_{t+1} \cdots O_T | q_t = S_i, \lambda) \]
- 初始化:
\[ \beta_T(i) = 1 \]
- 递归:
\[ \begin{align}
\beta_t(i) &\equiv P(O_{t+1} \cdots O_T|q_t=S_i, \lambda) \\
&= \sum_{j=1}^N P(O_{t+1} \cdots O_T, q_{t+1}=S_j|q_t=S_i,\lambda) \\
&= \sum_{j=1}^N P(O_{t+1} \cdots O_T | q_{t+1}=S_j, q_t=S_i, \lambda) P(q_{t+1} = S_j | q_t=S_i,\lambda) \\
&= \sum_{j=1}^N P(O_{t+1} | q_{t+1}=S_j, q_t=S_i, \lambda) P(O_{t+2} \cdots O_T | q_{t+1}=S_j, q_t=S_i, \lambda) P(q_{t+1} = S_j | q_t=S_i,\lambda) \\
&= \sum_{j=1}^N P(O_{t+1} | q_{t+1} = S_j,\lambda)P(O_{t+2} \cdots O_T | q_{t+1}=S_j, \lambda) P(q_{t+1} = S_j|q_t=S_i,\lambda) \\
&= \sum_{j=1}^N a_{ij} b_j(O_{t+1}) \beta_{t+1}(j)
\end{align} \]
倒数第二步去掉条件概率中的 \( q_t=S_i \) 依据是HMM假设当前状态仅与上一步状态相关,与上一步之前的状态无关.
- 最后:
\[ \begin{align}
\beta_0(i) &= \sum_{j=1}^N \pi_{j} b_j(O_1) \beta_1(j) \\
P(O|\lambda) &= \beta_0(i) \quad where \quad i \quad \text{is any value}
\end{align} \]
2.2.1 数值稳定性
- 定义
\[ \begin{align}
\hat{\beta}_t(i) & \equiv \frac{ P(O_{t+1} \cdots O_T|q_t=S_i, \lambda) }{ P(O_{t+1} \cdots O_T | O_1 \cdots O_t, \lambda) } \\
&= \frac{ \beta_t(i) }{ \prod_{u=t+1}^T C_u }
\end{align} \]
缩放操作和之前的\( \hat{\alpha}_t(i) \)相同,只是前后顺序刚好相反. 注意:
\[ \begin{align}
\prod_{u=1}^t C_u \prod_{u=t+1}^T C_u &= P(O_1 \cdots O_t | \lambda) P(O_{t+1} \cdots O_T | O_1 \cdots O_t, \lambda) \\
&= P(O_1 \cdots O_T | \lambda) \\
\alpha_t(i) \beta_t(i) &= P(O_1 \cdots O_t, q_t = S_i | \lambda) P(O_{t+1} \cdots O_T | q_t = S_i, \lambda) \\
&= P(O_1 \cdots O_t, q_t = S_i, O_{t+1} \cdots O_T | \lambda) \\
\hat{\alpha}_t(i) \hat{\beta}_t(i) &= \frac{\alpha_t(i)}{\prod_{u=1}^t C_u} \frac{ \beta_t(i) }{ \prod_{u=t+1}^T C_u } \\
&= P(q_t = S_i | O_1 \cdots O_T, \lambda)
\end{align} \]
- 递归
\[ \begin{align}
\hat{\beta}_t(i) & \equiv \frac{ P(O_{t+1} \cdots O_T|q_t=S_i, \lambda) }{ P(O_{t+1} \cdots O_T | O_1 \cdots O_t, \lambda) } \\
&= \frac{ \beta_t(i) }{ \prod_{u=t+1}^T C_u } \\
&= \frac{ \sum_{j=1}^N a_{ij} b_j(O_{t+1}) \beta_{t+1}(j) }{ \prod_{u=t+1}^T C_u } \\
&= \frac{ \sum_{j=1}^N a_{ij} b_j(O_{t+1}) \beta_{t+1}(j) }{ C_{t+1} \prod_{u=t+2}^T C_u } \\
&= \frac{ \sum_{j=1}^N a_{ij} b_j(O_{t+1}) \frac{\beta_{t+1}(j)}{\prod_{u=t+2}^T C_u} }{ C_{t+1} } \\
&= \frac{ \sum_{j=1}^N a_{ij} b_j(O_{t+1}) \hat{\beta}_{t+1}(j)}{ C_{t+1} }
\end{align} \]
2.3 log-sum-exp
解决数值计算精度影响稳定性的另一个方法是使用log-sum-exp技巧\( ^{[6]} \).
考虑如下操作应用到\( \alpha_t(i), \beta_t(i) \)函数.
\[ \begin{align}
\log \sum_{i=1}^m \exp (a_i) &= \log \left[ \sum_{i=1}^m \exp(a_i – b) \exp(b)\right] \\
&= \log \left[ \exp(b) \sum_{i=1}^m \exp(a_i – b) \right] \\
&= b + \log \sum_{i=1}^m \exp(a_i – b)
\end{align} \]
后续应用推导省略,2.4 代码实验中给出了实现示例(alpha_trick, beta_trick). 每次将最小概率值提取到对数外即可, 于是求和内部也被转换为对数求和,且避免了指数操作后值小于计算机机器精度范围. 注意这里最终得到的是对数概率.
2.4 代码实验
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 | import numpy as np np.random.seed(42) # T: length of observation # N: number of statesin the model # M: number of distinct observation symbols g_T, g_N, g_M = 100,10,1000 # observation sequence g_O = np.random.randint(low=0, high=g_M, size=g_T).flatten() # O = { O_1,O_2,\cdots,O_T } # initial state probabilities g_Pi = np.random.dirichlet(np.ones(g_N), size=1).flatten() # \Pi = [\pi_i] # state transition probabilities g_A = np.random.dirichlet(np.ones(g_N), size=g_N) # A = [a_{ij}] # observation emission probabilities g_B = np.random.dirichlet(np.ones(g_M), size=g_N) # B = [b_j(m)] # the parameters of the model g_Lambda = (g_A, g_B, g_Pi) # \lambda = (A,B,\Pi) # normal forward algorithm class Alpha: def __init__(self, O, Lambda): self.O = O self.A, self.B, self.Pi = Lambda self.T = O.shape[0] self.N = self.A.shape[0] self.M = self.B.shape[1] assert( self.A.shape[0] == self.A.shape[1] == self.B.shape[0] == self.Pi.shape[0] ) assert( self.O.min() >= 0 and self.O.max() < self.B.shape[1] ) ''' \alpha_t(i) \equiv P(O_1 \cdots O_t, q_t = S_i | \lambda) notice: param t: 0=>t1, 1=>t2, ..., T-1=>T ''' self.alpha = np.zeros(shape=(self.T, self.N)) for t in range(self.T): if t == 0: self.alpha[t] = [self.Pi[i] * self.B[i, self.O[t]] for i in range(self.N)] else: self.alpha[t] = [sum([self.alpha[t-1, h] * self.A[h, i] for h in range(self.N)]) * self.B[i, self.O[t]] for i in range(self.N)] def __call__(self, t, i): return self.alpha[t, i] alpha = Alpha(g_O, g_Lambda) # P(O|\lambda) = \sum_{i=1}^N \alpha_T(i) # notice: param t: 0=>t1, 1=>t2, ..., T-1=>T alpha_P = sum([ alpha(g_T-1, i) for i in range(g_N) ]) # normal backword algorithm class Beta: def __init__(self, O, Lambda): self.O = O self.A, self.B, self.Pi = Lambda self.T = O.shape[0] self.N = self.A.shape[0] self.M = self.B.shape[1] assert( self.A.shape[0] == self.A.shape[1] == self.B.shape[0] == self.Pi.shape[0] ) assert( self.O.min() >= 0 and self.O.max() < self.B.shape[1] ) ''' \beta_t(i) \equiv P(O_{t+1} \cdots O_T | q_t = S_i, \lambda) notice: param t: T=>T,T-1=>T-1,...,1=>t1,0=>t0 ''' self.beta = np.ones(shape=(self.T+1, self.N)) for t in range(self.T-1, -1, -1): if t == 0: self.beta[t] = np.array([ self.Pi[j] * self.B[j, self.O[t]] * self.beta[t+1, j] for j in range(self.N)]).sum() else: self.beta[t] = np.array([ self.A[:,j] * self.B[j, self.O[t]] * self.beta[t+1, j] for j in range(self.N)]).sum(axis=0) def __call__(self, t, i): return self.beta[t, i] beta = Beta(g_O, g_Lambda) # P(O|\lambda) = \beta_0(i) where i is any value # notice: param t: T=>T,T-1=>T-1,...,1=>t1,0=>t0 beta_P = beta(0,0) # scaling forward algorithm class Alpha_Hat: def __init__(self, O, Lambda): self.O = O self.A, self.B, self.Pi = Lambda self.T = O.shape[0] self.N = self.A.shape[0] self.M = self.B.shape[1] assert( self.A.shape[0] == self.A.shape[1] == self.B.shape[0] == self.Pi.shape[0] ) assert( self.O.min() >= 0 and self.O.max() < self.B.shape[1] ) ''' \hat{\alpha}_t(i) & \equiv P(q_t = S_i | O_1 \cdots O_t, \lambda) notice: param t: 0=>t1, 1=>t2, ..., T-1=>T ''' self.C = np.zeros(shape=(self.T,)) self.alpha_hat = np.zeros(shape=(self.T, self.N)) for t in range(self.T): if t==0: self.alpha_hat[t] = [self.Pi[i] * self.B[i, self.O[t]] for i in range(self.N)] else: self.alpha_hat[t] = [sum([self.alpha_hat[t-1, h] * self.A[h, i] for h in range(self.N)]) * self.B[i, self.O[t]] for i in range(self.N)] self.C[t] = self.alpha_hat[t].sum() self.alpha_hat[t] /= self.C[t] def __call__(self, t, i): return self.alpha_hat[t,i] alpha_hat = Alpha_Hat(g_O, g_Lambda) # P(O | \lambda) &= \prod_{t=1}^T C_t # \log P(O | \lambda) &= \sum_{t=1}^T \log C_t # notice: param t: 0=>t1, 1=>t2, ..., T-1=>T CPO = np.prod(alpha_hat.C) logCPO = np.sum( np.log(alpha_hat.C) ) # scaling backward algorithm class Beta_Hat: def __init__(self, O, Lambda, C): self.O = O self.A, self.B, self.Pi = Lambda self.T = O.shape[0] self.N = self.A.shape[0] self.M = self.B.shape[1] self.C = C assert( self.O.shape == self.C.shape ) assert( self.A.shape[0] == self.A.shape[1] == self.B.shape[0] == self.Pi.shape[0] ) assert( self.O.min() >= 0 and self.O.max() < self.B.shape[1] ) ''' \hat{\beta}_t(i) = \frac{ \beta_t(i) }{ \prod_{u=t+1}^T C_u } notice: param t: T=>T,T-1=>T-1,...,1=>t1,0=>t0 ''' self.beta_hat = np.ones(shape=(self.T+1, self.N)) for t in range(self.T-1, -1, -1): if t == 0: self.beta_hat[t] = np.array([ self.Pi[j] * self.B[j, self.O[t]] * self.beta_hat[t+1, j] for j in range(self.N)]).sum() else: self.beta_hat[t] = np.array([ self.A[:,j] * self.B[j, self.O[t]] * self.beta_hat[t+1, j] for j in range(self.N)]).sum(axis=0) self.beta_hat[t] /= self.C[t] def __call__(self, t, i): return self.beta_hat[t,i] beta_hat = Beta_Hat(g_O, g_Lambda, alpha_hat.C) # log-sum-exp trick for forward algorithm class Alpha_Trick: def __init__(self, O, Lambda): self.O = O self.A, self.B, self.Pi = np.log(Lambda[0]), np.log(Lambda[1]), np.log(Lambda[2]) self.T = O.shape[0] self.N = self.A.shape[0] self.M = self.B.shape[1] assert( self.A.shape[0] == self.A.shape[1] == self.B.shape[0] == self.Pi.shape[0] ) assert( self.O.min() >= 0 and self.O.max() < self.B.shape[1] ) ''' \alpha_trick_t(i) \equiv \log P(O_1 \cdots O_t, q_t = S_i | \lambda) notice: param t: 0=>t1, 1=>t2, ..., T-1=>T ''' self.alpha_trick_log = np.zeros(shape=(self.T, self.N)) for t in range(self.T): if t == 0: self.alpha_trick_log[t] = [self.Pi[i] + self.B[i, self.O[t]] for i in range(self.N)] else: b = min([self.alpha_trick_log[t-1, h] for h in range(self.N)]) self.alpha_trick_log[t] = [b + np.log(sum( [np.exp(self.alpha_trick_log[t-1, h] + self.A[h, i] + self.B[i, self.O[t]] - b) for h in range(self.N)] )) for i in range(self.N)] def __call__(self, t, i): return self.alpha_trick_log[t, i] alpha_trick = Alpha_Trick(g_O, g_Lambda) # \log P(O|\lambda) = b + \log( \sum_{i=1}^N \exp( \alpha_trick_T(i) - b ) ) # notice: param t: 0=>t1, 1=>t2, ..., T-1=>T alpha_trick_b = min( [ alpha_trick(g_T-1, i) for i in range(g_N) ] ) alpha_trick_logP = alpha_trick_b + np.log( sum([ np.exp(alpha_trick(g_T-1, i) - alpha_trick_b) for i in range(g_N) ]) ) # log-sum-exp trick for backword algorithm class Beta_Trick: def __init__(self, O, Lambda): self.O = O self.A, self.B, self.Pi = np.log(Lambda[0]), np.log(Lambda[1]), np.log(Lambda[2]) self.T = O.shape[0] self.N = self.A.shape[0] self.M = self.B.shape[1] assert( self.A.shape[0] == self.A.shape[1] == self.B.shape[0] == self.Pi.shape[0] ) assert( self.O.min() >= 0 and self.O.max() < self.B.shape[1] ) ''' \beta_trick_t(i) \equiv \log P(O_{t+1} \cdots O_T | q_t = S_i, \lambda) notice: param t: T=>T,T-1=>T-1,...,1=>t1,0=>t0 ''' self.beta_trick_log = np.log( np.ones(shape=(self.T+1, self.N)) ) for t in range(self.T-1, -1, -1): b = min([self.beta_trick_log[t+1, j] for j in range(self.N)]) if t == 0: self.beta_trick_log[t] = b + np.log(np.array(np.exp([ self.Pi[j] + self.B[j, self.O[t]] + self.beta_trick_log[t+1, j] - b for j in range(self.N)])).sum()) else: self.beta_trick_log[t] = b + np.log(np.array(np.exp([ self.A[:,j] + self.B[j, self.O[t]] + self.beta_trick_log[t+1, j] - b for j in range(self.N)])).sum(axis=0)) def __call__(self, t, i): return self.beta_trick_log[t, i] beta_trick = Beta_Trick(g_O, g_Lambda) # \logP(O|\lambda) = \beta_trick_0(i) where i is any value # notice: param t: T=>T,T-1=>T-1,...,1=>t1,0=>t0 beta_trick_logP = beta_trick(0,0) t = g_T//2 # 0=>t1, 1=>t2, ..., T-1=>T print(alpha_P, '= \sum_{i=1}^N \\alpha_T(i)') print(beta_P, '= \sum_{j=1}^N \pi_{j} b_j(O_1) \\beta_1(j)') print(CPO, '= \prod_{t=1}^T C_t') print(np.exp(logCPO), '= \exp \left[ \sum_{t=1}^T \log C_t \\right]') print(np.sum([alpha(t,i) * beta(t+1,i) for i in range(g_N)]), '= \sum_{i=1}^N \\alpha_t(i) \\beta_t(i)') print(np.sum([alpha_hat(t,i) * beta_hat(t+1,i) * CPO for i in range(g_N)]), '= \sum_{i=1}^N \left[ \hat{\\alpha}_t(i) \hat{\\beta}_t(i) \prod_{u=1}^T C_u \\right]') print(np.exp(alpha_trick_logP), '= \exp\left[ b + \log \left[ \sum_{i=1}^N \exp \left[ \\alpha_T^{trick}(i) - b \\right] \\right] \\right]') print(np.exp(beta_trick_logP), '= \exp\left[ b + \log \left[ \sum_{i=1}^N \exp \left[ \log \pi_i + \log b_i(O_1) + \\beta^{trick}_1(i) - b \\right] \\right] \\right]') |
output:\(P(O | \lambda)\)
\[\begin{align} \sum_{i=1}^N \alpha_T(i) &= 6.8789476175101185e-304 \\ \sum_{j=1}^N \pi_{j} b_j(O_1) \beta_1(j) &= 6.878947617510119e-304 \\ \prod_{t=1}^T C_t &= 6.878947617510125e-304 \\ \exp \left[ \sum_{t=1}^T \log C_t \right] &= 6.878947617510068e-304 \\ \sum_{i=1}^N \alpha_t(i) \beta_t(i) &= 6.878947617510122e-304 \\ \sum_{i=1}^N \left[ \hat{\alpha}_t(i) \hat{\beta}_t(i) \prod_{u=1}^T C_u \right] &= 6.8789476175101225e-304 \\ \exp\left[ b + \log \left[ \sum_{i=1}^N \exp \left[ \alpha_T^{trick}(i) – b \right] \right] \right] &= 6.87894761751085e-304 \\ \exp\left[ b + \log \left[ \sum_{i=1}^N \exp \left[ \log \pi_i + \log b_i(O_1) + \beta^{trick}_1(i) – b \right] \right] \right] &= 6.878947617510068e-304 \end{align}\]
References:
[1] Rabiner, L. R. 1989. “A Tutorial on Hidden Markov Models and Selected Applications in Speech Recognition.” Proceedings of the IEEE 77:257–286.
[2] Daniel Jurafsky, James H. Martin “Speech and Language Processing” 15.3 Hidden Markov Models :P421
[3] Chain rule (probability) From Wikipedia
[4] Conditional probability From From Wikipedia
[5] Christopher M. Bishop “Pattern Recognition and
Machine Learning” 2006 Springer 13.2.4 Scaling factors p627-629.
[6] Advanced Stochastic Modeling Course material for STA 531
Leave a Reply