2 向前向后算法(forward-backward)
给定模型参数,计算给定任意观测序列\( O \)的概率\( ^{[2]} \).
\[ P(O|\lambda) = \sum_Q P(O, Q | \lambda) \]
\( Q \)有\( N^T \)种可能,很显然,计算复杂度\( o(N^T) \)太高. 因此使用(forward-backward)向前向后算法实现可接受的计算复杂度.
2.1 向前算法(forward)
向前向后算法时间复杂度均为 \( o(N^2 T) \).
- 定义:
\[ \alpha_t(i) \equiv P(O_1 \cdots O_t, q_t = S_i | \lambda) \]
- 初始化:
\[ \begin{align}
\alpha_1(i) &\equiv P(O_1,q_1=S_i|\lambda) \\
&= P(O_1 | q_1=S_i,\lambda)P(q_1=S_i|\lambda) \\
&= \pi_i b_i(O_1)
\end{align} \]
- 递归:
\[ \begin{align}
\alpha_{t+1}(j) &\equiv P(O_1 \cdots O_{t+1}, q_{t+1}=S_j|\lambda) \\
&= P(O_1 \cdots O_{t+1} | q_{t+1}=S_j, \lambda) P(q_{t+1}=S_j | \lambda) \\
&= P(O_1 \cdots O_t|q_{t+1}=S_j,\lambda)P(O_{t+1}|q_{t+1}=S_j,\lambda)P(q_{t+1}=S_j|\lambda) \\
&= P(O_1 \cdots O_t, q_{t+1}=S_j|\lambda)P(O_{t+1}|q_{t+1}=S_j,\lambda) \\
&= P(O_{t+1}|q_{t+1}=S_j,\lambda) \sum_{i=1}^N P(O_1 \cdots O_t, q_t=S_i, q_{t+1}=S_j|\lambda) \\
&= P(O_{t+1}|q_{t+1}=S_j,\lambda) \sum_{i=1}^N P(O_1 \cdots O_t, q_{t+1}=S_j | q_t=S_i, \lambda)P(q_t=S_i|\lambda) \\
&= P(O_{t+1}|q_{t+1}=S_j,\lambda) \sum_{i=1}^N P(O_1 \cdots O_t | q_t=S_i, \lambda) P(q_{t+1} = S_j | q_t=S_i, \lambda) P(q_t=S_i|\lambda) \\
&= P(O_{t+1}|q_{t+1}=S_j,\lambda) \sum_{i=1}^N P(O_1 \cdots O_t, q_t=S_i, \lambda) P(q_{t+1} = S_j | q_t=S_i, \lambda) \\
&= \left[ \sum_{i=1}^N \alpha_t(i) a_{ij} \right] b_j(O_{t+1})
\end{align} \]
- 最后:
\[ \begin{align}
P(O|\lambda) &= \sum_{i=1}^N P(O, q_T = S_i | \lambda) \\
&= \sum_{i=1}^N \alpha_T(i)
\end{align} \]
2.1.1 数值稳定性
如同之前的模型概率,在这里同样存在大量小于1.0的数值相乘,且因为在过程中有求和,因此这里不适合直接使用对数概率。为了提高数值稳定性让其在机器精度范围内, 可将计算过程中的\( \alpha_{t}(i) \)缩放\( ^{[1][5]} \).
Chain rule (probability)\( ^{[3]} \) Conditional probability\( ^{[4]} \)
- 定义
\[ \begin{align}
C_t & \equiv P(O_t | O_1 \cdots O_{t-1}, \lambda) \\
\hat{\alpha}_t(i) & \equiv P(q_t = S_i | O_1 \cdots O_t, \lambda) \\
&= \frac{ P(q_t = S_i, O_1 \cdots O_t, \lambda) }{ P(O_1 \cdots O_t | \lambda) } \\
&= \frac{\alpha_t(i)}{\prod_{u=1}^t C_u}
\end{align} \]
- 递归
\[ \begin{align}
\hat{\alpha_{t+1}} (j) & \equiv P(q_{t+1} = S_j | O_1 \cdots O_{t+1}, \lambda) \\
&= \frac{ P(q_{t+1} = S_j, O_1 \cdots O_{t+1} | \lambda)}{ P( O_1 \cdots O_{t+1} | \lambda)} \\
&= \frac{\alpha_{t+1}(j)}{P( O_1 \cdots O_{t+1} | \lambda)} \\
&= \frac{ \left[ \sum_{i=1}^N \alpha_t(i) a_{ij} \right] b_j(O_{t+1}) }{P( O_1 \cdots O_{t+1} | \lambda)} \\
&=\frac{\left[ \sum_{i=1}^N P(O_1 \cdots O_t, q_t = S_i | \lambda) a_{ij} \right] b_j(O_{t+1})}{P(O_1 \cdots O_{t+1} | \lambda)} \\
&=\frac{\left[ \sum_{i=1}^N P(q_t = S_i | O_1 \cdots O_t, \lambda) a_{ij} \right] b_j(O_{t+1})}{P(O_{t+1} |O_1 \cdots O_t, \lambda)} \\
&=\frac{\left[ \sum_{i=1}^N \hat{\alpha_t}(i) a_{ij} \right] b_j(O_{t+1})}{P(O_{t+1} |O_1 \cdots O_t, \lambda)} \\
&=\frac{\left[ \sum_{i=1}^N \hat{\alpha_t}(i) a_{ij} \right] b_j(O_{t+1})}{C_{t+1}} \\
\text{where:} \\
\left[ \sum_{i=1}^N \hat{\alpha_t}(i) a_{ij} \right] b_j(O_{t+1}) &= \left[ \sum_{i=1}^N P(q_t = S_i | O_1 \cdots O_t, \lambda) P(q_{t+1}=S_j | q_t = S_i, \lambda) \right] P(O_{t+1} | q_{t+1}=S_j, \lambda) \\
&= \sum_{i=1}^N P(q_t = S_i | O_1 \cdots O_t, \lambda) P(q_{t+1}=S_j | q_t = S_i, \lambda) P(O_{t+1} | q_{t+1}=S_j, \lambda) \\
& = \sum_{i=1}^N P(q_t = S_i, q_{t+1}=S_j, O_{t+1} | O_1 \cdots O_t, \lambda) \\
& = P(q_{t+1}=S_j, O_{t+1} | O_1 \cdots O_t, \lambda) \\
\text{thus:} \\
C_{t+1} &= \sum_{j=1}^N P(q_{t+1}=S_j, O_{t+1} | O_1 \cdots O_t, \lambda) \\
&= P(O_{t+1} | O_1 \cdots O_t, \lambda) \\
\text{so:} \\
C_{t+1} &= \sum_{j=1}^N \left[ \sum_{i=1}^N \hat{\alpha_t}(i) a_{ij} \right] b_j(O_{t+1}) \\
\end{align} \]
- 最后
\[ \begin{align}
P(O | \lambda) &= \prod_{t=1}^T C_t \\
\log P(O | \lambda) &= \sum_{t=1}^T \log C_t
\end{align} \]
2.2 向后算法
- 定义
\[ \beta_t(i) \equiv P(O_{t+1} \cdots O_T | q_t = S_i, \lambda) \]
- 初始化:
\[ \beta_T(i) = 1 \]
- 递归:
\[ \begin{align}
\beta_t(i) &\equiv P(O_{t+1} \cdots O_T|q_t=S_i, \lambda) \\
&= \sum_{j=1}^N P(O_{t+1} \cdots O_T, q_{t+1}=S_j|q_t=S_i,\lambda) \\
&= \sum_{j=1}^N P(O_{t+1} \cdots O_T | q_{t+1}=S_j, q_t=S_i, \lambda) P(q_{t+1} = S_j | q_t=S_i,\lambda) \\
&= \sum_{j=1}^N P(O_{t+1} | q_{t+1}=S_j, q_t=S_i, \lambda) P(O_{t+2} \cdots O_T | q_{t+1}=S_j, q_t=S_i, \lambda) P(q_{t+1} = S_j | q_t=S_i,\lambda) \\
&= \sum_{j=1}^N P(O_{t+1} | q_{t+1} = S_j,\lambda)P(O_{t+2} \cdots O_T | q_{t+1}=S_j, \lambda) P(q_{t+1} = S_j|q_t=S_i,\lambda) \\
&= \sum_{j=1}^N a_{ij} b_j(O_{t+1}) \beta_{t+1}(j)
\end{align} \]
倒数第二步去掉条件概率中的 \( q_t=S_i \) 依据是HMM假设当前状态仅与上一步状态相关,与上一步之前的状态无关.
- 最后:
\[ \begin{align}
\beta_0(i) &= \sum_{j=1}^N \pi_{j} b_j(O_1) \beta_1(j) \\
P(O|\lambda) &= \beta_0(i) \quad where \quad i \quad \text{is any value}
\end{align} \]
2.2.1 数值稳定性
- 定义
\[ \begin{align}
\hat{\beta}_t(i) & \equiv \frac{ P(O_{t+1} \cdots O_T|q_t=S_i, \lambda) }{ P(O_{t+1} \cdots O_T | O_1 \cdots O_t, \lambda) } \\
&= \frac{ \beta_t(i) }{ \prod_{u=t+1}^T C_u }
\end{align} \]
缩放操作和之前的\( \hat{\alpha}_t(i) \)相同,只是前后顺序刚好相反. 注意:
\[ \begin{align}
\prod_{u=1}^t C_u \prod_{u=t+1}^T C_u &= P(O_1 \cdots O_t | \lambda) P(O_{t+1} \cdots O_T | O_1 \cdots O_t, \lambda) \\
&= P(O_1 \cdots O_T | \lambda) \\
\alpha_t(i) \beta_t(i) &= P(O_1 \cdots O_t, q_t = S_i | \lambda) P(O_{t+1} \cdots O_T | q_t = S_i, \lambda) \\
&= P(O_1 \cdots O_t, q_t = S_i, O_{t+1} \cdots O_T | \lambda) \\
\hat{\alpha}_t(i) \hat{\beta}_t(i) &= \frac{\alpha_t(i)}{\prod_{u=1}^t C_u} \frac{ \beta_t(i) }{ \prod_{u=t+1}^T C_u } \\
&= P(q_t = S_i | O_1 \cdots O_T, \lambda)
\end{align} \]
- 递归
\[ \begin{align}
\hat{\beta}_t(i) & \equiv \frac{ P(O_{t+1} \cdots O_T|q_t=S_i, \lambda) }{ P(O_{t+1} \cdots O_T | O_1 \cdots O_t, \lambda) } \\
&= \frac{ \beta_t(i) }{ \prod_{u=t+1}^T C_u } \\
&= \frac{ \sum_{j=1}^N a_{ij} b_j(O_{t+1}) \beta_{t+1}(j) }{ \prod_{u=t+1}^T C_u } \\
&= \frac{ \sum_{j=1}^N a_{ij} b_j(O_{t+1}) \beta_{t+1}(j) }{ C_{t+1} \prod_{u=t+2}^T C_u } \\
&= \frac{ \sum_{j=1}^N a_{ij} b_j(O_{t+1}) \frac{\beta_{t+1}(j)}{\prod_{u=t+2}^T C_u} }{ C_{t+1} } \\
&= \frac{ \sum_{j=1}^N a_{ij} b_j(O_{t+1}) \hat{\beta}_{t+1}(j)}{ C_{t+1} }
\end{align} \]
2.3 log-sum-exp
解决数值计算精度影响稳定性的另一个方法是使用log-sum-exp技巧\( ^{[6]} \).
考虑如下操作应用到\( \alpha_t(i), \beta_t(i) \)函数.
\[ \begin{align}
\log \sum_{i=1}^m \exp (a_i) &= \log \left[ \sum_{i=1}^m \exp(a_i – b) \exp(b)\right] \\
&= \log \left[ \exp(b) \sum_{i=1}^m \exp(a_i – b) \right] \\
&= b + \log \sum_{i=1}^m \exp(a_i – b)
\end{align} \]
后续应用推导省略,2.4 代码实验中给出了实现示例(alpha_trick, beta_trick). 每次将最小概率值提取到对数外即可, 于是求和内部也被转换为对数求和,且避免了指数操作后值小于计算机机器精度范围. 注意这里最终得到的是对数概率.
2.4 代码实验
import numpy as np
np.random.seed(42)
# T: length of observation
# N: number of statesin the model
# M: number of distinct observation symbols
g_T, g_N, g_M = 100,10,1000
# observation sequence
g_O = np.random.randint(low=0, high=g_M, size=g_T).flatten() # O = { O_1,O_2,\cdots,O_T }
# initial state probabilities
g_Pi = np.random.dirichlet(np.ones(g_N), size=1).flatten() # \Pi = [\pi_i]
# state transition probabilities
g_A = np.random.dirichlet(np.ones(g_N), size=g_N) # A = [a_{ij}]
# observation emission probabilities
g_B = np.random.dirichlet(np.ones(g_M), size=g_N) # B = [b_j(m)]
# the parameters of the model
g_Lambda = (g_A, g_B, g_Pi) # \lambda = (A,B,\Pi)
# normal forward algorithm
class Alpha:
def __init__(self, O, Lambda):
self.O = O
self.A, self.B, self.Pi = Lambda
self.T = O.shape[0]
self.N = self.A.shape[0]
self.M = self.B.shape[1]
assert( self.A.shape[0] == self.A.shape[1] == self.B.shape[0] == self.Pi.shape[0] )
assert( self.O.min() >= 0 and self.O.max() < self.B.shape[1] )
'''
\alpha_t(i) \equiv P(O_1 \cdots O_t, q_t = S_i | \lambda)
notice: param t: 0=>t1, 1=>t2, ..., T-1=>T
'''
self.alpha = np.zeros(shape=(self.T, self.N))
for t in range(self.T):
if t == 0:
self.alpha[t] = [self.Pi[i] * self.B[i, self.O[t]] for i in range(self.N)]
else:
self.alpha[t] = [sum([self.alpha[t-1, h] * self.A[h, i] for h in range(self.N)]) * self.B[i, self.O[t]] for i in range(self.N)]
def __call__(self, t, i):
return self.alpha[t, i]
alpha = Alpha(g_O, g_Lambda)
# P(O|\lambda) = \sum_{i=1}^N \alpha_T(i)
# notice: param t: 0=>t1, 1=>t2, ..., T-1=>T
alpha_P = sum([ alpha(g_T-1, i) for i in range(g_N) ])
# normal backword algorithm
class Beta:
def __init__(self, O, Lambda):
self.O = O
self.A, self.B, self.Pi = Lambda
self.T = O.shape[0]
self.N = self.A.shape[0]
self.M = self.B.shape[1]
assert( self.A.shape[0] == self.A.shape[1] == self.B.shape[0] == self.Pi.shape[0] )
assert( self.O.min() >= 0 and self.O.max() < self.B.shape[1] )
'''
\beta_t(i) \equiv P(O_{t+1} \cdots O_T | q_t = S_i, \lambda)
notice: param t: T=>T,T-1=>T-1,...,1=>t1,0=>t0
'''
self.beta = np.ones(shape=(self.T+1, self.N))
for t in range(self.T-1, -1, -1):
if t == 0:
self.beta[t] = np.array([ self.Pi[j] * self.B[j, self.O[t]] * self.beta[t+1, j] for j in range(self.N)]).sum()
else:
self.beta[t] = np.array([ self.A[:,j] * self.B[j, self.O[t]] * self.beta[t+1, j] for j in range(self.N)]).sum(axis=0)
def __call__(self, t, i):
return self.beta[t, i]
beta = Beta(g_O, g_Lambda)
# P(O|\lambda) = \beta_0(i) where i is any value
# notice: param t: T=>T,T-1=>T-1,...,1=>t1,0=>t0
beta_P = beta(0,0)
# scaling forward algorithm
class Alpha_Hat:
def __init__(self, O, Lambda):
self.O = O
self.A, self.B, self.Pi = Lambda
self.T = O.shape[0]
self.N = self.A.shape[0]
self.M = self.B.shape[1]
assert( self.A.shape[0] == self.A.shape[1] == self.B.shape[0] == self.Pi.shape[0] )
assert( self.O.min() >= 0 and self.O.max() < self.B.shape[1] )
'''
\hat{\alpha}_t(i) & \equiv P(q_t = S_i | O_1 \cdots O_t, \lambda)
notice: param t: 0=>t1, 1=>t2, ..., T-1=>T
'''
self.C = np.zeros(shape=(self.T,))
self.alpha_hat = np.zeros(shape=(self.T, self.N))
for t in range(self.T):
if t==0:
self.alpha_hat[t] = [self.Pi[i] * self.B[i, self.O[t]] for i in range(self.N)]
else:
self.alpha_hat[t] = [sum([self.alpha_hat[t-1, h] * self.A[h, i] for h in range(self.N)]) * self.B[i, self.O[t]] for i in range(self.N)]
self.C[t] = self.alpha_hat[t].sum()
self.alpha_hat[t] /= self.C[t]
def __call__(self, t, i):
return self.alpha_hat[t,i]
alpha_hat = Alpha_Hat(g_O, g_Lambda)
# P(O | \lambda) &= \prod_{t=1}^T C_t
# \log P(O | \lambda) &= \sum_{t=1}^T \log C_t
# notice: param t: 0=>t1, 1=>t2, ..., T-1=>T
CPO = np.prod(alpha_hat.C)
logCPO = np.sum( np.log(alpha_hat.C) )
# scaling backward algorithm
class Beta_Hat:
def __init__(self, O, Lambda, C):
self.O = O
self.A, self.B, self.Pi = Lambda
self.T = O.shape[0]
self.N = self.A.shape[0]
self.M = self.B.shape[1]
self.C = C
assert( self.O.shape == self.C.shape )
assert( self.A.shape[0] == self.A.shape[1] == self.B.shape[0] == self.Pi.shape[0] )
assert( self.O.min() >= 0 and self.O.max() < self.B.shape[1] )
'''
\hat{\beta}_t(i) = \frac{ \beta_t(i) }{ \prod_{u=t+1}^T C_u }
notice: param t: T=>T,T-1=>T-1,...,1=>t1,0=>t0
'''
self.beta_hat = np.ones(shape=(self.T+1, self.N))
for t in range(self.T-1, -1, -1):
if t == 0:
self.beta_hat[t] = np.array([ self.Pi[j] * self.B[j, self.O[t]] * self.beta_hat[t+1, j] for j in range(self.N)]).sum()
else:
self.beta_hat[t] = np.array([ self.A[:,j] * self.B[j, self.O[t]] * self.beta_hat[t+1, j] for j in range(self.N)]).sum(axis=0)
self.beta_hat[t] /= self.C[t]
def __call__(self, t, i):
return self.beta_hat[t,i]
beta_hat = Beta_Hat(g_O, g_Lambda, alpha_hat.C)
# log-sum-exp trick for forward algorithm
class Alpha_Trick:
def __init__(self, O, Lambda):
self.O = O
self.A, self.B, self.Pi = np.log(Lambda[0]), np.log(Lambda[1]), np.log(Lambda[2])
self.T = O.shape[0]
self.N = self.A.shape[0]
self.M = self.B.shape[1]
assert( self.A.shape[0] == self.A.shape[1] == self.B.shape[0] == self.Pi.shape[0] )
assert( self.O.min() >= 0 and self.O.max() < self.B.shape[1] )
'''
\alpha_trick_t(i) \equiv \log P(O_1 \cdots O_t, q_t = S_i | \lambda)
notice: param t: 0=>t1, 1=>t2, ..., T-1=>T
'''
self.alpha_trick_log = np.zeros(shape=(self.T, self.N))
for t in range(self.T):
if t == 0:
self.alpha_trick_log[t] = [self.Pi[i] + self.B[i, self.O[t]] for i in range(self.N)]
else:
b = min([self.alpha_trick_log[t-1, h] for h in range(self.N)])
self.alpha_trick_log[t] = [b + np.log(sum( [np.exp(self.alpha_trick_log[t-1, h] + self.A[h, i] + self.B[i, self.O[t]] - b) for h in range(self.N)] )) for i in range(self.N)]
def __call__(self, t, i):
return self.alpha_trick_log[t, i]
alpha_trick = Alpha_Trick(g_O, g_Lambda)
# \log P(O|\lambda) = b + \log( \sum_{i=1}^N \exp( \alpha_trick_T(i) - b ) )
# notice: param t: 0=>t1, 1=>t2, ..., T-1=>T
alpha_trick_b = min( [ alpha_trick(g_T-1, i) for i in range(g_N) ] )
alpha_trick_logP = alpha_trick_b + np.log( sum([ np.exp(alpha_trick(g_T-1, i) - alpha_trick_b) for i in range(g_N) ]) )
# log-sum-exp trick for backword algorithm
class Beta_Trick:
def __init__(self, O, Lambda):
self.O = O
self.A, self.B, self.Pi = np.log(Lambda[0]), np.log(Lambda[1]), np.log(Lambda[2])
self.T = O.shape[0]
self.N = self.A.shape[0]
self.M = self.B.shape[1]
assert( self.A.shape[0] == self.A.shape[1] == self.B.shape[0] == self.Pi.shape[0] )
assert( self.O.min() >= 0 and self.O.max() < self.B.shape[1] )
'''
\beta_trick_t(i) \equiv \log P(O_{t+1} \cdots O_T | q_t = S_i, \lambda)
notice: param t: T=>T,T-1=>T-1,...,1=>t1,0=>t0
'''
self.beta_trick_log = np.log( np.ones(shape=(self.T+1, self.N)) )
for t in range(self.T-1, -1, -1):
b = min([self.beta_trick_log[t+1, j] for j in range(self.N)])
if t == 0:
self.beta_trick_log[t] = b + np.log(np.array(np.exp([ self.Pi[j] + self.B[j, self.O[t]] + self.beta_trick_log[t+1, j] - b for j in range(self.N)])).sum())
else:
self.beta_trick_log[t] = b + np.log(np.array(np.exp([ self.A[:,j] + self.B[j, self.O[t]] + self.beta_trick_log[t+1, j] - b for j in range(self.N)])).sum(axis=0))
def __call__(self, t, i):
return self.beta_trick_log[t, i]
beta_trick = Beta_Trick(g_O, g_Lambda)
# \logP(O|\lambda) = \beta_trick_0(i) where i is any value
# notice: param t: T=>T,T-1=>T-1,...,1=>t1,0=>t0
beta_trick_logP = beta_trick(0,0)
t = g_T//2 # 0=>t1, 1=>t2, ..., T-1=>T
print(alpha_P, '= \sum_{i=1}^N \\alpha_T(i)')
print(beta_P, '= \sum_{j=1}^N \pi_{j} b_j(O_1) \\beta_1(j)')
print(CPO, '= \prod_{t=1}^T C_t')
print(np.exp(logCPO), '= \exp \left[ \sum_{t=1}^T \log C_t \\right]')
print(np.sum([alpha(t,i) * beta(t+1,i) for i in range(g_N)]), '= \sum_{i=1}^N \\alpha_t(i) \\beta_t(i)')
print(np.sum([alpha_hat(t,i) * beta_hat(t+1,i) * CPO for i in range(g_N)]), '= \sum_{i=1}^N \left[ \hat{\\alpha}_t(i) \hat{\\beta}_t(i) \prod_{u=1}^T C_u \\right]')
print(np.exp(alpha_trick_logP), '= \exp\left[ b + \log \left[ \sum_{i=1}^N \exp \left[ \\alpha_T^{trick}(i) - b \\right] \\right] \\right]')
print(np.exp(beta_trick_logP), '= \exp\left[ b + \log \left[ \sum_{i=1}^N \exp \left[ \log \pi_i + \log b_i(O_1) + \\beta^{trick}_1(i) - b \\right] \\right] \\right]')
output:\(P(O | \lambda)\)
\[\begin{align} \sum_{i=1}^N \alpha_T(i) &= 6.8789476175101185e-304 \\ \sum_{j=1}^N \pi_{j} b_j(O_1) \beta_1(j) &= 6.878947617510119e-304 \\ \prod_{t=1}^T C_t &= 6.878947617510125e-304 \\ \exp \left[ \sum_{t=1}^T \log C_t \right] &= 6.878947617510068e-304 \\ \sum_{i=1}^N \alpha_t(i) \beta_t(i) &= 6.878947617510122e-304 \\ \sum_{i=1}^N \left[ \hat{\alpha}_t(i) \hat{\beta}_t(i) \prod_{u=1}^T C_u \right] &= 6.8789476175101225e-304 \\ \exp\left[ b + \log \left[ \sum_{i=1}^N \exp \left[ \alpha_T^{trick}(i) – b \right] \right] \right] &= 6.87894761751085e-304 \\ \exp\left[ b + \log \left[ \sum_{i=1}^N \exp \left[ \log \pi_i + \log b_i(O_1) + \beta^{trick}_1(i) – b \right] \right] \right] &= 6.878947617510068e-304 \end{align}\]
References:
[1] Rabiner, L. R. 1989. “A Tutorial on Hidden Markov Models and Selected Applications in Speech Recognition.” Proceedings of the IEEE 77:257–286.
[2] Daniel Jurafsky, James H. Martin “Speech and Language Processing” 15.3 Hidden Markov Models :P421
[3] Chain rule (probability) From Wikipedia
[4] Conditional probability From From Wikipedia
[5] Christopher M. Bishop “Pattern Recognition and
Machine Learning” 2006 Springer 13.2.4 Scaling factors p627-629.
[6] Advanced Stochastic Modeling Course material for STA 531
Leave a Reply