1 /**
2  * @file   word_align.c
3  *
4  * <JA>
5  * @brief  ñ�졦���ǡ�����ñ�̤Υ��饤�����
6  *
7  * �����Ǥϡ�ǧ����̤��Ф������ϲ����Υ��饤����Ȥ���Ϥ��뤿���
8  * �ؿ����������Ƥ��ޤ�.
9  *
10  * Julius/Julian �Ǥϡ�ǧ����̤ˤ����Ƥ���ñ��䲻�ǡ����뤤��HMM�ξ��֤�
11  * ���줾�����ϲ����Τɤζ�֤˥ޥå������Τ����Τ뤳�Ȥ��Ǥ��ޤ�.
12  * ������Τʥ��饤����Ȥ���뤿��ˡ�Julius/Julian �Ǥ�ǧ�����
13  * �����ޤ������Ѥ����ˡ�ǧ��������ä��������줿ǧ����̤�ñ�����
14  * �Ф��ơ����餿��� forced alignment ��¹Ԥ��Ƥ��ޤ�.
15  * </JA>
16  *
17  * <EN>
18  * @brief  Forced alignment by word / phoneme / state unit.
19  *
20  * This file defines functions for performing forced alignment of
21  * recognized words.  The forced alignment is implimented in Julius/Julian
22  * to get the best matching segmentation of recognized word sequence
23  * upon input speech.  Word-level, phoneme-level and HMM state-level
24  * alignment can be obtained.
25  *
26  * Julius/Julian performs the forced alignment as a post-processing of
27  * recognition process.  Recomputation of Viterbi path on the recognized
28  * word sequence toward input speech will be done after the recognition
29  * to get better alignment.
30  *
31  * </EN>
32  *
33  * @author Akinobu Lee
34  * @date   Sat Sep 24 16:09:46 2005
35  *
36  * $Revision: 1.5 $
37  *
38  */
39 /*
40  * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University
41  * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
42  * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology
43  * All rights reserved
44  */
45 
46 #include <julius/julius.h>
47 
48 /**
49  * <JA>
50  * Ϳ����줿ñ������HMM��Ϣ�뤷��ʸ���Τ�HMM���ۤ���.
51  *
52  * @param wseq [in] ñ����
53  * @param num [in] @a wseq �ο�
54  * @param has_sp_ret [out] ���硼�ȥݡ������³�������������˥åȤξ���
55  * @param num_ret [out] ���ۤ��줿HMM�˴ޤޤ�벻��HMM�ο�
56  * @param end_ret [out] ���饤����Ȥζ��ڤ�Ȥʤ�����ֹ����
57  * @param per_what [in] ñ�졦���ǡ����֤Τɤ�ñ�̤ǥ��饤����Ȥ��뤫�����
58  * @param r [in] ǧ��������������
59  *
60  * @return ���餿�˳���դ���줿ʸ���Τ���魯HMM��ǥ���ؤΥݥ������֤�.
61  * </JA>
62  * <EN>
63  * Make the whole sentence HMM from given word sequence by connecting
64  * each phoneme HMM.
65  *
66  * @param wseq [in] word sequence to align
67  * @param num [in] number of @a wseq
68  * @param has_sp_ret [out] unit information of whether it can be followed by a short-pause
69  * @param num_ret [out] number of HMM contained in the generated sentence HMM
70  * @param end_ret [out] sequence of state location as alignment unit
71  * @param per_what [in] specify the alignment unit (word / phoneme / state)
72  * @param r [in] recognition process instance
73  *
74  * @return newly malloced HMM sequences.
75  * </EN>
76  */
77 static HMM_Logical **
make_phseq(WORD_ID * wseq,short num,boolean ** has_sp_ret,int * num_ret,int ** end_ret,int per_what,RecogProcess * r)78 make_phseq(WORD_ID *wseq, short num, boolean **has_sp_ret, int *num_ret, int **end_ret, int per_what,
79 	   RecogProcess *r)
80 {
81   HMM_Logical **ph;		/* phoneme sequence */
82   boolean *has_sp;
83   int k;
84   int phnum;			/* num of above */
85   WORD_ID tmpw, w;
86   int i, j, pn, st, endn;
87   HMM_Logical *tmpp, *ret;
88   WORD_INFO *winfo;
89   HTK_HMM_INFO *hmminfo;
90   boolean enable_iwsp;		/* for multipath */
91 
92   winfo = r->lm->winfo;
93   hmminfo = r->am->hmminfo;
94   if (hmminfo->multipath) enable_iwsp = r->lm->config->enable_iwsp;
95 
96   /* make ph[] from wseq[] */
97   /* 1. calc total phone num and malloc */
98   phnum = 0;
99   for (w=0;w<num;w++) phnum += winfo->wlen[wseq[w]];
100   ph = (HMM_Logical **)mymalloc(sizeof(HMM_Logical *) * phnum);
101 
102   if (hmminfo->multipath && enable_iwsp) {
103     has_sp = (boolean *)mymalloc(sizeof(boolean) * phnum);
104   } else {
105     has_sp = NULL;
106   }
107   /* 2. make phoneme sequence */
108   st = 0;
109   if (hmminfo->multipath) st++;
110   pn = 0;
111   endn = 0;
112   for (w=0;w<num;w++) {
113     tmpw = wseq[w];
114     for (i=0;i<winfo->wlen[tmpw];i++) {
115       tmpp = winfo->wseq[tmpw][i];
116       /* handle cross-word context dependency */
117       if (r->ccd_flag) {
118 	if (w > 0 && i == 0) {	/* word head */
119 
120 	  if ((ret = get_left_context_HMM(tmpp, ph[pn-1]->name, hmminfo)) != NULL) {
121 	    tmpp = ret;
122 	  }
123 	  /* if triphone not found, fallback to bi/mono-phone  */
124 	  /* use pseudo phone when no bi-phone found in alignment... */
125 	}
126 	if (w < num-1 && i == winfo->wlen[tmpw] - 1) { /* word tail */
127 	  if ((ret = get_right_context_HMM(tmpp, winfo->wseq[wseq[w+1]][0]->name, hmminfo)) != NULL) {
128 	    tmpp = ret;
129 	  }
130 	}
131       }
132       ph[pn] = tmpp;
133       if (hmminfo->multipath && enable_iwsp) {
134 	if (i == winfo->wlen[tmpw] - 1) {
135 	  has_sp[pn] = TRUE;
136 	} else {
137 	  has_sp[pn] = FALSE;
138 	}
139       }
140       if (per_what == PER_STATE) {
141 	for (j=0;j<hmm_logical_state_num(tmpp)-2;j++) {
142 	  (*end_ret)[endn++] = st + j;
143 	}
144 	if (hmminfo->multipath && enable_iwsp && has_sp[pn]) {
145 	  for (k=0;k<hmm_logical_state_num(hmminfo->sp)-2;k++) {
146 	    (*end_ret)[endn++] = st + j + k;
147 	  }
148 	}
149       }
150       st += hmm_logical_state_num(tmpp) - 2;
151       if (hmminfo->multipath && enable_iwsp && has_sp[pn]) {
152 	st += hmm_logical_state_num(hmminfo->sp) - 2;
153       }
154       if (per_what == PER_PHONEME) (*end_ret)[endn++] = st - 1;
155       pn++;
156     }
157     if (per_what == PER_WORD) (*end_ret)[endn++] = st - 1;
158   }
159   *num_ret = phnum;
160   *has_sp_ret = has_sp;
161   return ph;
162 }
163 
164 
165 /**
166  * <JA>
167  * ʸ���Τ�HMM���ۤ���Viterbi���饤����Ȥ�¹Ԥ�����̤���Ϥ���.
168  *
169  * @param words [in] ʸ�������魯ñ����
170  * @param wnum [in] @a words ��Ĺ��
171  * @param param [in] ������ħ�ѥ�᡼����
172  * @param per_what [in] ñ�졦���ǡ����֤Τɤ�ñ�̤ǥ��饤����Ȥ��뤫�����
173  * @param align [out] ���饤����ȷ�̤��Ǽ����Sentence��¤��
174  * @param r [i/o] ǧ��������������
175  * </JA>
176  * <EN>
177  * Build sentence HMM, call viterbi_segment() and output result.
178  *
179  * @param words [in] word sequence of the sentence
180  * @param wnum [in] number of words in @a words
181  * @param param [in] input parameter vector
182  * @param per_what [in] specify the alignment unit (word / phoneme / state)
183  * @param s [out] Sentence data area to store the alignment result
184  * @param r [i/o] recognition process instance
185  * </EN>
186  */
187 static void
do_align(WORD_ID * words,short wnum,HTK_Param * param,int per_what,SentenceAlign * align,RecogProcess * r)188 do_align(WORD_ID *words, short wnum, HTK_Param *param, int per_what, SentenceAlign *align, RecogProcess *r)
189 {
190   HMM_Logical **phones;		/* phoneme sequence */
191   boolean *has_sp;		/* whether phone can follow short pause */
192   int k;
193   int phonenum;			/* num of above */
194   HMM *shmm;			/* sentence HMM */
195   int *end_state;		/* state number of word ends */
196   int *end_frame;		/* segmented last frame of words */
197   LOGPROB *end_score;		/* normalized score of each words */
198   LOGPROB allscore;		/* total score of this word sequence */
199   WORD_ID w;
200   int i, rlen;
201   int end_num = 0;
202   int *id_seq, *phloc = NULL, *stloc = NULL;
203   int j,n,p;
204   WORD_INFO *winfo;
205   HTK_HMM_INFO *hmminfo;
206   boolean enable_iwsp;		/* for multipath */
207 
208   winfo = r->lm->winfo;
209   hmminfo = r->am->hmminfo;
210   if (hmminfo->multipath) enable_iwsp = r->lm->config->enable_iwsp;
211 
212   /* initialize result storage buffer */
213   switch(per_what) {
214   case PER_WORD:
215     jlog("ALIGN: === word alignment begin ===\n");
216     end_num = wnum;
217     phloc = (int *)mymalloc(sizeof(int)*wnum);
218     i = 0;
219     for(w=0;w<wnum;w++) {
220       phloc[w] = i;
221       i += winfo->wlen[words[w]];
222     }
223     break;
224   case PER_PHONEME:
225     jlog("ALIGN: === phoneme alignment begin ===\n");
226     end_num = 0;
227     for(w=0;w<wnum;w++) end_num += winfo->wlen[words[w]];
228     break;
229   case PER_STATE:
230     jlog("ALIGN: === state alignment begin ===\n");
231     end_num = 0;
232     for(w=0;w<wnum;w++) {
233       for (i=0;i<winfo->wlen[words[w]]; i++) {
234 	end_num += hmm_logical_state_num(winfo->wseq[words[w]][i]) - 2;
235       }
236       if (hmminfo->multipath && enable_iwsp) {
237 	end_num += hmm_logical_state_num(hmminfo->sp) - 2;
238       }
239     }
240     phloc = (int *)mymalloc(sizeof(int)*end_num);
241     stloc = (int *)mymalloc(sizeof(int)*end_num);
242     {
243       n = 0;
244       p = 0;
245       for(w=0;w<wnum;w++) {
246 	for(i=0;i<winfo->wlen[words[w]]; i++) {
247 	  for(j=0; j<hmm_logical_state_num(winfo->wseq[words[w]][i]) - 2; j++) {
248 	    phloc[n] = p;
249 	    stloc[n] = j + 1;
250 	    n++;
251 	  }
252 	  if (hmminfo->multipath && enable_iwsp && i == winfo->wlen[words[w]] - 1) {
253 	    for(k=0;k<hmm_logical_state_num(hmminfo->sp)-2;k++) {
254 	      phloc[n] = p;
255 	      stloc[n] = j + 1 + k + end_num;
256 	      n++;
257 	    }
258 	  }
259 	  p++;
260 	}
261       }
262     }
263 
264     break;
265   }
266   end_state = (int *)mymalloc(sizeof(int) * end_num);
267 
268   /* make phoneme sequence word sequence */
269   phones = make_phseq(words, wnum, &has_sp, &phonenum, &end_state, per_what, r);
270   /* build the sentence HMMs */
271   shmm = new_make_word_hmm(hmminfo, phones, phonenum, has_sp);
272   if (shmm == NULL) {
273     j_internal_error("Error: failed to make word hmm for alignment\n");
274   }
275 
276   /* call viterbi segmentation function */
277   allscore = viterbi_segment(shmm, param, r->wchmm->hmmwrk, hmminfo->multipath, end_state, end_num, &id_seq, &end_frame, &end_score, &rlen);
278 
279   /* store result to s */
280   align->num = rlen;
281   align->unittype = per_what;
282   align->begin_frame = (int *)mymalloc(sizeof(int) * rlen);
283   align->end_frame   = (int *)mymalloc(sizeof(int) * rlen);
284   align->avgscore    = (LOGPROB *)mymalloc(sizeof(LOGPROB) * rlen);
285   for(i=0;i<rlen;i++) {
286     align->begin_frame[i] = (i == 0) ? 0 : end_frame[i-1] + 1;
287     align->end_frame[i]   = end_frame[i];
288     align->avgscore[i]    = end_score[i];
289   }
290   switch(per_what) {
291   case PER_WORD:
292     align->w = (WORD_ID *)mymalloc(sizeof(WORD_ID) * rlen);
293     for(i=0;i<rlen;i++) {
294       align->w[i] = words[id_seq[i]];
295     }
296     break;
297   case PER_PHONEME:
298     align->ph = (HMM_Logical **)mymalloc(sizeof(HMM_Logical *) * rlen);
299     for(i=0;i<rlen;i++) {
300       align->ph[i] = phones[id_seq[i]];
301     }
302     break;
303   case PER_STATE:
304     align->ph = (HMM_Logical **)mymalloc(sizeof(HMM_Logical *) * rlen);
305     align->loc = (short *)mymalloc(sizeof(short) * rlen);
306     if (hmminfo->multipath) align->is_iwsp = (boolean *)mymalloc(sizeof(boolean) * rlen);
307     for(i=0;i<rlen;i++) {
308       align->ph[i]  = phones[phloc[id_seq[i]]];
309       if (hmminfo->multipath) {
310 	if (enable_iwsp && stloc[id_seq[i]] > end_num) {
311 	  align->loc[i] = stloc[id_seq[i]] - end_num;
312 	  align->is_iwsp[i] = TRUE;
313 	} else {
314 	  align->loc[i] = stloc[id_seq[i]];
315 	  align->is_iwsp[i] = FALSE;
316 	}
317       } else {
318 	align->loc[i] = stloc[id_seq[i]];
319       }
320     }
321     break;
322   }
323 
324   align->allscore = allscore;
325 
326   free_hmm(shmm);
327   free(id_seq);
328   free(phones);
329   if (has_sp) free(has_sp);
330   free(end_score);
331   free(end_frame);
332   free(end_state);
333 
334   switch(per_what) {
335   case PER_WORD:
336     free(phloc);
337     break;
338   case PER_PHONEME:
339     break;
340   case PER_STATE:
341     free(phloc);
342     free(stloc);
343   }
344 
345 }
346 
347 /**
348  * <JA>
349  * ñ�줴�Ȥ� forced alignment ��Ԥ�.
350  *
351  * @param words [in] ñ����
352  * @param wnum [in] @a words ��ñ���
353  * @param param [in] ������ħ�٥��ȥ���
354  * @param align [out] ���饤����ȷ�̤��Ǽ����Sentence��¤��
355  * @param r [i/o] ǧ��������������
356  * </JA>
357  * <EN>
358  * Do forced alignment per word for the given word sequence.
359  *
360  * @param words [in] word sequence
361  * @param wnum [in] length of @a words
362  * @param param [in] input parameter vectors
363  * @param align [out] Sentence data area to store the alignment result
364  * @param r [i/o] recognition process instance
365  * </EN>
366  * @callgraph
367  * @callergraph
368  */
369 void
word_align(WORD_ID * words,short wnum,HTK_Param * param,SentenceAlign * align,RecogProcess * r)370 word_align(WORD_ID *words, short wnum, HTK_Param *param, SentenceAlign *align, RecogProcess *r)
371 {
372   do_align(words, wnum, param, PER_WORD, align, r);
373 }
374 
375 /**
376  * <JA>
377  * ñ�줴�Ȥ� forced alignment ��Ԥ���ñ�줬�ս��Ϳ���������
378  *
379  * @param revwords [in] ñ����ʵս��
380  * @param wnum [in] @a revwords ��ñ���
381  * @param param [in] ������ħ�٥��ȥ���
382  * @param align [out] ���饤����ȷ�̤��Ǽ����Sentence��¤��
383  * @param r [i/o] ǧ��������������
384  * </JA>
385  * <EN>
386  * Do forced alignment per word for the given word sequence (reversed order).
387  *
388  * @param revwords [in] word sequence in reversed direction
389  * @param wnum [in] length of @a revwords
390  * @param param [in] input parameter vectors
391  * @param align [out] Sentence data area to store the alignment result
392  * @param r [i/o] recognition process instance
393  * </EN>
394  * @callgraph
395  * @callergraph
396  */
397 void
word_rev_align(WORD_ID * revwords,short wnum,HTK_Param * param,SentenceAlign * align,RecogProcess * r)398 word_rev_align(WORD_ID *revwords, short wnum, HTK_Param *param, SentenceAlign *align, RecogProcess *r)
399 {
400   WORD_ID *words;		/* word sequence (true order) */
401   int w;
402   words = (WORD_ID *)mymalloc(sizeof(WORD_ID) * wnum);
403   for (w=0;w<wnum;w++) words[w] = revwords[wnum-w-1];
404   do_align(words, wnum, param, PER_WORD, align, r);
405   free(words);
406 }
407 
408 /**
409  * <JA>
410  * ���Ǥ��Ȥ� forced alignment ��Ԥ�.
411  *
412  * @param words [in] ñ����
413  * @param num [in] @a words ��ñ���
414  * @param param [in] ������ħ�٥��ȥ���
415  * @param align [out] ���饤����ȷ�̤��Ǽ����Sentence��¤��
416  * @param r [i/o] ǧ��������������
417  * </JA>
418  * <EN>
419  * Do forced alignment per phoneme for the given word sequence.
420  *
421  * @param words [in] word sequence
422  * @param num [in] length of @a words
423  * @param param [in] input parameter vectors
424  * @param align [out] Sentence data area to store the alignment result
425  * @param r [i/o] recognition process instance
426  * </EN>
427  * @callgraph
428  * @callergraph
429  */
430 void
phoneme_align(WORD_ID * words,short num,HTK_Param * param,SentenceAlign * align,RecogProcess * r)431 phoneme_align(WORD_ID *words, short num, HTK_Param *param, SentenceAlign *align, RecogProcess *r)
432 {
433   do_align(words, num, param, PER_PHONEME, align, r);
434 }
435 
436 /**
437  * <JA>
438  * ���Ǥ��Ȥ� forced alignment ��Ԥ���ñ�줬�ս��Ϳ���������
439  *
440  * @param revwords [in] ñ����ʵս��
441  * @param num [in] @a revwords ��ñ���
442  * @param param [in] ������ħ�٥��ȥ���
443  * @param align [out] ���饤����ȷ�̤��Ǽ����Sentence��¤��
444  * @param r [i/o] ǧ��������������
445  * </JA>
446  * <EN>
447  * Do forced alignment per phoneme for the given word sequence (reversed order).
448  *
449  * @param revwords [in] word sequence in reversed direction
450  * @param num [in] length of @a revwords
451  * @param param [in] input parameter vectors
452  * @param align [out] Sentence data area to store the alignment result
453  * @param r [i/o] recognition process instance
454  * </EN>
455  * @callgraph
456  * @callergraph
457  */
458 void
phoneme_rev_align(WORD_ID * revwords,short num,HTK_Param * param,SentenceAlign * align,RecogProcess * r)459 phoneme_rev_align(WORD_ID *revwords, short num, HTK_Param *param, SentenceAlign *align, RecogProcess *r)
460 {
461   WORD_ID *words;		/* word sequence (true order) */
462   int p;
463   words = (WORD_ID *)mymalloc(sizeof(WORD_ID) * num);
464   for (p=0;p<num;p++) words[p] = revwords[num-p-1];
465   do_align(words, num, param, PER_PHONEME, align, r);
466   free(words);
467 }
468 
469 /**
470  * <JA>
471  * HMM���֤��Ȥ� forced alignment ��Ԥ�.
472  *
473  * @param words [in] ñ����
474  * @param num [in] @a words ��ñ���
475  * @param param [in] ������ħ�٥��ȥ���
476  * @param align [out] ���饤����ȷ�̤��Ǽ����Sentence��¤��
477  * @param r [i/o] ǧ��������������
478  * </JA>
479  * <EN>
480  * Do forced alignment per HMM state for the given word sequence.
481  *
482  * @param words [in] word sequence
483  * @param num [in] length of @a words
484  * @param param [in] input parameter vectors
485  * @param align [out] Sentence data area to store the alignment result
486  * @param r [i/o] recognition process instance
487  * </EN>
488  * @callgraph
489  * @callergraph
490  */
491 void
state_align(WORD_ID * words,short num,HTK_Param * param,SentenceAlign * align,RecogProcess * r)492 state_align(WORD_ID *words, short num, HTK_Param *param, SentenceAlign *align, RecogProcess *r)
493 {
494   do_align(words, num, param, PER_STATE, align, r);
495 }
496 
497 /**
498  * <JA>
499  * HMM���֤��Ȥ� forced alignment ��Ԥ���ñ�줬�ս��Ϳ���������
500  *
501  * @param revwords [in] ñ����ʵս��
502  * @param num [in] @a revwords ��ñ���
503  * @param param [in] ������ħ�٥��ȥ���
504  * @param align [out] ���饤����ȷ�̤��Ǽ����Sentence��¤��
505  * @param r [i/o] ǧ��������������
506  * </JA>
507  * <EN>
508  * Do forced alignment per state for the given word sequence (reversed order).
509  *
510  * @param revwords [in] word sequence in reversed direction
511  * @param num [in] length of @a revwords
512  * @param param [in] input parameter vectors
513  * @param align [out] Sentence data area to store the alignment result
514  * @param r [i/o] recognition process instance
515  * </EN>
516  * @callgraph
517  * @callergraph
518  */
519 void
state_rev_align(WORD_ID * revwords,short num,HTK_Param * param,SentenceAlign * align,RecogProcess * r)520 state_rev_align(WORD_ID *revwords, short num, HTK_Param *param, SentenceAlign *align, RecogProcess *r)
521 {
522   WORD_ID *words;		/* word sequence (true order) */
523   int p;
524   words = (WORD_ID *)mymalloc(sizeof(WORD_ID) * num);
525   for (p=0;p<num;p++) words[p] = revwords[num-p-1];
526   do_align(words, num, param, PER_STATE, align, r);
527   free(words);
528 }
529 
530 /**
531  * <JA>
532  * ǧ����̤��Ф���ɬ�פʥ��饤����Ȥ����Ƽ¹Ԥ��롥
533  *
534  * @param r [i/o] ǧ��������������
535  * @param param [in] ������ħ�٥��ȥ���
536  * </JA>
537  * <EN>
538  * Do required forced alignment for the recognition results
539  *
540  * @param r [i/o] recognition process instance
541  * @param param [in] input parameter vectors
542  * </EN>
543  * @callgraph
544  * @callergraph
545  */
546 void
do_alignment_all(RecogProcess * r,HTK_Param * param)547 do_alignment_all(RecogProcess *r, HTK_Param *param)
548 {
549   int n;
550   Sentence *s;
551   SentenceAlign *now, *prev;
552 
553   for(n = 0; n < r->result.sentnum; n++) {
554     s = &(r->result.sent[n]);
555     /* do forced alignment if needed */
556     if (r->config->annotate.align_result_word_flag) {
557       now = result_align_new();
558       word_align(s->word, s->word_num, param, now, r);
559       if (s->align == NULL) s->align = now;
560       else prev->next = now;
561       prev = now;
562     }
563     if (r->config->annotate.align_result_phoneme_flag) {
564       now = result_align_new();
565       phoneme_align(s->word, s->word_num, param, now, r);
566       if (s->align == NULL) s->align = now;
567       else prev->next = now;
568       prev = now;
569     }
570     if (r->config->annotate.align_result_state_flag) {
571       now = result_align_new();
572       state_align(s->word, s->word_num, param, now, r);
573       if (s->align == NULL) s->align = now;
574       else prev->next = now;
575       prev = now;
576     }
577   }
578 }
579 
580 /* end of file */
581