1 /**
2 * @file word_align.c
3 *
4 * <JA>
5 * @brief ñ�졦���ǡ�����ñ�̤Υ��饤�����
6 *
7 * �����Ǥϡ�ǧ����̤��Ф������ϲ����Υ��饤����Ȥ���Ϥ��뤿���
8 * �ؿ����������Ƥ��ޤ�.
9 *
10 * Julius/Julian �Ǥϡ�ǧ����̤ˤ����Ƥ���ñ��䲻�ǡ����뤤��HMM�ξ��֤�
11 * ���줾�����ϲ����Τɤζ�֤˥ޥå������Τ����Τ뤳�Ȥ��Ǥ��ޤ�.
12 * ������Τʥ��饤����Ȥ���뤿��ˡ�Julius/Julian �Ǥ�ǧ�����
13 * �����ޤ������Ѥ����ˡ�ǧ��������ä��������줿ǧ����̤�ñ�����
14 * �Ф��ơ����餿��� forced alignment ��¹Ԥ��Ƥ��ޤ�.
15 * </JA>
16 *
17 * <EN>
18 * @brief Forced alignment by word / phoneme / state unit.
19 *
20 * This file defines functions for performing forced alignment of
21 * recognized words. The forced alignment is implimented in Julius/Julian
22 * to get the best matching segmentation of recognized word sequence
23 * upon input speech. Word-level, phoneme-level and HMM state-level
24 * alignment can be obtained.
25 *
26 * Julius/Julian performs the forced alignment as a post-processing of
27 * recognition process. Recomputation of Viterbi path on the recognized
28 * word sequence toward input speech will be done after the recognition
29 * to get better alignment.
30 *
31 * </EN>
32 *
33 * @author Akinobu Lee
34 * @date Sat Sep 24 16:09:46 2005
35 *
36 * $Revision: 1.5 $
37 *
38 */
39 /*
40 * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University
41 * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
42 * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology
43 * All rights reserved
44 */
45
46 #include <julius/julius.h>
47
48 /**
49 * <JA>
50 * Ϳ����줿ñ����HMM��Ϣ�뤷��ʸ���Τ�HMM���ۤ���.
51 *
52 * @param wseq [in] ��
53 * @param num [in] @a wseq �ο�
54 * @param has_sp_ret [out] ���硼�ȥݡ������³�������������˥åȤξ���
55 * @param num_ret [out] ���ۤ��줿HMM�˴ޤޤ�벻��HMM�ο�
56 * @param end_ret [out] ���饤����Ȥζ��ڤ�Ȥʤ�����ֹ����
57 * @param per_what [in] ñ�졦���ǡ����֤Τɤ�ñ�̤ǥ��饤����Ȥ��뤫�����
58 * @param r [in] ǧ������������
59 *
60 * @return ���餿�˳���դ���줿ʸ���Τ�魯HMM��ǥ���ؤΥݥ����֤�.
61 * </JA>
62 * <EN>
63 * Make the whole sentence HMM from given word sequence by connecting
64 * each phoneme HMM.
65 *
66 * @param wseq [in] word sequence to align
67 * @param num [in] number of @a wseq
68 * @param has_sp_ret [out] unit information of whether it can be followed by a short-pause
69 * @param num_ret [out] number of HMM contained in the generated sentence HMM
70 * @param end_ret [out] sequence of state location as alignment unit
71 * @param per_what [in] specify the alignment unit (word / phoneme / state)
72 * @param r [in] recognition process instance
73 *
74 * @return newly malloced HMM sequences.
75 * </EN>
76 */
77 static HMM_Logical **
make_phseq(WORD_ID * wseq,short num,boolean ** has_sp_ret,int * num_ret,int ** end_ret,int per_what,RecogProcess * r)78 make_phseq(WORD_ID *wseq, short num, boolean **has_sp_ret, int *num_ret, int **end_ret, int per_what,
79 RecogProcess *r)
80 {
81 HMM_Logical **ph; /* phoneme sequence */
82 boolean *has_sp;
83 int k;
84 int phnum; /* num of above */
85 WORD_ID tmpw, w;
86 int i, j, pn, st, endn;
87 HMM_Logical *tmpp, *ret;
88 WORD_INFO *winfo;
89 HTK_HMM_INFO *hmminfo;
90 boolean enable_iwsp; /* for multipath */
91
92 winfo = r->lm->winfo;
93 hmminfo = r->am->hmminfo;
94 if (hmminfo->multipath) enable_iwsp = r->lm->config->enable_iwsp;
95
96 /* make ph[] from wseq[] */
97 /* 1. calc total phone num and malloc */
98 phnum = 0;
99 for (w=0;w<num;w++) phnum += winfo->wlen[wseq[w]];
100 ph = (HMM_Logical **)mymalloc(sizeof(HMM_Logical *) * phnum);
101
102 if (hmminfo->multipath && enable_iwsp) {
103 has_sp = (boolean *)mymalloc(sizeof(boolean) * phnum);
104 } else {
105 has_sp = NULL;
106 }
107 /* 2. make phoneme sequence */
108 st = 0;
109 if (hmminfo->multipath) st++;
110 pn = 0;
111 endn = 0;
112 for (w=0;w<num;w++) {
113 tmpw = wseq[w];
114 for (i=0;i<winfo->wlen[tmpw];i++) {
115 tmpp = winfo->wseq[tmpw][i];
116 /* handle cross-word context dependency */
117 if (r->ccd_flag) {
118 if (w > 0 && i == 0) { /* word head */
119
120 if ((ret = get_left_context_HMM(tmpp, ph[pn-1]->name, hmminfo)) != NULL) {
121 tmpp = ret;
122 }
123 /* if triphone not found, fallback to bi/mono-phone */
124 /* use pseudo phone when no bi-phone found in alignment... */
125 }
126 if (w < num-1 && i == winfo->wlen[tmpw] - 1) { /* word tail */
127 if ((ret = get_right_context_HMM(tmpp, winfo->wseq[wseq[w+1]][0]->name, hmminfo)) != NULL) {
128 tmpp = ret;
129 }
130 }
131 }
132 ph[pn] = tmpp;
133 if (hmminfo->multipath && enable_iwsp) {
134 if (i == winfo->wlen[tmpw] - 1) {
135 has_sp[pn] = TRUE;
136 } else {
137 has_sp[pn] = FALSE;
138 }
139 }
140 if (per_what == PER_STATE) {
141 for (j=0;j<hmm_logical_state_num(tmpp)-2;j++) {
142 (*end_ret)[endn++] = st + j;
143 }
144 if (hmminfo->multipath && enable_iwsp && has_sp[pn]) {
145 for (k=0;k<hmm_logical_state_num(hmminfo->sp)-2;k++) {
146 (*end_ret)[endn++] = st + j + k;
147 }
148 }
149 }
150 st += hmm_logical_state_num(tmpp) - 2;
151 if (hmminfo->multipath && enable_iwsp && has_sp[pn]) {
152 st += hmm_logical_state_num(hmminfo->sp) - 2;
153 }
154 if (per_what == PER_PHONEME) (*end_ret)[endn++] = st - 1;
155 pn++;
156 }
157 if (per_what == PER_WORD) (*end_ret)[endn++] = st - 1;
158 }
159 *num_ret = phnum;
160 *has_sp_ret = has_sp;
161 return ph;
162 }
163
164
165 /**
166 * <JA>
167 * ʸ���Τ�HMM���ۤ���Viterbi���饤����Ȥ�¹Ԥ�����̤���Ϥ���.
168 *
169 * @param words [in] ʸ�����魯ñ����
170 * @param wnum [in] @a words ����
171 * @param param [in] ������ħ�ѥ�����
172 * @param per_what [in] ñ�졦���ǡ����֤Τɤ�ñ�̤ǥ��饤����Ȥ��뤫�����
173 * @param align [out] ���饤����ȷ�̤��Ǽ����Sentence��¤��
174 * @param r [i/o] ǧ������������
175 * </JA>
176 * <EN>
177 * Build sentence HMM, call viterbi_segment() and output result.
178 *
179 * @param words [in] word sequence of the sentence
180 * @param wnum [in] number of words in @a words
181 * @param param [in] input parameter vector
182 * @param per_what [in] specify the alignment unit (word / phoneme / state)
183 * @param s [out] Sentence data area to store the alignment result
184 * @param r [i/o] recognition process instance
185 * </EN>
186 */
187 static void
do_align(WORD_ID * words,short wnum,HTK_Param * param,int per_what,SentenceAlign * align,RecogProcess * r)188 do_align(WORD_ID *words, short wnum, HTK_Param *param, int per_what, SentenceAlign *align, RecogProcess *r)
189 {
190 HMM_Logical **phones; /* phoneme sequence */
191 boolean *has_sp; /* whether phone can follow short pause */
192 int k;
193 int phonenum; /* num of above */
194 HMM *shmm; /* sentence HMM */
195 int *end_state; /* state number of word ends */
196 int *end_frame; /* segmented last frame of words */
197 LOGPROB *end_score; /* normalized score of each words */
198 LOGPROB allscore; /* total score of this word sequence */
199 WORD_ID w;
200 int i, rlen;
201 int end_num = 0;
202 int *id_seq, *phloc = NULL, *stloc = NULL;
203 int j,n,p;
204 WORD_INFO *winfo;
205 HTK_HMM_INFO *hmminfo;
206 boolean enable_iwsp; /* for multipath */
207
208 winfo = r->lm->winfo;
209 hmminfo = r->am->hmminfo;
210 if (hmminfo->multipath) enable_iwsp = r->lm->config->enable_iwsp;
211
212 /* initialize result storage buffer */
213 switch(per_what) {
214 case PER_WORD:
215 jlog("ALIGN: === word alignment begin ===\n");
216 end_num = wnum;
217 phloc = (int *)mymalloc(sizeof(int)*wnum);
218 i = 0;
219 for(w=0;w<wnum;w++) {
220 phloc[w] = i;
221 i += winfo->wlen[words[w]];
222 }
223 break;
224 case PER_PHONEME:
225 jlog("ALIGN: === phoneme alignment begin ===\n");
226 end_num = 0;
227 for(w=0;w<wnum;w++) end_num += winfo->wlen[words[w]];
228 break;
229 case PER_STATE:
230 jlog("ALIGN: === state alignment begin ===\n");
231 end_num = 0;
232 for(w=0;w<wnum;w++) {
233 for (i=0;i<winfo->wlen[words[w]]; i++) {
234 end_num += hmm_logical_state_num(winfo->wseq[words[w]][i]) - 2;
235 }
236 if (hmminfo->multipath && enable_iwsp) {
237 end_num += hmm_logical_state_num(hmminfo->sp) - 2;
238 }
239 }
240 phloc = (int *)mymalloc(sizeof(int)*end_num);
241 stloc = (int *)mymalloc(sizeof(int)*end_num);
242 {
243 n = 0;
244 p = 0;
245 for(w=0;w<wnum;w++) {
246 for(i=0;i<winfo->wlen[words[w]]; i++) {
247 for(j=0; j<hmm_logical_state_num(winfo->wseq[words[w]][i]) - 2; j++) {
248 phloc[n] = p;
249 stloc[n] = j + 1;
250 n++;
251 }
252 if (hmminfo->multipath && enable_iwsp && i == winfo->wlen[words[w]] - 1) {
253 for(k=0;k<hmm_logical_state_num(hmminfo->sp)-2;k++) {
254 phloc[n] = p;
255 stloc[n] = j + 1 + k + end_num;
256 n++;
257 }
258 }
259 p++;
260 }
261 }
262 }
263
264 break;
265 }
266 end_state = (int *)mymalloc(sizeof(int) * end_num);
267
268 /* make phoneme sequence word sequence */
269 phones = make_phseq(words, wnum, &has_sp, &phonenum, &end_state, per_what, r);
270 /* build the sentence HMMs */
271 shmm = new_make_word_hmm(hmminfo, phones, phonenum, has_sp);
272 if (shmm == NULL) {
273 j_internal_error("Error: failed to make word hmm for alignment\n");
274 }
275
276 /* call viterbi segmentation function */
277 allscore = viterbi_segment(shmm, param, r->wchmm->hmmwrk, hmminfo->multipath, end_state, end_num, &id_seq, &end_frame, &end_score, &rlen);
278
279 /* store result to s */
280 align->num = rlen;
281 align->unittype = per_what;
282 align->begin_frame = (int *)mymalloc(sizeof(int) * rlen);
283 align->end_frame = (int *)mymalloc(sizeof(int) * rlen);
284 align->avgscore = (LOGPROB *)mymalloc(sizeof(LOGPROB) * rlen);
285 for(i=0;i<rlen;i++) {
286 align->begin_frame[i] = (i == 0) ? 0 : end_frame[i-1] + 1;
287 align->end_frame[i] = end_frame[i];
288 align->avgscore[i] = end_score[i];
289 }
290 switch(per_what) {
291 case PER_WORD:
292 align->w = (WORD_ID *)mymalloc(sizeof(WORD_ID) * rlen);
293 for(i=0;i<rlen;i++) {
294 align->w[i] = words[id_seq[i]];
295 }
296 break;
297 case PER_PHONEME:
298 align->ph = (HMM_Logical **)mymalloc(sizeof(HMM_Logical *) * rlen);
299 for(i=0;i<rlen;i++) {
300 align->ph[i] = phones[id_seq[i]];
301 }
302 break;
303 case PER_STATE:
304 align->ph = (HMM_Logical **)mymalloc(sizeof(HMM_Logical *) * rlen);
305 align->loc = (short *)mymalloc(sizeof(short) * rlen);
306 if (hmminfo->multipath) align->is_iwsp = (boolean *)mymalloc(sizeof(boolean) * rlen);
307 for(i=0;i<rlen;i++) {
308 align->ph[i] = phones[phloc[id_seq[i]]];
309 if (hmminfo->multipath) {
310 if (enable_iwsp && stloc[id_seq[i]] > end_num) {
311 align->loc[i] = stloc[id_seq[i]] - end_num;
312 align->is_iwsp[i] = TRUE;
313 } else {
314 align->loc[i] = stloc[id_seq[i]];
315 align->is_iwsp[i] = FALSE;
316 }
317 } else {
318 align->loc[i] = stloc[id_seq[i]];
319 }
320 }
321 break;
322 }
323
324 align->allscore = allscore;
325
326 free_hmm(shmm);
327 free(id_seq);
328 free(phones);
329 if (has_sp) free(has_sp);
330 free(end_score);
331 free(end_frame);
332 free(end_state);
333
334 switch(per_what) {
335 case PER_WORD:
336 free(phloc);
337 break;
338 case PER_PHONEME:
339 break;
340 case PER_STATE:
341 free(phloc);
342 free(stloc);
343 }
344
345 }
346
347 /**
348 * <JA>
349 * ñ�줴�Ȥ� forced alignment ��Ԥ�.
350 *
351 * @param words [in] ��
352 * @param wnum [in] @a words ���
353 * @param param [in] ������ħ�٥��ȥ���
354 * @param align [out] ���饤����ȷ�̤��Ǽ����Sentence��¤��
355 * @param r [i/o] ǧ������������
356 * </JA>
357 * <EN>
358 * Do forced alignment per word for the given word sequence.
359 *
360 * @param words [in] word sequence
361 * @param wnum [in] length of @a words
362 * @param param [in] input parameter vectors
363 * @param align [out] Sentence data area to store the alignment result
364 * @param r [i/o] recognition process instance
365 * </EN>
366 * @callgraph
367 * @callergraph
368 */
369 void
word_align(WORD_ID * words,short wnum,HTK_Param * param,SentenceAlign * align,RecogProcess * r)370 word_align(WORD_ID *words, short wnum, HTK_Param *param, SentenceAlign *align, RecogProcess *r)
371 {
372 do_align(words, wnum, param, PER_WORD, align, r);
373 }
374
375 /**
376 * <JA>
377 * ñ�줴�Ȥ� forced alignment ��Ԥ���ñ�줬�ս��Ϳ���������
378 *
379 * @param revwords [in] ñ����ʵս��
380 * @param wnum [in] @a revwords ���
381 * @param param [in] ������ħ�٥��ȥ���
382 * @param align [out] ���饤����ȷ�̤��Ǽ����Sentence��¤��
383 * @param r [i/o] ǧ������������
384 * </JA>
385 * <EN>
386 * Do forced alignment per word for the given word sequence (reversed order).
387 *
388 * @param revwords [in] word sequence in reversed direction
389 * @param wnum [in] length of @a revwords
390 * @param param [in] input parameter vectors
391 * @param align [out] Sentence data area to store the alignment result
392 * @param r [i/o] recognition process instance
393 * </EN>
394 * @callgraph
395 * @callergraph
396 */
397 void
word_rev_align(WORD_ID * revwords,short wnum,HTK_Param * param,SentenceAlign * align,RecogProcess * r)398 word_rev_align(WORD_ID *revwords, short wnum, HTK_Param *param, SentenceAlign *align, RecogProcess *r)
399 {
400 WORD_ID *words; /* word sequence (true order) */
401 int w;
402 words = (WORD_ID *)mymalloc(sizeof(WORD_ID) * wnum);
403 for (w=0;w<wnum;w++) words[w] = revwords[wnum-w-1];
404 do_align(words, wnum, param, PER_WORD, align, r);
405 free(words);
406 }
407
408 /**
409 * <JA>
410 * ���Ǥ��Ȥ� forced alignment ��Ԥ�.
411 *
412 * @param words [in] ��
413 * @param num [in] @a words ���
414 * @param param [in] ������ħ�٥��ȥ���
415 * @param align [out] ���饤����ȷ�̤��Ǽ����Sentence��¤��
416 * @param r [i/o] ǧ������������
417 * </JA>
418 * <EN>
419 * Do forced alignment per phoneme for the given word sequence.
420 *
421 * @param words [in] word sequence
422 * @param num [in] length of @a words
423 * @param param [in] input parameter vectors
424 * @param align [out] Sentence data area to store the alignment result
425 * @param r [i/o] recognition process instance
426 * </EN>
427 * @callgraph
428 * @callergraph
429 */
430 void
phoneme_align(WORD_ID * words,short num,HTK_Param * param,SentenceAlign * align,RecogProcess * r)431 phoneme_align(WORD_ID *words, short num, HTK_Param *param, SentenceAlign *align, RecogProcess *r)
432 {
433 do_align(words, num, param, PER_PHONEME, align, r);
434 }
435
436 /**
437 * <JA>
438 * ���Ǥ��Ȥ� forced alignment ��Ԥ���ñ�줬�ս��Ϳ���������
439 *
440 * @param revwords [in] ñ����ʵս��
441 * @param num [in] @a revwords ���
442 * @param param [in] ������ħ�٥��ȥ���
443 * @param align [out] ���饤����ȷ�̤��Ǽ����Sentence��¤��
444 * @param r [i/o] ǧ������������
445 * </JA>
446 * <EN>
447 * Do forced alignment per phoneme for the given word sequence (reversed order).
448 *
449 * @param revwords [in] word sequence in reversed direction
450 * @param num [in] length of @a revwords
451 * @param param [in] input parameter vectors
452 * @param align [out] Sentence data area to store the alignment result
453 * @param r [i/o] recognition process instance
454 * </EN>
455 * @callgraph
456 * @callergraph
457 */
458 void
phoneme_rev_align(WORD_ID * revwords,short num,HTK_Param * param,SentenceAlign * align,RecogProcess * r)459 phoneme_rev_align(WORD_ID *revwords, short num, HTK_Param *param, SentenceAlign *align, RecogProcess *r)
460 {
461 WORD_ID *words; /* word sequence (true order) */
462 int p;
463 words = (WORD_ID *)mymalloc(sizeof(WORD_ID) * num);
464 for (p=0;p<num;p++) words[p] = revwords[num-p-1];
465 do_align(words, num, param, PER_PHONEME, align, r);
466 free(words);
467 }
468
469 /**
470 * <JA>
471 * HMM���֤��Ȥ� forced alignment ��Ԥ�.
472 *
473 * @param words [in] ��
474 * @param num [in] @a words ���
475 * @param param [in] ������ħ�٥��ȥ���
476 * @param align [out] ���饤����ȷ�̤��Ǽ����Sentence��¤��
477 * @param r [i/o] ǧ������������
478 * </JA>
479 * <EN>
480 * Do forced alignment per HMM state for the given word sequence.
481 *
482 * @param words [in] word sequence
483 * @param num [in] length of @a words
484 * @param param [in] input parameter vectors
485 * @param align [out] Sentence data area to store the alignment result
486 * @param r [i/o] recognition process instance
487 * </EN>
488 * @callgraph
489 * @callergraph
490 */
491 void
state_align(WORD_ID * words,short num,HTK_Param * param,SentenceAlign * align,RecogProcess * r)492 state_align(WORD_ID *words, short num, HTK_Param *param, SentenceAlign *align, RecogProcess *r)
493 {
494 do_align(words, num, param, PER_STATE, align, r);
495 }
496
497 /**
498 * <JA>
499 * HMM���֤��Ȥ� forced alignment ��Ԥ���ñ�줬�ս��Ϳ���������
500 *
501 * @param revwords [in] ñ����ʵս��
502 * @param num [in] @a revwords ���
503 * @param param [in] ������ħ�٥��ȥ���
504 * @param align [out] ���饤����ȷ�̤��Ǽ����Sentence��¤��
505 * @param r [i/o] ǧ������������
506 * </JA>
507 * <EN>
508 * Do forced alignment per state for the given word sequence (reversed order).
509 *
510 * @param revwords [in] word sequence in reversed direction
511 * @param num [in] length of @a revwords
512 * @param param [in] input parameter vectors
513 * @param align [out] Sentence data area to store the alignment result
514 * @param r [i/o] recognition process instance
515 * </EN>
516 * @callgraph
517 * @callergraph
518 */
519 void
state_rev_align(WORD_ID * revwords,short num,HTK_Param * param,SentenceAlign * align,RecogProcess * r)520 state_rev_align(WORD_ID *revwords, short num, HTK_Param *param, SentenceAlign *align, RecogProcess *r)
521 {
522 WORD_ID *words; /* word sequence (true order) */
523 int p;
524 words = (WORD_ID *)mymalloc(sizeof(WORD_ID) * num);
525 for (p=0;p<num;p++) words[p] = revwords[num-p-1];
526 do_align(words, num, param, PER_STATE, align, r);
527 free(words);
528 }
529
530 /**
531 * <JA>
532 * ǧ����̤��Ф���ɬ�פʥ��饤����Ȥ����Ƽ¹Ԥ��롥
533 *
534 * @param r [i/o] ǧ������������
535 * @param param [in] ������ħ�٥��ȥ���
536 * </JA>
537 * <EN>
538 * Do required forced alignment for the recognition results
539 *
540 * @param r [i/o] recognition process instance
541 * @param param [in] input parameter vectors
542 * </EN>
543 * @callgraph
544 * @callergraph
545 */
546 void
do_alignment_all(RecogProcess * r,HTK_Param * param)547 do_alignment_all(RecogProcess *r, HTK_Param *param)
548 {
549 int n;
550 Sentence *s;
551 SentenceAlign *now, *prev;
552
553 for(n = 0; n < r->result.sentnum; n++) {
554 s = &(r->result.sent[n]);
555 /* do forced alignment if needed */
556 if (r->config->annotate.align_result_word_flag) {
557 now = result_align_new();
558 word_align(s->word, s->word_num, param, now, r);
559 if (s->align == NULL) s->align = now;
560 else prev->next = now;
561 prev = now;
562 }
563 if (r->config->annotate.align_result_phoneme_flag) {
564 now = result_align_new();
565 phoneme_align(s->word, s->word_num, param, now, r);
566 if (s->align == NULL) s->align = now;
567 else prev->next = now;
568 prev = now;
569 }
570 if (r->config->annotate.align_result_state_flag) {
571 now = result_align_new();
572 state_align(s->word, s->word_num, param, now, r);
573 if (s->align == NULL) s->align = now;
574 else prev->next = now;
575 prev = now;
576 }
577 }
578 }
579
580 /* end of file */
581