1 /* ====================================================================
2 * Copyright (c) 1999-2004 Carnegie Mellon University. All rights
3 * reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * This work was supported in part by funding from the Defense Advanced
18 * Research Projects Agency and the National Science Foundation of the
19 * United States of America, and the CMU Sphinx Speech Consortium.
20 *
21 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
22 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
23 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
25 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 *
33 * ====================================================================
34 *
35 */
36 /*
37 * fast_algo_struct.c -- Various forms of pruning beam
38 *
39 * **********************************************
40 * CMU ARPA Speech Project
41 *
42 * Copyright (c) 1999 Carnegie Mellon University.
43 * ALL RIGHTS RESERVED.
44 * **********************************************
45 *
46 * HISTORY
47 * $Log$
48 * Revision 1.7 2006/02/22 16:39:43 arthchan2003
49 * Merged from SPHINX3_5_2_RCI_IRII_BRANCH: 1, Initialize beam->n_ciphone properly, 2, use ckd_free instead of free, use float64 for subvqbeam and cipbeam. 3, Add a proper free function for fast_gmm_free
50 *
51 * Revision 1.5.4.4 2005/10/17 04:43:57 arthchan2003
52 * Free fast_gmm_t.
53 *
54 * Revision 1.5.4.3 2005/09/25 19:35:26 arthchan2003
55 * Change realloc to calloc. Could be permanent if we found that there is no need to reallocate the array.
56 *
57 * Revision 1.5.4.2 2005/07/24 01:29:54 arthchan2003
58 * Set #ci phone.
59 *
60 * Revision 1.5.4.1 2005/07/03 22:53:15 arthchan2003
61 * 1, Changed free to ckd_free, 2, Join from HEAD, using float64 instead of float32.
62 *
63 * Revision 1.6 2005/06/30 13:08:44 egouvea
64 * Beams in linear scale have to be float64, since they can be easily defined as < 1e-40
65 *
66 * Revision 1.5 2005/06/21 18:26:38 arthchan2003
67 * Log. fast_algo_struct.c go through major changes in the gentle
68 * refactoring process. It is the location of several wrapper structures
69 * that control fast search. That includes beam_t for storing beams and
70 * scores. pl_t for storing structure for phoneme lookahead, histprune_t
71 * for storing structures for histogram pruning. Lastly
72 * fast_algo_struct_t, for storing structures for fast GMM
73 * computation.
74 *
75 * Log. General Remark All of them now has consistent inteface, _init,
76 * _report and _free. They are respectively used for allocation,
77 * reporting and deallocation of the routine. Doxygen documentation are
78 * fixed for all structures.
79 *
80 * Log. Individual changes; beam_t start to own bestscore, bestwordscore,
81 * wordbestscores, wordbestexits. They were owned by kb_t. histprune_t
82 * now wrapped up maxwpf, maxhmmpdf, maxhistpf and
83 * hmm_hist_binsize. Currently, the beam size determination routine is
84 * controlled by search implementation modules. It is done because
85 * wrapping that operation up means we may need to introduce a bridge
86 * between beam_t and histprune_t. pl_t is now owning heuristic type,
87 * the phoneme lookahead beam size. It also wrapped up phoneme heuristic
88 * computation.
89 *
90 * Revision 1.6 2005/04/21 23:50:26 archan
91 * Some more refactoring on the how reporting of structures inside kbcore_t is done, it is now 50% nice. Also added class-based LM test case into test-decode.sh.in. At this moment, everything in search mode 5 is already done. It is time to test the idea whether the search can really be used.
92 *
93 * Revision 1.5 2005/04/20 03:33:54 archan
94 * Remove pl_win and pl_win_strt, Now consider them as the parameters of the search abstraction in srch.c
95 *
96 * Revision 1.4 2005/03/30 01:22:46 archan
97 * Fixed mistakes in last updates. Add
98 *
99 *
100 * 09-Feb-2000 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
101 * Started.
102 */
103
104
105 #include "fast_algo_struct.h"
106 #include "logs3.h"
107
108
109 beam_t *
beam_init(float64 hmm,float64 ptr,float64 wd,float64 wdend,int32 ptranskip,int32 n_ciphone,logmath_t * logmath)110 beam_init(float64 hmm, float64 ptr, float64 wd, float64 wdend,
111 int32 ptranskip, int32 n_ciphone, logmath_t *logmath)
112 {
113 beam_t *beam;
114
115 beam = (beam_t *) ckd_calloc(1, sizeof(beam_t));
116
117 beam->hmm = logs3(logmath, hmm);
118 beam->ptrans = logs3(logmath, ptr);
119 beam->word = logs3(logmath, wd);
120 beam->wordend = logs3(logmath, wdend);
121 beam->ptranskip = ptranskip;
122 beam->bestscore = MAX_NEG_INT32;
123 beam->bestwordscore = MAX_NEG_INT32;
124 beam->n_ciphone = n_ciphone;
125
126 beam->wordbestscores = (int32 *) ckd_calloc(n_ciphone, sizeof(int32));
127 beam->wordbestexits = (int32 *) ckd_calloc(n_ciphone, sizeof(int32));
128
129 return beam;
130 }
131
132 void
beam_report(beam_t * b)133 beam_report(beam_t * b)
134 {
135 E_INFO_NOFN("Initialization of beam_t, report:\n");
136 E_INFO_NOFN("Parameters used in Beam Pruning of Viterbi Search:\n");
137 E_INFO_NOFN("Beam=%d\n", b->hmm);
138 E_INFO_NOFN("PBeam=%d\n", b->ptrans);
139 E_INFO_NOFN("WBeam=%d (Skip=%d)\n", b->word, b->ptranskip);
140 E_INFO_NOFN("WEndBeam=%d \n", b->wordend);
141 E_INFO_NOFN("No of CI Phone assumed=%d \n", b->n_ciphone);
142 E_INFO_NOFN("\n");
143 }
144
145 void
beam_free(beam_t * b)146 beam_free(beam_t * b)
147 {
148 if (b) {
149 if (b->wordbestscores) {
150 free(b->wordbestscores);
151 }
152 if (b->wordbestexits) {
153 free(b->wordbestexits);
154 }
155 free(b);
156 }
157 }
158
159 pl_t *
pl_init(int32 pheurtype,int32 pl_beam,int32 n_ciphone,logmath_t * logmath)160 pl_init(int32 pheurtype, int32 pl_beam, int32 n_ciphone, logmath_t *logmath)
161 {
162 pl_t *pl;
163 pl = (pl_t *) ckd_calloc(1, sizeof(pl_t));
164
165 pl->pheurtype = pheurtype;
166 pl->pl_beam = logs3(logmath, pl_beam);
167 pl->n_ciphone = n_ciphone;
168 pl->phn_heur_list = (int32 *) ckd_calloc(pl->n_ciphone, sizeof(int32));
169
170 return pl;
171 }
172
173 void
pl_free(pl_t * pl)174 pl_free(pl_t * pl)
175 {
176 if (pl) {
177 if (pl->phn_heur_list)
178 ckd_free((void *) pl->phn_heur_list);
179
180 ckd_free((void *) pl);
181 }
182 }
183
184 void
pl_report(pl_t * pl)185 pl_report(pl_t * pl)
186 {
187
188 E_INFO_NOFN("Initialization of pl_t, report:\n");
189 E_INFO_NOFN("Parameters used in phoneme lookahead:\n");
190 E_INFO_NOFN("Phoneme look-ahead type = %d\n", pl->pheurtype);
191 E_INFO_NOFN("Phoneme look-ahead beam size = %d\n", pl->pl_beam);
192 E_INFO_NOFN("No of CI Phones assumed=%d \n", pl->n_ciphone);
193 E_INFO_NOFN("\n");
194 }
195
196 /* Determine which set of phonemes should be active in next stage
197 using the lookahead information*/
198 /* Notice that this loop can be further optimized by implementing
199 it incrementally*/
200 /* ARCHAN and JSHERWAN Eventually, this is implemented as a function */
201 /* Notice that this loop can be further optimized by implementing it incrementally*/
202
203 #define _CHECKUNDERFLOW_ 1
204
205 static int32
NO_UFLOW_ADD(int32 a,int32 b)206 NO_UFLOW_ADD(int32 a, int32 b)
207 {
208 int32 c;
209 #ifdef _CHECKUNDERFLOW_
210 c = a + b;
211 c = (c > 0 && a < 0 && b < 0) ? MAX_NEG_INT32 : c;
212 #else
213 c = a + b;
214 #endif
215 return c;
216 }
217
218
219 void
pl_computePhnHeur(mdef_t * md,ascr_t * a,pl_t * pl,int32 heutype,int32 win_strt,int32 win_efv)220 pl_computePhnHeur(mdef_t * md, ascr_t * a, pl_t * pl, int32 heutype,
221 int32 win_strt, int32 win_efv)
222 {
223 int32 nState;
224 int32 i, j;
225 int32 curPhn, curFrmPhnVar; /* variables for phoneme lookahead computation */
226 int32 *ph_lst;
227
228 nState = mdef_n_emit_state(md);
229 ph_lst = pl->phn_heur_list;
230
231 /* Initializing all the phoneme heuristics for each phone to be 0 */
232 for (j = 0; j == md->cd2cisen[j]; j++) {
233 curPhn = md->sen2cimap[j]; /*Just to save a warning */
234 ph_lst[curPhn] = 0;
235 }
236
237 /* 20040503: ARCHAN, the code can be reduced to 10 lines, it is so
238 organized such that there is no overhead in checking the
239 heuristic type in the inner loop.
240 */
241 /* One trick we use is to use sen2cimap to check phoneme ending boundary */
242
243
244 if (heutype == 1) { /* Taking Max */
245 for (i = win_strt; i < win_efv; i++) {
246 curPhn = 0;
247 curFrmPhnVar = MAX_NEG_INT32;
248 for (j = 0; j == md->cd2cisen[j]; j++) {
249 if (curFrmPhnVar < a->cache_ci_senscr[i][j])
250 curFrmPhnVar = a->cache_ci_senscr[i][j];
251
252 curPhn = md->sen2cimap[j];
253 /* Update at the phone_end boundary */
254 if (curPhn != md->sen2cimap[j + 1]) {
255 ph_lst[curPhn] =
256 NO_UFLOW_ADD(ph_lst[curPhn], curFrmPhnVar);
257 curFrmPhnVar = MAX_NEG_INT32;
258 }
259 }
260 }
261 }
262 else if (heutype == 2) {
263 for (i = win_strt; i < win_efv; i++) {
264 curPhn = 0;
265 curFrmPhnVar = MAX_NEG_INT32;
266 for (j = 0; j == md->cd2cisen[j]; j++) {
267 curFrmPhnVar =
268 NO_UFLOW_ADD(a->cache_ci_senscr[i][j], curFrmPhnVar);
269 curPhn = md->sen2cimap[j];
270
271 /* Update at the phone_end boundary */
272 if (curPhn != md->sen2cimap[j + 1]) {
273 curFrmPhnVar /= nState; /* ARCHAN: I hate to do division ! */
274 ph_lst[curPhn] =
275 NO_UFLOW_ADD(ph_lst[curPhn], curFrmPhnVar);
276 curFrmPhnVar = MAX_NEG_INT32;
277 }
278 }
279 }
280 }
281 else if (heutype == 3) {
282 for (i = win_strt; i < win_efv; i++) {
283 curPhn = 0;
284 curFrmPhnVar = MAX_NEG_INT32;
285 for (j = 0; j == md->cd2cisen[j]; j++) {
286 if (curPhn == 0 || curPhn != md->sen2cimap[j - 1]) /* dangerous hack! */
287 ph_lst[curPhn] =
288 NO_UFLOW_ADD(ph_lst[curPhn],
289 a->cache_ci_senscr[i][j]);
290
291 curPhn = md->sen2cimap[j];
292
293 if (curFrmPhnVar < a->cache_ci_senscr[i][j])
294 curFrmPhnVar = a->cache_ci_senscr[i][j];
295
296 /* Update at the phone_end boundary */
297 if (md->sen2cimap[j] != md->sen2cimap[j + 1]) {
298 ph_lst[curPhn] =
299 NO_UFLOW_ADD(ph_lst[curPhn], curFrmPhnVar);
300 curFrmPhnVar = MAX_NEG_INT32;
301 }
302 }
303 }
304 }
305
306 #if 0
307 for (j = 0; j == md->cd2cisen[j]; j++) {
308 curPhn = md->cd2cisen[j];
309 E_INFO("phoneme heuristics scores at phn %d is %d\n", j,
310 kb->phn_list[mdef->sen2cimap[j]]);
311 }
312 #endif
313
314
315 }
316
317
318 histprune_t *
histprune_init(int32 maxhmm,int32 maxhist,int32 maxword,int32 hmmhistbinsize,int32 numNodes)319 histprune_init(int32 maxhmm, int32 maxhist, int32 maxword,
320 int32 hmmhistbinsize, int32 numNodes)
321 {
322 histprune_t *h;
323 int32 n;
324
325 h = (histprune_t *) ckd_calloc(1, sizeof(histprune_t));
326 h->maxwpf = maxword;
327 h->maxhmmpf = maxhmm;
328 h->maxhistpf = maxhist;
329 h->hmm_hist_binsize = hmmhistbinsize;
330
331 n = numNodes;
332 n /= h->hmm_hist_binsize;
333
334 h->hmm_hist_bins = n + 1;
335
336 h->hmm_hist = (int32 *) ckd_calloc(h->hmm_hist_bins, sizeof(int32));
337
338
339 return h;
340 }
341
342 void
histprune_zero_histbin(histprune_t * h)343 histprune_zero_histbin(histprune_t * h)
344 {
345 int32 *hmm_hist;
346 int32 numhistbins; /* Local version of number of histogram bins, don't expect it to change */
347 int32 i;
348
349 hmm_hist = h->hmm_hist;
350 numhistbins = h->hmm_hist_bins;
351
352 for (i = 0; i < numhistbins; i++)
353 hmm_hist[i] = 0;
354
355 }
356
357
358 void
histprune_update_histbinsize(histprune_t * h,int32 hmmhistbinsize,int32 numNodes)359 histprune_update_histbinsize(histprune_t * h,
360 int32 hmmhistbinsize, int32 numNodes)
361 {
362 int32 n;
363 h->hmm_hist_binsize = hmmhistbinsize;
364 n = numNodes;
365 n /= h->hmm_hist_binsize;
366
367 h->hmm_hist_bins = n + 1;
368
369 h->hmm_hist =
370 (int32 *) ckd_realloc(h->hmm_hist,
371 h->hmm_hist_bins * sizeof(int32));
372 }
373
374 void
histprune_free(histprune_t * h)375 histprune_free(histprune_t * h)
376 {
377 if (h != NULL) {
378 if (h->hmm_hist != NULL) {
379 ckd_free(h->hmm_hist);
380 }
381 free(h);
382 }
383 }
384
385 void
histprune_showhistbin(histprune_t * hp,int32 nfr,char * uttid)386 histprune_showhistbin(histprune_t * hp, int32 nfr, char *uttid)
387 {
388 int32 i, j, k;
389
390 if (nfr == 0) {
391 nfr = 1;
392 E_WARN("Set number of frame to 1\n");
393 }
394
395 for (j = hp->hmm_hist_bins - 1; (j >= 0) && (hp->hmm_hist[j] == 0);
396 --j);
397 E_INFO("HMMHist[0..%d](%s):", j, uttid);
398 for (i = 0, k = 0; i <= j; i++) {
399 k += hp->hmm_hist[i];
400 E_INFOCONT(" %d(%d)", hp->hmm_hist[i], (k * 100) / nfr);
401 }
402 E_INFOCONT("\n");
403 }
404
405
406 void
histprune_report(histprune_t * h)407 histprune_report(histprune_t * h)
408 {
409 E_INFO_NOFN("Initialization of histprune_t, report:\n");
410 E_INFO_NOFN("Parameters used in histogram pruning:\n");
411 E_INFO_NOFN("Max. HMM per frame=%d\n", h->maxhmmpf);
412 E_INFO_NOFN("Max. history per frame=%d\n", h->maxhistpf);
413 E_INFO_NOFN("Max. word per frame=%d\n", h->maxwpf);
414 E_INFO_NOFN("Size of histogram bin=%d\n", h->hmm_hist_binsize);
415 E_INFO_NOFN("No. of histogram bin=%d\n", h->hmm_hist_bins);
416 E_INFO_NOFN("\n");
417 }
418
419 fast_gmm_t *
fast_gmm_init(int32 down_sampling_ratio,int32 mode_cond_ds,int32 mode_dist_ds,int32 isGS4GS,int32 isSVQ4SVQ,float64 subvqbeam,float64 cipbeam,float32 tighten_factor,int32 maxcd,int32 n_ci_sen,logmath_t * logmath)420 fast_gmm_init(int32 down_sampling_ratio,
421 int32 mode_cond_ds,
422 int32 mode_dist_ds,
423 int32 isGS4GS,
424 int32 isSVQ4SVQ,
425 float64 subvqbeam,
426 float64 cipbeam,
427 float32 tighten_factor, int32 maxcd, int32 n_ci_sen,
428 logmath_t *logmath)
429 {
430 fast_gmm_t *fg;
431
432 fg = (fast_gmm_t *) ckd_calloc(1, sizeof(fast_gmm_t));
433
434 fg->rec_bst_senscr = 0;
435 fg->last_feat = NULL;
436
437 fg->gs4gs = isGS4GS;
438 fg->svq4svq = isSVQ4SVQ;
439 fg->downs = (downsampling_t *) ckd_calloc(1, sizeof(downsampling_t));
440 fg->gmms = (gmm_select_t *) ckd_calloc(1, sizeof(gmm_select_t));
441 fg->gaus = (gau_select_t *) ckd_calloc(1, sizeof(gau_select_t));
442
443 fg->gmms->ci_pbeam = logs3(logmath, cipbeam);
444 fg->gmms->tighten_factor = tighten_factor;
445 if (fg->gmms->ci_pbeam < -10000000)
446 E_INFO
447 ("Virtually no CI phone beam is applied now. (ci_pbeam <-1000000)\n");
448 fg->gmms->ci_occu = (int32 *) ckd_calloc(n_ci_sen, sizeof(int32));
449 fg->gmms->idx = (int32 *) ckd_calloc(n_ci_sen, sizeof(int32));
450 fg->gmms->max_cd = maxcd;
451
452 fg->gaus->rec_bstcid = -1;
453
454 fg->gaus->subvqbeam = logs3(logmath, subvqbeam);
455
456 fg->downs->ds_ratio = down_sampling_ratio;
457 fg->downs->cond_ds = mode_cond_ds;
458 fg->downs->dist_ds = mode_dist_ds;
459 fg->downs->skip_count = 0;
460
461 if (fg->downs->cond_ds && fg->downs->dist_ds)
462 E_FATAL("-cond_ds and -dist_ds cannot be specified together\n");
463
464
465
466 return fg;
467 }
468
469 void
fast_gmm_free(fast_gmm_t * fg)470 fast_gmm_free(fast_gmm_t * fg)
471 {
472 if (fg) {
473 if (fg->gmms->ci_occu)
474 ckd_free(fg->gmms->ci_occu);
475 if (fg->gmms->idx)
476 ckd_free(fg->gmms->idx);
477
478 if (fg->gmms)
479 ckd_free(fg->gmms);
480
481 if (fg->gaus)
482 ckd_free(fg->gaus);
483
484 if (fg->downs)
485 ckd_free(fg->downs);
486
487 ckd_free(fg);
488 }
489 }
490
491
492 void
fast_gmm_report(fast_gmm_t * f)493 fast_gmm_report(fast_gmm_t * f)
494 {
495 assert(f);
496 E_INFO_NOFN("Initialization of fast_gmm_t, report:\n");
497 E_INFO_NOFN("Parameters used in Fast GMM computation:\n");
498 E_INFO_NOFN
499 (" Frame-level: Down Sampling Ratio %d, Conditional Down Sampling? %d, Distance-based Down Sampling? %d\n",
500 f->downs->ds_ratio, f->downs->cond_ds, f->downs->dist_ds);
501 E_INFO_NOFN(" GMM-level: CI phone beam %d. MAX CD %d\n",
502 f->gmms->ci_pbeam, f->gmms->max_cd);
503 E_INFO_NOFN
504 ("Gaussian-level: GS map would be used for Gaussian Selection? =%d, SVQ would be used as Gaussian Score? =%d SubVQ Beam %d\n",
505 f->gs4gs, f->svq4svq, f->gaus->subvqbeam);
506
507 E_INFO_NOFN("\n");
508
509 }
510