1 /* ====================================================================
2  * Copyright (c) 1999-2004 Carnegie Mellon University.  All rights
3  * reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  *
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in
14  *    the documentation and/or other materials provided with the
15  *    distribution.
16  *
17  * This work was supported in part by funding from the Defense Advanced
18  * Research Projects Agency and the National Science Foundation of the
19  * United States of America, and the CMU Sphinx Speech Consortium.
20  *
21  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
22  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
23  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
25  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  *
33  * ====================================================================
34  *
35  */
36 /*
37  * fast_algo_struct.c -- Various forms of pruning beam
38  *
39  * **********************************************
40  * CMU ARPA Speech Project
41  *
42  * Copyright (c) 1999 Carnegie Mellon University.
43  * ALL RIGHTS RESERVED.
44  * **********************************************
45  *
46  * HISTORY
47  * $Log$
48  * Revision 1.7  2006/02/22  16:39:43  arthchan2003
49  * Merged from SPHINX3_5_2_RCI_IRII_BRANCH: 1, Initialize beam->n_ciphone properly, 2, use ckd_free instead of free, use float64 for subvqbeam and cipbeam.  3, Add a proper free function for fast_gmm_free
50  *
51  * Revision 1.5.4.4  2005/10/17 04:43:57  arthchan2003
52  * Free fast_gmm_t.
53  *
54  * Revision 1.5.4.3  2005/09/25 19:35:26  arthchan2003
55  * Change realloc to calloc. Could be permanent if we found that there is no need to reallocate the array.
56  *
57  * Revision 1.5.4.2  2005/07/24 01:29:54  arthchan2003
58  * Set #ci phone.
59  *
60  * Revision 1.5.4.1  2005/07/03 22:53:15  arthchan2003
61  * 1, Changed free to ckd_free, 2, Join from HEAD, using float64 instead of float32.
62  *
63  * Revision 1.6  2005/06/30 13:08:44  egouvea
64  * Beams in linear scale have to be float64, since they can be easily defined as < 1e-40
65  *
66  * Revision 1.5  2005/06/21 18:26:38  arthchan2003
67  * Log. fast_algo_struct.c go through major changes in the gentle
68  * refactoring process. It is the location of several wrapper structures
69  * that control fast search.  That includes beam_t for storing beams and
70  * scores. pl_t for storing structure for phoneme lookahead, histprune_t
71  * for storing structures for histogram pruning. Lastly
72  * fast_algo_struct_t, for storing structures for fast GMM
73  * computation.
74  *
75  * Log. General Remark All of them now has consistent inteface, _init,
76  * _report and _free.  They are respectively used for allocation,
77  * reporting and deallocation of the routine. Doxygen documentation are
78  * fixed for all structures.
79  *
80  * Log. Individual changes; beam_t start to own bestscore, bestwordscore,
81  * wordbestscores, wordbestexits. They were owned by kb_t. histprune_t
82  * now wrapped up maxwpf, maxhmmpdf, maxhistpf and
83  * hmm_hist_binsize. Currently, the beam size determination routine is
84  * controlled by search implementation modules.  It is done because
85  * wrapping that operation up means we may need to introduce a bridge
86  * between beam_t and histprune_t.  pl_t is now owning heuristic type,
87  * the phoneme lookahead beam size. It also wrapped up phoneme heuristic
88  * computation.
89  *
90  * Revision 1.6  2005/04/21 23:50:26  archan
91  * Some more refactoring on the how reporting of structures inside kbcore_t is done, it is now 50% nice. Also added class-based LM test case into test-decode.sh.in.  At this moment, everything in search mode 5 is already done.  It is time to test the idea whether the search can really be used.
92  *
93  * Revision 1.5  2005/04/20 03:33:54  archan
94  * Remove pl_win and pl_win_strt, Now consider them as the parameters of the search abstraction in srch.c
95  *
96  * Revision 1.4  2005/03/30 01:22:46  archan
97  * Fixed mistakes in last updates. Add
98  *
99  *
100  * 09-Feb-2000	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
101  * 		Started.
102  */
103 
104 
105 #include "fast_algo_struct.h"
106 #include "logs3.h"
107 
108 
109 beam_t *
beam_init(float64 hmm,float64 ptr,float64 wd,float64 wdend,int32 ptranskip,int32 n_ciphone,logmath_t * logmath)110 beam_init(float64 hmm, float64 ptr, float64 wd, float64 wdend,
111           int32 ptranskip, int32 n_ciphone, logmath_t *logmath)
112 {
113     beam_t *beam;
114 
115     beam = (beam_t *) ckd_calloc(1, sizeof(beam_t));
116 
117     beam->hmm = logs3(logmath, hmm);
118     beam->ptrans = logs3(logmath, ptr);
119     beam->word = logs3(logmath, wd);
120     beam->wordend = logs3(logmath, wdend);
121     beam->ptranskip = ptranskip;
122     beam->bestscore = MAX_NEG_INT32;
123     beam->bestwordscore = MAX_NEG_INT32;
124     beam->n_ciphone = n_ciphone;
125 
126     beam->wordbestscores = (int32 *) ckd_calloc(n_ciphone, sizeof(int32));
127     beam->wordbestexits = (int32 *) ckd_calloc(n_ciphone, sizeof(int32));
128 
129     return beam;
130 }
131 
132 void
beam_report(beam_t * b)133 beam_report(beam_t * b)
134 {
135     E_INFO_NOFN("Initialization of beam_t, report:\n");
136     E_INFO_NOFN("Parameters used in Beam Pruning of Viterbi Search:\n");
137     E_INFO_NOFN("Beam=%d\n", b->hmm);
138     E_INFO_NOFN("PBeam=%d\n", b->ptrans);
139     E_INFO_NOFN("WBeam=%d (Skip=%d)\n", b->word, b->ptranskip);
140     E_INFO_NOFN("WEndBeam=%d \n", b->wordend);
141     E_INFO_NOFN("No of CI Phone assumed=%d \n", b->n_ciphone);
142     E_INFO_NOFN("\n");
143 }
144 
145 void
beam_free(beam_t * b)146 beam_free(beam_t * b)
147 {
148     if (b) {
149         if (b->wordbestscores) {
150             free(b->wordbestscores);
151         }
152         if (b->wordbestexits) {
153             free(b->wordbestexits);
154         }
155         free(b);
156     }
157 }
158 
159 pl_t *
pl_init(int32 pheurtype,int32 pl_beam,int32 n_ciphone,logmath_t * logmath)160 pl_init(int32 pheurtype, int32 pl_beam, int32 n_ciphone, logmath_t *logmath)
161 {
162     pl_t *pl;
163     pl = (pl_t *) ckd_calloc(1, sizeof(pl_t));
164 
165     pl->pheurtype = pheurtype;
166     pl->pl_beam = logs3(logmath, pl_beam);
167     pl->n_ciphone = n_ciphone;
168     pl->phn_heur_list = (int32 *) ckd_calloc(pl->n_ciphone, sizeof(int32));
169 
170     return pl;
171 }
172 
173 void
pl_free(pl_t * pl)174 pl_free(pl_t * pl)
175 {
176     if (pl) {
177         if (pl->phn_heur_list)
178             ckd_free((void *) pl->phn_heur_list);
179 
180         ckd_free((void *) pl);
181     }
182 }
183 
184 void
pl_report(pl_t * pl)185 pl_report(pl_t * pl)
186 {
187 
188     E_INFO_NOFN("Initialization of pl_t, report:\n");
189     E_INFO_NOFN("Parameters used in phoneme lookahead:\n");
190     E_INFO_NOFN("Phoneme look-ahead        type = %d\n", pl->pheurtype);
191     E_INFO_NOFN("Phoneme look-ahead beam   size = %d\n", pl->pl_beam);
192     E_INFO_NOFN("No of CI Phones assumed=%d \n", pl->n_ciphone);
193     E_INFO_NOFN("\n");
194 }
195 
196 /* Determine which set of phonemes should be active in next stage
197    using the lookahead information*/
198 /* Notice that this loop can be further optimized by implementing
199    it incrementally*/
200 /* ARCHAN and JSHERWAN Eventually, this is implemented as a function */
201 /* Notice that this loop can be further optimized by implementing it incrementally*/
202 
203 #define _CHECKUNDERFLOW_ 1
204 
205 static int32
NO_UFLOW_ADD(int32 a,int32 b)206 NO_UFLOW_ADD(int32 a, int32 b)
207 {
208     int32 c;
209 #ifdef _CHECKUNDERFLOW_
210     c = a + b;
211     c = (c > 0 && a < 0 && b < 0) ? MAX_NEG_INT32 : c;
212 #else
213     c = a + b;
214 #endif
215     return c;
216 }
217 
218 
219 void
pl_computePhnHeur(mdef_t * md,ascr_t * a,pl_t * pl,int32 heutype,int32 win_strt,int32 win_efv)220 pl_computePhnHeur(mdef_t * md, ascr_t * a, pl_t * pl, int32 heutype,
221                   int32 win_strt, int32 win_efv)
222 {
223     int32 nState;
224     int32 i, j;
225     int32 curPhn, curFrmPhnVar; /* variables for phoneme lookahead computation */
226     int32 *ph_lst;
227 
228     nState = mdef_n_emit_state(md);
229     ph_lst = pl->phn_heur_list;
230 
231     /* Initializing all the phoneme heuristics for each phone to be 0 */
232     for (j = 0; j == md->cd2cisen[j]; j++) {
233         curPhn = md->sen2cimap[j];      /*Just to save a warning */
234         ph_lst[curPhn] = 0;
235     }
236 
237     /* 20040503: ARCHAN, the code can be reduced to 10 lines, it is so
238        organized such that there is no overhead in checking the
239        heuristic type in the inner loop.
240      */
241     /* One trick we use is to use sen2cimap to check phoneme ending boundary */
242 
243 
244     if (heutype == 1) {         /* Taking Max */
245         for (i = win_strt; i < win_efv; i++) {
246             curPhn = 0;
247             curFrmPhnVar = MAX_NEG_INT32;
248             for (j = 0; j == md->cd2cisen[j]; j++) {
249                 if (curFrmPhnVar < a->cache_ci_senscr[i][j])
250                     curFrmPhnVar = a->cache_ci_senscr[i][j];
251 
252                 curPhn = md->sen2cimap[j];
253                 /* Update at the phone_end boundary */
254                 if (curPhn != md->sen2cimap[j + 1]) {
255                     ph_lst[curPhn] =
256                         NO_UFLOW_ADD(ph_lst[curPhn], curFrmPhnVar);
257                     curFrmPhnVar = MAX_NEG_INT32;
258                 }
259             }
260         }
261     }
262     else if (heutype == 2) {
263         for (i = win_strt; i < win_efv; i++) {
264             curPhn = 0;
265             curFrmPhnVar = MAX_NEG_INT32;
266             for (j = 0; j == md->cd2cisen[j]; j++) {
267                 curFrmPhnVar =
268                     NO_UFLOW_ADD(a->cache_ci_senscr[i][j], curFrmPhnVar);
269                 curPhn = md->sen2cimap[j];
270 
271                 /* Update at the phone_end boundary */
272                 if (curPhn != md->sen2cimap[j + 1]) {
273                     curFrmPhnVar /= nState;     /* ARCHAN: I hate to do division ! */
274                     ph_lst[curPhn] =
275                         NO_UFLOW_ADD(ph_lst[curPhn], curFrmPhnVar);
276                     curFrmPhnVar = MAX_NEG_INT32;
277                 }
278             }
279         }
280     }
281     else if (heutype == 3) {
282         for (i = win_strt; i < win_efv; i++) {
283             curPhn = 0;
284             curFrmPhnVar = MAX_NEG_INT32;
285             for (j = 0; j == md->cd2cisen[j]; j++) {
286                 if (curPhn == 0 || curPhn != md->sen2cimap[j - 1])      /* dangerous hack! */
287                     ph_lst[curPhn] =
288                         NO_UFLOW_ADD(ph_lst[curPhn],
289                                      a->cache_ci_senscr[i][j]);
290 
291                 curPhn = md->sen2cimap[j];
292 
293                 if (curFrmPhnVar < a->cache_ci_senscr[i][j])
294                     curFrmPhnVar = a->cache_ci_senscr[i][j];
295 
296                 /* Update at the phone_end boundary */
297                 if (md->sen2cimap[j] != md->sen2cimap[j + 1]) {
298                     ph_lst[curPhn] =
299                         NO_UFLOW_ADD(ph_lst[curPhn], curFrmPhnVar);
300                     curFrmPhnVar = MAX_NEG_INT32;
301                 }
302             }
303         }
304     }
305 
306 #if 0
307     for (j = 0; j == md->cd2cisen[j]; j++) {
308         curPhn = md->cd2cisen[j];
309         E_INFO("phoneme heuristics scores at phn %d is %d\n", j,
310                kb->phn_list[mdef->sen2cimap[j]]);
311     }
312 #endif
313 
314 
315 }
316 
317 
318 histprune_t *
histprune_init(int32 maxhmm,int32 maxhist,int32 maxword,int32 hmmhistbinsize,int32 numNodes)319 histprune_init(int32 maxhmm, int32 maxhist, int32 maxword,
320                int32 hmmhistbinsize, int32 numNodes)
321 {
322     histprune_t *h;
323     int32 n;
324 
325     h = (histprune_t *) ckd_calloc(1, sizeof(histprune_t));
326     h->maxwpf = maxword;
327     h->maxhmmpf = maxhmm;
328     h->maxhistpf = maxhist;
329     h->hmm_hist_binsize = hmmhistbinsize;
330 
331     n = numNodes;
332     n /= h->hmm_hist_binsize;
333 
334     h->hmm_hist_bins = n + 1;
335 
336     h->hmm_hist = (int32 *) ckd_calloc(h->hmm_hist_bins, sizeof(int32));
337 
338 
339     return h;
340 }
341 
342 void
histprune_zero_histbin(histprune_t * h)343 histprune_zero_histbin(histprune_t * h)
344 {
345     int32 *hmm_hist;
346     int32 numhistbins;          /* Local version of number of histogram bins, don't expect it to change */
347     int32 i;
348 
349     hmm_hist = h->hmm_hist;
350     numhistbins = h->hmm_hist_bins;
351 
352     for (i = 0; i < numhistbins; i++)
353         hmm_hist[i] = 0;
354 
355 }
356 
357 
358 void
histprune_update_histbinsize(histprune_t * h,int32 hmmhistbinsize,int32 numNodes)359 histprune_update_histbinsize(histprune_t * h,
360                              int32 hmmhistbinsize, int32 numNodes)
361 {
362     int32 n;
363     h->hmm_hist_binsize = hmmhistbinsize;
364     n = numNodes;
365     n /= h->hmm_hist_binsize;
366 
367     h->hmm_hist_bins = n + 1;
368 
369     h->hmm_hist =
370         (int32 *) ckd_realloc(h->hmm_hist,
371                               h->hmm_hist_bins * sizeof(int32));
372 }
373 
374 void
histprune_free(histprune_t * h)375 histprune_free(histprune_t * h)
376 {
377     if (h != NULL) {
378         if (h->hmm_hist != NULL) {
379             ckd_free(h->hmm_hist);
380         }
381         free(h);
382     }
383 }
384 
385 void
histprune_showhistbin(histprune_t * hp,int32 nfr,char * uttid)386 histprune_showhistbin(histprune_t * hp, int32 nfr, char *uttid)
387 {
388     int32 i, j, k;
389 
390     if (nfr == 0) {
391         nfr = 1;
392         E_WARN("Set number of frame to 1\n");
393     }
394 
395     for (j = hp->hmm_hist_bins - 1; (j >= 0) && (hp->hmm_hist[j] == 0);
396          --j);
397     E_INFO("HMMHist[0..%d](%s):", j, uttid);
398     for (i = 0, k = 0; i <= j; i++) {
399         k += hp->hmm_hist[i];
400         E_INFOCONT(" %d(%d)", hp->hmm_hist[i], (k * 100) / nfr);
401     }
402     E_INFOCONT("\n");
403 }
404 
405 
406 void
histprune_report(histprune_t * h)407 histprune_report(histprune_t * h)
408 {
409     E_INFO_NOFN("Initialization of histprune_t, report:\n");
410     E_INFO_NOFN("Parameters used in histogram pruning:\n");
411     E_INFO_NOFN("Max.     HMM per frame=%d\n", h->maxhmmpf);
412     E_INFO_NOFN("Max. history per frame=%d\n", h->maxhistpf);
413     E_INFO_NOFN("Max.    word per frame=%d\n", h->maxwpf);
414     E_INFO_NOFN("Size of histogram  bin=%d\n", h->hmm_hist_binsize);
415     E_INFO_NOFN("No.  of histogram  bin=%d\n", h->hmm_hist_bins);
416     E_INFO_NOFN("\n");
417 }
418 
419 fast_gmm_t *
fast_gmm_init(int32 down_sampling_ratio,int32 mode_cond_ds,int32 mode_dist_ds,int32 isGS4GS,int32 isSVQ4SVQ,float64 subvqbeam,float64 cipbeam,float32 tighten_factor,int32 maxcd,int32 n_ci_sen,logmath_t * logmath)420 fast_gmm_init(int32 down_sampling_ratio,
421               int32 mode_cond_ds,
422               int32 mode_dist_ds,
423               int32 isGS4GS,
424               int32 isSVQ4SVQ,
425               float64 subvqbeam,
426               float64 cipbeam,
427               float32 tighten_factor, int32 maxcd, int32 n_ci_sen,
428               logmath_t *logmath)
429 {
430     fast_gmm_t *fg;
431 
432     fg = (fast_gmm_t *) ckd_calloc(1, sizeof(fast_gmm_t));
433 
434     fg->rec_bst_senscr = 0;
435     fg->last_feat = NULL;
436 
437     fg->gs4gs = isGS4GS;
438     fg->svq4svq = isSVQ4SVQ;
439     fg->downs = (downsampling_t *) ckd_calloc(1, sizeof(downsampling_t));
440     fg->gmms = (gmm_select_t *) ckd_calloc(1, sizeof(gmm_select_t));
441     fg->gaus = (gau_select_t *) ckd_calloc(1, sizeof(gau_select_t));
442 
443     fg->gmms->ci_pbeam = logs3(logmath, cipbeam);
444     fg->gmms->tighten_factor = tighten_factor;
445     if (fg->gmms->ci_pbeam < -10000000)
446         E_INFO
447             ("Virtually no CI phone beam is applied now. (ci_pbeam <-1000000)\n");
448     fg->gmms->ci_occu = (int32 *) ckd_calloc(n_ci_sen, sizeof(int32));
449     fg->gmms->idx = (int32 *) ckd_calloc(n_ci_sen, sizeof(int32));
450     fg->gmms->max_cd = maxcd;
451 
452     fg->gaus->rec_bstcid = -1;
453 
454     fg->gaus->subvqbeam = logs3(logmath, subvqbeam);
455 
456     fg->downs->ds_ratio = down_sampling_ratio;
457     fg->downs->cond_ds = mode_cond_ds;
458     fg->downs->dist_ds = mode_dist_ds;
459     fg->downs->skip_count = 0;
460 
461     if (fg->downs->cond_ds && fg->downs->dist_ds)
462         E_FATAL("-cond_ds and -dist_ds cannot be specified together\n");
463 
464 
465 
466     return fg;
467 }
468 
469 void
fast_gmm_free(fast_gmm_t * fg)470 fast_gmm_free(fast_gmm_t * fg)
471 {
472     if (fg) {
473         if (fg->gmms->ci_occu)
474             ckd_free(fg->gmms->ci_occu);
475         if (fg->gmms->idx)
476             ckd_free(fg->gmms->idx);
477 
478         if (fg->gmms)
479             ckd_free(fg->gmms);
480 
481         if (fg->gaus)
482             ckd_free(fg->gaus);
483 
484         if (fg->downs)
485             ckd_free(fg->downs);
486 
487         ckd_free(fg);
488     }
489 }
490 
491 
492 void
fast_gmm_report(fast_gmm_t * f)493 fast_gmm_report(fast_gmm_t * f)
494 {
495     assert(f);
496     E_INFO_NOFN("Initialization of fast_gmm_t, report:\n");
497     E_INFO_NOFN("Parameters used in Fast GMM computation:\n");
498     E_INFO_NOFN
499         ("   Frame-level: Down Sampling Ratio %d, Conditional Down Sampling? %d, Distance-based Down Sampling? %d\n",
500          f->downs->ds_ratio, f->downs->cond_ds, f->downs->dist_ds);
501     E_INFO_NOFN("     GMM-level: CI phone beam %d. MAX CD %d\n",
502                 f->gmms->ci_pbeam, f->gmms->max_cd);
503     E_INFO_NOFN
504         ("Gaussian-level: GS map would be used for Gaussian Selection? =%d, SVQ would be used as Gaussian Score? =%d SubVQ Beam %d\n",
505          f->gs4gs, f->svq4svq, f->gaus->subvqbeam);
506 
507     E_INFO_NOFN("\n");
508 
509 }
510