1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 1999-2004 Carnegie Mellon University.  All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 /*
38  * lm.c -- Disk-based backoff word trigram LM module.
39  *
40  * **********************************************
41  * CMU ARPA Speech Project
42  *
43  * Copyright (c) 1997 Carnegie Mellon University.
44  * ALL RIGHTS RESERVED.
45  * **********************************************
46  *
47  * HISTORY
48  * $Log: lm.c,v $
49  * Revision 1.20  2006/03/03 20:02:38  arthchan2003
50  * Removed C++ styles comment. This will make options -ansi and -std=c89 happy
51  *
52  * Revision 1.19  2006/03/02 22:11:56  arthchan2003
53  * Fixed dox-doc.
54  *
55  * Revision 1.18  2006/03/01 20:03:55  arthchan2003
56  * Do encoding conversion when the encodings are different. This will avoid a lot of weird characters.
57  *
58  * Revision 1.17  2006/02/24 13:38:08  arthchan2003
59  * Added lm_read, it is a simple version of lm_read_advance.
60  *
61  * Revision 1.16  2006/02/23 04:16:29  arthchan2003
62  * Merged from SPHINX3_5_2_RCI_IRII_BRANCH:
63  * Splited the original lm.c into five parts,
64  * a, lm.c - a controller of other subroutines.
65  * b, lm_3g.c - implement TXT-based lm operations
66  * c, lm_3g_dmp.c - implement DMP-based lm operations
67  * d, lm_attfsm.c - implement FSM-based lm operations
68  * e, lmset.c - implement sets of lm.
69  *
70  *
71  * Revision 1.14.4.9  2006/01/16 19:56:37  arthchan2003
72  * 1, lm_rawscore doesn't need a language weight, 2, Support dumping the LM in FST format.  This code used Yannick Esteve's and LIUM code.
73  *
74  * Revision 1.14.4.8  2005/11/17 06:18:49  arthchan2003
75  * Added a string encoding conversion routine in lm.c. Currently it only works for converting hex to its value.
76  *
77  * Revision 1.14.4.7  2005/10/17 04:49:13  arthchan2003
78  * Free resource of lm_t and lmset_t correctly.
79  *
80  * Revision 1.14.4.6  2005/09/07 23:30:26  arthchan2003
81  * Changed error message for LM dump.
82  *
83  * Revision 1.14.4.5  2005/08/02 21:10:18  arthchan2003
84  * Added function declaration for lm_read_dump.
85  *
86  * Revision 1.14.4.4  2005/07/17 05:24:23  arthchan2003
87  * (Incomplete) Added lm_arbitrary.[ch], an arbitrary n-gram data structure.  Far from completed. Don't expect too much.
88  *
89  * Revision 1.14.4.3  2005/07/13 01:44:17  arthchan2003
90  * 1, Moved text formatted LM code into lm_3g.c, 2 Changed lm_read such that it will work with both TXT file format and DMP file format. 3,  Added function lm_write to handle lm writing.
91  *
92  * Revision 1.14.4.2  2005/07/05 21:31:25  arthchan2003
93  * Merged from HEAD.
94  *
95  * Revision 1.15  2005/07/05 13:12:37  dhdfu
96  * Add new arguments to logs3_init() in some tests, main_ep
97  *
98  * Revision 1.14.4.1  2005/07/03 22:58:56  arthchan2003
99  * tginfo and membg 's memory were not deallocated at all. This change fixed it.
100  *
101  * Revision 1.14  2005/06/21 22:24:02  arthchan2003
102  * Log. In this change, I introduced a new interface for lm ,which is
103  * call lmset_t. lmset_t wraps up multiple lm, n_lm, n_alloclm into the
104  * same structure and handle LM initialization (lm_init) switching,
105  * (lmset_curlm_widx), delete LM (lmset_delete_lm).  The internal
106  * structure is called lmarray and is an array of pointers of lm.  The
107  * current lm is always maintained and pointed by a pointer called cur_lm
108  * . This substantially clarify the structure of the code.  At this
109  * check-in, not every core function of lmset is completed.
110  * e.g. lmset_add_lm because that required testing of several LM reading
111  * routines and could be quite time-consuming.
112  *
113  * Log. Another notable change is the fact dict2lmwid map is started to
114  * be part of the LM. The reason of this is clearly described inside the
115  * code. Don't want to repeat here.
116  *
117  * Log. The new interface has been already used broadly in both Sphinx
118  * 3.0 and sphinx 3.x family of tools.
119  *
120  * Revision 1.4  2005/06/18 03:22:28  archan
121  * Add lmset_init. A wrapper function of various LM initialization and initialize an lmset It is now used in decode, livepretend, dag and astar.
122  *
123  * Revision 1.3  2005/06/17 23:44:40  archan
124  * Sphinx3 to s3.generic, 1, Support -lmname in decode and livepretend.  2, Wrap up the initialization of dict2lmwid to lm initialization. 3, add Dave's trick in LM switching in mode 4 of the search.
125  *
126  * Revision 1.2  2005/05/10 21:21:53  archan
127  * Three functionalities added but not tested. Code on 1) addition/deletion of LM in mode 4. 2) reading text-based LM 3) Converting txt-based LM to dmp-based LM.
128  *
129  * Revision 1.1  2005/05/04 06:08:07  archan
130  * Refactor all lm routines except fillpen.c into ./libs3decoder/liblm/ . This will be equivalent to ./lib/liblm in future.
131  *
132  * Revision 1.6  2005/05/04 04:02:24  archan
133  * Implementation of lm addition, deletion in (mode 4) time-switching tree implementation of search.  Not yet tested. Just want to keep up my own momentum.
134  *
135  * Revision 1.5  2005/04/20 03:37:59  archan
136  * LM code changes: functions are added to set, add and delete LM from the lmset, change the legacy lmset data structure to contain n_lm and n_alloc_lm.
137  *
138  * Revision 1.4  2005/03/30 16:28:34  archan
139  * delete test-full.log alog
140  *
141  * Revision 1.3  2005/03/30 01:22:47  archan
142  * Fixed mistakes in last updates. Add
143  *
144  *
145  * 20.Apr.2001  RAH (rhoughton@mediasite.com, ricky.houghton@cs.cmu.edu)
146  *              Adding lm_free() to free allocated memory
147  *
148  * 30-Dec-2000  Rita Singh (rsingh@cs.cmu.edu) at Carnegie Mellon University
149  *		Removed language weight application to wip. To maintain
150  *		comparability between s3decode and current decoder. Does
151  *		not affect decoding performance.
152  *
153  * 23-Feb-2000	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
154  * 		Bugfix: Applied language weight to word insertion penalty.
155  *
156  * 24-Jun-97	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
157  * 		Added lm_t.access_type; made lm_wid externally visible.
158  *
159  * 24-Jun-97	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
160  * 		Added lm_t.log_bg_seg_sz and lm_t.bg_seg_sz.
161  *
162  * 13-Feb-97	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University.
163  * 		Creating from original S3 version.
164  */
165 
166 
167 #include <string.h>
168 
169 #include "lm.h"
170 #include "bio.h"
171 #include "logs3.h"
172 #include "wid.h"
173 #include "encoding.h"
174 
175 /*ARCHAN, 20041112: NOP, NO STATIC VARIABLES! */
176 
177 extern lm_t *lm_read_txt(const char *filename, /**< The file name */
178                          const int lminmemory,  /**< Whether using in memory LM */
179                          int *err_no, /**< Input/Output: Depends on the problem that LM
180 					reading encounters, it could be errors
181 					from -2 (LM_OFFSET_TOO_LARGE) to
182 					-15 (LM_CANNOT_ALLOCATE).  Please checkout
183 					lm.h for details.
184 				     */
185                          int32 isforced32bit, /** Input: normally, we should let lm_read_txt
186 						 to decide whether a file is 32 bit or not.
187 						 When the lm_read_txt couldn't decide that before
188 						 reading or if more specificially when we hit
189 						 the LM segment size problems. Then this bit
190 						 will alter the reading behavior to 32 bit.
191 					     */
192                          logmath_t *logmath
193     );
194 
195 extern lm_t *lm_read_dump(const char *file,  /**< The file name*/
196                           int lminmemory,  /**< Whether using in memory LM */
197                           logmath_t *logmath
198     );
199 
200 
201 int32 lm3g_dump(char const *file,   /**< the file name */
202                 lm_t * model,       /**< the langauge model for output */
203                 char const *lmfile,   /**< the lm file name */
204                 int32 mtime,   /**< LM file modification date */
205                 int32 noBits   /**< Number of bits of DMP format */
206     );
207 
208 /**
209    Writer of lm in ARPA text format
210  */
211 int32 lm_write_arpa_text(lm_t * lmp, /**< the pointer of the language model */
212                          const char *outputfn, /**< the output file name */
213                          const char *inputenc, /**< The input encoding method */
214                          const char *outputenc /**< The output encoding method */
215     );
216 
217 /**
218    Writer of lm in FST format
219  */
220 
221 int32 lm_write_att_fsm(lm_t * lm, /**< the languauge model pointer */
222                        const char *filename/**< output file name */
223     );
224 
225 
226 /**
227    The function to return whether an LM should be 32bit or not.
228    It is decided by whether we are using 32bit mode DMP.  Or whether
229    it is LMTXT_VERSION but with more than 0xffff words.  The final
230    criterion is when LMFORCE_TXT32VERSION.
231  */
232 int32
lm_is32bits(lm_t * lm)233 lm_is32bits(lm_t * lm)
234 {
235     if (lm->version == LMDMP_VERSION_TG_32BIT)
236         return 1;
237     if (lm->version == LMFORCED_TXT32VERSION)
238         return 1;
239     if (lm->version == LMTXT_VERSION && lm->n_ug > LM_LEGACY_CONSTANT)
240         return 1;
241     if (lm->version == LMFST_VERSION && lm->n_ug > LM_LEGACY_CONSTANT)
242         return 1;
243 
244     return 0;
245 }
246 
247 
248 int32
lm_get_classid(lm_t * model,const char * name)249 lm_get_classid(lm_t * model, const char *name)
250 {
251     int32 i;
252 
253     if (!model->lmclass)
254         return BAD_LMCLASSID;
255 
256     for (i = 0; i < model->n_lmclass; i++) {
257         if (strcmp(lmclass_getname(model->lmclass[i]), name) == 0)
258             return (i + LM_CLASSID_BASE);
259     }
260     return BAD_LMCLASSID;
261 }
262 
263 
264 
265 void
lm_null_struct(lm_t * lm)266 lm_null_struct(lm_t * lm)
267 {
268     lm->name = NULL;
269     lm->wordstr = NULL;
270 
271     lm->ug = NULL;
272     lm->bg = NULL;
273     lm->tg = NULL;
274     lm->membg = NULL;
275     lm->tginfo = NULL;
276     lm->tgcache = NULL;
277     lm->dict2lmwid = NULL;
278 
279     lm->bg32 = NULL;
280     lm->tg32 = NULL;
281     lm->membg32 = NULL;
282     lm->tginfo32 = NULL;
283     lm->tgcache32 = NULL;
284 
285     lm->bgprob = NULL;
286     lm->tgprob = NULL;
287     lm->tgbowt = NULL;
288 
289     lm->tg_segbase = NULL;
290     lm->lmclass = NULL;
291     lm->inclass_ugscore = NULL;
292     lm->logmath = NULL;
293 }
294 
295 /* Apply unigram weight; should be part of LM creation, but... */
296 static void
lm_uw(lm_t * lm,float64 uw)297 lm_uw(lm_t * lm, float64 uw)
298 {
299     int32 i, loguw, loguw_, loguniform, p1, p2;
300 
301     /* Interpolate unigram probs with uniform PDF, with weight uw */
302     loguw = logs3(lm->logmath, uw);
303     loguw_ = logs3(lm->logmath, 1.0 - uw);
304     loguniform = logs3(lm->logmath, 1.0 / (lm->n_ug - 1));   /* Skipping S3_START_WORD */
305 
306     for (i = 0; i < lm->n_ug; i++) {
307         if (strcmp(lm->wordstr[i], S3_START_WORD) != 0) {
308             p1 = lm->ug[i].prob.l + loguw;
309             p2 = loguniform + loguw_;
310             lm->ug[i].prob.l = logmath_add(lm->logmath, p1, p2);
311         }
312     }
313 }
314 
315 
316 static void
lm2logs3(lm_t * lm,float64 uw)317 lm2logs3(lm_t * lm, float64 uw)
318 {
319     int32 i;
320 
321     for (i = 0; i < lm->n_ug; i++) {
322         lm->ug[i].prob.l = logmath_log10_to_log(lm->logmath, lm->ug[i].prob.f);
323 
324         /* This prevent underflow if the backoff value is too small
325            It happens sometimes in cmu-lmtk V3's lm_combine.
326          */
327 
328         if (lm->ug[i].bowt.f < MIN_PROB_F)
329             lm->ug[i].bowt.f = MIN_PROB_F;
330 
331         lm->ug[i].bowt.l = logmath_log10_to_log(lm->logmath, lm->ug[i].bowt.f);
332     }
333 
334     lm_uw(lm, uw);
335 
336     for (i = 0; i < lm->n_bgprob; i++)
337         lm->bgprob[i].l = logmath_log10_to_log(lm->logmath, lm->bgprob[i].f);
338 
339     if (lm->n_tg > 0) {
340         for (i = 0; i < lm->n_tgprob; i++)
341             lm->tgprob[i].l = logmath_log10_to_log(lm->logmath, lm->tgprob[i].f);
342         for (i = 0; i < lm->n_tgbowt; i++) {
343 
344             if (lm->tgbowt[i].f < MIN_PROB_F)
345                 lm->tgbowt[i].f = MIN_PROB_F;
346 
347             lm->tgbowt[i].l = logmath_log10_to_log(lm->logmath, lm->tgbowt[i].f);
348         }
349     }
350 }
351 
352 
353 void
lm_set_param(lm_t * lm,float64 lw,float64 wip)354 lm_set_param(lm_t * lm, float64 lw, float64 wip)
355 {
356     int32 i, iwip;
357     float64 f;
358 
359     if (lw <= 0.0)
360         E_FATAL("lw = %e\n", lw);
361     if (wip <= 0.0)
362         E_FATAL("wip = %e\n", wip);
363 #if 0                           /* No lang weight on wip */
364     iwip = logs3(lm->logmath, wip) * lw;
365 #endif
366     iwip = logs3(lm->logmath, wip);
367 
368     f = lw / lm->lw;
369 
370     for (i = 0; i < lm->n_ug; i++) {
371         lm->ug[i].prob.l =
372             (int32) ((lm->ug[i].prob.l - lm->wip) * f) + iwip;
373         lm->ug[i].bowt.l = (int32) (lm->ug[i].bowt.l * f);
374     }
375 
376     for (i = 0; i < lm->n_bgprob; i++)
377         lm->bgprob[i].l = (int32) ((lm->bgprob[i].l - lm->wip) * f) + iwip;
378 
379     if (lm->n_tg > 0) {
380         for (i = 0; i < lm->n_tgprob; i++)
381             lm->tgprob[i].l =
382                 (int32) ((lm->tgprob[i].l - lm->wip) * f) + iwip;
383         for (i = 0; i < lm->n_tgbowt; i++)
384             lm->tgbowt[i].l = (int32) (lm->tgbowt[i].l * f);
385     }
386 
387     lm->lw = (float32) lw;
388     lm->wip = iwip;
389 }
390 
391 
392 int32
lm_add_wordlist(lm_t * lm,dict_t * dict,const char * filename)393 lm_add_wordlist(lm_t * lm,      /**< In/Out: a modified LM structure */
394                 dict_t * dict,      /**< In: a dictionary */
395                 const char *filename       /**< In: a file that contains a
396 					list of word one wants to
397 					add*/
398     )
399 {
400     FILE *fp;
401     char string[1024];
402     char word[1024];
403     int32 n;
404 
405     fp = NULL;
406     if ((fp = fopen(filename, "r")) == NULL) {
407         E_ERROR("Cannot open file %s\n", filename);
408         return LM_FAIL;
409     }
410 
411     while (fgets(string, sizeof(string), fp) != NULL) {
412         n = sscanf(string, "%s", word);
413         if (n != 1) {
414             E_INFO
415                 ("Detecting more than 1 word in one line. Only using the first word. \n");
416             return LM_FAIL;
417         }
418         E_INFO("%s\n", word);
419         if (lm_add_word_to_ug(lm, dict, word) == LM_FAIL)
420             E_INFO("Fail to add word %s into the unigram\n", word);
421     }
422 
423     if (lm == NULL) {
424         E_ERROR("LM pointer is NULL.  lm_add_wordlist failed.\n");
425         return LM_FAIL;
426     }
427 
428     fclose(fp);
429     return LM_SUCCESS;
430 }
431 
432 /*
433   INCOMPLETE
434  */
435 int32
lm_add_word_to_ug(lm_t * lm,dict_t * dict,const char * newword)436 lm_add_word_to_ug(lm_t * lm,      /**<In/Out: a modified LM structure */
437                   dict_t * dict,      /**< In: an initialized dictionary structure */
438                   const char *newword       /**< In: a new word */
439     )
440 {
441     s3wid_t w;
442     s3lmwid_t lwid;
443     void *id;
444     int32 classid = BAD_LMCLASSID;
445 
446   /** ARCHAN 20060320
447       Add a word into the unigram.
448       look up the dictionary and see whether it exists in the dictionary
449       Looks alike with wid.c's logic at this point.
450 
451       We also avoid the addition of classes at this point because that
452       could complicated things quite a lot */
453 
454   /** Reallocate the size of lm->ug, lm->wordstr
455       Update the value lm->n_ug, lm->max_ug;
456    */
457 
458     if (hash_table_lookup(lm->HT, newword, &id) == 0) {
459         E_WARN("The word %s already exists in the language model \n",
460                newword);
461         return LM_FAIL;
462     }
463 
464     lm->n_ug = lm->n_ug + 1;
465     lm->max_ug = lm->n_ug;
466 
467     E_INFO("lm->n_ug %d\n", lm->n_ug);
468     lm->ug = (ug_t *) ckd_realloc(lm->ug, (lm->n_ug + 1) * sizeof(ug_t));       /* Yes, +2 look at NewUnigramModel(n_ug+1) */
469     lm->wordstr =
470         (char **) ckd_realloc(lm->wordstr, (lm->n_ug) * sizeof(char *));
471 
472   /** Reallocate the size of lm->membg
473       and lm->tginfo
474   */
475 
476     if (!lm->is32bits) {
477         lm->membg =
478             (membg_t *) ckd_realloc(lm->membg,
479                                     (lm->n_ug) * sizeof(membg_t));
480         lm->tginfo =
481             (tginfo_t **) ckd_realloc(lm->tginfo,
482                                       (lm->n_ug) * sizeof(tginfo_t *));
483         lm->tginfo[lm->n_ug - 1] = NULL;
484     }
485     else {
486         lm->membg32 =
487             (membg32_t *) ckd_realloc(lm->membg32,
488                                       (lm->n_ug) * sizeof(membg32_t));
489         lm->tginfo32 =
490             (tginfo32_t **) ckd_realloc(lm->tginfo32,
491                                         (lm->n_ug) * sizeof(tginfo32_t *));
492         lm->tginfo32[lm->n_ug - 1] = NULL;
493     }
494 
495 
496     E_WARN("Invoke incomplete lm_add_word_to_ug\n");
497 
498   /** Insert the entry into lm->ug and lm->wordstr */
499 
500     /*
501        This part is not compeleted, prob.f should be the second best
502        unigram probability.  This is a fairly standard that was used by
503        Dragon and also recommended by Roni.
504      */
505 
506     lm->ug[lm->n_ug].prob.f = -99.0;
507     lm->ug[lm->n_ug].bowt.f = -99.0;
508     lm->ug[lm->n_ug].dictwid = lm->n_ug;        /* See the comment in ug_t, this is not exactly correct
509                                                    externally application needs to set it again.
510                                                  */
511 
512     /* Supposingly, the bigram should follow the unigram order.
513        Because, we have no bigram inserted in this case, the
514        unigram.firstbg will just follow the previous one.  */
515 
516     lm->ug[lm->n_ug].firstbg = lm->ug[lm->n_ug - 1].firstbg;
517 
518     lm->wordstr[lm->n_ug - 1] = (char *) ckd_salloc(newword);
519 
520     hash_table_enter(lm->HT, lm->wordstr[lm->n_ug - 1], (void *)(long)(lm->n_ug - 1));
521 
522     if (dict != NULL) {
523                   /** If dictionary is initialized and used in this context */
524     /** Insert the mapping from LM WID to dictionary Word ID  */
525         w = dict_wordid(dict, newword);
526 
527         if (lm->lmclass)
528             classid = lm_get_classid(lm, newword);
529 
530         lwid = lm->dict2lmwid[w];
531 
532         E_INFO("%d\n", lwid);
533 
534         if (IS_S3WID(w)) {
535             if ((lm->lmclass) && (classid != BAD_LMCLASSID)) {
536                 E_ERROR("%s is both a word and an LM class name\n",
537                         lm_wordstr(lm, lm->n_ug - 1));
538                 return LM_FAIL;
539             }
540             else {
541                 if (dict_filler_word(dict, w))
542                     E_ERROR("Filler dictionary word '%s' found in LM\n",
543                             lm_wordstr(lm, lm->n_ug - 1));
544 
545                 if (w != dict_basewid(dict, w)) {
546                     E_ERROR
547                         ("LM word '%s' is an alternative pronunciation in dictionary\n",
548                          lm_wordstr(lm, lm->n_ug - 1));
549 
550                     w = dict_basewid(dict, w);
551                     lm_lmwid2dictwid(lm, lm->n_ug - 1) = w;
552                 }
553 
554                 for (; IS_S3WID(w); w = dict_nextalt(dict, w))
555                     lm->dict2lmwid[w] = (s3lmwid32_t) (lm->n_ug - 1);
556             }
557         }
558         else {
559             E_ERROR
560                 ("Thew new word is not in the dictionary.  We will not do anything in this case\n");
561             return LM_FAIL;
562         }
563 
564     }
565     return LM_SUCCESS;
566 }
567 
568 lm_t *
lm_read(const char * file,const char * lmname,cmd_ln_t * config,logmath_t * logmath)569 lm_read(const char *file, const char *lmname, cmd_ln_t *config, logmath_t *logmath)
570 {
571     return lm_read_advance(file,
572                            lmname,
573                            cmd_ln_float32_r(config, "-lw"),
574                            cmd_ln_float32_r(config, "-wip"),
575                            cmd_ln_float32_r(config, "-uw"), 0, NULL, 1, logmath);
576 }
577 
578 lm_t *
lm_read_advance(const char * file,const char * lmname,float64 lw,float64 wip,float64 uw,int32 ndict,const char * fmt,int32 applyWeight,logmath_t * logmath)579 lm_read_advance(const char *file, const char *lmname, float64 lw,
580                 float64 wip, float64 uw, int32 ndict, const char *fmt,
581                 int32 applyWeight, logmath_t *logmath)
582 {
583     return lm_read_advance2(file, lmname, lw, wip, uw, ndict, fmt, applyWeight, 0, logmath);
584 }
585 
586 lm_t *
lm_read_advance2(const char * file,const char * lmname,float64 lw,float64 wip,float64 uw,int32 ndict,const char * fmt,int32 applyWeight,int lminmemory,logmath_t * logmath)587 lm_read_advance2(const char *file, const char *lmname, float64 lw,
588                  float64 wip, float64 uw, int32 ndict, const char *fmt,
589                  int32 applyWeight, int lminmemory, logmath_t *logmath)
590 {
591     int32 i, u;
592     lm_t *lm;
593     int32 err_no;
594 
595     if (!file)
596         E_FATAL("No LM file\n");
597     if (lw <= 0.0)
598         E_FATAL("lw = %e\n", lw);
599     if (wip <= 0.0)
600         E_FATAL("wip = %e\n", wip);
601     if ((uw < 0.0) || (uw > 1.0))
602         E_FATAL("uw = %e\n", uw);
603 
604     /* HACK: At this part, one should check whether the LM name is being used already */
605 
606     E_INFO("LM read('%s', lw= %.2f, wip= %.2f, uw= %.2f)\n", file, lw, wip,
607            uw);
608     E_INFO("Reading LM file %s (LM name \"%s\")\n", file, lmname);
609 
610     /* First it will try to decide whether the file a .DMP file */
611     /* ARCHAN: We should provide function pointer implementation at here. */
612     if (fmt == NULL) {
613         /**Automatically decide the LM format */
614         lm = lm_read_dump(file, lminmemory, logmath);
615         if (lm == NULL) {
616             E_INFO("In lm_read, LM is not a DMP file. Trying to read it as a txt file\n");
617             if (lminmemory == 0) {
618                 E_WARN("On-disk LM not supported for text files, reading it into memory.\n");
619                 lminmemory = 1;
620             }
621             lm = lm_read_txt(file, lminmemory, &err_no, 0, logmath); /* Not forcing 32bit LM */
622             if (lm == NULL) {
623                 if (err_no == LM_OFFSET_TOO_LARGE) {
624                     E_INFO
625                         ("In lm read, LM is not a DMP, it is likely to be a ARPA format file. But the LM hits the limit of legacy 16 bit format. Force LM reading to 32bit now\n");
626 
627                     /* This only happens when both TXT & DMP format reading have problems */
628                     lm = lm_read_txt(file, lminmemory, &err_no, 1, logmath);      /* Now force 32bit LM */
629                     if (lm == NULL) {
630                         E_INFO
631                             ("Panic: In lm_read, LM is not DMP format, it is likely to be ARPA format and hits legacy 16 bit format problem. But when forcing to 32bit LM, problem still couldn't be solved.\n");
632                         return NULL;
633                     }
634                 }
635                 else {
636                     E_INFO("Lm is both not DMP and TXT format\n");
637                     return NULL;
638                 }
639             }
640         }
641     }
642     else if (!strcmp(fmt, "TXT")) {
643         lm = lm_read_txt(file, lminmemory, &err_no, 0, logmath);  /* Not forcing 32bit LM */
644         if (lm == NULL) {
645             if (err_no == LM_OFFSET_TOO_LARGE) {
646                 E_INFO
647                     ("In lm read, LM is not a DMP, it is likely to be a ARPA format file. But the LM hits the limit of legacy 16 bit format. Force LM reading to 32bit now\n");
648 
649                 /* This only happens when both TXT & DMP format reading have problems */
650                 lm = lm_read_txt(file, lminmemory, &err_no, 1, logmath);  /* Now force 32bit LM */
651                 if (lm == NULL) {
652                     E_INFO
653                         ("Panic: In lm_read, LM is not DMP format, it is likely to be ARPA format and hits legacy 16 bit format problem. But when forcing to 32bit LM, problem still couldn't be solved.\n");
654                     return NULL;
655                 }
656             }
657             else {
658                 E_INFO("LM is not in TXT format\n");
659                 return NULL;
660             }
661         }
662 
663     }
664     else if (!strcmp(fmt, "DMP")) {
665         lm = lm_read_dump(file, lminmemory, logmath);
666         if (lm == NULL) {
667             E_INFO
668                 ("In lm_read, a DMP format reader is called, but lm cannot be read, Diagnosis: LM is corrupted or not enough memory.\n");
669             return NULL;
670         }
671     }
672     else if (!strcmp(fmt, "TXT32")) {
673         lm = lm_read_txt(file, lminmemory, &err_no, 1, logmath);
674         if (lm == NULL) {
675             E_INFO("In lm_read, failed to read lm in txt format. .\n");
676             return NULL;
677         }
678     }
679     else {
680         E_INFO("Unknown format (%s) is specified\n", fmt);
681         return NULL;
682     }
683 
684 
685     lm->name = ckd_salloc(lmname);
686     lm->inputenc = IND_BADENCODING;
687     lm->outputenc = IND_BADENCODING;
688 
689     lm->is32bits = lm_is32bits(lm);
690 
691     E_INFO("The LM routine is operating at %d bits mode\n",
692            lm->is32bits ? 32 : 16);
693 
694     /* Initialize the fast trigram cache, with all entries invalid */
695     if (lm->n_tg > 0) {
696         if (lm->is32bits) {
697             lm->tgcache32 =
698                 (lm_tgcache_entry32_t *) ckd_calloc(LM_TGCACHE_SIZE,
699                         sizeof
700                         (lm_tgcache_entry32_t));
701             for (i = 0; i < LM_TGCACHE_SIZE; i++)
702                 lm->tgcache32[i].lwid[0] = (s3lmwid32_t) BAD_LMWID(lm);
703         }
704         else {
705             lm->tgcache =
706                 (lm_tgcache_entry_t *) ckd_calloc(LM_TGCACHE_SIZE,
707                         sizeof(lm_tgcache_entry_t));
708             for (i = 0; i < LM_TGCACHE_SIZE; i++)
709                 lm->tgcache[i].lwid[0] = (s3lmwid_t) BAD_LMWID(lm);
710         }
711     }
712 
713     if (applyWeight) {
714         lm2logs3(lm, uw);       /* Applying unigram weight; convert to logs3 values */
715 
716         /* Apply the new lw and wip values */
717         lm->lw = 1.0;           /* The initial settings for lw and wip */
718         lm->wip = 0;            /* logs3(1.0) */
719         lm_set_param(lm, lw, wip);
720     }
721 
722 
723     assert(lm);
724     /* Set the size of dictionary */
725     lm->dict_size = ndict;
726     /*    E_INFO("lm->dict %d\n",lm->dict_size); */
727     for (u = 0; u < lm->n_ug; u++)
728         lm->ug[u].dictwid = BAD_S3WID;
729 
730 
731     return lm;
732 }
733 
734 /*
735   This convert every string in the lm from lmp->inputenc to
736   lm->outputenc.  This function assumes the caller has checked the
737   encoding schemes appropriateness.
738 
739   (Caution!) At 20051115, the method is specific and only support hex
740   to value conversion.  The code also hasn't considered that output
741   encoding requires a longer length of string than the input encoding.
742  */
743 static void
lm_convert_encoding(lm_t * lmp)744 lm_convert_encoding(lm_t * lmp)
745 {
746     int i;
747 
748     E_INFO("Encoding Conversion\n");
749     for (i = 0; i < lmp->n_ug; i++) {
750 #if 0
751         E_INFO("%s\n", lmp->wordstr[i]);
752 #endif
753 
754         if (ishex(lmp->wordstr[i])) {
755             hextocode(lmp->wordstr[i]);
756         }
757 
758 #if 0
759         E_INFO("%s\n", lmp->wordstr[i]);
760 #endif
761     }
762 }
763 
764 int32
lm_write_advance(lm_t * lmp,const char * outputfn,const char * filename,const char * fmt,const char * inputenc,char * outputenc)765 lm_write_advance(lm_t * lmp, const char *outputfn, const char *filename,
766                  const char *fmt, const char *inputenc, char *outputenc)
767 {
768     /* This might be duplicated with the caller checking but was done for extra safety. */
769 
770     assert(encoding_resolve(inputenc, outputenc));
771 
772     lmp->inputenc = encoding_str2ind(inputenc);
773     lmp->outputenc = encoding_str2ind(outputenc);
774 
775     if (lmp->inputenc != lmp->outputenc) {
776         E_INFO("Did I come here?\n");
777         lm_convert_encoding(lmp);
778     }
779 
780     if (!strcmp(fmt, "TXT")) {
781 
782         return lm_write_arpa_text(lmp, outputfn, inputenc, outputenc);
783 
784     }
785     else if (!strcmp(fmt, "DMP")) {
786 
787         /* set mtime to be zero because sphinx3 has no mechanism to check
788            whether the file is generated earlier (at least for now.) */
789 
790         if (lm_is32bits(lmp)) {
791             E_INFO
792                 ("16 bit DMP format is specified but LM is decided to be 32 bit mode. (May be it has segment size which is large than 64k or programmer forced it).\n",
793                  LM_LEGACY_CONSTANT);
794             E_INFO("Now use 32 bits format.\n");
795             return lm3g_dump(outputfn, lmp, filename, 0, 32);
796         }
797         else {
798             return lm3g_dump(outputfn, lmp, filename, 0, 16);
799         }
800 
801 
802     }
803     else if (!strcmp(fmt, "DMP32")) {
804 
805         /* set mtime to be zero because sphinx3 has no mechanism to check
806            whether the file is generated earlier (at least for now.) */
807 
808         return lm3g_dump(outputfn, lmp, filename, 0, 32);
809 
810     }
811     else if (!strcmp(fmt, "FST")) {
812 
813         E_WARN("Invoke un-tested ATT-FSM writer\n");
814         return lm_write_att_fsm(lmp, outputfn);
815 
816     }
817     else {
818 
819         E_INFO("Unknown format (%s) is specified\n", fmt);
820         return LM_FAIL;
821     }
822 }
823 
824 int32
lm_write(lm_t * lmp,const char * outputfn,const char * filename,const char * fmt)825 lm_write(lm_t * lmp, const char *outputfn, const char *filename, const char *fmt)
826 {
827     return lm_write_advance(lmp, outputfn, filename, fmt, "iso8859-1",
828                             "iso8859-1");
829 }
830 
831 
832 /*
833  * Free stale bigram and trigram info, those not used since last reset.
834  */
835 void
lm_cache_reset(lm_t * lm)836 lm_cache_reset(lm_t * lm)
837 {
838     int32 i, n_bgfree, n_tgfree;
839     tginfo_t *tginfo, *next_tginfo, *prev_tginfo;
840     tginfo32_t *tginfo32, *next_tginfo32, *prev_tginfo32;
841     int32 is32bits;
842 
843     n_bgfree = n_tgfree = 0;
844 
845 
846     /* ARCHAN: RAH only short-circult this function only */
847     if (lm->isLM_IN_MEMORY)     /* RAH We are going to short circuit this if we are running with the lm in memory */
848         return;
849 
850     is32bits = lm->is32bits;
851 
852     if ((lm->n_bg > 0) && (!lm->bg)) {  /* Disk-based; free "stale" bigrams */
853 
854         if (is32bits) {
855             for (i = 0; i < lm->n_ug; i++) {
856                 if (lm->membg32[i].bg32 && (!lm->membg32[i].used)) {
857                     lm->n_bg_inmem -=
858                         lm->ug[i + 1].firstbg - lm->ug[i].firstbg;
859 
860                     ckd_free(lm->membg32[i].bg32);
861                     lm->membg32[i].bg32 = NULL;
862                     n_bgfree++;
863                 }
864 
865                 lm->membg32[i].used = 0;
866             }
867         }
868         else {
869             for (i = 0; i < lm->n_ug; i++) {
870                 if (lm->membg[i].bg && (!lm->membg[i].used)) {
871                     lm->n_bg_inmem -=
872                         lm->ug[i + 1].firstbg - lm->ug[i].firstbg;
873 
874                     ckd_free(lm->membg[i].bg);
875                     lm->membg[i].bg = NULL;
876                     n_bgfree++;
877                 }
878 
879                 lm->membg[i].used = 0;
880             }
881         }
882     }
883 
884     if (lm->n_tg > 0) {
885         if (is32bits) {
886             for (i = 0; i < lm->n_ug; i++) {
887                 prev_tginfo32 = NULL;
888                 for (tginfo32 = lm->tginfo32[i]; tginfo32;
889                      tginfo32 = next_tginfo32) {
890                     next_tginfo32 = tginfo32->next;
891 
892                     if (!tginfo32->used) {
893                         if ((!lm->tg32) && tginfo32->tg32) {
894                             lm->n_tg_inmem -= tginfo32->n_tg;
895                             ckd_free(tginfo32->tg32);
896                             n_tgfree++;
897                         }
898 
899                         ckd_free(tginfo32);
900                         if (prev_tginfo32)
901                             prev_tginfo32->next = next_tginfo32;
902                         else
903                             lm->tginfo32[i] = next_tginfo32;
904                     }
905                     else {
906                         tginfo32->used = 0;
907                         prev_tginfo32 = tginfo32;
908                     }
909                 }
910             }
911         }
912         else {
913             for (i = 0; i < lm->n_ug; i++) {
914                 prev_tginfo = NULL;
915                 for (tginfo = lm->tginfo[i]; tginfo; tginfo = next_tginfo) {
916                     next_tginfo = tginfo->next;
917 
918                     if (!tginfo->used) {
919                         if ((!lm->tg) && tginfo->tg) {
920                             lm->n_tg_inmem -= tginfo->n_tg;
921                             ckd_free(tginfo->tg);
922                             n_tgfree++;
923                         }
924 
925                         free(tginfo);
926                         if (prev_tginfo)
927                             prev_tginfo->next = next_tginfo;
928                         else
929                             lm->tginfo[i] = next_tginfo;
930                     }
931                     else {
932                         tginfo->used = 0;
933                         prev_tginfo = tginfo;
934                     }
935                 }
936             }
937         }
938     }
939 
940     if ((n_tgfree > 0) || (n_bgfree > 0)) {
941         E_INFO("%d tg frees, %d in mem; %d bg frees, %d in mem\n",
942                n_tgfree, lm->n_tg_inmem, n_bgfree, lm->n_bg_inmem);
943     }
944 }
945 
946 
947 void
lm_cache_stats_dump(lm_t * lm)948 lm_cache_stats_dump(lm_t * lm)
949 {
950     E_INFO
951         ("%9d tg(), %9d tgcache, %8d bo; %5d fills, %8d in mem (%.1f%%)\n",
952          lm->n_tg_score, lm->n_tgcache_hit, lm->n_tg_bo, lm->n_tg_fill,
953          lm->n_tg_inmem, (lm->n_tg_inmem * 100.0) / (lm->n_tg + 1));
954     E_INFO("%8d bg(), %8d bo; %5d fills, %8d in mem (%.1f%%)\n",
955            lm->n_bg_score, lm->n_bg_bo, lm->n_bg_fill, lm->n_bg_inmem,
956            (lm->n_bg_inmem * 100.0) / (lm->n_bg + 1));
957 
958     lm->n_tgcache_hit = 0;
959     lm->n_tg_fill = 0;
960     lm->n_tg_score = 0;
961     lm->n_tg_bo = 0;
962     lm->n_bg_fill = 0;
963     lm->n_bg_score = 0;
964     lm->n_bg_bo = 0;
965 }
966 
967 
968 int32
lm_ug_score(lm_t * lm,s3lmwid32_t lwid,s3wid_t wid)969 lm_ug_score(lm_t * lm, s3lmwid32_t lwid, s3wid_t wid)
970 {
971     if (NOT_LMWID(lm, lwid) || (lwid >= lm->n_ug))
972         E_FATAL("Bad argument (%d) to lm_ug_score\n", lwid);
973 
974     lm->access_type = 1;
975 
976     if (lm->inclass_ugscore)
977         return (lm->ug[lwid].prob.l + lm->inclass_ugscore[wid]);
978     else
979         return (lm->ug[lwid].prob.l);
980 }
981 
982 int32
lm_ug_exists(lm_t * lm,s3lmwid32_t lwid)983 lm_ug_exists(lm_t * lm, s3lmwid32_t lwid)
984 {
985     if (NOT_LMWID(lm, lwid) || (lwid >= lm->n_ug))
986         return 0;
987     else
988         return 1;
989 }
990 
991 
992 int32
lm_uglist(lm_t * lm,ug_t ** ugptr)993 lm_uglist(lm_t * lm, ug_t ** ugptr)
994 {
995     *ugptr = lm->ug;
996     return (lm->n_ug);
997 }
998 
999 
1000 /* This create a mapping from either the unigram or words in a class*/
1001 int32
lm_ug_wordprob(lm_t * lm,dict_t * dict,int32 th,wordprob_t * wp)1002 lm_ug_wordprob(lm_t * lm, dict_t * dict, int32 th, wordprob_t * wp)
1003 {
1004     int32 i, j, n, p;
1005     s3wid_t w, dictid;
1006     lmclass_t *lmclass;
1007     lmclass_word_t *lm_cw;
1008     n = lm->n_ug;
1009 
1010     for (i = 0, j = 0; i < n; i++) {
1011         w = lm->ug[i].dictwid;
1012         if (IS_S3WID(w)) {      /*Is w>0? Then it can be either wid or class id */
1013             if (w < LM_CLASSID_BASE) {  /*It is just a word */
1014                 if ((p = lm->ug[i].prob.l) >= th) {
1015                     wp[j].wid = w;
1016                     wp[j].prob = p;
1017                     j++;
1018                 }
1019             }
1020             else {              /* It is a class */
1021                 lmclass = LM_CLASSID_TO_CLASS(lm, w);   /* Get the class */
1022                 lm_cw = lmclass_firstword(lmclass);
1023                 while (lmclass_isword(lm_cw)) {
1024                     dictid = lmclass_getwid(lm_cw);
1025 
1026                     /*E_INFO("Lookup dict_id using dict_basewid %d\n",dictid); */
1027                     if (IS_S3WID(dictid)) {
1028                         if (dictid != dict_basewid(dict, dictid)) {
1029                             dictid = dict_basewid(dict, dictid);
1030                         }
1031                         if ((p =
1032                              lm->ug[i].prob.l +
1033                              lm->inclass_ugscore[dictid]) >= th) {
1034                             wp[j].wid = dictid;
1035                             wp[j].prob = lm->ug[i].prob.l;
1036                             j++;
1037                         }
1038                     }
1039                     else {
1040                         E_INFO("Word %s cannot be found \n",
1041                                lmclass_getword(lm_cw));
1042                     }
1043 
1044                     lm_cw = lmclass_nextword(lmclass, lm_cw);
1045 
1046                 }
1047             }
1048         }
1049     }
1050 
1051     return j;
1052 }
1053 
1054 
1055 /*
1056  * Load bigrams for the given unigram (LMWID) lw1 from disk into memory
1057  */
1058 static void
load_bg(lm_t * lm,s3lmwid32_t lw1)1059 load_bg(lm_t * lm, s3lmwid32_t lw1)
1060 {
1061     int32 i, n, b;
1062     bg_t *bg = NULL;
1063     bg32_t *bg32 = NULL;
1064 
1065     int32 mem_sz;
1066     int32 is32bits;
1067 
1068     b = lm->ug[lw1].firstbg;    /* Absolute first bg index for ug lw1 */
1069     n = lm->ug[lw1 + 1].firstbg - b;    /* Not including guard/sentinel */
1070 
1071     is32bits = lm->is32bits;
1072     mem_sz = is32bits ? sizeof(bg32_t) : sizeof(bg_t);
1073 
1074     if (lm->isLM_IN_MEMORY) {   /* RAH, if LM_IN_MEMORY, then we don't need to go get it. */
1075         if (is32bits)
1076             bg32 = lm->membg32[lw1].bg32 = &lm->bg32[b];
1077         else
1078             bg = lm->membg[lw1].bg = &lm->bg[b];
1079     }
1080     else {
1081         if (is32bits)
1082             bg32 = lm->membg32[lw1].bg32 =
1083                 (bg32_t *) ckd_calloc(n + 1, mem_sz);
1084         else
1085             bg = lm->membg[lw1].bg = (bg_t *) ckd_calloc(n + 1, mem_sz);
1086 
1087         if (fseek(lm->fp, lm->bgoff + b * mem_sz, SEEK_SET) < 0)
1088             E_FATAL_SYSTEM("fseek failed\n");
1089 
1090 
1091         /* Need to read n+1 because obtaining tg count for one bg also depends on next bg */
1092         if (is32bits) {
1093             if (fread(bg32, mem_sz, n + 1, lm->fp) != (size_t) (n + 1))
1094                 E_FATAL("fread failed\n");
1095             if (lm->byteswap) {
1096                 for (i = 0; i <= n; i++)
1097                     swap_bg32(&(bg32[i]));
1098             }
1099         }
1100         else {
1101             if (fread(bg, mem_sz, n + 1, lm->fp) != (size_t) (n + 1))
1102                 E_FATAL("fread failed\n");
1103             if (lm->byteswap) {
1104                 for (i = 0; i <= n; i++)
1105                     swap_bg(&(bg[i]));
1106             }
1107         }
1108     }
1109     lm->n_bg_fill++;
1110     lm->n_bg_inmem += n;
1111 }
1112 
1113 
1114 #define BINARY_SEARCH_THRESH	16
1115 
1116 /* Locate a specific bigram within a bigram list */
1117 int32
find_bg(bg_t * bg,int32 n,s3lmwid32_t w)1118 find_bg(bg_t * bg, int32 n, s3lmwid32_t w)
1119 {
1120     int32 i, b, e;
1121 
1122     /* Binary search until segment size < threshold */
1123     b = 0;
1124     e = n;
1125     while (e - b > BINARY_SEARCH_THRESH) {
1126         i = (b + e) >> 1;
1127         if (bg[i].wid < w)
1128             b = i + 1;
1129         else if (bg[i].wid > w)
1130             e = i;
1131         else
1132             return i;
1133     }
1134 
1135     /* Linear search within narrowed segment */
1136     for (i = b; (i < e) && (bg[i].wid != w); i++);
1137     return ((i < e) ? i : -1);
1138 }
1139 
1140 /* Locate a specific bigram within a bigram list */
1141 int32
find_bg32(bg32_t * bg,int32 n,s3lmwid32_t w)1142 find_bg32(bg32_t * bg, int32 n, s3lmwid32_t w)
1143 {
1144     int32 i, b, e;
1145 
1146     /* Binary search until segment size < threshold */
1147     b = 0;
1148     e = n;
1149     while (e - b > BINARY_SEARCH_THRESH) {
1150         i = (b + e) >> 1;
1151         if (bg[i].wid < w)
1152             b = i + 1;
1153         else if (bg[i].wid > w)
1154             e = i;
1155         else
1156             return i;
1157     }
1158 
1159     /* Linear search within narrowed segment */
1160     for (i = b; (i < e) && (bg[i].wid != w); i++);
1161     return ((i < e) ? i : -1);
1162 }
1163 
1164 
1165 /*** Begin lm_bglist*/
1166 int32
lm_bglist(lm_t * lm,s3lmwid32_t w1,bg_t ** bgptr,int32 * bowt)1167 lm_bglist(lm_t * lm, s3lmwid32_t w1, bg_t ** bgptr, int32 * bowt)
1168 {
1169     int32 n;
1170 
1171     if (NOT_LMWID(lm, w1) || (w1 >= lm->n_ug))
1172         E_FATAL("Bad w1 argument (%d) to lm_bglist\n", w1);
1173 
1174     n = (lm->n_bg > 0) ? lm->ug[w1 + 1].firstbg - lm->ug[w1].firstbg : 0;
1175 
1176     if (n > 0) {
1177         if (!lm->membg[w1].bg)
1178             load_bg(lm, w1);
1179         lm->membg[w1].used = 1;
1180 
1181         *bgptr = lm->membg[w1].bg;
1182         *bowt = lm->ug[w1].bowt.l;
1183     }
1184     else {
1185         *bgptr = NULL;
1186         *bowt = 0;
1187     }
1188 
1189     return (n);
1190 }
1191 
1192 int32
lm_bg32list(lm_t * lm,s3lmwid32_t w1,bg32_t ** bgptr,int32 * bowt)1193 lm_bg32list(lm_t * lm, s3lmwid32_t w1, bg32_t ** bgptr, int32 * bowt)
1194 {
1195     int32 n;
1196 
1197     if (NOT_LMWID(lm, w1) || (w1 >= lm->n_ug))
1198         E_FATAL("Bad w1 argument (%d) to lm_bglist\n", w1);
1199 
1200     n = (lm->n_bg > 0) ? lm->ug[w1 + 1].firstbg - lm->ug[w1].firstbg : 0;
1201 
1202     if (n > 0) {
1203         if (!lm->membg32[w1].bg32)
1204             load_bg(lm, w1);
1205         lm->membg32[w1].used = 1;
1206 
1207         *bgptr = lm->membg32[w1].bg32;
1208         *bowt = lm->ug[w1].bowt.l;
1209     }
1210     else {
1211         *bgptr = NULL;
1212         *bowt = 0;
1213     }
1214 
1215     return (n);
1216 }
1217 
1218 /*** End lm_bglist*/
1219 
1220 /*
1221  *  This function look-ups the bigram score of p(lw2|lw1)
1222  *  The information for lw2 and w2 are repeated because the legacy
1223  *  implementation(since s3.2) of vithist used only LM wid rather
1224  *  than dictionary wid.
1225  */
1226 
1227 int32
lm_bg_score(lm_t * lm,s3lmwid32_t lw1,s3lmwid32_t lw2,s3wid_t w2)1228 lm_bg_score(lm_t * lm, s3lmwid32_t lw1, s3lmwid32_t lw2, s3wid_t w2)
1229 {
1230     int32 i, n, score;
1231     bg_t *bg = NULL;
1232     bg32_t *bg32 = NULL;
1233     int32 is32bits;
1234 
1235     is32bits = lm->is32bits;
1236 
1237     if ((lm->n_bg == 0) || (NOT_LMWID(lm, lw1)))
1238         return (lm_ug_score(lm, lw2, w2));
1239 
1240     lm->n_bg_score++;
1241 
1242     if (NOT_LMWID(lm, lw2) || (lw2 >= lm->n_ug))
1243         E_FATAL("Bad lw2 argument (%d) to lm_bg_score\n", lw2);
1244 
1245     n = lm->ug[lw1 + 1].firstbg - lm->ug[lw1].firstbg;
1246 
1247     if (n > 0) {
1248         if (is32bits) {
1249             if (!lm->membg32[lw1].bg32)
1250                 load_bg(lm, lw1);
1251             lm->membg32[lw1].used = 1;
1252             bg32 = lm->membg32[lw1].bg32;
1253             i = find_bg32(bg32, n, lw2);
1254         }
1255         else {
1256             if (!lm->membg[lw1].bg)
1257                 load_bg(lm, lw1);
1258             lm->membg[lw1].used = 1;
1259             bg = lm->membg[lw1].bg;
1260             i = find_bg(bg, n, lw2);
1261         }
1262     }
1263     else
1264         i = -1;
1265 
1266     if (i >= 0) {
1267         if (is32bits)
1268             score = lm->bgprob[bg32[i].probid].l;
1269         else
1270             score = lm->bgprob[bg[i].probid].l;
1271 
1272         if (lm->inclass_ugscore) {      /*Only add within class prob if class information exists.
1273                                            Is actually ok to just add the score because if the word
1274                                            is not within-class. The returning scores will be 0. I just
1275                                            love to safe-guard it :-).
1276                                          */
1277             score += lm->inclass_ugscore[w2];
1278         }
1279 
1280         lm->access_type = 2;
1281     }
1282     else {
1283         lm->n_bg_bo++;
1284         lm->access_type = 1;
1285         score = lm->ug[lw1].bowt.l + lm->ug[lw2].prob.l;
1286     }
1287 
1288     return (score);
1289 }
1290 
1291 int32
lm_bg_exists(lm_t * lm,s3lmwid32_t lw1,s3lmwid32_t lw2)1292 lm_bg_exists(lm_t * lm, s3lmwid32_t lw1, s3lmwid32_t lw2)
1293 {
1294     int32 i, n, score;
1295     bg_t *bg = NULL;
1296     bg32_t *bg32 = NULL;
1297     int32 is32bits;
1298 
1299     is32bits = lm->is32bits;
1300 
1301     if ((lm->n_bg == 0) || (NOT_LMWID(lm, lw1)))
1302         return 0;
1303 
1304     if (NOT_LMWID(lm, lw2) || (lw2 >= lm->n_ug))
1305         return 0;
1306 
1307     n = lm->ug[lw1 + 1].firstbg - lm->ug[lw1].firstbg;
1308 
1309     if (n > 0) {
1310         if (is32bits) {
1311             if (!lm->membg32[lw1].bg32)
1312                 load_bg(lm, lw1);
1313             lm->membg32[lw1].used = 1;
1314             bg32 = lm->membg32[lw1].bg32;
1315             i = find_bg32(bg32, n, lw2);
1316         }
1317         else {
1318             if (!lm->membg[lw1].bg)
1319                 load_bg(lm, lw1);
1320             lm->membg[lw1].used = 1;
1321             bg = lm->membg[lw1].bg;
1322 
1323             i = find_bg(bg, n, lw2);
1324         }
1325     }
1326     else
1327         i = -1;
1328 
1329     if (i >= 0)
1330         return 1;
1331     else
1332         return 0;
1333 
1334 
1335     return (score);
1336 }
1337 
1338 
1339 static void
load_tg(lm_t * lm,s3lmwid32_t lw1,s3lmwid32_t lw2)1340 load_tg(lm_t * lm, s3lmwid32_t lw1, s3lmwid32_t lw2)
1341 {
1342     int32 i, n, b;
1343     int32 t = -1;               /* Let's make sure that if t isn't initialized after the
1344                                  * "if" statement below, it makes things go bad */
1345     bg_t *bg = NULL;
1346     bg32_t *bg32 = NULL;
1347     tg_t *tg = NULL;
1348     tg32_t *tg32 = NULL;
1349     tginfo_t *tginfo = NULL;
1350     tginfo32_t *tginfo32 = NULL;
1351     int32 mem_sz_tg, mem_sz_tginfo;
1352     int32 is32bits;
1353 
1354     is32bits = lm->is32bits;
1355     mem_sz_tg = is32bits ? sizeof(tg32_t) : sizeof(tg_t);
1356     mem_sz_tginfo = is32bits ? sizeof(tginfo32_t) : sizeof(tginfo_t);
1357 
1358     /* First allocate space for tg information for bg lw1,lw2 */
1359 
1360     if (is32bits) {
1361         tginfo32 = (tginfo32_t *) ckd_malloc(mem_sz_tginfo);
1362         tginfo32->w1 = lw1;
1363         tginfo32->tg32 = NULL;
1364         tginfo32->next = lm->tginfo32[lw2];
1365         lm->tginfo32[lw2] = tginfo32;
1366     }
1367     else {
1368         tginfo = (tginfo_t *) ckd_malloc(mem_sz_tginfo);
1369         tginfo->w1 = lw1;
1370         tginfo->tg = NULL;
1371         tginfo->next = lm->tginfo[lw2];
1372         lm->tginfo[lw2] = tginfo;
1373     }
1374 
1375     /* Locate bigram lw1,lw2 */
1376 
1377     b = lm->ug[lw1].firstbg;
1378     n = lm->ug[lw1 + 1].firstbg - b;
1379 
1380 
1381     /* Make sure bigrams for lw1, if any, loaded into memory */
1382     if (n > 0) {
1383         if (is32bits) {
1384             if (!lm->membg32[lw1].bg32)
1385                 load_bg(lm, lw1);
1386             lm->membg32[lw1].used = 1;
1387             bg32 = lm->membg32[lw1].bg32;
1388         }
1389         else {
1390             if (!lm->membg[lw1].bg)
1391                 load_bg(lm, lw1);
1392             lm->membg[lw1].used = 1;
1393             bg = lm->membg[lw1].bg;
1394         }
1395     }
1396 
1397     /* At this point, n = #bigrams for lw1 */
1398     if (n > 0
1399         && (i =
1400             is32bits ? find_bg32(bg32, n, lw2) : find_bg(bg, n,
1401                                                          lw2)) >= 0) {
1402 
1403         /*      if(i<0){
1404            E_INFO("What is the value of i %d, lw2 %d\n",i,lw2);
1405            } */
1406 
1407         if (i >= 0) {
1408             if (is32bits)
1409                 tginfo32->bowt = lm->tgbowt[bg32[i].bowtid].l;
1410             else
1411                 tginfo->bowt = lm->tgbowt[bg[i].bowtid].l;
1412 
1413 
1414             /* Find t = Absolute first trigram index for bigram lw1,lw2 */
1415             b += i;             /* b = Absolute index of bigram lw1,lw2 on disk */
1416             t = lm->tg_segbase[b >> lm->log_bg_seg_sz];
1417             t += is32bits ? bg32[i].firsttg : bg[i].firsttg;
1418 
1419             /*      E_INFO("%d %d\n",lm->tg_segbase[b >> lm->log_bg_seg_sz],t); */
1420             /* Find #tg for bigram w1,w2 */
1421             n = lm->tg_segbase[(b + 1) >> lm->log_bg_seg_sz];
1422             n += is32bits ? bg32[i + 1].firsttg : bg[i + 1].firsttg;
1423             n -= t;
1424 
1425             if (is32bits)
1426                 tginfo32->n_tg = n;
1427             else
1428                 tginfo->n_tg = n;
1429 
1430         }
1431 
1432     }
1433     else {                      /* No bigram w1,w2 */
1434 
1435         if (is32bits) {
1436             tginfo32->bowt = 0;
1437             n = tginfo32->n_tg = 0;
1438         }
1439         else {
1440             tginfo->bowt = 0;
1441             n = tginfo->n_tg = 0;
1442         }
1443     }
1444 
1445     /* "t" has not been assigned any meanigful value, so if you use it
1446      * beyond this point, make sure it's been properly assigned.
1447      */
1448     /*  assert (t != -1); */
1449 
1450     /* At this point, n = #trigrams for lw1,lw2.  Read them in */
1451 
1452     if (lm->isLM_IN_MEMORY) {
1453         /* RAH, already have this in memory */
1454         if (n > 0) {
1455             assert(t != -1);
1456             if (is32bits)
1457                 tg32 = tginfo32->tg32 = &lm->tg32[t];
1458             else
1459                 tg = tginfo->tg = &lm->tg[t];
1460         }
1461     }
1462     else {
1463         if (n > 0) {
1464 
1465             if (is32bits)
1466                 tg32 = tginfo32->tg32 =
1467                     (tg32_t *) ckd_calloc(n, mem_sz_tg);
1468             else
1469                 tg = tginfo->tg = (tg_t *) ckd_calloc(n, mem_sz_tg);
1470 
1471 
1472             if (fseek(lm->fp, lm->tgoff + t * mem_sz_tg, SEEK_SET) < 0)
1473                 E_FATAL_SYSTEM("fseek failed\n");
1474 
1475 
1476             if (is32bits) {
1477                 if (fread(tg32, mem_sz_tg, n, lm->fp) != (size_t) n)
1478                     E_FATAL("fread(tg32, %d at %d) failed\n", n,
1479                             lm->tgoff);
1480                 if (lm->byteswap) {
1481                     for (i = 0; i < n; i++) {
1482                         SWAP_INT32(&(tg32[i].wid));
1483                         SWAP_INT32(&(tg32[i].probid));
1484                     }
1485                 }
1486             }
1487             else {
1488                 if (fread(tg, mem_sz_tg, n, lm->fp) != (size_t) n)
1489                     E_FATAL("fread(tg, %d at %d) failed\n", n, lm->tgoff);
1490                 if (lm->byteswap) {
1491                     for (i = 0; i < n; i++) {
1492                         SWAP_INT16(&(tg[i].wid));
1493                         SWAP_INT16(&(tg[i].probid));
1494                     }
1495                 }
1496             }
1497         }
1498     }
1499     lm->n_tg_fill++;
1500     lm->n_tg_inmem += n;
1501 }
1502 
1503 
1504 /* Similar to find_bg */
1505 int32
find_tg(tg_t * tg,int32 n,s3lmwid32_t w)1506 find_tg(tg_t * tg, int32 n, s3lmwid32_t w)
1507 {
1508     int32 i, b, e;
1509 
1510     b = 0;
1511     e = n;
1512     while (e - b > BINARY_SEARCH_THRESH) {
1513         i = (b + e) >> 1;
1514         if (tg[i].wid < w)
1515             b = i + 1;
1516         else if (tg[i].wid > w)
1517             e = i;
1518         else
1519             return i;
1520     }
1521 
1522     for (i = b; (i < e) && (tg[i].wid != w); i++);
1523     return ((i < e) ? i : -1);
1524 }
1525 
1526 int32
find_tg32(tg32_t * tg,int32 n,s3lmwid32_t w)1527 find_tg32(tg32_t * tg, int32 n, s3lmwid32_t w)
1528 {
1529     int32 i, b, e;
1530 
1531     b = 0;
1532     e = n;
1533     while (e - b > BINARY_SEARCH_THRESH) {
1534         i = (b + e) >> 1;
1535         if (tg[i].wid < w)
1536             b = i + 1;
1537         else if (tg[i].wid > w)
1538             e = i;
1539         else
1540             return i;
1541     }
1542 
1543     for (i = b; (i < e) && (tg[i].wid != w); i++);
1544     return ((i < e) ? i : -1);
1545 }
1546 
1547 
1548 int32
lm_tglist(lm_t * lm,s3lmwid32_t lw1,s3lmwid32_t lw2,tg_t ** tgptr,int32 * bowt)1549 lm_tglist(lm_t * lm, s3lmwid32_t lw1, s3lmwid32_t lw2, tg_t ** tgptr,
1550           int32 * bowt)
1551 {
1552     tginfo_t *tginfo, *prev_tginfo;
1553 
1554     if (lm->n_tg <= 0) {
1555         *tgptr = NULL;
1556         *bowt = 0;
1557         return 0;
1558     }
1559 
1560     if (NOT_LMWID(lm, lw1) || (lw1 >= lm->n_ug))
1561         E_FATAL("Bad lw1 argument (%d) to lm_tglist\n", lw1);
1562     if (NOT_LMWID(lm, lw2) || (lw2 >= lm->n_ug))
1563         E_FATAL("Bad lw2 argument (%d) to lm_tglist\n", lw2);
1564 
1565     prev_tginfo = NULL;
1566     for (tginfo = lm->tginfo[lw2]; tginfo; tginfo = tginfo->next) {
1567         if (tginfo->w1 == lw1)
1568             break;
1569         prev_tginfo = tginfo;
1570     }
1571 
1572     if (!tginfo) {
1573         load_tg(lm, lw1, lw2);
1574         tginfo = lm->tginfo[lw2];
1575     }
1576     else if (prev_tginfo) {
1577         prev_tginfo->next = tginfo->next;
1578         tginfo->next = lm->tginfo[lw2];
1579         lm->tginfo[lw2] = tginfo;
1580     }
1581     tginfo->used = 1;
1582 
1583     *tgptr = tginfo->tg;
1584     *bowt = tginfo->bowt;
1585 
1586     return (tginfo->n_tg);
1587 }
1588 
1589 int32
lm_tg32list(lm_t * lm,s3lmwid32_t lw1,s3lmwid32_t lw2,tg32_t ** tgptr,int32 * bowt)1590 lm_tg32list(lm_t * lm, s3lmwid32_t lw1, s3lmwid32_t lw2, tg32_t ** tgptr,
1591             int32 * bowt)
1592 {
1593     tginfo32_t *tginfo32, *prev_tginfo32;
1594 
1595     if (lm->n_tg <= 0) {
1596         *tgptr = NULL;
1597         *bowt = 0;
1598         return 0;
1599     }
1600 
1601     if (NOT_LMWID(lm, lw1) || (lw1 >= lm->n_ug))
1602         E_FATAL("Bad lw1 argument (%d) to lm_tglist\n", lw1);
1603     if (NOT_LMWID(lm, lw2) || (lw2 >= lm->n_ug))
1604         E_FATAL("Bad lw2 argument (%d) to lm_tglist\n", lw2);
1605 
1606     prev_tginfo32 = NULL;
1607     for (tginfo32 = lm->tginfo32[lw2]; tginfo32; tginfo32 = tginfo32->next) {
1608         if (tginfo32->w1 == lw1)
1609             break;
1610         prev_tginfo32 = tginfo32;
1611     }
1612 
1613     if (!tginfo32) {
1614         load_tg(lm, lw1, lw2);
1615         tginfo32 = lm->tginfo32[lw2];
1616     }
1617     else if (prev_tginfo32) {
1618         prev_tginfo32->next = tginfo32->next;
1619         tginfo32->next = lm->tginfo32[lw2];
1620         lm->tginfo32[lw2] = tginfo32;
1621     }
1622     tginfo32->used = 1;
1623 
1624     *tgptr = tginfo32->tg32;
1625     *bowt = tginfo32->bowt;
1626 
1627     return (tginfo32->n_tg);
1628 }
1629 
1630 /*
1631  *  This function look-ups the trigram score of p(lw3|lw2,lw1)
1632  *  and compute the in-class ug probability of w3.
1633  *  The information for lw3 and w3 are repeated because the legacy
1634  *  implementation(since s3.2) of vithist used only LM wid rather
1635  *  than dictionary wid.
1636  *
1637  */
1638 
1639 int32
lm_tg_score(lm_t * lm,s3lmwid32_t lw1,s3lmwid32_t lw2,s3lmwid32_t lw3,s3wid_t w3)1640 lm_tg_score(lm_t * lm, s3lmwid32_t lw1, s3lmwid32_t lw2, s3lmwid32_t lw3,
1641             s3wid_t w3)
1642 {
1643     int32 i, h, n, score;
1644     tg_t *tg;
1645     tginfo_t *tginfo, *prev_tginfo;
1646     tg32_t *tg32;
1647     tginfo32_t *tginfo32, *prev_tginfo32;
1648     int32 is32bits;
1649 
1650     tg = NULL;
1651     tginfo = prev_tginfo = NULL;
1652 
1653     tg32 = NULL;
1654     tginfo32 = prev_tginfo32 = NULL;
1655 
1656 
1657     is32bits = lm->is32bits;
1658 
1659     /*    E_INFO("lw1 %d, lw2 %d, lw3 %d is32bits %d BAD_LMWID %d\n",lw1,lw2,lw3,is32bits, BAD_LMWID(lm)); */
1660 
1661     if ((lm->n_tg == 0) || (NOT_LMWID(lm, lw1)))
1662         return (lm_bg_score(lm, lw2, lw3, w3));
1663 
1664     lm->n_tg_score++;
1665 
1666     /*    E_INFO("lw1 %d, lw2 %d, lw3 %d is32bits %d BAD_LMWID %d\n",lw1,lw2,lw3,is32bits, BAD_LMWID(lm)); */
1667 
1668     if (NOT_LMWID(lm, lw1) || (lw1 >= lm->n_ug))
1669         E_FATAL("Bad lw1 argument (%d) to lm_tg_score\n", lw1);
1670     if (NOT_LMWID(lm, lw2) || (lw2 >= lm->n_ug))
1671         E_FATAL("Bad lw2 argument (%d) to lm_tg_score\n", lw2);
1672     if (NOT_LMWID(lm, lw3) || (lw3 >= lm->n_ug))
1673         E_FATAL("Bad lw3 argument (%d) to lm_tg_score\n", lw3);
1674 
1675     /* Lookup tgcache first; compute hash(lw1, lw2, lw3) */
1676     h = ((lw1 & 0x000003ff) << 21) + ((lw2 & 0x000003ff) << 11) +
1677         (lw3 & 0x000007ff);
1678     h %= LM_TGCACHE_SIZE;
1679 
1680 
1681     if (is32bits) {
1682         if ((lm->tgcache32[h].lwid[0] == lw1) &&
1683             (lm->tgcache32[h].lwid[1] == lw2) &&
1684             (lm->tgcache32[h].lwid[2] == lw3)) {
1685 
1686             lm->n_tgcache_hit++;
1687             return lm->tgcache32[h].lscr;
1688         }
1689 
1690         prev_tginfo32 = NULL;
1691         for (tginfo32 = lm->tginfo32[lw2]; tginfo32;
1692              tginfo32 = tginfo32->next) {
1693             if (tginfo32->w1 == lw1)
1694                 break;
1695             prev_tginfo32 = tginfo32;
1696         }
1697 
1698     }
1699     else {
1700         if ((lm->tgcache[h].lwid[0] == lw1) &&
1701             (lm->tgcache[h].lwid[1] == lw2) &&
1702             (lm->tgcache[h].lwid[2] == lw3)) {
1703 
1704             lm->n_tgcache_hit++;
1705             return lm->tgcache[h].lscr;
1706         }
1707 
1708         prev_tginfo = NULL;
1709         for (tginfo = lm->tginfo[lw2]; tginfo; tginfo = tginfo->next) {
1710             if (tginfo->w1 == lw1)
1711                 break;
1712             prev_tginfo = tginfo;
1713         }
1714     }
1715 
1716     if (is32bits) {
1717         if (!tginfo32) {
1718             load_tg(lm, lw1, lw2);
1719             tginfo32 = lm->tginfo32[lw2];
1720         }
1721         else if (prev_tginfo32) {
1722             prev_tginfo32->next = tginfo32->next;
1723             tginfo32->next = lm->tginfo32[lw2];
1724             lm->tginfo32[lw2] = tginfo32;
1725         }
1726         tginfo32->used = 1;
1727     }
1728     else {
1729         if (!tginfo) {
1730             load_tg(lm, lw1, lw2);
1731             tginfo = lm->tginfo[lw2];
1732         }
1733         else if (prev_tginfo) {
1734             prev_tginfo->next = tginfo->next;
1735             tginfo->next = lm->tginfo[lw2];
1736             lm->tginfo[lw2] = tginfo;
1737         }
1738         tginfo->used = 1;
1739     }
1740 
1741 
1742     /* Trigrams for w1,w2 now in memory; look for w1,w2,w3 */
1743     if (is32bits) {
1744         n = tginfo32->n_tg;
1745         tg32 = tginfo32->tg32;
1746         assert(tginfo32);
1747     }
1748     else {
1749         n = tginfo->n_tg;
1750         tg = tginfo->tg;
1751         assert(tginfo);
1752     }
1753 
1754     if (is32bits)
1755         i = find_tg32(tg32, n, lw3);
1756     else
1757         i = find_tg(tg, n, lw3);
1758 
1759     if (i >= 0) {
1760         if (is32bits)
1761             score = lm->tgprob[tg32[i].probid].l;
1762         else
1763             score = lm->tgprob[tg[i].probid].l;
1764 
1765         if (lm->inclass_ugscore) {      /*Only add within class prob if class information exists.
1766                                            Is actually ok to just add the score because if the word
1767                                            is not within-class. The returning scores will be 0.
1768                                          */
1769             score += lm->inclass_ugscore[w3];
1770         }
1771         lm->access_type = 3;
1772     }
1773     else {
1774         lm->n_tg_bo++;
1775         score = is32bits ? tginfo32->bowt : tginfo->bowt;
1776         score += lm_bg_score(lm, lw2, lw3, w3);
1777 
1778     }
1779 
1780     if (is32bits) {
1781         lm->tgcache32[h].lwid[0] = lw1;
1782         lm->tgcache32[h].lwid[1] = lw2;
1783         lm->tgcache32[h].lwid[2] = lw3;
1784         lm->tgcache32[h].lscr = score;
1785     }
1786     else {
1787         lm->tgcache[h].lwid[0] = lw1;
1788         lm->tgcache[h].lwid[1] = lw2;
1789         lm->tgcache[h].lwid[2] = lw3;
1790         lm->tgcache[h].lscr = score;
1791     }
1792 
1793 
1794 #if 0
1795     printf("      %5d %5d -> %8d\n", lw1, lw2, score);
1796     /* ENABLE this when you suspect the lm routine produce abnormal scores */
1797     if (score > 0) {
1798         E_INFO
1799             ("score %d >0 lm->ug[lw1].bowt.l %d lm_ug[lw2].prob.l %d, lw1 %d lw2 %d i, %d\n",
1800              score, lm->ug[lw1].bowt.l, lm->ug[lw2].bowt.l, lw1, lw2, i);
1801     }
1802 #endif
1803 
1804     return (score);
1805 }
1806 
1807 int32
lm_tg_exists(lm_t * lm,s3lmwid32_t lw1,s3lmwid32_t lw2,s3lmwid32_t lw3)1808 lm_tg_exists(lm_t * lm, s3lmwid32_t lw1, s3lmwid32_t lw2, s3lmwid32_t lw3)
1809 {
1810     int32 i, n;
1811     tg_t *tg;
1812     tginfo_t *tginfo, *prev_tginfo;
1813     tg32_t *tg32;
1814     tginfo32_t *tginfo32, *prev_tginfo32;
1815 
1816 
1817     int32 is32bits;
1818 
1819     tg = NULL;
1820     tginfo = prev_tginfo = NULL;
1821     tg32 = NULL;
1822     tginfo32 = prev_tginfo32 = NULL;
1823     is32bits = lm->is32bits;
1824 
1825     if ((lm->n_tg == 0) || (NOT_LMWID(lm, lw1)))
1826         return 0;
1827 
1828     if (NOT_LMWID(lm, lw1) || (lw1 >= lm->n_ug))
1829         return 0;
1830     if (NOT_LMWID(lm, lw2) || (lw2 >= lm->n_ug))
1831         return 0;
1832     if (NOT_LMWID(lm, lw3) || (lw3 >= lm->n_ug))
1833         return 0;
1834 
1835     if (is32bits) {
1836         prev_tginfo32 = NULL;
1837         for (tginfo32 = lm->tginfo32[lw2]; tginfo32;
1838              tginfo32 = tginfo32->next) {
1839             if (tginfo32->w1 == lw1)
1840                 break;
1841             prev_tginfo32 = tginfo32;
1842         }
1843     }
1844     else {
1845         prev_tginfo = NULL;
1846         for (tginfo = lm->tginfo[lw2]; tginfo; tginfo = tginfo->next) {
1847             if (tginfo->w1 == lw1)
1848                 break;
1849             prev_tginfo = tginfo;
1850         }
1851     }
1852 
1853     if (is32bits) {
1854         if (!tginfo32) {
1855             load_tg(lm, lw1, lw2);
1856             tginfo32 = lm->tginfo32[lw2];
1857         }
1858         else if (prev_tginfo32) {
1859             prev_tginfo32->next = tginfo32->next;
1860             tginfo32->next = lm->tginfo32[lw2];
1861             lm->tginfo32[lw2] = tginfo32;
1862         }
1863         tginfo32->used = 1;
1864         /* Trigrams for w1,w2 now in memory; look for w1,w2,w3 */
1865         n = tginfo32->n_tg;
1866         tg32 = tginfo32->tg32;
1867         assert(tginfo32);
1868     }
1869     else {
1870         if (!tginfo) {
1871             load_tg(lm, lw1, lw2);
1872             tginfo = lm->tginfo[lw2];
1873         }
1874         else if (prev_tginfo) {
1875             prev_tginfo->next = tginfo->next;
1876             tginfo->next = lm->tginfo[lw2];
1877             lm->tginfo[lw2] = tginfo;
1878         }
1879         tginfo->used = 1;
1880         /* Trigrams for w1,w2 now in memory; look for w1,w2,w3 */
1881         n = tginfo->n_tg;
1882         tg = tginfo->tg;
1883         assert(tginfo);
1884     }
1885 
1886     if (is32bits)
1887         i = find_tg32(tg32, n, lw3);
1888     else
1889         i = find_tg(tg, n, lw3);
1890 
1891     if (i >= 0)
1892         return 1;
1893     else
1894         return 0;
1895 }
1896 
1897 
1898 s3lmwid32_t
lm_wid(lm_t * lm,const char * word)1899 lm_wid(lm_t * lm, const char *word)
1900 {
1901     int32 i;
1902 
1903     for (i = 0; i < lm->n_ug; i++)
1904         if (strcmp(lm->wordstr[i], word) == 0)
1905             return ((s3lmwid32_t) i);
1906 
1907     return BAD_LMWID(lm);
1908 }
1909 
1910 void
lm_free(lm_t * lm)1911 lm_free(lm_t * lm)
1912 {
1913     int i;
1914     tginfo_t *tginfo;
1915     tginfo32_t *tginfo32;
1916 
1917     if (lm->fp)
1918         fclose(lm->fp);
1919 
1920     ckd_free((void *) lm->ug);
1921 
1922     for (i = 0; i < lm->n_ug; i++)
1923         ckd_free((void *) lm->wordstr[i]);      /*  */
1924     ckd_free((void *) lm->wordstr);
1925 
1926     if (lm->n_bg > 0) {
1927         if (lm->bg || lm->bg32) {       /* Memory-based; free all bg */
1928             if (lm->bg)
1929                 ckd_free(lm->bg);
1930             if (lm->bg32)
1931                 ckd_free(lm->bg32);
1932 
1933             if (lm->membg)
1934                 ckd_free(lm->membg);
1935             if (lm->membg32)
1936                 ckd_free(lm->membg32);
1937         }
1938         else {                  /* Disk-based; free in-memory bg */
1939             if (lm->membg) {
1940                 for (i = 0; i < lm->n_ug; ++i)
1941                     ckd_free(lm->membg[i].bg);
1942                 ckd_free(lm->membg);
1943             }
1944             if (lm->membg32) {
1945                 for (i = 0; i < lm->n_ug; ++i)
1946                     ckd_free(lm->membg32[i].bg32);
1947                 ckd_free(lm->membg32);
1948             }
1949         }
1950 
1951         ckd_free(lm->bgprob);
1952     }
1953 
1954     if (lm->n_tg > 0) {
1955         if (lm->tg)
1956             ckd_free((void *) lm->tg);
1957         if (lm->tg32)
1958             ckd_free((void *) lm->tg32);
1959 
1960         if (lm->tginfo) {
1961             for (i = 0; i < lm->n_ug; i++) {
1962                 if (lm->tginfo[i] != NULL) {
1963                     /* Free the whole linked list of tginfo. */
1964                     while (lm->tginfo[i]) {
1965                         tginfo = lm->tginfo[i];
1966                         lm->tginfo[i] = tginfo->next;
1967                         if (!lm->isLM_IN_MEMORY)
1968                             ckd_free(tginfo->tg);
1969                         ckd_free((void *) tginfo);
1970                     }
1971                 }
1972             }
1973             ckd_free((void *) lm->tginfo);
1974         }
1975         if (lm->tginfo32) {
1976             for (i = 0; i < lm->n_ug; i++) {
1977                 if (lm->tginfo32[i] != NULL) {
1978                     while (lm->tginfo32[i]) {
1979                         tginfo32 = lm->tginfo32[i];
1980                         lm->tginfo32[i] = tginfo32->next;
1981                         if (!lm->isLM_IN_MEMORY)
1982                             ckd_free(tginfo32->tg32);
1983                         ckd_free((void *) tginfo32);
1984                     }
1985                 }
1986             }
1987             ckd_free((void *) lm->tginfo32);
1988         }
1989 
1990 
1991 
1992         if (lm->tgcache)
1993             ckd_free((void *) lm->tgcache);
1994         if (lm->tgcache32)
1995             ckd_free((void *) lm->tgcache32);
1996 
1997         ckd_free((void *) lm->tg_segbase);
1998         ckd_free((void *) lm->tgprob);
1999         ckd_free((void *) lm->tgbowt);
2000     }
2001 
2002     if (lm->lmclass) {
2003         for (i = 0; i < lm->n_lmclass; ++i)
2004             lmclass_free(lm->lmclass[i]);
2005         ckd_free(lm->lmclass);
2006     }
2007 
2008     if (lm->inclass_ugscore) {
2009         ckd_free(lm->inclass_ugscore);
2010     }
2011 
2012     if (lm->HT) {
2013         hash_table_free(lm->HT);
2014     }
2015 
2016     if (lm->dict2lmwid) {
2017         ckd_free(lm->dict2lmwid);
2018     }
2019 
2020     if (lm->name)
2021         ckd_free(lm->name);
2022 
2023     ckd_free((void *) lm);
2024 }
2025 
2026 static void
copy_bgt_to_bg32t(bg_t * b16,bg32_t * b32)2027 copy_bgt_to_bg32t(bg_t * b16, bg32_t * b32)
2028 {
2029     b32->wid = (s3lmwid32_t) b16->wid;
2030     b32->probid = (uint32) b16->probid;
2031     b32->bowtid = (uint32) b16->bowtid;
2032     b32->firsttg = (uint32) b16->firsttg;
2033 }
2034 
2035 void
copy_bg_to_bg32(lm_t * lm)2036 copy_bg_to_bg32(lm_t * lm)
2037 {
2038     int i;
2039     assert(lm->bg32 == NULL);
2040     lm->bg32 = (bg32_t *) ckd_calloc(lm->n_bg + 1, sizeof(bg32_t));
2041 
2042     for (i = 0; i <= lm->n_bg; i++)
2043         copy_bgt_to_bg32t(&(lm->bg[i]), &(lm->bg32[i]));
2044 }
2045 
2046 static void
copy_bg32t_to_bgt(bg32_t * b32,bg_t * b16)2047 copy_bg32t_to_bgt(bg32_t * b32, bg_t * b16)
2048 {
2049     assert(b32->wid <= LM_LEGACY_CONSTANT);
2050     b16->wid = (s3lmwid_t) b32->wid;
2051     b16->probid = (uint16) b32->probid;
2052     b16->bowtid = (uint16) b32->bowtid;
2053     b16->firsttg = (uint16) b32->firsttg;
2054 }
2055 
2056 void
copy_bg32_to_bg(lm_t * lm)2057 copy_bg32_to_bg(lm_t * lm)
2058 {
2059     int i;
2060     assert(lm->bg == NULL);
2061     lm->bg = (bg_t *) ckd_calloc(lm->n_bg + 1, sizeof(bg_t));
2062 
2063     for (i = 0; i <= lm->n_bg; i++)
2064         copy_bg32t_to_bgt(&(lm->bg32[i]), &(lm->bg[i]));
2065 
2066 }
2067 
2068 void
swap_bg(bg_t * b16)2069 swap_bg(bg_t * b16)
2070 {
2071     SWAP_INT16(&(b16->wid));
2072     SWAP_INT16(&(b16->probid));
2073     SWAP_INT16(&(b16->bowtid));
2074     SWAP_INT16(&(b16->firsttg));
2075 }
2076 
2077 void
swap_bg32(bg32_t * b32)2078 swap_bg32(bg32_t * b32)
2079 {
2080     SWAP_INT32(&(b32->wid));
2081     SWAP_INT32(&(b32->probid));
2082     SWAP_INT32(&(b32->bowtid));
2083     SWAP_INT32(&(b32->firsttg));
2084 }
2085 
2086 static void
copy_tgt_to_tg32t(tg_t * t16,tg32_t * t32)2087 copy_tgt_to_tg32t(tg_t * t16, tg32_t * t32)
2088 {
2089     t32->wid = (s3lmwid32_t) t16->wid;
2090     t32->probid = (uint32) t16->probid;
2091 }
2092 
2093 
2094 
2095 void
copy_tg_to_tg32(lm_t * lm)2096 copy_tg_to_tg32(lm_t * lm)
2097 {
2098     int i;
2099     assert(lm->tg32 == NULL);
2100     lm->tg32 = (tg32_t *) ckd_calloc(lm->n_tg, sizeof(tg32_t));
2101 
2102     for (i = 0; i < lm->n_tg; i++)
2103         copy_tgt_to_tg32t(&(lm->tg[i]), &(lm->tg32[i]));
2104 
2105 }
2106 
2107 static void
copy_tg32t_to_tgt(tg32_t * t32,tg_t * t16)2108 copy_tg32t_to_tgt(tg32_t * t32, tg_t * t16)
2109 {
2110     t16->wid = (s3lmwid_t) t32->wid;
2111     t16->probid = (uint32) t32->probid;
2112 }
2113 
2114 
2115 void
copy_tg32_to_tg(lm_t * lm)2116 copy_tg32_to_tg(lm_t * lm)
2117 {
2118     int i;
2119     assert(lm->tg == NULL);
2120     lm->tg = (tg_t *) ckd_calloc(lm->n_tg, sizeof(tg_t));
2121 
2122     for (i = 0; i < lm->n_tg; i++)
2123         copy_tg32t_to_tgt(&(lm->tg32[i]), &(lm->tg[i]));
2124 
2125 }
2126 
2127 void
swap_tg(tg_t * t16)2128 swap_tg(tg_t * t16)
2129 {
2130     SWAP_INT16(&(t16->wid));
2131     SWAP_INT16(&(t16->probid));
2132 }
2133 
2134 void
swap_tg32(tg32_t * t32)2135 swap_tg32(tg32_t * t32)
2136 {
2137     SWAP_INT32(&(t32->wid));
2138     SWAP_INT32(&(t32->probid));
2139 }
2140 
2141 
2142 int32
lm_rawscore(lm_t * lm,int32 score)2143 lm_rawscore(lm_t * lm, int32 score)
2144 {
2145 
2146     score -= lm->wip;
2147     score /= (int32) lm->lw;
2148 
2149     return score;
2150 }
2151 
2152 void
lm_convert_structure(lm_t * model,int32 is32bits)2153 lm_convert_structure(lm_t * model, int32 is32bits)
2154 {
2155     /* Convert the data structure */
2156     if (is32bits) {             /* Convert from 16 bits to 32 bits */
2157         if (model->n_bg > 0) {
2158             if (model->bg32 == NULL) {
2159                 assert(model->bg != NULL);
2160                 copy_bg_to_bg32(model);
2161             }
2162         }
2163         if (model->n_tg > 0) {
2164             if (model->tg32 == NULL) {
2165                 assert(model->tg != NULL);
2166                 copy_tg_to_tg32(model);
2167             }
2168         }
2169     }
2170     else {                      /* Convert from 32 bits to 16 bits */
2171         if (model->n_bg > 0) {
2172             if (model->bg == NULL) {
2173                 assert(model->bg32 != NULL);
2174                 copy_bg32_to_bg(model);
2175             }
2176         }
2177         if (model->n_tg > 0) {
2178             if (model->tg == NULL) {
2179                 assert(model->tg32 != NULL);
2180                 copy_tg32_to_tg(model);
2181             }
2182         }
2183     }
2184 
2185     if (is32bits) {
2186         if (model->bg > 0)
2187             assert(model->bg32 != NULL);
2188         if (model->tg > 0)
2189             assert(model->tg32 != NULL);
2190     }
2191     else {
2192         if (model->bg > 0)
2193             assert(model->bg != NULL);
2194         if (model->tg > 0)
2195             assert(model->tg != NULL);
2196     }
2197 
2198 }
2199 
2200 
2201 
2202 #if (_LM_TEST_)
2203 static int32
sentence_lmscore(lm_t * lm,const char * line)2204 sentence_lmscore(lm_t * lm, const char *line)
2205 {
2206     char *word[1024];
2207     s3lmwid32_t w[1024];
2208     int32 nwd, score, tgscr;
2209     int32 i, j;
2210 
2211     if ((nwd = str2words(line, word, 1020)) < 0)
2212         E_FATAL("Increase word[] and w[] arrays size\n");
2213 
2214     w[0] = BAD_LMWID(lm);
2215     w[1] = lm_wid(lm, S3_START_WORD);
2216     if (NOT_LMWID(lm, w[1]))
2217         E_FATAL("Unknown word: %s\n", S3_START_WORD);
2218 
2219     for (i = 0; i < nwd; i++) {
2220         w[i + 2] = lm_wid(lm, word[i]);
2221         if (NOT_LMWID(lm, w[i + 2])) {
2222             E_ERROR("Unknown word: %s\n", word[i]);
2223             return 0;
2224         }
2225     }
2226 
2227     w[i + 2] = lm_wid(lm, S3_FINISH_WORD);
2228     if (NOT_LMWID(lm, w[i + 2]))
2229         E_FATAL("Unknown word: %s\n", S3_FINISH_WORD);
2230 
2231     score = 0;
2232     for (i = 0, j = 2; i <= nwd; i++, j++) {
2233         tgscr = lm_tg_score(lm, w[j - 2], w[j - 1], w[j]);
2234         score += tgscr;
2235         printf("\t%10d %s\n", tgscr, lm->wordstr[w[j]]);
2236     }
2237 
2238     return (score);
2239 }
2240 
2241 
main(int32 argc,char * argv[])2242 main(int32 argc, char *argv[])
2243 {
2244     char line[4096];
2245     int32 score, k;
2246     lm_t *lm;
2247 
2248     if (argc < 2)
2249         E_FATAL("Usage: %s <LMdumpfile>\n", argv[0]);
2250 
2251     logs3_init(1.0001, 1, 1);
2252     lm = lm_read(argv[1], 9.5, 0.2);
2253 
2254     if (1) {                    /* Short cut this so we can test for memory leaks */
2255         for (;;) {
2256             printf("> ");
2257             if (fgets(line, sizeof(line), stdin) == NULL)
2258                 break;
2259 
2260             score = sentence_lmscore(lm, line);
2261 
2262             k = strlen(line);
2263             if (line[k - 1] == '\n')
2264                 line[k - 1] = '\0';
2265             printf("LMScr(%s) = %d\n", line, score);
2266         }
2267     }                           /*  */
2268     lm_free(lm);
2269     exit(0);
2270 }
2271 #endif
2272