1 /* ====================================================================
2  * Copyright (c) 1999-2004 Carnegie Mellon University.  All rights
3  * reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  *
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in
14  *    the documentation and/or other materials provided with the
15  *    distribution.
16  *
17  * This work was supported in part by funding from the Defense Advanced
18  * Research Projects Agency and the National Science Foundation of the
19  * United States of America, and the CMU Sphinx Speech Consortium.
20  *
21  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
22  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
23  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
25  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  *
33  * ====================================================================
34  *
35  */
36 /*
37  * lm_3g.c -- Darpa Trigram LM module (adapted to Sphinx 3)
38  *
39  * **********************************************
40  * CMU ARPA Speech Project
41  *
42  * Copyright (c) 1997 Carnegie Mellon University.
43  * ALL RIGHTS RESERVED.
44  * **********************************************
45  *
46  * HISTORY
47  * $Log: lm_3g.c,v $
48  * Revision 1.3  2006/03/01 20:05:09  arthchan2003
49  * Pretty format the lm dumping and make numbers in 4 decimals only.
50  *
51  * Revision 1.2  2006/02/23 04:08:36  arthchan2003
52  * Merged from branch SPHINX3_5_2_RCI_IRII_BRANCH
53  * 1, Added lm_3g.c - a TXT-based LM routines.
54  * 2, Added lm_3g_dmp.c - a DMP-based LM routines.
55  * 3, (Contributed by LIUM) Added lm_attfsm.c - convert lm to FSM
56  * 4, Added lmset.c - a wrapper for the lmset_t structure.
57  *
58  * Revision 1.1.2.4  2006/01/16 19:58:25  arthchan2003
59  * Small change to make function public.
60  *
61  * Revision 1.1.2.3  2005/11/17 06:21:05  arthchan2003
62  * 1, Change all lm_write_arpa_* functions' s3wid_t to s3lmwid_t.  2, Make the writing code to be changed more easily.
63  *
64  * Revision 1.1.2.2  2005/07/28 20:04:20  dhdfu
65  * Make LM writing not crash for bigram LM
66  *
67  * Revision 1.1.2.1  2005/07/13 01:39:55  arthchan2003
68  * Added lm_3g.c, which take cares of reading and writing of LM into the lm_t (3-gram specific lm structure.)
69  *
70  */
71 
72 #include <string.h>
73 
74 #include "pio.h"
75 #include "lm.h"
76 #include "bio.h"
77 #include "logs3.h"
78 #include "wid.h"
79 
80 /*#define LEGACY_MAX_SORTED_ENTRIES	65534*/
81 
82 /*
83    20060321 ARCHAN: Why MAX_SORTED_ENTRIES = 200000?
84 
85    Generally, MAX_SORTED_ENTRIES relates the quantization schemes of
86    the log probabilities values and back-off weights.  In the
87    Sphinx/CMUCamLMtk case.  This is usually represented as
88    -x.xxxx. That is to say maximally 100000 could be involved.  It is
89    also possible that value such as -99.0 and other special values is
90    involved.   Therefore, doubling the size is reasonable measure.
91 
92    When we use better precision, say 5 decimal places, we should
93    revise the validity of the above schemes.
94  */
95 #define MAX_SORTED_ENTRIES 200000
96 #define QUANTIZATION_MULTIPLIER 10000
97 #define QUANTIZATION_DIVISOR 0.0001
98 
99 
100 #define FIRST_BG(m,u)		((m)->ug[u].firstbg)
101 #define TSEG_BASE(m,b)		((m)->tg_segbase[(b)>>LOG2_BG_SEG_SZ])
102 
103 #define FIRST_TG(m,b)		(TSEG_BASE((m),(b))+((m)->bg[b].firsttg))
104 #define FIRST_TG32(m,b)		(TSEG_BASE((m),(b))+((m)->bg32[b].firsttg))
105 
106 
107 static int32
108 wstr2wid(lm_t * model,              /**< an LM */
109          char *w                    /**< a pre-allocated word string */
110     )
111 {
112     void *val;
113 
114     if (hash_table_lookup(model->HT, w, &val) != 0)
115         return NO_WORD;
116     return ((int32)(long)val);
117 }
118 
119 /*
120  * Initialize sorted list with the 0-th entry = MIN_PROB_F, which may be needed
121  * to replace spurious values in the Darpa LM file.
122  */
123 static void
124 init_sorted_list(sorted_list_t * l            /**< a sorted list */
125     )
126 {
127     l->list =
128         (sorted_entry_t *) ckd_calloc(MAX_SORTED_ENTRIES,
129                                       sizeof(sorted_entry_t));
130     l->list[0].val.f = MIN_PROB_F;
131     l->list[0].lower = 0;
132     l->list[0].higher = 0;
133     l->free = 1;
134 }
135 
136 /**
137  * Free a sorted list
138  */
139 static void
140 free_sorted_list(sorted_list_t * l             /**< a sorted list */
141     )
142 {
143     free(l->list);
144 }
145 
146 static lmlog_t *
147 vals_in_sorted_list(sorted_list_t * l)
148 {
149     lmlog_t *vals;
150     int32 i;
151 
152     vals = (lmlog_t *) ckd_calloc(l->free, sizeof(lmlog_t));
153     for (i = 0; i < l->free; i++)
154         vals[i].f = l->list[i].val.f;
155     return (vals);
156 }
157 
158 static int32
159 sorted_id(sorted_list_t * l, float *val)
160 {
161     int32 i = 0;
162 
163     for (;;) {
164         if (*val == l->list[i].val.f)
165             return (i);
166         if (*val < l->list[i].val.f) {
167             if (l->list[i].lower == 0) {
168 
169                 if (l->free >= MAX_SORTED_ENTRIES)
170                     E_INFO("sorted list overflow\n");
171 
172                 l->list[i].lower = l->free;
173                 (l->free)++;
174                 i = l->list[i].lower;
175                 l->list[i].val.f = *val;
176                 return (i);
177             }
178             else
179                 i = l->list[i].lower;
180         }
181         else {
182             if (l->list[i].higher == 0) {
183 
184                 if (l->free >= MAX_SORTED_ENTRIES)
185                     E_INFO("sorted list overflow\n");
186 
187                 l->list[i].higher = l->free;
188                 (l->free)++;
189                 i = l->list[i].higher;
190                 l->list[i].val.f = *val;
191                 return (i);
192             }
193             else
194                 i = l->list[i].higher;
195         }
196     }
197 }
198 
199 /**
200  * Read and return #unigrams, #bigrams, #trigrams as stated in input file.
201  *
202  * @warning The internal buffer has size 256.
203  *
204  * @return LM_NO_DATA_MARK if there is "\\data\\" mark. LM_UNKNOWN_NG if an
205  * unknown K of K-gram appears.  LM_BAD_LM_COUNT if a bad LM counts. LM_SUCCESS
206  * if the whole reading succeeds.
207  *
208  */
209 static int
210 ReadNgramCounts(FILE * fp, int32 * n_ug,   /**< Out: number of unigram read */
211                 int32 * n_bg,              /**< Out: number of bigram read */
212                 int32 * n_tg               /**< Out: number of trigram read */
213     )
214 {
215     char string[256];
216     int32 ngram, ngram_cnt;
217 
218     /* skip file until past the '\data\' marker */
219 
220 
221     do {
222         fgets(string, sizeof(string), fp);
223     } while ((strcmp(string, "\\data\\\n") != 0) && (!feof(fp)));
224 
225 
226     if (strcmp(string, "\\data\\\n") != 0) {
227 
228         E_WARN("No \\data\\ mark in LM file\n");
229         return LM_NO_DATA_MARK;
230     }
231 
232 
233     *n_ug = *n_bg = *n_tg = 0;
234     while (fgets(string, sizeof(string), fp) != NULL) {
235         if (sscanf(string, "ngram %d=%d", &ngram, &ngram_cnt) != 2)
236             break;
237 
238         switch (ngram) {
239         case 1:
240             *n_ug = ngram_cnt;
241             break;
242         case 2:
243             *n_bg = ngram_cnt;
244             break;
245         case 3:
246             *n_tg = ngram_cnt;
247             break;
248         default:
249             E_WARN("Unknown ngram (%d)\n", ngram);
250             return LM_UNKNOWN_NG;
251             break;
252         }
253     }
254 
255     /* Position file to just after the unigrams header '\1-grams:\' */
256     while ((strcmp(string, "\\1-grams:\n") != 0) && (!feof(fp)))
257         fgets(string, sizeof(string), fp);
258 
259     /* Check counts;  NOTE: #trigrams *CAN* be 0 */
260     if ((*n_ug <= 0) || (*n_bg <= 0) || (*n_tg < 0)) {
261         E_WARN("Bad or missing ngram count\n");
262         return LM_BAD_LM_COUNT;
263     }
264     return LM_SUCCESS;
265 }
266 
267 /**
268    Allocate a new unigram table.  Initially all dictionary words
269    are defined as NO_WORD, the probabilities and backoff weights
270    are -99.
271 
272    @return a pointer of unigram if succeed, NULL if failed.
273  */
274 ug_t *
275 NewUnigramTable(int32 n_ug)
276 {
277     ug_t *table;
278     int32 i;
279     table = NULL;
280 
281     table = (ug_t *) ckd_calloc(n_ug, sizeof(ug_t));
282     if (table == NULL) {
283         E_WARN("Fail to allocate the unigram table\n");
284         return NULL;
285     }
286     for (i = 0; i < n_ug; i++) {
287         table[i].dictwid = NO_WORD;
288         table[i].prob.f = -99.0;
289         table[i].bowt.f = -99.0;
290     }
291     return table;
292 }
293 
294 /**
295    Allocate a new model.  That includes the tables of unigram, bigram
296    and trigram.
297 
298    FIXME: should have more comments.
299  */
300 static lm_t *
301 NewModel(int32 n_ug, int32 n_bg, int32 n_tg, int32 version)
302 {
303     lm_t *model;
304 
305     model = (lm_t *) ckd_calloc(1, sizeof(lm_t));
306 
307     lm_null_struct(model);
308     /*
309      * Allocate one extra unigram and bigram entry: sentinels to terminate
310      * followers (bigrams and trigrams, respectively) of previous entry.
311      */
312     model->ug = NewUnigramTable(n_ug + 1);
313 
314     model->version = version;
315     if (model->version == LMTXT_VERSION)
316         model->is32bits = n_ug > LM_LEGACY_CONSTANT;
317     else if (model->version == LMFORCED_TXT32VERSION)
318         model->is32bits = 1;
319 
320     if (model->is32bits) {
321         model->bg32 = (bg32_t *) ckd_calloc(n_bg + 1, sizeof(bg32_t));
322         if (n_tg > 0)
323             model->tg32 = (tg32_t *) ckd_calloc(n_tg + 1, sizeof(tg32_t));
324 
325     }
326     else {
327         model->bg = (bg_t *) ckd_calloc(n_bg + 1, sizeof(bg_t));
328         if (n_tg > 0)
329             model->tg = (tg_t *) ckd_calloc(n_tg + 1, sizeof(tg_t));
330     }
331 
332     if (n_tg > 0) {
333         model->tg_segbase =
334             (int32 *) ckd_calloc((n_bg + 1) / BG_SEG_SZ + 1,
335                                  sizeof(int32));
336 
337         E_INFO("%8d = tseg_base entries allocated\n",
338                (n_bg + 1) / BG_SEG_SZ + 1);
339 
340     }
341 
342     /*This part will not be compiled */
343     model->max_ug = model->n_ug = n_ug;
344     model->n_bg = n_bg;
345     model->n_tg = n_tg;
346 
347     model->HT = hash_table_new(model->n_ug, HASH_CASE_YES);
348     model->log_bg_seg_sz = LOG2_BG_SEG_SZ;      /* Default */
349 
350     return model;
351 }
352 
353 /*
354  * Read in the unigrams from given file into the LM structure model.  On
355  * entry to this procedure, the file pointer is positioned just after the
356  * header line '\1-grams:'.
357  *
358  * @return LM_BAD_LM_COUNT, if there is a unigram which is large than what
359  * specify by the header. LM_SUCCESS if reading succeeds.
360  */
361 static int32
362 ReadUnigrams(FILE * fp, lm_t * model  /**< An LM where unigram will be filled in */
363     )
364 {
365     char string[256];
366     char name[128];
367     int32 wcnt, i;
368     float p1, bo_wt;
369     s3lmwid32_t startwid, endwid;
370 
371     E_INFO("Reading unigrams\n");
372 
373     wcnt = 0;
374     while ((fgets(string, sizeof(string), fp) != NULL) &&
375            (strcmp(string, "\\2-grams:\n") != 0)) {
376         if (sscanf(string, "%f %s %f", &p1, name, &bo_wt) != 3) {
377             if (string[0] != '\n')
378                 E_WARN("Format error; unigram ignored:%s", string);
379             continue;
380         }
381 
382         if (wcnt >= model->n_ug) {
383             E_WARN("Too many unigrams\n");
384             return LM_BAD_LM_COUNT;
385         }
386 
387         /* Associate name with word id */
388         /* This is again not local */
389         model->wordstr[wcnt] = (char *) ckd_salloc(name);
390         hash_table_enter(model->HT, model->wordstr[wcnt], (void *)(long)wcnt);
391         model->ug[wcnt].prob.f = p1;
392         model->ug[wcnt].bowt.f = bo_wt;
393         model->ug[wcnt].dictwid = wcnt;
394         wcnt++;
395     }
396 
397     if (model->n_ug != wcnt) {
398         E_WARN("lm_t.n_ug(%d) != #unigrams read(%d)\n", model->n_ug, wcnt);
399         model->n_ug = wcnt;
400     }
401 
402     startwid = endwid = BAD_LMWID(model);
403 
404     for (i = 0; i < model->n_ug; i++) {
405         if (strcmp(model->wordstr[i], S3_START_WORD) == 0)
406             startwid = i;
407         else if (strcmp(model->wordstr[i], S3_FINISH_WORD) == 0)
408             endwid = i;
409     }
410 
411     /* Force ugprob(<s>) = MIN_PROB_F */
412     if (IS_LMWID(model, startwid)) {
413         model->ug[startwid].prob.f = MIN_PROB_F;
414         model->startlwid = startwid;
415     }
416 
417     /* Force bowt(</s>) = MIN_PROB_F */
418     if (IS_LMWID(model, endwid)) {
419         model->ug[endwid].bowt.f = MIN_PROB_F;
420         model->finishlwid = endwid;
421     }
422 
423     return LM_SUCCESS;
424 }
425 
426 /*
427  * Read bigrams from given file into given model structure.  File may be arpabo
428  * or arpabo-id format, depending on idfmt = 0 or 1.
429  *
430  * @return LM_UNKNOWN_WORDS if a word appears but it couldn't be found
431  * in the unigram.  LM_BAD_BIGRAM if the word id is bad or out of bound
432  * LM_TOO_MANY if there are too many n-grams than specified in the header.
433  * LM_SUCCESS if the reading succeeds.
434  */
435 static int32
436 ReadBigrams(FILE * fp, lm_t * model, int32 idfmt)
437 {
438     char string[1024], word1[256], word2[256];
439     int32 w1, w2, prev_w1, bgcount, p;
440     bg_t *bgptr;
441     bg32_t *bgptr32;
442     float p2, bo_wt;
443     int32 n_fld, n;
444     int32 is32bits;
445 
446     E_INFO("Reading bigrams\n");
447 
448     bgcount = 0;
449     bgptr = model->bg;
450     bgptr32 = model->bg32;
451 
452     prev_w1 = -1;
453     n_fld = (model->n_tg > 0) ? 4 : 3;
454 
455     bo_wt = 0.0;
456     is32bits = model->is32bits;
457     while (fgets(string, sizeof(string), fp) != NULL) {
458         if (!idfmt)
459             n = sscanf(string, "%f %s %s %f", &p2, word1, word2, &bo_wt);
460         else
461             n = sscanf(string, "%f %d %d %f", &p2, &w1, &w2, &bo_wt);
462         if (n < n_fld) {
463             if (string[0] != '\n')
464                 break;
465             continue;
466         }
467 
468         if (!idfmt) {
469             if ((w1 = wstr2wid(model, word1)) == NO_WORD) {
470                 E_WARN("Unknown word: %s\n", word1);
471                 return LM_UNKNOWN_WORDS;
472             }
473             if ((w2 = wstr2wid(model, word2)) == NO_WORD) {
474                 E_WARN("Unknown word: %s\n", word2);
475                 return LM_UNKNOWN_WORDS;
476             }
477         }
478         else {
479             if ((w1 >= model->n_ug) || (w2 >= model->n_ug) || (w1 < 0)
480                 || (w2 < 0)) {
481                 E_WARN("Bad bigram: %s", string);
482                 return LM_BAD_BIGRAM;
483             }
484         }
485 
486         /* HACK!! to quantize probs to 4 decimal digits */
487         p = p2 * QUANTIZATION_MULTIPLIER;
488         p2 = p * QUANTIZATION_DIVISOR;
489         p = bo_wt * QUANTIZATION_MULTIPLIER;
490         bo_wt = p * QUANTIZATION_DIVISOR;
491 
492         if (bgcount >= model->n_bg) {
493             E_WARN("Too many bigrams\n");
494             return LM_TOO_MANY_NGRAM;
495         }
496 
497         if (is32bits) {
498             bgptr32->wid = w2;
499             bgptr32->probid = sorted_id(&(model->sorted_prob2), &p2);
500             if (model->n_tg > 0)
501                 bgptr32->bowtid =
502                     sorted_id(&(model->sorted_bowt2), &bo_wt);
503 
504         }
505         else {
506             bgptr->wid = w2;
507             bgptr->probid = sorted_id(&(model->sorted_prob2), &p2);
508             if (model->n_tg > 0)
509                 bgptr->bowtid = sorted_id(&(model->sorted_bowt2), &bo_wt);
510         }
511 
512         if (w1 != prev_w1) {
513             if (w1 < prev_w1)
514                 E_INFO("Bigrams not in unigram order\n");
515 
516             for (prev_w1++; prev_w1 <= w1; prev_w1++)
517                 model->ug[prev_w1].firstbg = bgcount;
518             prev_w1 = w1;
519         }
520 
521         bgcount++;
522 
523         if (is32bits)
524             bgptr32++;
525         else
526             bgptr++;
527 
528         if ((bgcount & 0x0000ffff) == 0) {
529             E_INFO_NOFN("Processing bigram .\n");
530         }
531     }
532     if ((strcmp(string, "\\end\\\n") != 0)
533         && (strcmp(string, "\\3-grams:\n") != 0)) {
534         E_WARN("Bad bigram: %s\n", string);
535         return LM_BAD_BIGRAM;
536     }
537 
538     for (prev_w1++; prev_w1 <= model->n_ug; prev_w1++)
539         model->ug[prev_w1].firstbg = bgcount;
540 
541     return LM_SUCCESS;
542 }
543 
544 /*
545  * Reading Trigrams
546  *
547  * Very similar to ReadBigrams (in the sense that they are both
548  * written in C.)
549  *
550  * @return LM_UNKNOWN_WORDS when an unknown word couldn't be found.
551  * LM_BAD_TRIGRAM when a bad trigram is found. LM_NO_MINUS_1GRAM if
552  * the corresponding bigram of a trigram couldn't be
553  * found. LM_OFFSET_TOO_LARGE when in 16 bit mode, the trigram count
554  * of a segment is too huge. LM_BAD_TRIGRAM when a bad trigram is
555  * found.  LM_SUCCESS when the whole reading is ok.
556  */
557 static int
558 ReadTrigrams(FILE * fp, lm_t * model, int32 idfmt)
559 {
560     char string[1024], word1[256], word2[256], word3[256];
561     int32 i, n, w1, w2, w3, prev_w1, prev_w2, tgcount, prev_bg, bg, endbg,
562         p;
563     int32 seg, prev_seg, prev_seg_lastbg;
564     tg_t *tgptr;
565     tg32_t *tgptr32;
566     bg_t *bgptr;
567     bg32_t *bgptr32;
568     float p3;
569     int32 is32bits;
570 
571     E_INFO("Reading trigrams\n");
572 
573     is32bits = model->is32bits;
574     tgcount = 0;
575     tgptr = model->tg;
576     tgptr32 = model->tg32;
577     prev_w1 = -1;
578     prev_w2 = -1;
579     prev_bg = -1;
580     prev_seg = -1;
581 
582     while (fgets(string, sizeof(string), fp) != NULL) {
583         if (!idfmt)
584             n = sscanf(string, "%f %s %s %s", &p3, word1, word2, word3);
585         else
586             n = sscanf(string, "%f %d %d %d", &p3, &w1, &w2, &w3);
587         if (n != 4) {
588             if (string[0] != '\n')
589                 break;
590             continue;
591         }
592 
593         if (!idfmt) {
594             if ((w1 = wstr2wid(model, word1)) == NO_WORD) {
595                 E_WARN("Unknown word: %s\n", word1);
596                 return LM_UNKNOWN_WORDS;
597             }
598             if ((w2 = wstr2wid(model, word2)) == NO_WORD) {
599                 E_WARN("Unknown word: %s\n", word2);
600                 return LM_UNKNOWN_WORDS;
601             }
602             if ((w3 = wstr2wid(model, word3)) == NO_WORD) {
603                 E_WARN("Unknown word: %s\n", word3);
604                 return LM_UNKNOWN_WORDS;
605             }
606         }
607         else {
608             if ((w1 >= model->n_ug) || (w2 >= model->n_ug)
609                 || (w3 >= model->n_ug) || (w1 < 0) || (w2 < 0) || (w3 < 0)) {
610                 E_WARN("Bad trigram: %s\n", string);
611                 return LM_BAD_TRIGRAM;
612             }
613         }
614 
615         /* HACK!! to quantize probs to 4 decimal digits */
616         p = p3 * QUANTIZATION_MULTIPLIER;
617         p3 = p * QUANTIZATION_DIVISOR;
618 
619         if (tgcount >= model->n_tg) {
620             E_WARN("Too many trigrams\n");
621             return LM_TOO_MANY_NGRAM;
622         }
623 
624         if (is32bits) {
625             tgptr32->wid = w3;
626             tgptr32->probid = sorted_id(&model->sorted_prob3, &p3);
627 
628         }
629         else {
630             tgptr->wid = w3;
631             tgptr->probid = sorted_id(&model->sorted_prob3, &p3);
632         }
633 
634         if ((w1 != prev_w1) || (w2 != prev_w2)) {
635             /* Trigram for a new bigram; update tg info for all previous bigrams */
636             if ((w1 < prev_w1) || ((w1 == prev_w1) && (w2 < prev_w2)))
637                 E_INFO("Trigrams not in bigram order\n");
638 
639 
640             bg = (w1 != prev_w1) ? model->ug[w1].firstbg : prev_bg + 1;
641             endbg = model->ug[w1 + 1].firstbg;
642 
643             if (is32bits) {
644                 bgptr32 = model->bg32 + bg;
645                 for (; (bg < endbg) && (bgptr32->wid != w2);
646                      bg++, bgptr32++);
647             }
648             else {
649                 bgptr = model->bg + bg;
650                 for (; (bg < endbg) && (bgptr->wid != w2); bg++, bgptr++);
651             }
652 
653             if (bg >= endbg) {
654                 E_WARN("Missing bigram for trigram: %s", string);
655                 return LM_NO_MINUS_1GRAM;
656             }
657 
658             /* bg = bigram entry index for <w1,w2>.  Update tseg_base */
659             seg = bg >> LOG2_BG_SEG_SZ;
660             for (i = prev_seg + 1; i <= seg; i++)
661                 model->tg_segbase[i] = tgcount;
662 
663             /*      E_INFO("bg %d, seg %d, prev_seg %d, tgcount %d, tg_segbase[prev_seg] %d, tgoff %d\n",bg,seg,prev_seg,tgcount,model->tg_segbase[prev_seg],tgcount - model->tg_segbase[prev_seg]); */
664             /* Update trigrams pointers for all bigrams until bg */
665             if (prev_seg < seg) {
666                 int32 tgoff = 0;
667 
668                 if (prev_seg >= 0) {
669                     tgoff = tgcount - model->tg_segbase[prev_seg];
670 
671                     /*              E_INFO("Offset %d tgcount %d, seg_base %d from tseg_base > %d, prev_seg %d seg %d\n",
672                        tgoff, tgcount, model->tg_segbase[prev_seg],LM_LEGACY_CONSTANT,prev_seg,seg); */
673 
674                     if (!is32bits) {
675                         if (tgoff > LM_LEGACY_CONSTANT) {
676                             E_WARN
677                                 ("Offset %d tgcount %d, seg_base %d from tseg_base > %d\n",
678                                  tgoff, tgcount,
679                                  model->tg_segbase[prev_seg],
680                                  LM_LEGACY_CONSTANT);
681                             return LM_OFFSET_TOO_LARGE;
682                         }
683                     }
684                 }
685 
686                 prev_seg_lastbg = ((prev_seg + 1) << LOG2_BG_SEG_SZ) - 1;
687 
688                 if (is32bits) {
689                     bgptr32 = model->bg32 + prev_bg;
690                     for (++prev_bg, ++bgptr32; prev_bg <= prev_seg_lastbg;
691                          prev_bg++, bgptr32++)
692                         bgptr32->firsttg = tgoff;
693                     for (; prev_bg <= bg; prev_bg++, bgptr32++)
694                         bgptr32->firsttg = 0;
695                 }
696                 else {
697                     bgptr = model->bg + prev_bg;
698                     for (++prev_bg, ++bgptr; prev_bg <= prev_seg_lastbg;
699                          prev_bg++, bgptr++)
700                         bgptr->firsttg = tgoff;
701                     for (; prev_bg <= bg; prev_bg++, bgptr++)
702                         bgptr->firsttg = 0;
703                 }
704 
705             }
706             else {
707                 int32 tgoff;
708 
709                 tgoff = tgcount - model->tg_segbase[prev_seg];
710 
711                 if (!is32bits) {
712                     if (tgoff > LM_LEGACY_CONSTANT) {
713                         E_WARN
714                             ("Offset %d tgcount %d, seg_base %d from tseg_base > %d\n",
715                              tgoff, tgcount, model->tg_segbase[prev_seg],
716                              LM_LEGACY_CONSTANT);
717                         return LM_OFFSET_TOO_LARGE;
718                     }
719                 }
720 
721                 if (is32bits) {
722                     bgptr32 = model->bg32 + prev_bg;
723                     for (++prev_bg, ++bgptr32; prev_bg <= bg;
724                          prev_bg++, bgptr32++)
725                         bgptr32->firsttg = tgoff;
726                 }
727                 else {
728                     bgptr = model->bg + prev_bg;
729                     for (++prev_bg, ++bgptr; prev_bg <= bg;
730                          prev_bg++, bgptr++)
731                         bgptr->firsttg = tgoff;
732                 }
733             }
734 
735             prev_w1 = w1;
736             prev_w2 = w2;
737             prev_bg = bg;
738             prev_seg = seg;
739         }
740 
741         tgcount++;
742         /*      E_INFO("\ntg_count %d: This line: %s, w1 %d, prev_w1 %d, w2 %d, prev_2 %d, w3 %d\n\n",
743            tgcount,string,
744            w1,prev_w1,
745            w2,prev_w2,
746            w3
747            ); */
748 
749         if (is32bits) {
750             tgptr32++;
751         }
752         else
753             tgptr++;
754 
755         if ((tgcount & 0x0000ffff) == 0) {
756             E_INFO_NOFN("Processing trigram .\n");
757         }
758     }
759     if (strcmp(string, "\\end\\\n") != 0) {
760         E_WARN("Bad trigram: %s\n", string);
761         return LM_BAD_TRIGRAM;
762     }
763 
764     for (prev_bg++; prev_bg <= model->n_bg; prev_bg++) {
765         if ((prev_bg & (BG_SEG_SZ - 1)) == 0)
766             model->tg_segbase[prev_bg >> LOG2_BG_SEG_SZ] = tgcount;
767 
768         if (!is32bits) {
769             if ((tgcount - model->tg_segbase[prev_bg >> LOG2_BG_SEG_SZ]) >
770                 LM_LEGACY_CONSTANT) {
771                 E_WARN
772                     ("Offset %d tgcount %d, seg_base %d from tseg_base > %d\n",
773                      model->tg_segbase[prev_seg >> LOG2_BG_SEG_SZ],
774                      tgcount,
775                      model->tg_segbase[prev_seg >> LOG2_BG_SEG_SZ],
776                      LM_LEGACY_CONSTANT);
777                 return LM_OFFSET_TOO_LARGE;
778             }
779         }
780 
781         if (is32bits) {
782             model->bg32[prev_bg].firsttg =
783                 tgcount - model->tg_segbase[prev_bg >> LOG2_BG_SEG_SZ];
784         }
785         else {
786             model->bg[prev_bg].firsttg =
787                 tgcount - model->tg_segbase[prev_bg >> LOG2_BG_SEG_SZ];
788         }
789     }
790 
791     return LM_SUCCESS;
792 }
793 
794 /**
795    This is a function to read an ARPA-based LM.
796 
797    (200060708) At this point, it is not memory leak-free.
798 
799    @return an initialized LM if everything is alright. NULL, if something goes
800    wrong.
801 
802  */
803 lm_t *
804 lm_read_txt(const char *filename,        /**< Input: The file name*/
805             int32 lminmemory, /**< Input: Whether lm is in memory */
806             int32 * err_no,   /**< Input/Output: Depends on the problem that LM
807 				 reading encounters, it could be errors
808 				 from -2 (LM_OFFSET_TOO_LARGE) to
809 				 -15 (LM_CANNOT_ALLOCATE).  Please checkout
810 				 lm.h for details.
811 			      */
812             int32 isforced32bit, /** Input: normally, we should let lm_read_txt
813 				    to decide whether a file is 32 bit or not.
814 				    When the lm_read_txt couldn't decide that before
815 				    reading or if more specificially when we hit
816 				    the LM segment size problems. Then this bit
817 				    will alter the reading behavior to 32 bit.
818 				*/
819 	    logmath_t *logmath
820     )
821 {
822     lm_t *model;
823     FILE *fp = NULL;
824     int32 usingPipe = FALSE;
825     int32 n_unigram;
826     int32 n_bigram;
827     int32 n_trigram;
828     int32 idfmt = 0;
829     int32 _errmsg;
830 
831     E_INFO("Reading LM file %s\n", filename);
832 
833     fp = fopen_comp(filename, "r", &usingPipe);
834     if (fp == NULL) {
835         E_WARN("failed to read filename for LM\n");
836         *err_no = LM_FILE_NOT_FOUND;
837         return NULL;
838     }
839 
840     _errmsg = ReadNgramCounts(fp, &n_unigram, &n_bigram, &n_trigram);
841     if (_errmsg != LM_SUCCESS) {
842         E_WARN("Couldnt' read the ngram count\n");
843         *err_no = _errmsg;
844         fclose(fp);
845         return NULL;
846     }
847 
848     E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram);
849     /* HACK! This should be something provided by the dictionary What is dict_size? */
850 
851     model = NewModel(n_unigram, n_bigram, n_trigram,
852                      isforced32bit ?
853                      LMFORCED_TXT32VERSION : LMTXT_VERSION);
854     if (model == NULL) {
855         E_WARN
856             ("Cannot allocate tables for new lm with ug %d, bg %d, tg %d\n",
857              n_unigram, n_bigram, n_trigram);
858         *err_no = LM_CANNOT_ALLOCATE;
859         fclose(fp);
860         return NULL;
861     }
862 
863     model->n_ng = 1;
864     model->logmath = logmath;
865 
866     model->isLM_IN_MEMORY = lminmemory;
867 
868     model->is32bits = lm_is32bits(model);
869     if (model->is32bits)
870         E_INFO("Is 32 bits %d, lm->version %d\n", model->is32bits,
871                model->version);
872 
873     /* ARCHAN. Should do checking as well. I was lazy.
874      */
875 
876     if (model->n_bg > 0) {
877         model->n_ng = 2;
878         if (model->is32bits) {
879             model->membg32 =
880                 (membg32_t *) ckd_calloc(model->n_ug, sizeof(membg32_t));
881 
882         }
883         else {
884             model->membg =
885                 (membg_t *) ckd_calloc(model->n_ug, sizeof(membg_t));
886 
887         }
888     }
889 
890     if (model->n_tg > 0) {
891         model->n_ng = 3;
892 
893         if (model->is32bits) {
894             model->tginfo32 =
895                 (tginfo32_t **) ckd_calloc(model->n_ug,
896                                            sizeof(tginfo32_t *));
897 
898         }
899         else {
900             model->tginfo =
901                 (tginfo_t **) ckd_calloc(model->n_ug, sizeof(tginfo_t *));
902         }
903     }
904 
905 
906     /* Have to put it somewhere in lm as a kind of buffer */
907     model->wordstr = (char **) ckd_calloc(n_unigram, sizeof(char *));
908 
909     /* control the lm dumping mechanism */
910 
911     _errmsg = ReadUnigrams(fp, model);
912     if (_errmsg != LM_SUCCESS) {
913         *err_no = _errmsg;
914         fclose(fp);
915         return NULL;
916     }
917 
918     E_INFO("%8d = #unigrams created\n", model->n_ug);
919 
920     init_sorted_list(&(model->sorted_prob2));
921     if (model->n_tg > 0)
922         init_sorted_list(&(model->sorted_bowt2));
923 
924     _errmsg = ReadBigrams(fp, model, idfmt);
925     if (_errmsg != LM_SUCCESS) {
926         *err_no = _errmsg;
927         fclose(fp);
928         return NULL;
929     }
930 
931     model->n_bg = FIRST_BG(model, model->n_ug);
932     model->n_bgprob = model->sorted_prob2.free;
933     model->bgprob = vals_in_sorted_list(&(model->sorted_prob2));
934     free_sorted_list(&(model->sorted_prob2));
935 
936     E_INFO("%8d = #bigrams created\n", model->n_bg);
937     E_INFO("%8d = #prob2 entries\n", model->n_bgprob);
938 
939     if (model->n_tg > 0) {
940         /* Create trigram bo-wts array */
941         model->n_tgbowt = model->sorted_bowt2.free;
942         model->tgbowt = vals_in_sorted_list(&(model->sorted_bowt2));
943         free_sorted_list(&(model->sorted_bowt2));
944         E_INFO("%8d = #bo_wt2 entries\n", model->n_tgbowt);
945 
946         init_sorted_list(&(model->sorted_prob3));
947 
948         _errmsg = ReadTrigrams(fp, model, idfmt);
949         if (_errmsg != LM_SUCCESS) {
950             *err_no = _errmsg;
951             fclose(fp);
952             return NULL;
953         }
954 
955         model->n_tg =
956             model->is32bits ? FIRST_TG32(model,
957                                          model->n_bg) : FIRST_TG(model,
958                                                                  model->
959                                                                  n_bg);
960         model->n_tgprob = model->sorted_prob3.free;
961         model->tgprob = vals_in_sorted_list(&(model->sorted_prob3));
962         E_INFO("%8d = #trigrams created\n", model->n_tg);
963         E_INFO("%8d = #prob3 entries\n", model->n_tgprob);
964 
965         free_sorted_list(&model->sorted_prob3);
966     }
967 
968     fclose(fp);
969     *err_no = LM_SUCCESS;
970     return model;
971 }
972 
973 
974 static char const *txtheader[] = {
975     "#############################################################################",
976     "#Copyright (c) 1999-2004 Carnegie Mellon University.  All rights reserved",
977     "#############################################################################",
978     "#=============================================================================",
979     "#===============  This file was produced by the CMU Sphinx 3.X  ==============",
980     "#=============================================================================",
981     "#############################################################################",
982     "This file is in the ARPA-standard format introduced by Doug Paul.",
983     "",
984     "p(wd3|wd1,wd2)= if(trigram exists)           p_3(wd1,wd2,wd3)",
985     "                else if(bigram w1,w2 exists) bo_wt_2(w1,w2)*p(wd3|wd2)",
986     "                else                         p(wd3|w2)",
987     "",
988     "p(wd2|wd1)= if(bigram exists) p_2(wd1,wd2)",
989     "            else              bo_wt_1(wd1)*p_1(wd2)",
990     "",
991     "All probs and back-off weights (bo_wt) are given in log10 form.",
992     "\\",
993     "Data formats:",
994     "",
995     "Beginning of data mark: \\data\\",
996     NULL
997 };
998 
999 /**
1000    Write an arpa LM header (Sphinx 3.x -specific)
1001  */
1002 static void
1003 lm_write_arpa_header(lm_t * lmp,            /**< The LM pointer */
1004                      FILE * fp              /**< The file pointer */
1005     )
1006 {
1007     /* Print header */
1008     int32 i, j;
1009     for (i = 0; txtheader[i] != NULL; i++)
1010         fprintf(fp, "%s\n", txtheader[i]);
1011 
1012     for (i = 1; i <= lmp->n_ng; i++) {
1013         fprintf(fp, "ngram %d=nr            # number of %d-grams\n", i, i);
1014     }
1015     fprintf(fp, "\n");
1016     for (i = 1; i <= lmp->n_ng; i++) {
1017         fprintf(fp, "\\%d-grams:\n", i);
1018         fprintf(fp, "p_%d     ", i);
1019         for (j = 1; j <= i; j++) {
1020             fprintf(fp, "wd_%d ", j);
1021         }
1022         if (i == lmp->n_ng) {
1023             fprintf(fp, "\n");
1024         }
1025         else {
1026             fprintf(fp, "bo_wt_%d\n", i);
1027         }
1028     }
1029 
1030     fprintf(fp, "\n");
1031     fprintf(fp, "end of data mark: \\end\\\n");
1032     fprintf(fp, "\n");
1033 }
1034 
1035 /**
1036    Write the n-gram counts for ARPA file format.
1037  */
1038 static void
1039 lm_write_arpa_count(lm_t * lmp,            /**< The LM pointer */
1040                     FILE * fp              /**< The file pointer */
1041     )
1042 {
1043     fprintf(fp, "\\data\\\n");
1044 
1045     fprintf(fp, "ngram %d=%d\n", 1, lmp->n_ug);
1046     if (lmp->n_bg)
1047         fprintf(fp, "ngram %d=%d\n", 2, lmp->n_bg);
1048     if (lmp->n_tg)
1049         fprintf(fp, "ngram %d=%d\n", 3, lmp->n_tg);
1050 
1051     fprintf(fp, "\n");
1052 
1053 }
1054 
1055 /**
1056    Write unigram
1057  */
1058 
1059 static void
1060 lm_write_arpa_unigram(lm_t * lmp,            /**< The LM pointer */
1061                       FILE * fp              /**< The file pointer */
1062     )
1063 {
1064     int32 i;
1065     fprintf(fp, "\\1-grams:\n");
1066     for (i = 0; i < lmp->n_ug; i++) {
1067         fprintf(fp, "%.4f ", lmp->ug[i].prob.f);
1068         fprintf(fp, "%s", lmp->wordstr[i]);
1069         fprintf(fp, " ");
1070         fprintf(fp, "%.4f\n", lmp->ug[i].bowt.f);
1071     }
1072     fprintf(fp, "\n");
1073 }
1074 
1075 /**
1076    Write bigram in arpa format
1077  */
1078 
1079 static void
1080 lm_write_arpa_bigram(lm_t * lmp, FILE * fp)
1081 {
1082     int32 i, j;
1083     int32 n, b;
1084     s3lmwid32_t lw1, lw2;
1085     int32 is32bit;
1086     uint32 probid;
1087     uint32 bowtid;
1088 
1089     fprintf(fp, "\\2-grams:\n");
1090 
1091     /* Decide whether we are working on 32bits
1092        either lmp->bg is NULL or number of ug need more than 16 bits to represent.
1093      */
1094     is32bit = lm_is32bits(lmp);
1095 
1096     for (i = 0; i <= lmp->n_ug - 1; i++) {
1097         b = lmp->ug[i].firstbg;
1098         n = lmp->ug[i + 1].firstbg;
1099 
1100         for (j = b; j < n; j++) {
1101             lw1 = i;
1102             if (is32bit) {
1103                 assert(lmp->bg32 != NULL);
1104                 lw2 = lmp->bg32[j].wid;
1105                 probid = lmp->bg32[j].probid;
1106                 bowtid = lmp->bg32[j].bowtid;
1107 
1108             }
1109             else {
1110                 assert(lmp->bg != NULL);
1111                 lw2 = (s3lmwid32_t) lmp->bg[j].wid;
1112                 probid = (int32) lmp->bg[j].probid;
1113                 bowtid = (int32) lmp->bg[j].bowtid;
1114             }
1115 
1116             fprintf(fp, "%.4f ", lmp->bgprob[probid].f);
1117             fprintf(fp, "%s", lmp->wordstr[lw1]);
1118             fprintf(fp, " ");
1119             fprintf(fp, "%s", lmp->wordstr[lw2]);
1120 
1121             if (lmp->tgbowt) {
1122                 fprintf(fp, " ");
1123                 fprintf(fp, "%.4f\n", lmp->tgbowt[bowtid].f);
1124             }
1125             else
1126                 fprintf(fp, "\n");
1127 
1128         }
1129     }
1130 
1131     fprintf(fp, "\n");
1132 }
1133 
1134 /**
1135    Write trigram in arpa format
1136  */
1137 
1138 static void
1139 lm_write_arpa_trigram(lm_t * lmp,            /**< The pointer of LM */
1140                       FILE * fp              /**< A FILE pointer */
1141     )
1142 {
1143     int32 i, j, k;
1144     int32 b_bg, n_bg;
1145     int32 b_tg, n_tg;
1146     s3lmwid32_t lw1, lw2, lw3;
1147     uint32 probid;
1148     int32 is32bit;
1149 
1150     is32bit = lm_is32bits(lmp);
1151 
1152     fprintf(fp, "\\3-grams:\n");
1153     for (i = 0; i <= lmp->n_ug - 1; i++) {
1154         b_bg = lmp->ug[i].firstbg;
1155         n_bg = lmp->ug[i + 1].firstbg;
1156 
1157         for (j = b_bg; j <= n_bg - 1; j++) {
1158 
1159             if (is32bit) {
1160                 assert(lmp->bg32);
1161                 b_tg =
1162                     lmp->tg_segbase[j >> lmp->log_bg_seg_sz] +
1163                     lmp->bg32[j].firsttg;
1164                 n_tg =
1165                     lmp->tg_segbase[(j + 1) >> lmp->log_bg_seg_sz] +
1166                     lmp->bg32[j + 1].firsttg;
1167             }
1168             else {
1169                 assert(lmp->bg);
1170                 b_tg =
1171                     lmp->tg_segbase[j >> lmp->log_bg_seg_sz] +
1172                     lmp->bg[j].firsttg;
1173                 n_tg =
1174                     lmp->tg_segbase[(j + 1) >> lmp->log_bg_seg_sz] +
1175                     lmp->bg[j + 1].firsttg;
1176             }
1177 
1178             for (k = b_tg; k < n_tg; k++) {
1179 
1180                 if (is32bit) {
1181                     assert(lmp->bg32 && lmp->tg32);
1182                     lw1 = i;
1183                     lw2 = lmp->bg32[j].wid;
1184                     lw3 = lmp->tg32[k].wid;
1185                     probid = lmp->tg32[k].probid;
1186                 }
1187                 else {
1188                     assert(lmp->bg && lmp->tg);
1189                     lw1 = i;
1190                     lw2 = (s3lmwid32_t) lmp->bg[j].wid;
1191                     lw3 = (s3lmwid32_t) lmp->tg[k].wid;
1192                     probid = (uint32) lmp->tg[k].probid;
1193                 }
1194 
1195                 fprintf(fp, "%.4f ", lmp->tgprob[probid].f);
1196                 fprintf(fp, "%s", lmp->wordstr[lw1]);
1197                 fprintf(fp, " ");
1198                 fprintf(fp, "%s", lmp->wordstr[lw2]);
1199                 fprintf(fp, " ");
1200                 fprintf(fp, "%s", lmp->wordstr[lw3]);
1201                 fprintf(fp, "\n");
1202 
1203             }
1204         }
1205     }
1206 
1207 }
1208 
1209 /**
1210    Write the end for an arpa file format
1211  */
1212 static void
1213 lm_write_arpa_end(lm_t * lmp,            /**< The LM pointer */
1214                   FILE * fp              /**< The FILE Pointer */
1215     )
1216 {
1217     fprintf(fp, "\\end\\\n");
1218 }
1219 
1220 /**
1221  * Write an LM with ARPA file format
1222  */
1223 int32
1224 lm_write_arpa_text(lm_t * lmp, const char *outputfn)
1225 {
1226     FILE *fp;
1227     int is32bit;
1228 
1229     E_INFO("Dumping LM to %s\n", outputfn);
1230     if ((fp = fopen(outputfn, "w")) == NULL) {
1231         E_ERROR("Cannot create file %s\n", outputfn);
1232         return LM_FAIL;
1233     }
1234 
1235     is32bit = lm_is32bits(lmp);
1236     lm_write_arpa_header(lmp, fp);
1237     lm_write_arpa_count(lmp, fp);
1238     lm_write_arpa_unigram(lmp, fp);
1239 
1240     lm_convert_structure(lmp, is32bit);
1241 
1242     if (lmp->n_bg > 0)
1243         lm_write_arpa_bigram(lmp, fp);
1244     if (lmp->n_tg > 0)
1245         lm_write_arpa_trigram(lmp, fp);
1246     lm_write_arpa_end(lmp, fp);
1247 
1248     fclose(fp);
1249     return LM_SUCCESS;
1250 }
1251