1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 1999-2004 Carnegie Mellon University.  All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 
38 /*
39  * lm_3g_dmp.c -- DMP format LM manipulation.
40  *
41  * **********************************************
42  * CMU ARPA Speech Project
43  *
44  * Copyright (c) 1997 Carnegie Mellon University.
45  * ALL RIGHTS RESERVED.
46  * **********************************************
47  *
48  * HISTORY
49  * $Log: lm_3g_dmp.c,v $
50  * Revision 1.4  2006/03/03 00:42:36  egouvea
51  * In bio.h, definition of REVERSE_SWAP_... depends on WORDS_BIGENDIAN,
52  * since __BIG_ENDIAN__ isn't defined.
53  *
54  * In lm_3g_dmp.c, swap bigram and trigram values if needed.
55  *
56  * In lm_convert regresssion test, allow for tolerance (< 0.0002) when
57  * comparing the results.
58  *
59  * Revision 1.3  2006/03/02 00:35:08  arthchan2003
60  * Merged the logic in share/lm3g2dmp to here.  It will take care the situation when log_bg_seg_sz is different. (Must be an old format Ravi played with in the past). This will match the reading code also generalize the old sphinx 2's logic a little bit.
61  *
62  * Revision 1.2  2006/02/23 04:08:36  arthchan2003
63  * Merged from branch SPHINX3_5_2_RCI_IRII_BRANCH
64  * 1, Added lm_3g.c - a TXT-based LM routines.
65  * 2, Added lm_3g_dmp.c - a DMP-based LM routines.
66  * 3, (Contributed by LIUM) Added lm_attfsm.c - convert lm to FSM
67  * 4, Added lmset.c - a wrapper for the lmset_t structure.
68  *
69  * Revision 1.1.2.1  2005/07/17 05:23:25  arthchan2003
70  * added lm_3g_dmp.c and lmset.c, split it out from lm.c to avoid overcrowding situation in it.
71  *
72  *
73  *
74  */
75 
76 #include <string.h>
77 
78 #include <lm.h>
79 #include <s3types.h>
80 #include <bio.h>
81 
82 /**< ARCHAN 20060302:
83 
84 Please do not change it.  Legacy code use this string to match
85    the header of the LM DMP model.  If we change it, lm3g_read_dump
86    won't work.
87 */
88 const char *darpa_hdr = "Darpa Trigram LM";
89 
90 
91 #define IS32BITS 1
92 #define IS16BITS 0
93 
94 static void
fwrite_int32(FILE * fp,int32 val)95 fwrite_int32(FILE * fp, int32 val)
96 {
97     REVERSE_SENSE_SWAP_INT32(val);
98     fwrite(&val, sizeof(int32), 1, fp);
99 }
100 
101 static void
fwrite_ug(FILE * fp,ug_t * ug)102 fwrite_ug(FILE * fp, ug_t * ug)
103 {
104     ug_t tmp_ug = *ug;
105 
106     REVERSE_SENSE_SWAP_INT32(tmp_ug.dictwid);
107     REVERSE_SENSE_SWAP_INT32(tmp_ug.prob.l);
108     REVERSE_SENSE_SWAP_INT32(tmp_ug.bowt.l);
109     REVERSE_SENSE_SWAP_INT32(tmp_ug.firstbg);
110     fwrite(&tmp_ug, sizeof(ug_t), 1, fp);
111 }
112 
113 static void
fwrite_bg(FILE * fp,bg_t * bg)114 fwrite_bg(FILE * fp, bg_t * bg)
115 {
116     bg_t tmp_bg = *bg;
117 
118     REVERSE_SENSE_SWAP_INT16(tmp_bg.wid);
119     REVERSE_SENSE_SWAP_INT16(tmp_bg.probid);
120     REVERSE_SENSE_SWAP_INT16(tmp_bg.bowtid);
121     REVERSE_SENSE_SWAP_INT16(tmp_bg.firsttg);
122     fwrite(&tmp_bg, sizeof(bg_t), 1, fp);
123 }
124 
125 static void
fwrite_bg32(FILE * fp,bg32_t * bg)126 fwrite_bg32(FILE * fp, bg32_t * bg)
127 {
128     bg32_t tmp_bg = *bg;
129 
130     REVERSE_SENSE_SWAP_INT32(tmp_bg.wid);
131     REVERSE_SENSE_SWAP_INT32(tmp_bg.probid);
132     REVERSE_SENSE_SWAP_INT32(tmp_bg.bowtid);
133     REVERSE_SENSE_SWAP_INT32(tmp_bg.firsttg);
134     fwrite(&tmp_bg, sizeof(bg32_t), 1, fp);
135 }
136 
137 static void
fwrite_tg(FILE * fp,tg_t * tg)138 fwrite_tg(FILE * fp, tg_t * tg)
139 {
140     tg_t tmp_tg = *tg;
141 
142     REVERSE_SENSE_SWAP_INT16(tmp_tg.wid);
143     REVERSE_SENSE_SWAP_INT16(tmp_tg.probid);
144     fwrite(&tmp_tg, sizeof(tg_t), 1, fp);
145 }
146 
147 static void
fwrite_tg32(FILE * fp,tg32_t * tg)148 fwrite_tg32(FILE * fp, tg32_t * tg)
149 {
150     tg32_t tmp_tg = *tg;
151 
152     REVERSE_SENSE_SWAP_INT32(tmp_tg.wid);
153     REVERSE_SENSE_SWAP_INT32(tmp_tg.probid);
154     fwrite(&tmp_tg, sizeof(tg32_t), 1, fp);
155 }
156 
157 
158 /** Please look at the definition of
159  */
160 static char const *fmtdesc[] = {
161     "BEGIN FILE FORMAT DESCRIPTION",
162     "Header string length (int32) and string (including trailing 0)",
163     "Original LM filename string-length (int32) and filename (including trailing 0)",
164     "(int32) version number (present iff value <= 0)",
165     "(int32) original LM file modification timestamp (iff version# present)",
166     "(int32) string-length and string (including trailing 0) (iff version# present)",
167     "... previous entry continued any number of times (iff version# present)",
168     "(int32) 0 (terminating sequence of strings) (iff version# present)",
169     "(int32) log_bg_seg_sz (present iff different from default value of LOG2_BG_SEG_SZ)",
170     "(int32) lm_t.ucount (must be > 0)",
171     "(int32) lm_t.bcount",
172     "(int32) lm_t.tcount",
173     "lm_t.ucount+1 unigrams (including sentinel)",
174     "lm_t.bcount+1 bigrams (including sentinel 64 bits (bg_t) each if version=-1/-2, 128 bits (bg32_t) each if version=-3",
175     "lm_t.tcount trigrams (present iff lm_t.tcount > 0 32 bits (tg_t) each if version=-1/-2, 64 bits (tg32_t) each if version=-3)",
176     "(int32) lm_t.n_prob2",
177     "(int32) lm_t.prob2[]",
178     "(int32) lm_t.n_bo_wt2 (present iff lm_t.tcount > 0)",
179     "(int32) lm_t.bo_wt2[] (present iff lm_t.tcount > 0)",
180     "(int32) lm_t.n_prob3 (present iff lm_t.tcount > 0)",
181     "(int32) lm_t.prob3[] (present iff lm_t.tcount > 0)",
182     "(int32) (lm_t.bcount+1)/BG_SEG_SZ+1 (present iff lm_t.tcount > 0)",
183     "(int32) lm_t.tseg_base[] (present iff lm_t.tcount > 0)",
184     "(int32) Sum(all word string-lengths, including trailing 0 for each)",
185     "All word strings (including trailing 0 for each)",
186     "END FILE FORMAT DESCRIPTION",
187     NULL,
188 };
189 
190 void
lm3g_dump_write_header(FILE * fp)191 lm3g_dump_write_header(FILE * fp)
192 {
193     int32 k;
194     k = strlen(darpa_hdr) + 1;
195     fwrite_int32(fp, k);
196     fwrite(darpa_hdr, sizeof(char), k, fp);
197 }
198 
199 void
lm3g_dump_write_lm_filename(FILE * fp,const char * lmfile)200 lm3g_dump_write_lm_filename(FILE * fp, const char *lmfile)
201 {
202     int32 k;
203 
204     k = strlen(lmfile) + 1;
205     fwrite_int32(fp, k);
206     fwrite(lmfile, sizeof(char), k, fp);
207 
208 }
209 
210 void
lm3g_dump_write_version(FILE * fp,lm_t * model,int32 mtime,int32 is32bits)211 lm3g_dump_write_version(FILE * fp, lm_t * model, int32 mtime,
212                         int32 is32bits)
213 {
214     if (!is32bits) {
215         if (model->log_bg_seg_sz != LOG2_BG_SEG_SZ) {   /* Hack!! */
216             E_WARN("log_bg_seg_sz is different from default");
217             fwrite_int32(fp, LMDMP_VERSION_TG_16BIT_V2);        /* version # */
218         }
219         else {
220             fwrite_int32(fp, LMDMP_VERSION_TG_16BIT);   /* version # */
221         }
222     }
223     else
224         fwrite_int32(fp, LMDMP_VERSION_TG_32BIT);       /* version # */
225 
226     fwrite_int32(fp, mtime);
227 }
228 
229 void
lm3g_dump_write_ngram_counts(FILE * fp,lm_t * model)230 lm3g_dump_write_ngram_counts(FILE * fp, lm_t * model)
231 {
232     fwrite_int32(fp, model->n_ug);
233     fwrite_int32(fp, model->n_bg);
234     fwrite_int32(fp, model->n_tg);
235 }
236 
237 void
lm3g_dump_write_fmtdesc(FILE * fp)238 lm3g_dump_write_fmtdesc(FILE * fp)
239 {
240     int32 i, k;
241     long pos;
242 
243     /* Write file format description into header */
244     for (i = 0; fmtdesc[i] != NULL; i++) {
245         k = strlen(fmtdesc[i]) + 1;
246         fwrite_int32(fp, k);
247         fwrite(fmtdesc[i], sizeof(char), k, fp);
248     }
249     /* Pad it out in order to achieve 32-bit alignment */
250     pos = ftell(fp);
251     k = pos & 3;
252     if (k) {
253         fwrite_int32(fp, 4-k);
254         fwrite("!!!!", 1, 4-k, fp);
255     }
256     fwrite_int32(fp, 0);
257 }
258 
259 void
lm3g_dump_write_unigram(FILE * fp,lm_t * model)260 lm3g_dump_write_unigram(FILE * fp, lm_t * model)
261 {
262     int32 i;
263     for (i = 0; i <= model->n_ug; i++)
264         fwrite_ug(fp, &(model->ug[i]));
265 
266 }
267 
268 
269 void
lm3g_dump_write_bigram(FILE * fp,lm_t * model,int32 is32bits)270 lm3g_dump_write_bigram(FILE * fp, lm_t * model, int32 is32bits)
271 {
272     int32 i;
273     for (i = 0; i <= model->n_bg; i++) {
274         if (is32bits)
275             fwrite_bg32(fp, &(model->bg32[i]));
276         else
277             fwrite_bg(fp, &(model->bg[i]));
278     }
279 
280 }
281 
282 void
lm3g_dump_write_trigram(FILE * fp,lm_t * model,int32 is32bits)283 lm3g_dump_write_trigram(FILE * fp, lm_t * model, int32 is32bits)
284 {
285     int32 i;
286     for (i = 0; i < model->n_tg; i++) {
287         if (is32bits)
288             fwrite_tg32(fp, &(model->tg32[i]));
289         else
290             fwrite_tg(fp, &(model->tg[i]));
291     }
292 }
293 
294 void
lm3g_dump_write_bgprob(FILE * fp,lm_t * model)295 lm3g_dump_write_bgprob(FILE * fp, lm_t * model)
296 {
297     int32 i;
298     fwrite_int32(fp, model->n_bgprob);
299     for (i = 0; i < model->n_bgprob; i++)
300         fwrite_int32(fp, model->bgprob[i].l);
301 }
302 
303 void
lm3g_dump_write_tgbowt(FILE * fp,lm_t * model)304 lm3g_dump_write_tgbowt(FILE * fp, lm_t * model)
305 {
306     int32 i;
307     fwrite_int32(fp, model->n_tgbowt);
308     for (i = 0; i < model->n_tgbowt; i++)
309         fwrite_int32(fp, model->tgbowt[i].l);
310 }
311 
312 void
lm3g_dump_write_tgprob(FILE * fp,lm_t * model)313 lm3g_dump_write_tgprob(FILE * fp, lm_t * model)
314 {
315     int32 i;
316     fwrite_int32(fp, model->n_tgprob);
317     for (i = 0; i < model->n_tgprob; i++)
318         fwrite_int32(fp, model->tgprob[i].l);
319 }
320 
321 void
lm3g_dump_write_tg_segbase(FILE * fp,lm_t * model)322 lm3g_dump_write_tg_segbase(FILE * fp, lm_t * model)
323 {
324     int32 i, k;
325     k = (model->n_bg + 1) / BG_SEG_SZ + 1;
326     fwrite_int32(fp, k);
327     for (i = 0; i < k; i++)
328         fwrite_int32(fp, model->tg_segbase[i]);
329 }
330 
331 void
lm3g_dump_write_wordstr(FILE * fp,lm_t * model)332 lm3g_dump_write_wordstr(FILE * fp, lm_t * model)
333 {
334     int32 i, k;
335     k = 0;
336     for (i = 0; i < model->n_ug; i++)
337         k += strlen(model->wordstr[i]) + 1;
338     fwrite_int32(fp, k);
339     for (i = 0; i < model->n_ug; i++)
340         fwrite(model->wordstr[i], sizeof(char),
341                strlen(model->wordstr[i]) + 1, fp);
342 }
343 
344 int32
lm3g_dump(char const * file,lm_t * model,char const * lmfile,int32 mtime,int32 noBits)345 lm3g_dump(char const *file,         /**< the file name */
346           lm_t * model,             /**< the langauge model for output */
347           char const *lmfile,         /**< the */
348           int32 mtime,         /**< LM file modification date */
349           int32 noBits         /**< Number of bits of DMP format */
350     )
351 {
352     FILE *fp;
353     int32 is32bits;
354 
355     if (noBits != 16 && noBits != 32) {
356         E_ERROR("No of Bits specified is not 16 or 32\n");
357         return LM_FAIL;
358     }
359 
360     is32bits = (noBits == 32);
361 
362     if (!is32bits && model->n_ug > LM_LEGACY_CONSTANT) {
363         E_ERROR
364             ("Number of words is larger than %d, but 16 bits models were used\n",
365              LM_LEGACY_CONSTANT);
366         return LM_FAIL;
367     }
368     /*
369      * If is32bits,
370      */
371 
372     E_INFO("Dumping LM to %s\n", file);
373     if ((fp = fopen(file, "wb")) == NULL) {
374         E_ERROR("Cannot create file %s\n", file);
375         return LM_FAIL;
376     }
377 
378     lm3g_dump_write_header(fp);
379     lm3g_dump_write_lm_filename(fp, lmfile);
380     lm3g_dump_write_version(fp, model, mtime, is32bits);
381 
382     /* Write version# and LM file modification date */
383     lm3g_dump_write_fmtdesc(fp);
384 
385     /* HACK!! Write only if different from previous version */
386     if (model->log_bg_seg_sz != LOG2_BG_SEG_SZ)
387         fwrite_int32(fp, model->log_bg_seg_sz);
388 
389     lm3g_dump_write_ngram_counts(fp, model);
390 
391     if (!is32bits && model->n_ug > LM_LEGACY_CONSTANT) {
392         E_ERROR
393             ("The model is a 16 bits' one but the number of unigram has more thant 65535 words (>16 bits)");
394         return LM_FAIL;
395     }
396 
397     lm3g_dump_write_unigram(fp, model);
398 
399     /**
400        20060302 ARCHAN
401        This part is where the 16/32 bits differ
402      */
403 
404     lm_convert_structure(model, is32bits);
405     lm3g_dump_write_bigram(fp, model, is32bits);
406     lm3g_dump_write_trigram(fp, model, is32bits);
407 
408     /**************************************/
409 
410     lm3g_dump_write_bgprob(fp, model);
411 
412     if (model->n_tg > 0) {
413         lm3g_dump_write_tgbowt(fp, model);
414         lm3g_dump_write_tgprob(fp, model);
415         lm3g_dump_write_tg_segbase(fp, model);
416     }
417 
418     lm3g_dump_write_wordstr(fp, model);
419 
420     fclose(fp);
421     return LM_SUCCESS;
422 }
423 
424 
425 static int32
lm_fread_int32(lm_t * lm)426 lm_fread_int32(lm_t * lm)
427 {
428     int32 val;
429 
430     if (fread(&val, sizeof(int32), 1, lm->fp) != 1)
431         E_FATAL("fread failed\n");
432     if (lm->byteswap)
433         SWAP_INT32(&val);
434     return (val);
435 }
436 
437 
438 /**
439       20060303: ARCHAN
440 
441       lm_read_dump_header will read in the DMP format. What it will do
442       is to compare the value read in with the darpa_hdr ("Darpa
443       Trigram LM").  If it matches, that means there is no byte
444       swap. If it doesn't, we will try to swap the value and match the
445       header again.  If it still doesn't work, that means something is
446       wrong. (e.g. Format problem of the DMP file).
447 
448       This process will also allow us to know the byte-order of the
449       DMP file. Swapping could then automatically done in the code.
450  */
451 static int32
lm_read_dump_header(lm_t * lm,const char * file)452 lm_read_dump_header(lm_t * lm,             /**< The LM */
453                     const char *file              /**< The file we are reading */
454     )
455 {
456     int32 k;
457     char str[1024];
458 
459     /* Standard header string-size; set byteswap flag based on this */
460     if (fread(&k, sizeof(int32), 1, lm->fp) != 1)
461         E_FATAL("fread(%s) failed\n", file);
462 
463     if ((size_t) k == strlen(darpa_hdr) + 1)
464         lm->byteswap = 0;
465     else {
466         SWAP_INT32(&k);
467         if ((size_t) k == strlen(darpa_hdr) + 1)
468             lm->byteswap = 1;
469         else {
470             SWAP_INT32(&k);
471             E_INFO("Bad magic number: %d(%08x), not an LM dumpfile??\n", k,
472                    k);
473             return LM_FAIL;
474         }
475     }
476 
477     /* Read and verify standard header string */
478     if (fread(str, sizeof(char), k, lm->fp) != (size_t) k) {
479         E_ERROR("fread(%s) failed\n", file);
480         return LM_FAIL;
481     }
482     if (strncmp(str, darpa_hdr, k) != 0) {
483         E_ERROR("Bad header\n");
484         return LM_FAIL;
485     }
486 
487     return LM_SUCCESS;
488 
489 }
490 
491 static int32
lm_read_lmfilename(lm_t * lm,const char * file)492 lm_read_lmfilename(lm_t * lm,             /**< The LM */
493                    const char *file              /**< The file we are reading */
494     )
495 {
496     int32 k;
497     char str[1024];
498 
499     /* Original LM filename string size and string */
500     k = lm_fread_int32(lm);
501     if ((k < 1) || (k > 1024)) {
502         E_ERROR("Bad original filename size: %d\n", k);
503         return LM_FAIL;
504     }
505     if (fread(str, sizeof(char), k, lm->fp) != (size_t) k) {
506         E_ERROR("fread(%s) failed\n", file);
507         return LM_FAIL;
508     }
509 
510     return LM_SUCCESS;
511 }
512 
513 /**
514    20060303 ARCHAN:
515 
516    lm_read_dump_ver_nug read in the version number and number of
517    unigram from a LM dump file. They are related because of legacy.
518    Here is a survey of what's going on in our past routines at
519    timestamp 20060303.
520 
521    Before Sphinx 3.X (X<4), the routines of reading DMP format of LM
522    have appeared in 3 places.  First place is Sphinx 2's lm3g_load
523    which doesn't take care of version=LMDMP_VERSION_TG_16BIT_V2.  The
524    second place is share's lm3g_load which takes care of
525    version=LMDMP_VERSION_TG_16BIT (-1),
526    version=LMDMP_VERSION_TG_16BIT_V2 (-2) and version >
527    LMDMP_VERSIONNULL (0). The last one is Sphinx 3 which is
528    essentially a quick hack of Sphinx 2's version. (* Note because of
529    the legacy naming system version > 0 here is actually the oldest
530    version)
531 
532    What is in the version then? From the source code, you could
533    backtrace the story.  At the beginning, the version number is used
534    to store the number of unigram.  Hence, it is a number which can be
535    larger than LMDMP_VERSIONNULL (0).
536 
537    However, quickly, the programmer found that it doesn't make sense
538    to do that.  Hence, version  soon appear. version here really
539    mean the version number of the LM.
540 
541    Here is one small problem, the programmer found that log_bg_seg_sz
542    needs to be changed.  So he decides to introduce
543    version=LMDMP_VERSION_TG_32BIT. i.e. a version that doesn't follow
544    the current default value of log_bg_seg_sz (=9)
545 
546    At 20060303, the current code assume all versions
547    <LMDMP_VERSION_TG_32BIT are equivalent. This is likely to change
548    because we might need to introduce version 3, 4 and 5.
549  */
550 
551 static int32
lm_read_dump_ver_nug(lm_t * lm,const char * file)552 lm_read_dump_ver_nug(lm_t * lm,             /**< The LM*/
553                      const char *file              /**< The file we are reading */
554     )
555 {
556     int32 k;
557     char str[1024];
558 
559     /* Version#.  If present (must be <= 0); otherwise it's actually the unigram count */
560     lm->version = lm_fread_int32(lm);
561 
562     if (lm->version <= 0) {
563         /* Read and skip orginal file timestamp;
564            ARCHAN: Unlike the sphinx2's code, currently, the timestamp
565            is not compared in Sphinx 3.
566          */
567         k = lm_fread_int32(lm);
568 
569         /* Read and skip format description */
570         for (;;) {
571             if ((k = lm_fread_int32(lm)) == 0)
572                 break;
573             if (fread(str, sizeof(char), k, lm->fp) != (size_t) k) {
574                 E_ERROR("fread(%s) failed\n", file);
575                 return LM_FAIL;
576             }
577         }
578 
579         /* Read log_bg_seg_sz if present */
580 
581         /* ARCHAN 20060304
582            use lm->version == -2 (LMDMP_VERSION_TG_16BIT_V2) instead of lm->version <2,
583            This is different from share's version
584          */
585         if (lm->version == LMDMP_VERSION_TG_16BIT_V2) {
586             k = lm_fread_int32(lm);
587             if ((k < 1) || (k > 15)) {
588                 E_ERROR("log2(bg_seg_sz) %d outside range 1..15 \n", k);
589                 return LM_FAIL;
590             }
591             lm->log_bg_seg_sz = k;
592         }
593         else {
594             lm->log_bg_seg_sz = LOG2_BG_SEG_SZ; /* Default */
595         }
596 
597         /* Read #ug */
598         lm->n_ug = lm_fread_int32(lm);
599 
600     }
601     else {
602         /* oldest dump file version has no version# or any of the above */
603         if (lm->version > lm->n_ug) {
604             E_ERROR("LM.ucount(%d) out of range [1..%d]\n", lm->version,
605                     lm->n_ug);
606             return LM_FAIL;
607         }
608 
609         /* No version number, actually a unigram count */
610         lm->n_ug = lm->version;
611         lm->log_bg_seg_sz = LOG2_BG_SEG_SZ;     /* Default */
612     }
613 
614 
615     lm->is32bits = lm_is32bits(lm);
616     if ((lm->n_ug <= 0) || (lm->n_ug >= MAX_LMWID(lm))) {
617         E_ERROR("Bad #ug: %u (must be >0, <%u) Version %d\n", lm->n_ug,
618                 MAX_LMWID(lm), lm->version);
619         return LM_FAIL;
620     }
621 
622     lm->bg_seg_sz = 1 << lm->log_bg_seg_sz;
623 
624     if (lm->version == LMDMP_VERSION_TG_32BIT) {
625         E_INFO("Reading LM in 32 bits format\n");
626     }
627     else if (lm->version > LMDMP_VERSIONNULL ||
628              lm->version == LMDMP_VERSION_TG_16BIT ||
629              lm->version == LMDMP_VERSION_TG_16BIT_V2) {
630         E_INFO("Reading LM in 16 bits format\n");
631     }
632 
633     return LM_SUCCESS;
634 }
635 
636 static int32
lm_read_dump_ng_counts(lm_t * lm,const char * file)637 lm_read_dump_ng_counts(lm_t * lm, const char *file)
638 {
639     /* #bigrams */
640     lm->n_bg = lm_fread_int32(lm);
641     if (lm->n_bg < 0) {
642         E_ERROR("Bad #bigrams: %d\n", lm->n_bg);
643         return LM_FAIL;
644     }
645 
646     /* #trigrams */
647     lm->n_tg = lm_fread_int32(lm);
648     if (lm->n_tg < 0) {
649         E_ERROR("Bad #trigrams: %d\n", lm->n_tg);
650         return LM_FAIL;
651     }
652 
653     if (lm->n_bg > 0)
654         lm->n_ng = 2;
655 
656     if (lm->n_tg > 0)
657         lm->n_ng = 3;
658 
659     return LM_SUCCESS;
660 }
661 
662 
663 static int32
lm_read_dump_ug(lm_t * lm,const char * file)664 lm_read_dump_ug(lm_t * lm, const char *file)
665 {
666     int32 i;
667 
668     assert(lm->n_ug > 0);
669 
670     /* Read ug; remember sentinel ug at the end! */
671     lm->ug = (ug_t *) ckd_calloc(lm->n_ug + 1, sizeof(ug_t));
672     if (fread(lm->ug, sizeof(ug_t), lm->n_ug + 1, lm->fp) !=
673         (size_t) (lm->n_ug + 1)) {
674         E_ERROR("unigram fread(%s) failed\n", file);
675         return LM_FAIL;
676         /*        E_FATAL("fread(%s) failed\n", file); */
677     }
678 
679     if (lm->byteswap) {
680         for (i = 0; i <= lm->n_ug; i++) {
681             SWAP_INT32(&(lm->ug[i].prob.l));
682             SWAP_INT32(&(lm->ug[i].bowt.l));
683             SWAP_INT32(&(lm->ug[i].firstbg));
684         }
685     }
686     E_INFO("Read %8d unigrams [in memory]\n", lm->n_ug);
687     return LM_SUCCESS;
688 }
689 
690 
691 
692 /**
693    Reading bigram in the DMP format.
694 
695    When lm->isLM_IN_MEMORY is turned on.  A memory space will be
696    allocated based.  Recorded the offset of bigram. Then the lm will be
697    read from the file in one piece (lm->n_bg+1 *sizeof(bg_t)
698 
699    When lm->isLM_IN_MEMORY is turned off, we will just skip
700    (lm->n_bg+1 * sizeof(bg_t)) byte memory and recorded the offset of
701    bigram. In this case, the program will be operated in disk mode.
702 
703    ARCHAN 20060304, First introduced 32 bits reading.  This is whether
704    the code is 32bit or not, lm->bg32 or lm->bg (16bits) will be used.
705  */
706 static int32
lm_read_dump_bg(lm_t * lm,const char * file,int32 is32bits)707 lm_read_dump_bg(lm_t * lm,             /**< LM */
708                 const char *file,              /**< file we are reading */
709                 int32 is32bits                 /**< Is it a 32 bits reading? */
710     )
711 {
712     int32 i;
713     int32 mem_sz;
714     void *lmptr;
715     assert(lm->n_bg > 0);
716 
717     mem_sz = is32bits ? sizeof(bg32_t) : sizeof(bg_t);
718     lmptr = NULL;
719 
720   /** Allocate memory */
721     if (lm->isLM_IN_MEMORY) {   /* Remember the sentinel */
722         if ((lmptr = ckd_calloc(lm->n_bg + 1, mem_sz)) == NULL) {
723             E_ERROR
724                 ("Fail to allocate memory with size %d for bigram reading. Each bigram with size\n",
725                  lm->n_bg + 1, mem_sz);
726             return LM_FAIL;
727         }
728     }
729     else {
730         lmptr = NULL;
731     }
732 
733     if (lm->n_bg > 0) {
734 
735         lm->bgoff = ftell(lm->fp);
736 
737         if (lm->isLM_IN_MEMORY) {
738             if (is32bits) {
739                 lm->bg32 = (bg32_t *) lmptr;
740                 fread(lm->bg32, lm->n_bg + 1, mem_sz, lm->fp);
741                 if (lm->byteswap) {
742                     for (i = 0; i <= lm->n_bg; i++)
743                         swap_bg32(&(lm->bg32[i]));
744                 }
745             }
746             else {
747                 lm->bg = (bg_t *) lmptr;
748                 fread(lm->bg, lm->n_bg + 1, mem_sz, lm->fp);
749                 if (lm->byteswap) {
750                     for (i = 0; i <= lm->n_bg; i++)
751                         swap_bg(&(lm->bg[i]));
752                 }
753             }
754 
755             E_INFO("Read %8d bigrams [in memory]\n", lm->n_bg);
756         }
757         else {
758             fseek(lm->fp, (lm->n_bg + 1) * mem_sz, SEEK_CUR);
759             E_INFO("%8d bigrams [on disk]\n", lm->n_bg);
760         }
761 
762     }
763 
764     return LM_SUCCESS;
765 }
766 
767 /*
768 
769   Similar to lm_read_dump_bg, note instead of lm->n_tg+1, we are
770   working on lm->n_tg here.
771   @see lm_read_dump_bg
772  */
773 
774 static int32
lm_read_dump_tg(lm_t * lm,const char * file,int is32bits)775 lm_read_dump_tg(lm_t * lm,             /**< LM */
776                 const char *file,              /**< file we are reading */
777                 int is32bits               /**< Whether the data structure is 32 bits */
778     )
779 {
780     int32 i;
781     int32 mem_sz;
782     void *lmptr;
783     /* Number of Trigrams might be zero
784      */
785 
786 
787     assert(lm->n_tg >= 0);
788 
789     mem_sz = is32bits ? sizeof(tg32_t) : sizeof(tg_t);
790     lmptr = NULL;
791 
792     if (lm->isLM_IN_MEMORY && lm->n_tg > 0) {
793         if ((lmptr = ckd_calloc(lm->n_tg + 1, mem_sz)) == NULL) {
794             E_ERROR
795                 ("Fail to allocate memory with size %d for trigram reading.  Each trigram with mem_sz\n",
796                  lm->n_tg + 1, mem_sz);
797             return LM_FAIL;
798         }
799 
800     }
801     else
802         lmptr = NULL;
803 
804     if (lm->n_tg > 0) {         /* Read bigrams; remember sentinel at the end */
805 
806         lm->tgoff = ftell(lm->fp);
807 
808         if (lm->isLM_IN_MEMORY) {
809             if (is32bits) {
810                 lm->tg32 = (tg32_t *) lmptr;
811                 fread(lm->tg32, lm->n_tg, mem_sz, lm->fp);
812                 if (lm->byteswap) {
813                     for (i = 0; i <= lm->n_tg - 1; i++) {
814                         swap_tg32(&(lm->tg32[i]));
815                     }
816                 }
817             }
818             else {
819                 lm->tg = (tg_t *) lmptr;
820                 fread(lm->tg, lm->n_tg, mem_sz, lm->fp);
821                 if (lm->byteswap) {
822                     for (i = 0; i <= lm->n_tg - 1; i++) {
823                         swap_tg(&(lm->tg[i]));
824                     }
825                 }
826             }
827 
828             E_INFO("Read %8d trigrams [in memory]\n", lm->n_tg);
829         }
830         else {
831             fseek(lm->fp, (lm->n_tg) * mem_sz, SEEK_CUR);
832             E_INFO("%8d bigrams [on disk]\n", lm->n_tg);
833         }
834     }
835     return LM_SUCCESS;
836 }
837 
838 static int32
lm_read_dump_calloc_membg_tginfo(lm_t * lm,const char * file,int is32bits)839 lm_read_dump_calloc_membg_tginfo(lm_t * lm, const char *file, int is32bits)
840 {
841     void *lmptr, *lmptr2;
842     int32 mem_sz, mem_sz2;
843 
844     lmptr = lmptr2 = NULL;
845     mem_sz = is32bits ? sizeof(membg32_t) : sizeof(membg_t);
846     mem_sz2 = is32bits ? sizeof(tginfo32_t *) : sizeof(tginfo_t *);
847 
848     if (lm->n_bg > 0) {
849         if ((lmptr = ckd_calloc(lm->n_ug, mem_sz)) == NULL) {
850             E_ERROR("Failed to allocate memory for membg.\n");
851             return LM_FAIL;
852         }
853     }
854 
855     if (lm->n_tg > 0) {
856         if ((lmptr2 = ckd_calloc(lm->n_ug, mem_sz2)) == NULL) {
857             E_ERROR("Failed to allocate memory for tginfo.\n");
858             return LM_FAIL;
859         }
860     }
861 
862     if (is32bits) {
863         lm->membg32 = (membg32_t *) lmptr;
864         lm->tginfo32 = (tginfo32_t **) lmptr2;
865     }
866     else {
867         lm->membg = (membg_t *) lmptr;
868         lm->tginfo = (tginfo_t **) lmptr2;
869     }
870     return LM_SUCCESS;
871 
872 }
873 
874 static int32
lm_read_dump_bgprob(lm_t * lm,const char * file,int32 is32bits)875 lm_read_dump_bgprob(lm_t * lm, const char *file, int32 is32bits)
876 {
877     int32 i;
878     uint32 upper_limit;
879 
880     upper_limit = is32bits ? LM_SPHINX_CONSTANT : LM_LEGACY_CONSTANT;
881     /*  E_INFO("%d upper_limit\n",upper_limit); */
882     if (lm->n_bg > 0) {
883         /* Bigram probs table size */
884         lm->n_bgprob = lm_fread_int32(lm);
885         if ((lm->n_bgprob <= 0) || (lm->n_bgprob > upper_limit)) {
886             E_ERROR("Bad bigram prob table size: %d\n", lm->n_bgprob);
887             return LM_FAIL;
888         }
889 
890         /* Allocate and read bigram probs table */
891         lm->bgprob = (lmlog_t *) ckd_calloc(lm->n_bgprob, sizeof(lmlog_t));
892         if (fread(lm->bgprob, sizeof(lmlog_t), lm->n_bgprob, lm->fp) !=
893             (size_t) lm->n_bgprob) {
894             E_ERROR("fread(%s) failed\n", file);
895             return LM_FAIL;
896         }
897         if (lm->byteswap) {
898             for (i = 0; i < lm->n_bgprob; i++)
899                 SWAP_INT32(&(lm->bgprob[i].l));
900         }
901 
902         E_INFO("%8d bigram prob entries\n", lm->n_bgprob);
903     }
904     return LM_SUCCESS;
905 
906 }
907 
908 static int32
lm_read_dump_tgbowt(lm_t * lm,const char * file,int32 is32bits)909 lm_read_dump_tgbowt(lm_t * lm, const char *file, int32 is32bits)
910 {
911     int32 i;
912     uint32 upper_limit;
913 
914     upper_limit = is32bits ? LM_SPHINX_CONSTANT : LM_LEGACY_CONSTANT;
915 
916     if (lm->n_tg > 0) {
917         /* Trigram bowt table size */
918         lm->n_tgbowt = lm_fread_int32(lm);
919         if ((lm->n_tgbowt <= 0) || (lm->n_tgbowt > upper_limit)) {
920             E_ERROR("Bad trigram bowt table size: %d\n", lm->n_tgbowt);
921             return LM_FAIL;
922         }
923 
924         /* Allocate and read trigram bowt table */
925         lm->tgbowt = (lmlog_t *) ckd_calloc(lm->n_tgbowt, sizeof(lmlog_t));
926         if (fread(lm->tgbowt, sizeof(lmlog_t), lm->n_tgbowt, lm->fp) !=
927             (size_t) lm->n_tgbowt) {
928 
929             E_ERROR("fread(%s) failed\n", file);
930             return LM_FAIL;
931         }
932         if (lm->byteswap) {
933             for (i = 0; i < lm->n_tgbowt; i++)
934                 SWAP_INT32(&(lm->tgbowt[i].l));
935         }
936         E_INFO("%8d trigram bowt entries\n", lm->n_tgbowt);
937     }
938     return LM_SUCCESS;
939 }
940 
941 static int32
lm_read_dump_tgprob(lm_t * lm,const char * file,int32 is32bits)942 lm_read_dump_tgprob(lm_t * lm, const char *file, int32 is32bits)
943 {
944     int32 i;
945     uint32 upper_limit;
946 
947     upper_limit = is32bits ? LM_SPHINX_CONSTANT : LM_LEGACY_CONSTANT;
948 
949     if (lm->n_tg > 0) {
950         lm->n_tgprob = lm_fread_int32(lm);
951         if ((lm->n_tgprob <= 0) || (lm->n_tgprob > upper_limit)) {
952             E_ERROR("Bad trigram bowt table size: %d\n", lm->n_tgprob);
953             return LM_FAIL;
954         }
955 
956         /* Allocate and read trigram bowt table */
957         lm->tgprob = (lmlog_t *) ckd_calloc(lm->n_tgprob, sizeof(lmlog_t));
958         if (fread(lm->tgprob, sizeof(lmlog_t), lm->n_tgprob, lm->fp) !=
959             (size_t) lm->n_tgprob) {
960             E_ERROR("fread(%s) failed\n", file);
961             return LM_FAIL;
962         }
963         if (lm->byteswap) {
964             for (i = 0; i < lm->n_tgprob; i++)
965                 SWAP_INT32(&(lm->tgprob[i].l));
966         }
967         E_INFO("%8d trigram prob entries\n", lm->n_tgprob);
968     }
969 
970     return LM_SUCCESS;
971 }
972 
973 /*
974   The only function which doesn't require switching in lm_read_dump
975  */
976 static int32
lm_read_dump_tg_segbase(lm_t * lm,const char * file)977 lm_read_dump_tg_segbase(lm_t * lm, const char *file)
978 {
979     int i, k;
980     if (lm->n_tg > 0) {
981         /* Trigram seg table size */
982         k = lm_fread_int32(lm);
983         if (k != (lm->n_bg + 1) / lm->bg_seg_sz + 1) {
984             E_ERROR("Bad trigram seg table size: %d\n", k);
985             return LM_FAIL;
986         }
987 
988         /* Allocate and read trigram seg table */
989         lm->tg_segbase = (int32 *) ckd_calloc(k, sizeof(int32));
990         if (fread(lm->tg_segbase, sizeof(int32), k, lm->fp) != (size_t) k) {
991             E_ERROR("fread(%s) failed\n", file);
992             return LM_FAIL;
993         }
994         if (lm->byteswap) {
995             for (i = 0; i < k; i++)
996                 SWAP_INT32(&(lm->tg_segbase[i]));
997         }
998         E_INFO("%8d trigram segtable entries (%d segsize)\n", k,
999                lm->bg_seg_sz);
1000     }
1001     return LM_SUCCESS;
1002 }
1003 
1004 static int32
lm_read_dump_wordstr(lm_t * lm,const char * file,int32 is32bits)1005 lm_read_dump_wordstr(lm_t * lm, const char *file, int32 is32bits)
1006 {
1007     int32 i, j, k;
1008     char *tmp_word_str;
1009     s3lmwid32_t startwid, endwid;
1010 
1011     /* Read word string names */
1012     k = lm_fread_int32(lm);
1013     if (k <= 0) {
1014         E_ERROR("Bad wordstrings size: %d\n", k);
1015         return LM_FAIL;
1016     }
1017 
1018     tmp_word_str = (char *) ckd_calloc(k, sizeof(char));
1019     if (fread(tmp_word_str, sizeof(char), k, lm->fp) != (size_t) k) {
1020         E_ERROR("fread(%s) failed\n", file);
1021         return LM_FAIL;
1022     }
1023 
1024     /* First make sure string just read contains n_ug words (PARANOIA!!) */
1025     for (i = 0, j = 0; i < k; i++)
1026         if (tmp_word_str[i] == '\0')
1027             j++;
1028 
1029     if (j != lm->n_ug) {
1030         E_ERROR("Bad #words: %d\n", j);
1031         return LM_FAIL;
1032     }
1033 
1034 
1035     startwid = endwid = (s3lmwid32_t) BAD_LMWID(lm);
1036 
1037 
1038     lm->wordstr = (char **) ckd_calloc(lm->n_ug, sizeof(char *));
1039     j = 0;
1040     for (i = 0; i < lm->n_ug; i++) {
1041         if (strcmp(tmp_word_str + j, S3_START_WORD) == 0)
1042             startwid = i;
1043         else if (strcmp(tmp_word_str + j, S3_FINISH_WORD) == 0)
1044             endwid = i;
1045 
1046         lm->wordstr[i] = (char *) ckd_salloc(tmp_word_str + j);
1047 
1048         hash_table_enter(lm->HT, lm->wordstr[i], (void *)(long)i);
1049 
1050         j += strlen(tmp_word_str + j) + 1;
1051     }
1052     free(tmp_word_str);
1053     E_INFO("%8d word strings\n", i);
1054 
1055     /* Force ugprob(<s>) = MIN_PROB_F */
1056     if (IS_LMWID(lm, startwid)) {
1057         lm->ug[startwid].prob.f = MIN_PROB_F;
1058         lm->startlwid = startwid;
1059     }
1060 
1061     /* Force bowt(</s>) = MIN_PROB_F */
1062     if (IS_LMWID(lm, endwid)) {
1063         lm->ug[endwid].bowt.f = MIN_PROB_F;
1064         lm->finishlwid = endwid;
1065     }
1066     else {
1067         E_WARN("No </s> in LM!\n");
1068     }
1069 
1070     return LM_SUCCESS;
1071 }
1072 
1073 
1074 /**
1075   The core of reading reading the data structure from the LM file.  It
1076   also depends the version to operate.  Here is a summary of what's
1077   going on in each version.
1078 
1079   1, In version >0, version=-1(LMDMP_VERSION_TG_16BIT),
1080   -2(LMDMP_VERSION_TG_16BIT_V2),
1081 
1082   The code will read the file using the following sequence.
1083   -read unigram (*_dump_ug)
1084   -read bigram  (*_dump_bg)
1085   -read trigram (*_dump_tg)
1086   -create mem bigram
1087   -create trigram info
1088   -read the actual bigram probability (*_dump_bgprob)
1089   -read the actual trigram backoff weight (*_dump_tgbowt)
1090   -read the actual trigram probability  (*_dump_tgprob)
1091   -read the actual trigram segment base.  (*_dump_tgsegbase)
1092   -read the word str into the code.
1093 
1094   bigram, trigram, membg, tg_info are all in 16 bits.  unigram in
1095   Sphinx 2, Sphinx 3.x (x<4) legacy are already 32 bits.
1096 
1097   bgprob, tgbowt, tgprob, tgsegbase are arrays, their size are all
1098   controlled by a number which in int32.  We are cool here.
1099 
1100   2, In version = -3 (LMDMP_VERSION_TG_32BIT)
1101 
1102   The code will read the file using the following sequence.
1103 
1104   -read unigram (*_dump_ug)
1105   -read bigram in 32 bits  (*_dump_bg)
1106   -read trigram in 32 bits (*_dump_tg)
1107   -create mem bigram in 32 bits
1108   -create trigram info in 32 bits.
1109   -read the actual bigram probability (*_dump_bgprob)
1110   -read the actual trigram backoff weight (*_dump_tgbowt)
1111   -read the actual trigram probability  (*_dump_tgprob)
1112   -read the actual trigram segment base.  (*_dump_tgsegbase)
1113   -read the word str into the code.
1114 
1115   At here, all data structure will use 32 bits data structures or
1116   address arrays as int32 arrays. However because legacy
1117   implementation check the size in bgprob, tgbow, tgprob. I conformed
1118   to this coding style.  So except, _dump_ug and _dump_tgsegbase. All
1119   the code are now having is32bits arguments.  But the major difference
1120   between the two readings are mainly on _dump_bg and _dump_tg
1121 
1122 
1123   On coding :
1124 
1125   Each LM DMP versions will just show out all the routines used.  We
1126   are aware that you could optimize it.  Please don't because it will
1127   kill readability in future.
1128 
1129   We also want to support LIUM's lm format and a general n-gram format
1130   in my mind. We will see.
1131 
1132  */
1133 
1134 static int32
lm_read_dump_ng(lm_t * lm,const char * file)1135 lm_read_dump_ng(lm_t * lm, const char *file)
1136 {
1137 
1138     if (lm->version == LMDMP_VERSION_TG_16BIT ||
1139         lm->version == LMDMP_VERSION_TG_16BIT_V2 ||
1140         lm->version >= LMDMP_VERSIONNULL) {
1141 
1142         if (lm_read_dump_ug(lm, file) == LM_FAIL) {
1143             E_ERROR("Error in reading unigram. \n");
1144             return LM_FAIL;
1145         }
1146 
1147         if (lm_read_dump_bg(lm, file, IS16BITS) == LM_FAIL) {
1148             E_ERROR("Error in reading bigram. \n");
1149             return LM_FAIL;
1150         }
1151 
1152         if (lm_read_dump_tg(lm, file, IS16BITS) == LM_FAIL) {
1153             E_ERROR("Error in reading trigram. \n");
1154             return LM_FAIL;
1155         }
1156 
1157         if (lm_read_dump_calloc_membg_tginfo(lm, file, IS16BITS) ==
1158             LM_FAIL) {
1159             E_ERROR
1160                 ("Error in allocating memory bigram and trigram info. \n");
1161             return LM_FAIL;
1162         }
1163 
1164         if (lm_read_dump_bgprob(lm, file, IS16BITS) == LM_FAIL) {
1165             E_ERROR("Error in reading bigram probability. \n");
1166             return LM_FAIL;
1167         }
1168 
1169         if (lm_read_dump_tgbowt(lm, file, IS16BITS) == LM_FAIL) {
1170             E_ERROR("Error in reading trigram back off weight. \n");
1171 
1172             return LM_FAIL;
1173         }
1174 
1175         if (lm_read_dump_tgprob(lm, file, IS16BITS) == LM_FAIL) {
1176             E_ERROR("Error in reading trigram probability. \n");
1177             return LM_FAIL;
1178         }
1179 
1180         if (lm_read_dump_tg_segbase(lm, file) == LM_FAIL) {
1181             E_ERROR("Error in reading trigram segment base. \n");
1182             return LM_FAIL;
1183         }
1184 
1185         if (lm_read_dump_wordstr(lm, file, IS16BITS) == LM_FAIL) {
1186             E_ERROR("Error in reading the word str.  \n");
1187             return LM_FAIL;
1188         }
1189     }
1190     else if (lm->version == LMDMP_VERSION_TG_32BIT) {
1191 
1192         if (lm_read_dump_ug(lm, file) == LM_FAIL) {
1193             E_ERROR("Error in reading unigram. \n");
1194             return LM_FAIL;
1195         }
1196 
1197         if (lm_read_dump_bg(lm, file, IS32BITS) == LM_FAIL) {
1198             E_ERROR("Error in reading bigram. \n");
1199             return LM_FAIL;
1200         }
1201 
1202         if (lm_read_dump_tg(lm, file, IS32BITS) == LM_FAIL) {
1203             E_ERROR("Error in reading trigram. \n");
1204             return LM_FAIL;
1205         }
1206 
1207         if (lm_read_dump_calloc_membg_tginfo(lm, file, IS32BITS) ==
1208             LM_FAIL) {
1209             E_ERROR
1210                 ("Error in allocating memory bigram and trigram info. \n");
1211             return LM_FAIL;
1212         }
1213 
1214         if (lm_read_dump_bgprob(lm, file, IS32BITS) == LM_FAIL) {
1215             E_ERROR("Error in reading bigram probability. \n");
1216             return LM_FAIL;
1217         }
1218 
1219         if (lm_read_dump_tgbowt(lm, file, IS32BITS) == LM_FAIL) {
1220             E_ERROR("Error in reading trigram back off weight. \n");
1221             return LM_FAIL;
1222         }
1223 
1224         if (lm_read_dump_tgprob(lm, file, IS32BITS) == LM_FAIL) {
1225             E_ERROR("Error in reading trigram probability. \n");
1226             return LM_FAIL;
1227         }
1228 
1229         if (lm_read_dump_tg_segbase(lm, file) == LM_FAIL) {
1230             E_ERROR("Error in reading trigram segment base. \n");
1231             return LM_FAIL;
1232         }
1233 
1234         if (lm_read_dump_wordstr(lm, file, IS32BITS) == LM_FAIL) {
1235             E_ERROR("Error in reading the word str.  \n");
1236             return LM_FAIL;
1237         }
1238 
1239     }
1240     else {
1241         E_ERROR("Error, Format %d is unknown\n", lm->version);
1242         return LM_FAIL;
1243     }
1244 
1245     return LM_SUCCESS;
1246 }
1247 
1248 /**
1249  * Read LM dump (<lmname>.DMP) file and make it the current LM.
1250  * Same interface as lm_read except that the filename refers to a .DMP file.
1251  */
1252 lm_t *
lm_read_dump(const char * file,int lminmemory,logmath_t * logmath)1253 lm_read_dump(const char *file,        /**< The file name*/
1254              int lminmemory,        /**< Whether using in memory LM */
1255              logmath_t *logmath
1256     )
1257 {
1258     lm_t *lm;
1259 
1260     lm = (lm_t *) ckd_calloc(1, sizeof(lm_t));
1261 
1262     lm_null_struct(lm);
1263 
1264     lm->isLM_IN_MEMORY = lminmemory;
1265     lm->n_ng = 1;
1266     lm->logmath = logmath;
1267 
1268 
1269     if ((lm->fp = fopen(file, "rb")) == NULL)
1270         E_FATAL_SYSTEM("fopen(%s,rb) failed\n", file);
1271 
1272     /** Read header and compare byte order */
1273     if (lm_read_dump_header(lm, file) == LM_FAIL) {
1274         E_ERROR("Error in reading the header of the DUMP file. \n");
1275         fclose(lm->fp);
1276         ckd_free(lm);
1277         return NULL;
1278     }
1279 
1280     /** Read the full path of file name of lm */
1281     if (lm_read_lmfilename(lm, file) == LM_FAIL) {
1282         E_ERROR("Error in reading the file name of lm. \n");
1283         fclose(lm->fp);
1284         ckd_free(lm);
1285         return NULL;
1286     }
1287 
1288     /** Read the version number and number of unigram */
1289     if (lm_read_dump_ver_nug(lm, file) == LM_FAIL) {
1290         E_ERROR
1291             ("Error in reading the version name and number of unigram\n");
1292         fclose(lm->fp);
1293         ckd_free(lm);
1294         return NULL;
1295     }
1296 
1297     /** Reading the count of ngrams. */
1298 
1299     if (lm_read_dump_ng_counts(lm, file) == LM_FAIL) {
1300         E_ERROR("Error in reading the ngram counts.  \n");
1301         fclose(lm->fp);
1302         ckd_free(lm);
1303         return NULL;
1304     }
1305 
1306     lm->HT = hash_table_new(lm->n_ug, HASH_CASE_YES);
1307 
1308 
1309     /** Reading the ngrams, the meat of the code. Also decide how
1310 	different versions of LM are read in.
1311      */
1312 
1313     if (lm_read_dump_ng(lm, file) == LM_FAIL) {
1314         E_ERROR("Error in reading the ngram.  \n");
1315         fclose(lm->fp);
1316         hash_table_free(lm->HT);
1317         ckd_free(lm);
1318         return NULL;
1319     }
1320 
1321 
1322     return lm;
1323 }
1324