1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3 * Copyright (c) 1999-2004 Carnegie Mellon University. All rights
4 * reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
21 *
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 * ====================================================================
35 *
36 */
37
38 /*
39 * lm_3g_dmp.c -- DMP format LM manipulation.
40 *
41 * **********************************************
42 * CMU ARPA Speech Project
43 *
44 * Copyright (c) 1997 Carnegie Mellon University.
45 * ALL RIGHTS RESERVED.
46 * **********************************************
47 *
48 * HISTORY
49 * $Log: lm_3g_dmp.c,v $
50 * Revision 1.4 2006/03/03 00:42:36 egouvea
51 * In bio.h, definition of REVERSE_SWAP_... depends on WORDS_BIGENDIAN,
52 * since __BIG_ENDIAN__ isn't defined.
53 *
54 * In lm_3g_dmp.c, swap bigram and trigram values if needed.
55 *
56 * In lm_convert regresssion test, allow for tolerance (< 0.0002) when
57 * comparing the results.
58 *
59 * Revision 1.3 2006/03/02 00:35:08 arthchan2003
60 * Merged the logic in share/lm3g2dmp to here. It will take care the situation when log_bg_seg_sz is different. (Must be an old format Ravi played with in the past). This will match the reading code also generalize the old sphinx 2's logic a little bit.
61 *
62 * Revision 1.2 2006/02/23 04:08:36 arthchan2003
63 * Merged from branch SPHINX3_5_2_RCI_IRII_BRANCH
64 * 1, Added lm_3g.c - a TXT-based LM routines.
65 * 2, Added lm_3g_dmp.c - a DMP-based LM routines.
66 * 3, (Contributed by LIUM) Added lm_attfsm.c - convert lm to FSM
67 * 4, Added lmset.c - a wrapper for the lmset_t structure.
68 *
69 * Revision 1.1.2.1 2005/07/17 05:23:25 arthchan2003
70 * added lm_3g_dmp.c and lmset.c, split it out from lm.c to avoid overcrowding situation in it.
71 *
72 *
73 *
74 */
75
76 #include <string.h>
77
78 #include <lm.h>
79 #include <s3types.h>
80 #include <bio.h>
81
82 /**< ARCHAN 20060302:
83
84 Please do not change it. Legacy code use this string to match
85 the header of the LM DMP model. If we change it, lm3g_read_dump
86 won't work.
87 */
88 const char *darpa_hdr = "Darpa Trigram LM";
89
90
91 #define IS32BITS 1
92 #define IS16BITS 0
93
94 static void
fwrite_int32(FILE * fp,int32 val)95 fwrite_int32(FILE * fp, int32 val)
96 {
97 REVERSE_SENSE_SWAP_INT32(val);
98 fwrite(&val, sizeof(int32), 1, fp);
99 }
100
101 static void
fwrite_ug(FILE * fp,ug_t * ug)102 fwrite_ug(FILE * fp, ug_t * ug)
103 {
104 ug_t tmp_ug = *ug;
105
106 REVERSE_SENSE_SWAP_INT32(tmp_ug.dictwid);
107 REVERSE_SENSE_SWAP_INT32(tmp_ug.prob.l);
108 REVERSE_SENSE_SWAP_INT32(tmp_ug.bowt.l);
109 REVERSE_SENSE_SWAP_INT32(tmp_ug.firstbg);
110 fwrite(&tmp_ug, sizeof(ug_t), 1, fp);
111 }
112
113 static void
fwrite_bg(FILE * fp,bg_t * bg)114 fwrite_bg(FILE * fp, bg_t * bg)
115 {
116 bg_t tmp_bg = *bg;
117
118 REVERSE_SENSE_SWAP_INT16(tmp_bg.wid);
119 REVERSE_SENSE_SWAP_INT16(tmp_bg.probid);
120 REVERSE_SENSE_SWAP_INT16(tmp_bg.bowtid);
121 REVERSE_SENSE_SWAP_INT16(tmp_bg.firsttg);
122 fwrite(&tmp_bg, sizeof(bg_t), 1, fp);
123 }
124
125 static void
fwrite_bg32(FILE * fp,bg32_t * bg)126 fwrite_bg32(FILE * fp, bg32_t * bg)
127 {
128 bg32_t tmp_bg = *bg;
129
130 REVERSE_SENSE_SWAP_INT32(tmp_bg.wid);
131 REVERSE_SENSE_SWAP_INT32(tmp_bg.probid);
132 REVERSE_SENSE_SWAP_INT32(tmp_bg.bowtid);
133 REVERSE_SENSE_SWAP_INT32(tmp_bg.firsttg);
134 fwrite(&tmp_bg, sizeof(bg32_t), 1, fp);
135 }
136
137 static void
fwrite_tg(FILE * fp,tg_t * tg)138 fwrite_tg(FILE * fp, tg_t * tg)
139 {
140 tg_t tmp_tg = *tg;
141
142 REVERSE_SENSE_SWAP_INT16(tmp_tg.wid);
143 REVERSE_SENSE_SWAP_INT16(tmp_tg.probid);
144 fwrite(&tmp_tg, sizeof(tg_t), 1, fp);
145 }
146
147 static void
fwrite_tg32(FILE * fp,tg32_t * tg)148 fwrite_tg32(FILE * fp, tg32_t * tg)
149 {
150 tg32_t tmp_tg = *tg;
151
152 REVERSE_SENSE_SWAP_INT32(tmp_tg.wid);
153 REVERSE_SENSE_SWAP_INT32(tmp_tg.probid);
154 fwrite(&tmp_tg, sizeof(tg32_t), 1, fp);
155 }
156
157
158 /** Please look at the definition of
159 */
160 static char const *fmtdesc[] = {
161 "BEGIN FILE FORMAT DESCRIPTION",
162 "Header string length (int32) and string (including trailing 0)",
163 "Original LM filename string-length (int32) and filename (including trailing 0)",
164 "(int32) version number (present iff value <= 0)",
165 "(int32) original LM file modification timestamp (iff version# present)",
166 "(int32) string-length and string (including trailing 0) (iff version# present)",
167 "... previous entry continued any number of times (iff version# present)",
168 "(int32) 0 (terminating sequence of strings) (iff version# present)",
169 "(int32) log_bg_seg_sz (present iff different from default value of LOG2_BG_SEG_SZ)",
170 "(int32) lm_t.ucount (must be > 0)",
171 "(int32) lm_t.bcount",
172 "(int32) lm_t.tcount",
173 "lm_t.ucount+1 unigrams (including sentinel)",
174 "lm_t.bcount+1 bigrams (including sentinel 64 bits (bg_t) each if version=-1/-2, 128 bits (bg32_t) each if version=-3",
175 "lm_t.tcount trigrams (present iff lm_t.tcount > 0 32 bits (tg_t) each if version=-1/-2, 64 bits (tg32_t) each if version=-3)",
176 "(int32) lm_t.n_prob2",
177 "(int32) lm_t.prob2[]",
178 "(int32) lm_t.n_bo_wt2 (present iff lm_t.tcount > 0)",
179 "(int32) lm_t.bo_wt2[] (present iff lm_t.tcount > 0)",
180 "(int32) lm_t.n_prob3 (present iff lm_t.tcount > 0)",
181 "(int32) lm_t.prob3[] (present iff lm_t.tcount > 0)",
182 "(int32) (lm_t.bcount+1)/BG_SEG_SZ+1 (present iff lm_t.tcount > 0)",
183 "(int32) lm_t.tseg_base[] (present iff lm_t.tcount > 0)",
184 "(int32) Sum(all word string-lengths, including trailing 0 for each)",
185 "All word strings (including trailing 0 for each)",
186 "END FILE FORMAT DESCRIPTION",
187 NULL,
188 };
189
190 void
lm3g_dump_write_header(FILE * fp)191 lm3g_dump_write_header(FILE * fp)
192 {
193 int32 k;
194 k = strlen(darpa_hdr) + 1;
195 fwrite_int32(fp, k);
196 fwrite(darpa_hdr, sizeof(char), k, fp);
197 }
198
199 void
lm3g_dump_write_lm_filename(FILE * fp,const char * lmfile)200 lm3g_dump_write_lm_filename(FILE * fp, const char *lmfile)
201 {
202 int32 k;
203
204 k = strlen(lmfile) + 1;
205 fwrite_int32(fp, k);
206 fwrite(lmfile, sizeof(char), k, fp);
207
208 }
209
210 void
lm3g_dump_write_version(FILE * fp,lm_t * model,int32 mtime,int32 is32bits)211 lm3g_dump_write_version(FILE * fp, lm_t * model, int32 mtime,
212 int32 is32bits)
213 {
214 if (!is32bits) {
215 if (model->log_bg_seg_sz != LOG2_BG_SEG_SZ) { /* Hack!! */
216 E_WARN("log_bg_seg_sz is different from default");
217 fwrite_int32(fp, LMDMP_VERSION_TG_16BIT_V2); /* version # */
218 }
219 else {
220 fwrite_int32(fp, LMDMP_VERSION_TG_16BIT); /* version # */
221 }
222 }
223 else
224 fwrite_int32(fp, LMDMP_VERSION_TG_32BIT); /* version # */
225
226 fwrite_int32(fp, mtime);
227 }
228
229 void
lm3g_dump_write_ngram_counts(FILE * fp,lm_t * model)230 lm3g_dump_write_ngram_counts(FILE * fp, lm_t * model)
231 {
232 fwrite_int32(fp, model->n_ug);
233 fwrite_int32(fp, model->n_bg);
234 fwrite_int32(fp, model->n_tg);
235 }
236
237 void
lm3g_dump_write_fmtdesc(FILE * fp)238 lm3g_dump_write_fmtdesc(FILE * fp)
239 {
240 int32 i, k;
241 long pos;
242
243 /* Write file format description into header */
244 for (i = 0; fmtdesc[i] != NULL; i++) {
245 k = strlen(fmtdesc[i]) + 1;
246 fwrite_int32(fp, k);
247 fwrite(fmtdesc[i], sizeof(char), k, fp);
248 }
249 /* Pad it out in order to achieve 32-bit alignment */
250 pos = ftell(fp);
251 k = pos & 3;
252 if (k) {
253 fwrite_int32(fp, 4-k);
254 fwrite("!!!!", 1, 4-k, fp);
255 }
256 fwrite_int32(fp, 0);
257 }
258
259 void
lm3g_dump_write_unigram(FILE * fp,lm_t * model)260 lm3g_dump_write_unigram(FILE * fp, lm_t * model)
261 {
262 int32 i;
263 for (i = 0; i <= model->n_ug; i++)
264 fwrite_ug(fp, &(model->ug[i]));
265
266 }
267
268
269 void
lm3g_dump_write_bigram(FILE * fp,lm_t * model,int32 is32bits)270 lm3g_dump_write_bigram(FILE * fp, lm_t * model, int32 is32bits)
271 {
272 int32 i;
273 for (i = 0; i <= model->n_bg; i++) {
274 if (is32bits)
275 fwrite_bg32(fp, &(model->bg32[i]));
276 else
277 fwrite_bg(fp, &(model->bg[i]));
278 }
279
280 }
281
282 void
lm3g_dump_write_trigram(FILE * fp,lm_t * model,int32 is32bits)283 lm3g_dump_write_trigram(FILE * fp, lm_t * model, int32 is32bits)
284 {
285 int32 i;
286 for (i = 0; i < model->n_tg; i++) {
287 if (is32bits)
288 fwrite_tg32(fp, &(model->tg32[i]));
289 else
290 fwrite_tg(fp, &(model->tg[i]));
291 }
292 }
293
294 void
lm3g_dump_write_bgprob(FILE * fp,lm_t * model)295 lm3g_dump_write_bgprob(FILE * fp, lm_t * model)
296 {
297 int32 i;
298 fwrite_int32(fp, model->n_bgprob);
299 for (i = 0; i < model->n_bgprob; i++)
300 fwrite_int32(fp, model->bgprob[i].l);
301 }
302
303 void
lm3g_dump_write_tgbowt(FILE * fp,lm_t * model)304 lm3g_dump_write_tgbowt(FILE * fp, lm_t * model)
305 {
306 int32 i;
307 fwrite_int32(fp, model->n_tgbowt);
308 for (i = 0; i < model->n_tgbowt; i++)
309 fwrite_int32(fp, model->tgbowt[i].l);
310 }
311
312 void
lm3g_dump_write_tgprob(FILE * fp,lm_t * model)313 lm3g_dump_write_tgprob(FILE * fp, lm_t * model)
314 {
315 int32 i;
316 fwrite_int32(fp, model->n_tgprob);
317 for (i = 0; i < model->n_tgprob; i++)
318 fwrite_int32(fp, model->tgprob[i].l);
319 }
320
321 void
lm3g_dump_write_tg_segbase(FILE * fp,lm_t * model)322 lm3g_dump_write_tg_segbase(FILE * fp, lm_t * model)
323 {
324 int32 i, k;
325 k = (model->n_bg + 1) / BG_SEG_SZ + 1;
326 fwrite_int32(fp, k);
327 for (i = 0; i < k; i++)
328 fwrite_int32(fp, model->tg_segbase[i]);
329 }
330
331 void
lm3g_dump_write_wordstr(FILE * fp,lm_t * model)332 lm3g_dump_write_wordstr(FILE * fp, lm_t * model)
333 {
334 int32 i, k;
335 k = 0;
336 for (i = 0; i < model->n_ug; i++)
337 k += strlen(model->wordstr[i]) + 1;
338 fwrite_int32(fp, k);
339 for (i = 0; i < model->n_ug; i++)
340 fwrite(model->wordstr[i], sizeof(char),
341 strlen(model->wordstr[i]) + 1, fp);
342 }
343
344 int32
lm3g_dump(char const * file,lm_t * model,char const * lmfile,int32 mtime,int32 noBits)345 lm3g_dump(char const *file, /**< the file name */
346 lm_t * model, /**< the langauge model for output */
347 char const *lmfile, /**< the */
348 int32 mtime, /**< LM file modification date */
349 int32 noBits /**< Number of bits of DMP format */
350 )
351 {
352 FILE *fp;
353 int32 is32bits;
354
355 if (noBits != 16 && noBits != 32) {
356 E_ERROR("No of Bits specified is not 16 or 32\n");
357 return LM_FAIL;
358 }
359
360 is32bits = (noBits == 32);
361
362 if (!is32bits && model->n_ug > LM_LEGACY_CONSTANT) {
363 E_ERROR
364 ("Number of words is larger than %d, but 16 bits models were used\n",
365 LM_LEGACY_CONSTANT);
366 return LM_FAIL;
367 }
368 /*
369 * If is32bits,
370 */
371
372 E_INFO("Dumping LM to %s\n", file);
373 if ((fp = fopen(file, "wb")) == NULL) {
374 E_ERROR("Cannot create file %s\n", file);
375 return LM_FAIL;
376 }
377
378 lm3g_dump_write_header(fp);
379 lm3g_dump_write_lm_filename(fp, lmfile);
380 lm3g_dump_write_version(fp, model, mtime, is32bits);
381
382 /* Write version# and LM file modification date */
383 lm3g_dump_write_fmtdesc(fp);
384
385 /* HACK!! Write only if different from previous version */
386 if (model->log_bg_seg_sz != LOG2_BG_SEG_SZ)
387 fwrite_int32(fp, model->log_bg_seg_sz);
388
389 lm3g_dump_write_ngram_counts(fp, model);
390
391 if (!is32bits && model->n_ug > LM_LEGACY_CONSTANT) {
392 E_ERROR
393 ("The model is a 16 bits' one but the number of unigram has more thant 65535 words (>16 bits)");
394 return LM_FAIL;
395 }
396
397 lm3g_dump_write_unigram(fp, model);
398
399 /**
400 20060302 ARCHAN
401 This part is where the 16/32 bits differ
402 */
403
404 lm_convert_structure(model, is32bits);
405 lm3g_dump_write_bigram(fp, model, is32bits);
406 lm3g_dump_write_trigram(fp, model, is32bits);
407
408 /**************************************/
409
410 lm3g_dump_write_bgprob(fp, model);
411
412 if (model->n_tg > 0) {
413 lm3g_dump_write_tgbowt(fp, model);
414 lm3g_dump_write_tgprob(fp, model);
415 lm3g_dump_write_tg_segbase(fp, model);
416 }
417
418 lm3g_dump_write_wordstr(fp, model);
419
420 fclose(fp);
421 return LM_SUCCESS;
422 }
423
424
425 static int32
lm_fread_int32(lm_t * lm)426 lm_fread_int32(lm_t * lm)
427 {
428 int32 val;
429
430 if (fread(&val, sizeof(int32), 1, lm->fp) != 1)
431 E_FATAL("fread failed\n");
432 if (lm->byteswap)
433 SWAP_INT32(&val);
434 return (val);
435 }
436
437
438 /**
439 20060303: ARCHAN
440
441 lm_read_dump_header will read in the DMP format. What it will do
442 is to compare the value read in with the darpa_hdr ("Darpa
443 Trigram LM"). If it matches, that means there is no byte
444 swap. If it doesn't, we will try to swap the value and match the
445 header again. If it still doesn't work, that means something is
446 wrong. (e.g. Format problem of the DMP file).
447
448 This process will also allow us to know the byte-order of the
449 DMP file. Swapping could then automatically done in the code.
450 */
451 static int32
lm_read_dump_header(lm_t * lm,const char * file)452 lm_read_dump_header(lm_t * lm, /**< The LM */
453 const char *file /**< The file we are reading */
454 )
455 {
456 int32 k;
457 char str[1024];
458
459 /* Standard header string-size; set byteswap flag based on this */
460 if (fread(&k, sizeof(int32), 1, lm->fp) != 1)
461 E_FATAL("fread(%s) failed\n", file);
462
463 if ((size_t) k == strlen(darpa_hdr) + 1)
464 lm->byteswap = 0;
465 else {
466 SWAP_INT32(&k);
467 if ((size_t) k == strlen(darpa_hdr) + 1)
468 lm->byteswap = 1;
469 else {
470 SWAP_INT32(&k);
471 E_INFO("Bad magic number: %d(%08x), not an LM dumpfile??\n", k,
472 k);
473 return LM_FAIL;
474 }
475 }
476
477 /* Read and verify standard header string */
478 if (fread(str, sizeof(char), k, lm->fp) != (size_t) k) {
479 E_ERROR("fread(%s) failed\n", file);
480 return LM_FAIL;
481 }
482 if (strncmp(str, darpa_hdr, k) != 0) {
483 E_ERROR("Bad header\n");
484 return LM_FAIL;
485 }
486
487 return LM_SUCCESS;
488
489 }
490
491 static int32
lm_read_lmfilename(lm_t * lm,const char * file)492 lm_read_lmfilename(lm_t * lm, /**< The LM */
493 const char *file /**< The file we are reading */
494 )
495 {
496 int32 k;
497 char str[1024];
498
499 /* Original LM filename string size and string */
500 k = lm_fread_int32(lm);
501 if ((k < 1) || (k > 1024)) {
502 E_ERROR("Bad original filename size: %d\n", k);
503 return LM_FAIL;
504 }
505 if (fread(str, sizeof(char), k, lm->fp) != (size_t) k) {
506 E_ERROR("fread(%s) failed\n", file);
507 return LM_FAIL;
508 }
509
510 return LM_SUCCESS;
511 }
512
513 /**
514 20060303 ARCHAN:
515
516 lm_read_dump_ver_nug read in the version number and number of
517 unigram from a LM dump file. They are related because of legacy.
518 Here is a survey of what's going on in our past routines at
519 timestamp 20060303.
520
521 Before Sphinx 3.X (X<4), the routines of reading DMP format of LM
522 have appeared in 3 places. First place is Sphinx 2's lm3g_load
523 which doesn't take care of version=LMDMP_VERSION_TG_16BIT_V2. The
524 second place is share's lm3g_load which takes care of
525 version=LMDMP_VERSION_TG_16BIT (-1),
526 version=LMDMP_VERSION_TG_16BIT_V2 (-2) and version >
527 LMDMP_VERSIONNULL (0). The last one is Sphinx 3 which is
528 essentially a quick hack of Sphinx 2's version. (* Note because of
529 the legacy naming system version > 0 here is actually the oldest
530 version)
531
532 What is in the version then? From the source code, you could
533 backtrace the story. At the beginning, the version number is used
534 to store the number of unigram. Hence, it is a number which can be
535 larger than LMDMP_VERSIONNULL (0).
536
537 However, quickly, the programmer found that it doesn't make sense
538 to do that. Hence, version soon appear. version here really
539 mean the version number of the LM.
540
541 Here is one small problem, the programmer found that log_bg_seg_sz
542 needs to be changed. So he decides to introduce
543 version=LMDMP_VERSION_TG_32BIT. i.e. a version that doesn't follow
544 the current default value of log_bg_seg_sz (=9)
545
546 At 20060303, the current code assume all versions
547 <LMDMP_VERSION_TG_32BIT are equivalent. This is likely to change
548 because we might need to introduce version 3, 4 and 5.
549 */
550
551 static int32
lm_read_dump_ver_nug(lm_t * lm,const char * file)552 lm_read_dump_ver_nug(lm_t * lm, /**< The LM*/
553 const char *file /**< The file we are reading */
554 )
555 {
556 int32 k;
557 char str[1024];
558
559 /* Version#. If present (must be <= 0); otherwise it's actually the unigram count */
560 lm->version = lm_fread_int32(lm);
561
562 if (lm->version <= 0) {
563 /* Read and skip orginal file timestamp;
564 ARCHAN: Unlike the sphinx2's code, currently, the timestamp
565 is not compared in Sphinx 3.
566 */
567 k = lm_fread_int32(lm);
568
569 /* Read and skip format description */
570 for (;;) {
571 if ((k = lm_fread_int32(lm)) == 0)
572 break;
573 if (fread(str, sizeof(char), k, lm->fp) != (size_t) k) {
574 E_ERROR("fread(%s) failed\n", file);
575 return LM_FAIL;
576 }
577 }
578
579 /* Read log_bg_seg_sz if present */
580
581 /* ARCHAN 20060304
582 use lm->version == -2 (LMDMP_VERSION_TG_16BIT_V2) instead of lm->version <2,
583 This is different from share's version
584 */
585 if (lm->version == LMDMP_VERSION_TG_16BIT_V2) {
586 k = lm_fread_int32(lm);
587 if ((k < 1) || (k > 15)) {
588 E_ERROR("log2(bg_seg_sz) %d outside range 1..15 \n", k);
589 return LM_FAIL;
590 }
591 lm->log_bg_seg_sz = k;
592 }
593 else {
594 lm->log_bg_seg_sz = LOG2_BG_SEG_SZ; /* Default */
595 }
596
597 /* Read #ug */
598 lm->n_ug = lm_fread_int32(lm);
599
600 }
601 else {
602 /* oldest dump file version has no version# or any of the above */
603 if (lm->version > lm->n_ug) {
604 E_ERROR("LM.ucount(%d) out of range [1..%d]\n", lm->version,
605 lm->n_ug);
606 return LM_FAIL;
607 }
608
609 /* No version number, actually a unigram count */
610 lm->n_ug = lm->version;
611 lm->log_bg_seg_sz = LOG2_BG_SEG_SZ; /* Default */
612 }
613
614
615 lm->is32bits = lm_is32bits(lm);
616 if ((lm->n_ug <= 0) || (lm->n_ug >= MAX_LMWID(lm))) {
617 E_ERROR("Bad #ug: %u (must be >0, <%u) Version %d\n", lm->n_ug,
618 MAX_LMWID(lm), lm->version);
619 return LM_FAIL;
620 }
621
622 lm->bg_seg_sz = 1 << lm->log_bg_seg_sz;
623
624 if (lm->version == LMDMP_VERSION_TG_32BIT) {
625 E_INFO("Reading LM in 32 bits format\n");
626 }
627 else if (lm->version > LMDMP_VERSIONNULL ||
628 lm->version == LMDMP_VERSION_TG_16BIT ||
629 lm->version == LMDMP_VERSION_TG_16BIT_V2) {
630 E_INFO("Reading LM in 16 bits format\n");
631 }
632
633 return LM_SUCCESS;
634 }
635
636 static int32
lm_read_dump_ng_counts(lm_t * lm,const char * file)637 lm_read_dump_ng_counts(lm_t * lm, const char *file)
638 {
639 /* #bigrams */
640 lm->n_bg = lm_fread_int32(lm);
641 if (lm->n_bg < 0) {
642 E_ERROR("Bad #bigrams: %d\n", lm->n_bg);
643 return LM_FAIL;
644 }
645
646 /* #trigrams */
647 lm->n_tg = lm_fread_int32(lm);
648 if (lm->n_tg < 0) {
649 E_ERROR("Bad #trigrams: %d\n", lm->n_tg);
650 return LM_FAIL;
651 }
652
653 if (lm->n_bg > 0)
654 lm->n_ng = 2;
655
656 if (lm->n_tg > 0)
657 lm->n_ng = 3;
658
659 return LM_SUCCESS;
660 }
661
662
663 static int32
lm_read_dump_ug(lm_t * lm,const char * file)664 lm_read_dump_ug(lm_t * lm, const char *file)
665 {
666 int32 i;
667
668 assert(lm->n_ug > 0);
669
670 /* Read ug; remember sentinel ug at the end! */
671 lm->ug = (ug_t *) ckd_calloc(lm->n_ug + 1, sizeof(ug_t));
672 if (fread(lm->ug, sizeof(ug_t), lm->n_ug + 1, lm->fp) !=
673 (size_t) (lm->n_ug + 1)) {
674 E_ERROR("unigram fread(%s) failed\n", file);
675 return LM_FAIL;
676 /* E_FATAL("fread(%s) failed\n", file); */
677 }
678
679 if (lm->byteswap) {
680 for (i = 0; i <= lm->n_ug; i++) {
681 SWAP_INT32(&(lm->ug[i].prob.l));
682 SWAP_INT32(&(lm->ug[i].bowt.l));
683 SWAP_INT32(&(lm->ug[i].firstbg));
684 }
685 }
686 E_INFO("Read %8d unigrams [in memory]\n", lm->n_ug);
687 return LM_SUCCESS;
688 }
689
690
691
692 /**
693 Reading bigram in the DMP format.
694
695 When lm->isLM_IN_MEMORY is turned on. A memory space will be
696 allocated based. Recorded the offset of bigram. Then the lm will be
697 read from the file in one piece (lm->n_bg+1 *sizeof(bg_t)
698
699 When lm->isLM_IN_MEMORY is turned off, we will just skip
700 (lm->n_bg+1 * sizeof(bg_t)) byte memory and recorded the offset of
701 bigram. In this case, the program will be operated in disk mode.
702
703 ARCHAN 20060304, First introduced 32 bits reading. This is whether
704 the code is 32bit or not, lm->bg32 or lm->bg (16bits) will be used.
705 */
706 static int32
lm_read_dump_bg(lm_t * lm,const char * file,int32 is32bits)707 lm_read_dump_bg(lm_t * lm, /**< LM */
708 const char *file, /**< file we are reading */
709 int32 is32bits /**< Is it a 32 bits reading? */
710 )
711 {
712 int32 i;
713 int32 mem_sz;
714 void *lmptr;
715 assert(lm->n_bg > 0);
716
717 mem_sz = is32bits ? sizeof(bg32_t) : sizeof(bg_t);
718 lmptr = NULL;
719
720 /** Allocate memory */
721 if (lm->isLM_IN_MEMORY) { /* Remember the sentinel */
722 if ((lmptr = ckd_calloc(lm->n_bg + 1, mem_sz)) == NULL) {
723 E_ERROR
724 ("Fail to allocate memory with size %d for bigram reading. Each bigram with size\n",
725 lm->n_bg + 1, mem_sz);
726 return LM_FAIL;
727 }
728 }
729 else {
730 lmptr = NULL;
731 }
732
733 if (lm->n_bg > 0) {
734
735 lm->bgoff = ftell(lm->fp);
736
737 if (lm->isLM_IN_MEMORY) {
738 if (is32bits) {
739 lm->bg32 = (bg32_t *) lmptr;
740 fread(lm->bg32, lm->n_bg + 1, mem_sz, lm->fp);
741 if (lm->byteswap) {
742 for (i = 0; i <= lm->n_bg; i++)
743 swap_bg32(&(lm->bg32[i]));
744 }
745 }
746 else {
747 lm->bg = (bg_t *) lmptr;
748 fread(lm->bg, lm->n_bg + 1, mem_sz, lm->fp);
749 if (lm->byteswap) {
750 for (i = 0; i <= lm->n_bg; i++)
751 swap_bg(&(lm->bg[i]));
752 }
753 }
754
755 E_INFO("Read %8d bigrams [in memory]\n", lm->n_bg);
756 }
757 else {
758 fseek(lm->fp, (lm->n_bg + 1) * mem_sz, SEEK_CUR);
759 E_INFO("%8d bigrams [on disk]\n", lm->n_bg);
760 }
761
762 }
763
764 return LM_SUCCESS;
765 }
766
767 /*
768
769 Similar to lm_read_dump_bg, note instead of lm->n_tg+1, we are
770 working on lm->n_tg here.
771 @see lm_read_dump_bg
772 */
773
774 static int32
lm_read_dump_tg(lm_t * lm,const char * file,int is32bits)775 lm_read_dump_tg(lm_t * lm, /**< LM */
776 const char *file, /**< file we are reading */
777 int is32bits /**< Whether the data structure is 32 bits */
778 )
779 {
780 int32 i;
781 int32 mem_sz;
782 void *lmptr;
783 /* Number of Trigrams might be zero
784 */
785
786
787 assert(lm->n_tg >= 0);
788
789 mem_sz = is32bits ? sizeof(tg32_t) : sizeof(tg_t);
790 lmptr = NULL;
791
792 if (lm->isLM_IN_MEMORY && lm->n_tg > 0) {
793 if ((lmptr = ckd_calloc(lm->n_tg + 1, mem_sz)) == NULL) {
794 E_ERROR
795 ("Fail to allocate memory with size %d for trigram reading. Each trigram with mem_sz\n",
796 lm->n_tg + 1, mem_sz);
797 return LM_FAIL;
798 }
799
800 }
801 else
802 lmptr = NULL;
803
804 if (lm->n_tg > 0) { /* Read bigrams; remember sentinel at the end */
805
806 lm->tgoff = ftell(lm->fp);
807
808 if (lm->isLM_IN_MEMORY) {
809 if (is32bits) {
810 lm->tg32 = (tg32_t *) lmptr;
811 fread(lm->tg32, lm->n_tg, mem_sz, lm->fp);
812 if (lm->byteswap) {
813 for (i = 0; i <= lm->n_tg - 1; i++) {
814 swap_tg32(&(lm->tg32[i]));
815 }
816 }
817 }
818 else {
819 lm->tg = (tg_t *) lmptr;
820 fread(lm->tg, lm->n_tg, mem_sz, lm->fp);
821 if (lm->byteswap) {
822 for (i = 0; i <= lm->n_tg - 1; i++) {
823 swap_tg(&(lm->tg[i]));
824 }
825 }
826 }
827
828 E_INFO("Read %8d trigrams [in memory]\n", lm->n_tg);
829 }
830 else {
831 fseek(lm->fp, (lm->n_tg) * mem_sz, SEEK_CUR);
832 E_INFO("%8d bigrams [on disk]\n", lm->n_tg);
833 }
834 }
835 return LM_SUCCESS;
836 }
837
838 static int32
lm_read_dump_calloc_membg_tginfo(lm_t * lm,const char * file,int is32bits)839 lm_read_dump_calloc_membg_tginfo(lm_t * lm, const char *file, int is32bits)
840 {
841 void *lmptr, *lmptr2;
842 int32 mem_sz, mem_sz2;
843
844 lmptr = lmptr2 = NULL;
845 mem_sz = is32bits ? sizeof(membg32_t) : sizeof(membg_t);
846 mem_sz2 = is32bits ? sizeof(tginfo32_t *) : sizeof(tginfo_t *);
847
848 if (lm->n_bg > 0) {
849 if ((lmptr = ckd_calloc(lm->n_ug, mem_sz)) == NULL) {
850 E_ERROR("Failed to allocate memory for membg.\n");
851 return LM_FAIL;
852 }
853 }
854
855 if (lm->n_tg > 0) {
856 if ((lmptr2 = ckd_calloc(lm->n_ug, mem_sz2)) == NULL) {
857 E_ERROR("Failed to allocate memory for tginfo.\n");
858 return LM_FAIL;
859 }
860 }
861
862 if (is32bits) {
863 lm->membg32 = (membg32_t *) lmptr;
864 lm->tginfo32 = (tginfo32_t **) lmptr2;
865 }
866 else {
867 lm->membg = (membg_t *) lmptr;
868 lm->tginfo = (tginfo_t **) lmptr2;
869 }
870 return LM_SUCCESS;
871
872 }
873
874 static int32
lm_read_dump_bgprob(lm_t * lm,const char * file,int32 is32bits)875 lm_read_dump_bgprob(lm_t * lm, const char *file, int32 is32bits)
876 {
877 int32 i;
878 uint32 upper_limit;
879
880 upper_limit = is32bits ? LM_SPHINX_CONSTANT : LM_LEGACY_CONSTANT;
881 /* E_INFO("%d upper_limit\n",upper_limit); */
882 if (lm->n_bg > 0) {
883 /* Bigram probs table size */
884 lm->n_bgprob = lm_fread_int32(lm);
885 if ((lm->n_bgprob <= 0) || (lm->n_bgprob > upper_limit)) {
886 E_ERROR("Bad bigram prob table size: %d\n", lm->n_bgprob);
887 return LM_FAIL;
888 }
889
890 /* Allocate and read bigram probs table */
891 lm->bgprob = (lmlog_t *) ckd_calloc(lm->n_bgprob, sizeof(lmlog_t));
892 if (fread(lm->bgprob, sizeof(lmlog_t), lm->n_bgprob, lm->fp) !=
893 (size_t) lm->n_bgprob) {
894 E_ERROR("fread(%s) failed\n", file);
895 return LM_FAIL;
896 }
897 if (lm->byteswap) {
898 for (i = 0; i < lm->n_bgprob; i++)
899 SWAP_INT32(&(lm->bgprob[i].l));
900 }
901
902 E_INFO("%8d bigram prob entries\n", lm->n_bgprob);
903 }
904 return LM_SUCCESS;
905
906 }
907
908 static int32
lm_read_dump_tgbowt(lm_t * lm,const char * file,int32 is32bits)909 lm_read_dump_tgbowt(lm_t * lm, const char *file, int32 is32bits)
910 {
911 int32 i;
912 uint32 upper_limit;
913
914 upper_limit = is32bits ? LM_SPHINX_CONSTANT : LM_LEGACY_CONSTANT;
915
916 if (lm->n_tg > 0) {
917 /* Trigram bowt table size */
918 lm->n_tgbowt = lm_fread_int32(lm);
919 if ((lm->n_tgbowt <= 0) || (lm->n_tgbowt > upper_limit)) {
920 E_ERROR("Bad trigram bowt table size: %d\n", lm->n_tgbowt);
921 return LM_FAIL;
922 }
923
924 /* Allocate and read trigram bowt table */
925 lm->tgbowt = (lmlog_t *) ckd_calloc(lm->n_tgbowt, sizeof(lmlog_t));
926 if (fread(lm->tgbowt, sizeof(lmlog_t), lm->n_tgbowt, lm->fp) !=
927 (size_t) lm->n_tgbowt) {
928
929 E_ERROR("fread(%s) failed\n", file);
930 return LM_FAIL;
931 }
932 if (lm->byteswap) {
933 for (i = 0; i < lm->n_tgbowt; i++)
934 SWAP_INT32(&(lm->tgbowt[i].l));
935 }
936 E_INFO("%8d trigram bowt entries\n", lm->n_tgbowt);
937 }
938 return LM_SUCCESS;
939 }
940
941 static int32
lm_read_dump_tgprob(lm_t * lm,const char * file,int32 is32bits)942 lm_read_dump_tgprob(lm_t * lm, const char *file, int32 is32bits)
943 {
944 int32 i;
945 uint32 upper_limit;
946
947 upper_limit = is32bits ? LM_SPHINX_CONSTANT : LM_LEGACY_CONSTANT;
948
949 if (lm->n_tg > 0) {
950 lm->n_tgprob = lm_fread_int32(lm);
951 if ((lm->n_tgprob <= 0) || (lm->n_tgprob > upper_limit)) {
952 E_ERROR("Bad trigram bowt table size: %d\n", lm->n_tgprob);
953 return LM_FAIL;
954 }
955
956 /* Allocate and read trigram bowt table */
957 lm->tgprob = (lmlog_t *) ckd_calloc(lm->n_tgprob, sizeof(lmlog_t));
958 if (fread(lm->tgprob, sizeof(lmlog_t), lm->n_tgprob, lm->fp) !=
959 (size_t) lm->n_tgprob) {
960 E_ERROR("fread(%s) failed\n", file);
961 return LM_FAIL;
962 }
963 if (lm->byteswap) {
964 for (i = 0; i < lm->n_tgprob; i++)
965 SWAP_INT32(&(lm->tgprob[i].l));
966 }
967 E_INFO("%8d trigram prob entries\n", lm->n_tgprob);
968 }
969
970 return LM_SUCCESS;
971 }
972
973 /*
974 The only function which doesn't require switching in lm_read_dump
975 */
976 static int32
lm_read_dump_tg_segbase(lm_t * lm,const char * file)977 lm_read_dump_tg_segbase(lm_t * lm, const char *file)
978 {
979 int i, k;
980 if (lm->n_tg > 0) {
981 /* Trigram seg table size */
982 k = lm_fread_int32(lm);
983 if (k != (lm->n_bg + 1) / lm->bg_seg_sz + 1) {
984 E_ERROR("Bad trigram seg table size: %d\n", k);
985 return LM_FAIL;
986 }
987
988 /* Allocate and read trigram seg table */
989 lm->tg_segbase = (int32 *) ckd_calloc(k, sizeof(int32));
990 if (fread(lm->tg_segbase, sizeof(int32), k, lm->fp) != (size_t) k) {
991 E_ERROR("fread(%s) failed\n", file);
992 return LM_FAIL;
993 }
994 if (lm->byteswap) {
995 for (i = 0; i < k; i++)
996 SWAP_INT32(&(lm->tg_segbase[i]));
997 }
998 E_INFO("%8d trigram segtable entries (%d segsize)\n", k,
999 lm->bg_seg_sz);
1000 }
1001 return LM_SUCCESS;
1002 }
1003
1004 static int32
lm_read_dump_wordstr(lm_t * lm,const char * file,int32 is32bits)1005 lm_read_dump_wordstr(lm_t * lm, const char *file, int32 is32bits)
1006 {
1007 int32 i, j, k;
1008 char *tmp_word_str;
1009 s3lmwid32_t startwid, endwid;
1010
1011 /* Read word string names */
1012 k = lm_fread_int32(lm);
1013 if (k <= 0) {
1014 E_ERROR("Bad wordstrings size: %d\n", k);
1015 return LM_FAIL;
1016 }
1017
1018 tmp_word_str = (char *) ckd_calloc(k, sizeof(char));
1019 if (fread(tmp_word_str, sizeof(char), k, lm->fp) != (size_t) k) {
1020 E_ERROR("fread(%s) failed\n", file);
1021 return LM_FAIL;
1022 }
1023
1024 /* First make sure string just read contains n_ug words (PARANOIA!!) */
1025 for (i = 0, j = 0; i < k; i++)
1026 if (tmp_word_str[i] == '\0')
1027 j++;
1028
1029 if (j != lm->n_ug) {
1030 E_ERROR("Bad #words: %d\n", j);
1031 return LM_FAIL;
1032 }
1033
1034
1035 startwid = endwid = (s3lmwid32_t) BAD_LMWID(lm);
1036
1037
1038 lm->wordstr = (char **) ckd_calloc(lm->n_ug, sizeof(char *));
1039 j = 0;
1040 for (i = 0; i < lm->n_ug; i++) {
1041 if (strcmp(tmp_word_str + j, S3_START_WORD) == 0)
1042 startwid = i;
1043 else if (strcmp(tmp_word_str + j, S3_FINISH_WORD) == 0)
1044 endwid = i;
1045
1046 lm->wordstr[i] = (char *) ckd_salloc(tmp_word_str + j);
1047
1048 hash_table_enter(lm->HT, lm->wordstr[i], (void *)(long)i);
1049
1050 j += strlen(tmp_word_str + j) + 1;
1051 }
1052 free(tmp_word_str);
1053 E_INFO("%8d word strings\n", i);
1054
1055 /* Force ugprob(<s>) = MIN_PROB_F */
1056 if (IS_LMWID(lm, startwid)) {
1057 lm->ug[startwid].prob.f = MIN_PROB_F;
1058 lm->startlwid = startwid;
1059 }
1060
1061 /* Force bowt(</s>) = MIN_PROB_F */
1062 if (IS_LMWID(lm, endwid)) {
1063 lm->ug[endwid].bowt.f = MIN_PROB_F;
1064 lm->finishlwid = endwid;
1065 }
1066 else {
1067 E_WARN("No </s> in LM!\n");
1068 }
1069
1070 return LM_SUCCESS;
1071 }
1072
1073
1074 /**
1075 The core of reading reading the data structure from the LM file. It
1076 also depends the version to operate. Here is a summary of what's
1077 going on in each version.
1078
1079 1, In version >0, version=-1(LMDMP_VERSION_TG_16BIT),
1080 -2(LMDMP_VERSION_TG_16BIT_V2),
1081
1082 The code will read the file using the following sequence.
1083 -read unigram (*_dump_ug)
1084 -read bigram (*_dump_bg)
1085 -read trigram (*_dump_tg)
1086 -create mem bigram
1087 -create trigram info
1088 -read the actual bigram probability (*_dump_bgprob)
1089 -read the actual trigram backoff weight (*_dump_tgbowt)
1090 -read the actual trigram probability (*_dump_tgprob)
1091 -read the actual trigram segment base. (*_dump_tgsegbase)
1092 -read the word str into the code.
1093
1094 bigram, trigram, membg, tg_info are all in 16 bits. unigram in
1095 Sphinx 2, Sphinx 3.x (x<4) legacy are already 32 bits.
1096
1097 bgprob, tgbowt, tgprob, tgsegbase are arrays, their size are all
1098 controlled by a number which in int32. We are cool here.
1099
1100 2, In version = -3 (LMDMP_VERSION_TG_32BIT)
1101
1102 The code will read the file using the following sequence.
1103
1104 -read unigram (*_dump_ug)
1105 -read bigram in 32 bits (*_dump_bg)
1106 -read trigram in 32 bits (*_dump_tg)
1107 -create mem bigram in 32 bits
1108 -create trigram info in 32 bits.
1109 -read the actual bigram probability (*_dump_bgprob)
1110 -read the actual trigram backoff weight (*_dump_tgbowt)
1111 -read the actual trigram probability (*_dump_tgprob)
1112 -read the actual trigram segment base. (*_dump_tgsegbase)
1113 -read the word str into the code.
1114
1115 At here, all data structure will use 32 bits data structures or
1116 address arrays as int32 arrays. However because legacy
1117 implementation check the size in bgprob, tgbow, tgprob. I conformed
1118 to this coding style. So except, _dump_ug and _dump_tgsegbase. All
1119 the code are now having is32bits arguments. But the major difference
1120 between the two readings are mainly on _dump_bg and _dump_tg
1121
1122
1123 On coding :
1124
1125 Each LM DMP versions will just show out all the routines used. We
1126 are aware that you could optimize it. Please don't because it will
1127 kill readability in future.
1128
1129 We also want to support LIUM's lm format and a general n-gram format
1130 in my mind. We will see.
1131
1132 */
1133
1134 static int32
lm_read_dump_ng(lm_t * lm,const char * file)1135 lm_read_dump_ng(lm_t * lm, const char *file)
1136 {
1137
1138 if (lm->version == LMDMP_VERSION_TG_16BIT ||
1139 lm->version == LMDMP_VERSION_TG_16BIT_V2 ||
1140 lm->version >= LMDMP_VERSIONNULL) {
1141
1142 if (lm_read_dump_ug(lm, file) == LM_FAIL) {
1143 E_ERROR("Error in reading unigram. \n");
1144 return LM_FAIL;
1145 }
1146
1147 if (lm_read_dump_bg(lm, file, IS16BITS) == LM_FAIL) {
1148 E_ERROR("Error in reading bigram. \n");
1149 return LM_FAIL;
1150 }
1151
1152 if (lm_read_dump_tg(lm, file, IS16BITS) == LM_FAIL) {
1153 E_ERROR("Error in reading trigram. \n");
1154 return LM_FAIL;
1155 }
1156
1157 if (lm_read_dump_calloc_membg_tginfo(lm, file, IS16BITS) ==
1158 LM_FAIL) {
1159 E_ERROR
1160 ("Error in allocating memory bigram and trigram info. \n");
1161 return LM_FAIL;
1162 }
1163
1164 if (lm_read_dump_bgprob(lm, file, IS16BITS) == LM_FAIL) {
1165 E_ERROR("Error in reading bigram probability. \n");
1166 return LM_FAIL;
1167 }
1168
1169 if (lm_read_dump_tgbowt(lm, file, IS16BITS) == LM_FAIL) {
1170 E_ERROR("Error in reading trigram back off weight. \n");
1171
1172 return LM_FAIL;
1173 }
1174
1175 if (lm_read_dump_tgprob(lm, file, IS16BITS) == LM_FAIL) {
1176 E_ERROR("Error in reading trigram probability. \n");
1177 return LM_FAIL;
1178 }
1179
1180 if (lm_read_dump_tg_segbase(lm, file) == LM_FAIL) {
1181 E_ERROR("Error in reading trigram segment base. \n");
1182 return LM_FAIL;
1183 }
1184
1185 if (lm_read_dump_wordstr(lm, file, IS16BITS) == LM_FAIL) {
1186 E_ERROR("Error in reading the word str. \n");
1187 return LM_FAIL;
1188 }
1189 }
1190 else if (lm->version == LMDMP_VERSION_TG_32BIT) {
1191
1192 if (lm_read_dump_ug(lm, file) == LM_FAIL) {
1193 E_ERROR("Error in reading unigram. \n");
1194 return LM_FAIL;
1195 }
1196
1197 if (lm_read_dump_bg(lm, file, IS32BITS) == LM_FAIL) {
1198 E_ERROR("Error in reading bigram. \n");
1199 return LM_FAIL;
1200 }
1201
1202 if (lm_read_dump_tg(lm, file, IS32BITS) == LM_FAIL) {
1203 E_ERROR("Error in reading trigram. \n");
1204 return LM_FAIL;
1205 }
1206
1207 if (lm_read_dump_calloc_membg_tginfo(lm, file, IS32BITS) ==
1208 LM_FAIL) {
1209 E_ERROR
1210 ("Error in allocating memory bigram and trigram info. \n");
1211 return LM_FAIL;
1212 }
1213
1214 if (lm_read_dump_bgprob(lm, file, IS32BITS) == LM_FAIL) {
1215 E_ERROR("Error in reading bigram probability. \n");
1216 return LM_FAIL;
1217 }
1218
1219 if (lm_read_dump_tgbowt(lm, file, IS32BITS) == LM_FAIL) {
1220 E_ERROR("Error in reading trigram back off weight. \n");
1221 return LM_FAIL;
1222 }
1223
1224 if (lm_read_dump_tgprob(lm, file, IS32BITS) == LM_FAIL) {
1225 E_ERROR("Error in reading trigram probability. \n");
1226 return LM_FAIL;
1227 }
1228
1229 if (lm_read_dump_tg_segbase(lm, file) == LM_FAIL) {
1230 E_ERROR("Error in reading trigram segment base. \n");
1231 return LM_FAIL;
1232 }
1233
1234 if (lm_read_dump_wordstr(lm, file, IS32BITS) == LM_FAIL) {
1235 E_ERROR("Error in reading the word str. \n");
1236 return LM_FAIL;
1237 }
1238
1239 }
1240 else {
1241 E_ERROR("Error, Format %d is unknown\n", lm->version);
1242 return LM_FAIL;
1243 }
1244
1245 return LM_SUCCESS;
1246 }
1247
1248 /**
1249 * Read LM dump (<lmname>.DMP) file and make it the current LM.
1250 * Same interface as lm_read except that the filename refers to a .DMP file.
1251 */
1252 lm_t *
lm_read_dump(const char * file,int lminmemory,logmath_t * logmath)1253 lm_read_dump(const char *file, /**< The file name*/
1254 int lminmemory, /**< Whether using in memory LM */
1255 logmath_t *logmath
1256 )
1257 {
1258 lm_t *lm;
1259
1260 lm = (lm_t *) ckd_calloc(1, sizeof(lm_t));
1261
1262 lm_null_struct(lm);
1263
1264 lm->isLM_IN_MEMORY = lminmemory;
1265 lm->n_ng = 1;
1266 lm->logmath = logmath;
1267
1268
1269 if ((lm->fp = fopen(file, "rb")) == NULL)
1270 E_FATAL_SYSTEM("fopen(%s,rb) failed\n", file);
1271
1272 /** Read header and compare byte order */
1273 if (lm_read_dump_header(lm, file) == LM_FAIL) {
1274 E_ERROR("Error in reading the header of the DUMP file. \n");
1275 fclose(lm->fp);
1276 ckd_free(lm);
1277 return NULL;
1278 }
1279
1280 /** Read the full path of file name of lm */
1281 if (lm_read_lmfilename(lm, file) == LM_FAIL) {
1282 E_ERROR("Error in reading the file name of lm. \n");
1283 fclose(lm->fp);
1284 ckd_free(lm);
1285 return NULL;
1286 }
1287
1288 /** Read the version number and number of unigram */
1289 if (lm_read_dump_ver_nug(lm, file) == LM_FAIL) {
1290 E_ERROR
1291 ("Error in reading the version name and number of unigram\n");
1292 fclose(lm->fp);
1293 ckd_free(lm);
1294 return NULL;
1295 }
1296
1297 /** Reading the count of ngrams. */
1298
1299 if (lm_read_dump_ng_counts(lm, file) == LM_FAIL) {
1300 E_ERROR("Error in reading the ngram counts. \n");
1301 fclose(lm->fp);
1302 ckd_free(lm);
1303 return NULL;
1304 }
1305
1306 lm->HT = hash_table_new(lm->n_ug, HASH_CASE_YES);
1307
1308
1309 /** Reading the ngrams, the meat of the code. Also decide how
1310 different versions of LM are read in.
1311 */
1312
1313 if (lm_read_dump_ng(lm, file) == LM_FAIL) {
1314 E_ERROR("Error in reading the ngram. \n");
1315 fclose(lm->fp);
1316 hash_table_free(lm->HT);
1317 ckd_free(lm);
1318 return NULL;
1319 }
1320
1321
1322 return lm;
1323 }
1324