1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3 * Copyright (c) 1999-2004 Carnegie Mellon University. All rights
4 * reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
21 *
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 * ====================================================================
35 *
36 */
37 /*
38 * lm.c -- Disk-based backoff word trigram LM module.
39 *
40 * **********************************************
41 * CMU ARPA Speech Project
42 *
43 * Copyright (c) 1997 Carnegie Mellon University.
44 * ALL RIGHTS RESERVED.
45 * **********************************************
46 *
47 * HISTORY
48 * $Log: lm.c,v $
49 * Revision 1.20 2006/03/03 20:02:38 arthchan2003
50 * Removed C++ styles comment. This will make options -ansi and -std=c89 happy
51 *
52 * Revision 1.19 2006/03/02 22:11:56 arthchan2003
53 * Fixed dox-doc.
54 *
55 * Revision 1.18 2006/03/01 20:03:55 arthchan2003
56 * Do encoding conversion when the encodings are different. This will avoid a lot of weird characters.
57 *
58 * Revision 1.17 2006/02/24 13:38:08 arthchan2003
59 * Added lm_read, it is a simple version of lm_read_advance.
60 *
61 * Revision 1.16 2006/02/23 04:16:29 arthchan2003
62 * Merged from SPHINX3_5_2_RCI_IRII_BRANCH:
63 * Splited the original lm.c into five parts,
64 * a, lm.c - a controller of other subroutines.
65 * b, lm_3g.c - implement TXT-based lm operations
66 * c, lm_3g_dmp.c - implement DMP-based lm operations
67 * d, lm_attfsm.c - implement FSM-based lm operations
68 * e, lmset.c - implement sets of lm.
69 *
70 *
71 * Revision 1.14.4.9 2006/01/16 19:56:37 arthchan2003
72 * 1, lm_rawscore doesn't need a language weight, 2, Support dumping the LM in FST format. This code used Yannick Esteve's and LIUM code.
73 *
74 * Revision 1.14.4.8 2005/11/17 06:18:49 arthchan2003
75 * Added a string encoding conversion routine in lm.c. Currently it only works for converting hex to its value.
76 *
77 * Revision 1.14.4.7 2005/10/17 04:49:13 arthchan2003
78 * Free resource of lm_t and lmset_t correctly.
79 *
80 * Revision 1.14.4.6 2005/09/07 23:30:26 arthchan2003
81 * Changed error message for LM dump.
82 *
83 * Revision 1.14.4.5 2005/08/02 21:10:18 arthchan2003
84 * Added function declaration for lm_read_dump.
85 *
86 * Revision 1.14.4.4 2005/07/17 05:24:23 arthchan2003
87 * (Incomplete) Added lm_arbitrary.[ch], an arbitrary n-gram data structure. Far from completed. Don't expect too much.
88 *
89 * Revision 1.14.4.3 2005/07/13 01:44:17 arthchan2003
90 * 1, Moved text formatted LM code into lm_3g.c, 2 Changed lm_read such that it will work with both TXT file format and DMP file format. 3, Added function lm_write to handle lm writing.
91 *
92 * Revision 1.14.4.2 2005/07/05 21:31:25 arthchan2003
93 * Merged from HEAD.
94 *
95 * Revision 1.15 2005/07/05 13:12:37 dhdfu
96 * Add new arguments to logs3_init() in some tests, main_ep
97 *
98 * Revision 1.14.4.1 2005/07/03 22:58:56 arthchan2003
99 * tginfo and membg 's memory were not deallocated at all. This change fixed it.
100 *
101 * Revision 1.14 2005/06/21 22:24:02 arthchan2003
102 * Log. In this change, I introduced a new interface for lm ,which is
103 * call lmset_t. lmset_t wraps up multiple lm, n_lm, n_alloclm into the
104 * same structure and handle LM initialization (lm_init) switching,
105 * (lmset_curlm_widx), delete LM (lmset_delete_lm). The internal
106 * structure is called lmarray and is an array of pointers of lm. The
107 * current lm is always maintained and pointed by a pointer called cur_lm
108 * . This substantially clarify the structure of the code. At this
109 * check-in, not every core function of lmset is completed.
110 * e.g. lmset_add_lm because that required testing of several LM reading
111 * routines and could be quite time-consuming.
112 *
113 * Log. Another notable change is the fact dict2lmwid map is started to
114 * be part of the LM. The reason of this is clearly described inside the
115 * code. Don't want to repeat here.
116 *
117 * Log. The new interface has been already used broadly in both Sphinx
118 * 3.0 and sphinx 3.x family of tools.
119 *
120 * Revision 1.4 2005/06/18 03:22:28 archan
121 * Add lmset_init. A wrapper function of various LM initialization and initialize an lmset It is now used in decode, livepretend, dag and astar.
122 *
123 * Revision 1.3 2005/06/17 23:44:40 archan
124 * Sphinx3 to s3.generic, 1, Support -lmname in decode and livepretend. 2, Wrap up the initialization of dict2lmwid to lm initialization. 3, add Dave's trick in LM switching in mode 4 of the search.
125 *
126 * Revision 1.2 2005/05/10 21:21:53 archan
127 * Three functionalities added but not tested. Code on 1) addition/deletion of LM in mode 4. 2) reading text-based LM 3) Converting txt-based LM to dmp-based LM.
128 *
129 * Revision 1.1 2005/05/04 06:08:07 archan
130 * Refactor all lm routines except fillpen.c into ./libs3decoder/liblm/ . This will be equivalent to ./lib/liblm in future.
131 *
132 * Revision 1.6 2005/05/04 04:02:24 archan
133 * Implementation of lm addition, deletion in (mode 4) time-switching tree implementation of search. Not yet tested. Just want to keep up my own momentum.
134 *
135 * Revision 1.5 2005/04/20 03:37:59 archan
136 * LM code changes: functions are added to set, add and delete LM from the lmset, change the legacy lmset data structure to contain n_lm and n_alloc_lm.
137 *
138 * Revision 1.4 2005/03/30 16:28:34 archan
139 * delete test-full.log alog
140 *
141 * Revision 1.3 2005/03/30 01:22:47 archan
142 * Fixed mistakes in last updates. Add
143 *
144 *
145 * 20.Apr.2001 RAH (rhoughton@mediasite.com, ricky.houghton@cs.cmu.edu)
146 * Adding lm_free() to free allocated memory
147 *
148 * 30-Dec-2000 Rita Singh (rsingh@cs.cmu.edu) at Carnegie Mellon University
149 * Removed language weight application to wip. To maintain
150 * comparability between s3decode and current decoder. Does
151 * not affect decoding performance.
152 *
153 * 23-Feb-2000 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
154 * Bugfix: Applied language weight to word insertion penalty.
155 *
156 * 24-Jun-97 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
157 * Added lm_t.access_type; made lm_wid externally visible.
158 *
159 * 24-Jun-97 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
160 * Added lm_t.log_bg_seg_sz and lm_t.bg_seg_sz.
161 *
162 * 13-Feb-97 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University.
163 * Creating from original S3 version.
164 */
165
166
167 #include <string.h>
168
169 #include "lm.h"
170 #include "bio.h"
171 #include "logs3.h"
172 #include "wid.h"
173 #include "encoding.h"
174
175 /*ARCHAN, 20041112: NOP, NO STATIC VARIABLES! */
176
177 extern lm_t *lm_read_txt(const char *filename, /**< The file name */
178 const int lminmemory, /**< Whether using in memory LM */
179 int *err_no, /**< Input/Output: Depends on the problem that LM
180 reading encounters, it could be errors
181 from -2 (LM_OFFSET_TOO_LARGE) to
182 -15 (LM_CANNOT_ALLOCATE). Please checkout
183 lm.h for details.
184 */
185 int32 isforced32bit, /** Input: normally, we should let lm_read_txt
186 to decide whether a file is 32 bit or not.
187 When the lm_read_txt couldn't decide that before
188 reading or if more specificially when we hit
189 the LM segment size problems. Then this bit
190 will alter the reading behavior to 32 bit.
191 */
192 logmath_t *logmath
193 );
194
195 extern lm_t *lm_read_dump(const char *file, /**< The file name*/
196 int lminmemory, /**< Whether using in memory LM */
197 logmath_t *logmath
198 );
199
200
201 int32 lm3g_dump(char const *file, /**< the file name */
202 lm_t * model, /**< the langauge model for output */
203 char const *lmfile, /**< the lm file name */
204 int32 mtime, /**< LM file modification date */
205 int32 noBits /**< Number of bits of DMP format */
206 );
207
208 /**
209 Writer of lm in ARPA text format
210 */
211 int32 lm_write_arpa_text(lm_t * lmp, /**< the pointer of the language model */
212 const char *outputfn, /**< the output file name */
213 const char *inputenc, /**< The input encoding method */
214 const char *outputenc /**< The output encoding method */
215 );
216
217 /**
218 Writer of lm in FST format
219 */
220
221 int32 lm_write_att_fsm(lm_t * lm, /**< the languauge model pointer */
222 const char *filename/**< output file name */
223 );
224
225
226 /**
227 The function to return whether an LM should be 32bit or not.
228 It is decided by whether we are using 32bit mode DMP. Or whether
229 it is LMTXT_VERSION but with more than 0xffff words. The final
230 criterion is when LMFORCE_TXT32VERSION.
231 */
232 int32
lm_is32bits(lm_t * lm)233 lm_is32bits(lm_t * lm)
234 {
235 if (lm->version == LMDMP_VERSION_TG_32BIT)
236 return 1;
237 if (lm->version == LMFORCED_TXT32VERSION)
238 return 1;
239 if (lm->version == LMTXT_VERSION && lm->n_ug > LM_LEGACY_CONSTANT)
240 return 1;
241 if (lm->version == LMFST_VERSION && lm->n_ug > LM_LEGACY_CONSTANT)
242 return 1;
243
244 return 0;
245 }
246
247
248 int32
lm_get_classid(lm_t * model,const char * name)249 lm_get_classid(lm_t * model, const char *name)
250 {
251 int32 i;
252
253 if (!model->lmclass)
254 return BAD_LMCLASSID;
255
256 for (i = 0; i < model->n_lmclass; i++) {
257 if (strcmp(lmclass_getname(model->lmclass[i]), name) == 0)
258 return (i + LM_CLASSID_BASE);
259 }
260 return BAD_LMCLASSID;
261 }
262
263
264
265 void
lm_null_struct(lm_t * lm)266 lm_null_struct(lm_t * lm)
267 {
268 lm->name = NULL;
269 lm->wordstr = NULL;
270
271 lm->ug = NULL;
272 lm->bg = NULL;
273 lm->tg = NULL;
274 lm->membg = NULL;
275 lm->tginfo = NULL;
276 lm->tgcache = NULL;
277 lm->dict2lmwid = NULL;
278
279 lm->bg32 = NULL;
280 lm->tg32 = NULL;
281 lm->membg32 = NULL;
282 lm->tginfo32 = NULL;
283 lm->tgcache32 = NULL;
284
285 lm->bgprob = NULL;
286 lm->tgprob = NULL;
287 lm->tgbowt = NULL;
288
289 lm->tg_segbase = NULL;
290 lm->lmclass = NULL;
291 lm->inclass_ugscore = NULL;
292 lm->logmath = NULL;
293 }
294
295 /* Apply unigram weight; should be part of LM creation, but... */
296 static void
lm_uw(lm_t * lm,float64 uw)297 lm_uw(lm_t * lm, float64 uw)
298 {
299 int32 i, loguw, loguw_, loguniform, p1, p2;
300
301 /* Interpolate unigram probs with uniform PDF, with weight uw */
302 loguw = logs3(lm->logmath, uw);
303 loguw_ = logs3(lm->logmath, 1.0 - uw);
304 loguniform = logs3(lm->logmath, 1.0 / (lm->n_ug - 1)); /* Skipping S3_START_WORD */
305
306 for (i = 0; i < lm->n_ug; i++) {
307 if (strcmp(lm->wordstr[i], S3_START_WORD) != 0) {
308 p1 = lm->ug[i].prob.l + loguw;
309 p2 = loguniform + loguw_;
310 lm->ug[i].prob.l = logmath_add(lm->logmath, p1, p2);
311 }
312 }
313 }
314
315
316 static void
lm2logs3(lm_t * lm,float64 uw)317 lm2logs3(lm_t * lm, float64 uw)
318 {
319 int32 i;
320
321 for (i = 0; i < lm->n_ug; i++) {
322 lm->ug[i].prob.l = logmath_log10_to_log(lm->logmath, lm->ug[i].prob.f);
323
324 /* This prevent underflow if the backoff value is too small
325 It happens sometimes in cmu-lmtk V3's lm_combine.
326 */
327
328 if (lm->ug[i].bowt.f < MIN_PROB_F)
329 lm->ug[i].bowt.f = MIN_PROB_F;
330
331 lm->ug[i].bowt.l = logmath_log10_to_log(lm->logmath, lm->ug[i].bowt.f);
332 }
333
334 lm_uw(lm, uw);
335
336 for (i = 0; i < lm->n_bgprob; i++)
337 lm->bgprob[i].l = logmath_log10_to_log(lm->logmath, lm->bgprob[i].f);
338
339 if (lm->n_tg > 0) {
340 for (i = 0; i < lm->n_tgprob; i++)
341 lm->tgprob[i].l = logmath_log10_to_log(lm->logmath, lm->tgprob[i].f);
342 for (i = 0; i < lm->n_tgbowt; i++) {
343
344 if (lm->tgbowt[i].f < MIN_PROB_F)
345 lm->tgbowt[i].f = MIN_PROB_F;
346
347 lm->tgbowt[i].l = logmath_log10_to_log(lm->logmath, lm->tgbowt[i].f);
348 }
349 }
350 }
351
352
353 void
lm_set_param(lm_t * lm,float64 lw,float64 wip)354 lm_set_param(lm_t * lm, float64 lw, float64 wip)
355 {
356 int32 i, iwip;
357 float64 f;
358
359 if (lw <= 0.0)
360 E_FATAL("lw = %e\n", lw);
361 if (wip <= 0.0)
362 E_FATAL("wip = %e\n", wip);
363 #if 0 /* No lang weight on wip */
364 iwip = logs3(lm->logmath, wip) * lw;
365 #endif
366 iwip = logs3(lm->logmath, wip);
367
368 f = lw / lm->lw;
369
370 for (i = 0; i < lm->n_ug; i++) {
371 lm->ug[i].prob.l =
372 (int32) ((lm->ug[i].prob.l - lm->wip) * f) + iwip;
373 lm->ug[i].bowt.l = (int32) (lm->ug[i].bowt.l * f);
374 }
375
376 for (i = 0; i < lm->n_bgprob; i++)
377 lm->bgprob[i].l = (int32) ((lm->bgprob[i].l - lm->wip) * f) + iwip;
378
379 if (lm->n_tg > 0) {
380 for (i = 0; i < lm->n_tgprob; i++)
381 lm->tgprob[i].l =
382 (int32) ((lm->tgprob[i].l - lm->wip) * f) + iwip;
383 for (i = 0; i < lm->n_tgbowt; i++)
384 lm->tgbowt[i].l = (int32) (lm->tgbowt[i].l * f);
385 }
386
387 lm->lw = (float32) lw;
388 lm->wip = iwip;
389 }
390
391
392 int32
lm_add_wordlist(lm_t * lm,dict_t * dict,const char * filename)393 lm_add_wordlist(lm_t * lm, /**< In/Out: a modified LM structure */
394 dict_t * dict, /**< In: a dictionary */
395 const char *filename /**< In: a file that contains a
396 list of word one wants to
397 add*/
398 )
399 {
400 FILE *fp;
401 char string[1024];
402 char word[1024];
403 int32 n;
404
405 fp = NULL;
406 if ((fp = fopen(filename, "r")) == NULL) {
407 E_ERROR("Cannot open file %s\n", filename);
408 return LM_FAIL;
409 }
410
411 while (fgets(string, sizeof(string), fp) != NULL) {
412 n = sscanf(string, "%s", word);
413 if (n != 1) {
414 E_INFO
415 ("Detecting more than 1 word in one line. Only using the first word. \n");
416 return LM_FAIL;
417 }
418 E_INFO("%s\n", word);
419 if (lm_add_word_to_ug(lm, dict, word) == LM_FAIL)
420 E_INFO("Fail to add word %s into the unigram\n", word);
421 }
422
423 if (lm == NULL) {
424 E_ERROR("LM pointer is NULL. lm_add_wordlist failed.\n");
425 return LM_FAIL;
426 }
427
428 fclose(fp);
429 return LM_SUCCESS;
430 }
431
432 /*
433 INCOMPLETE
434 */
435 int32
lm_add_word_to_ug(lm_t * lm,dict_t * dict,const char * newword)436 lm_add_word_to_ug(lm_t * lm, /**<In/Out: a modified LM structure */
437 dict_t * dict, /**< In: an initialized dictionary structure */
438 const char *newword /**< In: a new word */
439 )
440 {
441 s3wid_t w;
442 s3lmwid_t lwid;
443 void *id;
444 int32 classid = BAD_LMCLASSID;
445
446 /** ARCHAN 20060320
447 Add a word into the unigram.
448 look up the dictionary and see whether it exists in the dictionary
449 Looks alike with wid.c's logic at this point.
450
451 We also avoid the addition of classes at this point because that
452 could complicated things quite a lot */
453
454 /** Reallocate the size of lm->ug, lm->wordstr
455 Update the value lm->n_ug, lm->max_ug;
456 */
457
458 if (hash_table_lookup(lm->HT, newword, &id) == 0) {
459 E_WARN("The word %s already exists in the language model \n",
460 newword);
461 return LM_FAIL;
462 }
463
464 lm->n_ug = lm->n_ug + 1;
465 lm->max_ug = lm->n_ug;
466
467 E_INFO("lm->n_ug %d\n", lm->n_ug);
468 lm->ug = (ug_t *) ckd_realloc(lm->ug, (lm->n_ug + 1) * sizeof(ug_t)); /* Yes, +2 look at NewUnigramModel(n_ug+1) */
469 lm->wordstr =
470 (char **) ckd_realloc(lm->wordstr, (lm->n_ug) * sizeof(char *));
471
472 /** Reallocate the size of lm->membg
473 and lm->tginfo
474 */
475
476 if (!lm->is32bits) {
477 lm->membg =
478 (membg_t *) ckd_realloc(lm->membg,
479 (lm->n_ug) * sizeof(membg_t));
480 lm->tginfo =
481 (tginfo_t **) ckd_realloc(lm->tginfo,
482 (lm->n_ug) * sizeof(tginfo_t *));
483 lm->tginfo[lm->n_ug - 1] = NULL;
484 }
485 else {
486 lm->membg32 =
487 (membg32_t *) ckd_realloc(lm->membg32,
488 (lm->n_ug) * sizeof(membg32_t));
489 lm->tginfo32 =
490 (tginfo32_t **) ckd_realloc(lm->tginfo32,
491 (lm->n_ug) * sizeof(tginfo32_t *));
492 lm->tginfo32[lm->n_ug - 1] = NULL;
493 }
494
495
496 E_WARN("Invoke incomplete lm_add_word_to_ug\n");
497
498 /** Insert the entry into lm->ug and lm->wordstr */
499
500 /*
501 This part is not compeleted, prob.f should be the second best
502 unigram probability. This is a fairly standard that was used by
503 Dragon and also recommended by Roni.
504 */
505
506 lm->ug[lm->n_ug].prob.f = -99.0;
507 lm->ug[lm->n_ug].bowt.f = -99.0;
508 lm->ug[lm->n_ug].dictwid = lm->n_ug; /* See the comment in ug_t, this is not exactly correct
509 externally application needs to set it again.
510 */
511
512 /* Supposingly, the bigram should follow the unigram order.
513 Because, we have no bigram inserted in this case, the
514 unigram.firstbg will just follow the previous one. */
515
516 lm->ug[lm->n_ug].firstbg = lm->ug[lm->n_ug - 1].firstbg;
517
518 lm->wordstr[lm->n_ug - 1] = (char *) ckd_salloc(newword);
519
520 hash_table_enter(lm->HT, lm->wordstr[lm->n_ug - 1], (void *)(long)(lm->n_ug - 1));
521
522 if (dict != NULL) {
523 /** If dictionary is initialized and used in this context */
524 /** Insert the mapping from LM WID to dictionary Word ID */
525 w = dict_wordid(dict, newword);
526
527 if (lm->lmclass)
528 classid = lm_get_classid(lm, newword);
529
530 lwid = lm->dict2lmwid[w];
531
532 E_INFO("%d\n", lwid);
533
534 if (IS_S3WID(w)) {
535 if ((lm->lmclass) && (classid != BAD_LMCLASSID)) {
536 E_ERROR("%s is both a word and an LM class name\n",
537 lm_wordstr(lm, lm->n_ug - 1));
538 return LM_FAIL;
539 }
540 else {
541 if (dict_filler_word(dict, w))
542 E_ERROR("Filler dictionary word '%s' found in LM\n",
543 lm_wordstr(lm, lm->n_ug - 1));
544
545 if (w != dict_basewid(dict, w)) {
546 E_ERROR
547 ("LM word '%s' is an alternative pronunciation in dictionary\n",
548 lm_wordstr(lm, lm->n_ug - 1));
549
550 w = dict_basewid(dict, w);
551 lm_lmwid2dictwid(lm, lm->n_ug - 1) = w;
552 }
553
554 for (; IS_S3WID(w); w = dict_nextalt(dict, w))
555 lm->dict2lmwid[w] = (s3lmwid32_t) (lm->n_ug - 1);
556 }
557 }
558 else {
559 E_ERROR
560 ("Thew new word is not in the dictionary. We will not do anything in this case\n");
561 return LM_FAIL;
562 }
563
564 }
565 return LM_SUCCESS;
566 }
567
568 lm_t *
lm_read(const char * file,const char * lmname,cmd_ln_t * config,logmath_t * logmath)569 lm_read(const char *file, const char *lmname, cmd_ln_t *config, logmath_t *logmath)
570 {
571 return lm_read_advance(file,
572 lmname,
573 cmd_ln_float32_r(config, "-lw"),
574 cmd_ln_float32_r(config, "-wip"),
575 cmd_ln_float32_r(config, "-uw"), 0, NULL, 1, logmath);
576 }
577
578 lm_t *
lm_read_advance(const char * file,const char * lmname,float64 lw,float64 wip,float64 uw,int32 ndict,const char * fmt,int32 applyWeight,logmath_t * logmath)579 lm_read_advance(const char *file, const char *lmname, float64 lw,
580 float64 wip, float64 uw, int32 ndict, const char *fmt,
581 int32 applyWeight, logmath_t *logmath)
582 {
583 return lm_read_advance2(file, lmname, lw, wip, uw, ndict, fmt, applyWeight, 0, logmath);
584 }
585
586 lm_t *
lm_read_advance2(const char * file,const char * lmname,float64 lw,float64 wip,float64 uw,int32 ndict,const char * fmt,int32 applyWeight,int lminmemory,logmath_t * logmath)587 lm_read_advance2(const char *file, const char *lmname, float64 lw,
588 float64 wip, float64 uw, int32 ndict, const char *fmt,
589 int32 applyWeight, int lminmemory, logmath_t *logmath)
590 {
591 int32 i, u;
592 lm_t *lm;
593 int32 err_no;
594
595 if (!file)
596 E_FATAL("No LM file\n");
597 if (lw <= 0.0)
598 E_FATAL("lw = %e\n", lw);
599 if (wip <= 0.0)
600 E_FATAL("wip = %e\n", wip);
601 if ((uw < 0.0) || (uw > 1.0))
602 E_FATAL("uw = %e\n", uw);
603
604 /* HACK: At this part, one should check whether the LM name is being used already */
605
606 E_INFO("LM read('%s', lw= %.2f, wip= %.2f, uw= %.2f)\n", file, lw, wip,
607 uw);
608 E_INFO("Reading LM file %s (LM name \"%s\")\n", file, lmname);
609
610 /* First it will try to decide whether the file a .DMP file */
611 /* ARCHAN: We should provide function pointer implementation at here. */
612 if (fmt == NULL) {
613 /**Automatically decide the LM format */
614 lm = lm_read_dump(file, lminmemory, logmath);
615 if (lm == NULL) {
616 E_INFO("In lm_read, LM is not a DMP file. Trying to read it as a txt file\n");
617 if (lminmemory == 0) {
618 E_WARN("On-disk LM not supported for text files, reading it into memory.\n");
619 lminmemory = 1;
620 }
621 lm = lm_read_txt(file, lminmemory, &err_no, 0, logmath); /* Not forcing 32bit LM */
622 if (lm == NULL) {
623 if (err_no == LM_OFFSET_TOO_LARGE) {
624 E_INFO
625 ("In lm read, LM is not a DMP, it is likely to be a ARPA format file. But the LM hits the limit of legacy 16 bit format. Force LM reading to 32bit now\n");
626
627 /* This only happens when both TXT & DMP format reading have problems */
628 lm = lm_read_txt(file, lminmemory, &err_no, 1, logmath); /* Now force 32bit LM */
629 if (lm == NULL) {
630 E_INFO
631 ("Panic: In lm_read, LM is not DMP format, it is likely to be ARPA format and hits legacy 16 bit format problem. But when forcing to 32bit LM, problem still couldn't be solved.\n");
632 return NULL;
633 }
634 }
635 else {
636 E_INFO("Lm is both not DMP and TXT format\n");
637 return NULL;
638 }
639 }
640 }
641 }
642 else if (!strcmp(fmt, "TXT")) {
643 lm = lm_read_txt(file, lminmemory, &err_no, 0, logmath); /* Not forcing 32bit LM */
644 if (lm == NULL) {
645 if (err_no == LM_OFFSET_TOO_LARGE) {
646 E_INFO
647 ("In lm read, LM is not a DMP, it is likely to be a ARPA format file. But the LM hits the limit of legacy 16 bit format. Force LM reading to 32bit now\n");
648
649 /* This only happens when both TXT & DMP format reading have problems */
650 lm = lm_read_txt(file, lminmemory, &err_no, 1, logmath); /* Now force 32bit LM */
651 if (lm == NULL) {
652 E_INFO
653 ("Panic: In lm_read, LM is not DMP format, it is likely to be ARPA format and hits legacy 16 bit format problem. But when forcing to 32bit LM, problem still couldn't be solved.\n");
654 return NULL;
655 }
656 }
657 else {
658 E_INFO("LM is not in TXT format\n");
659 return NULL;
660 }
661 }
662
663 }
664 else if (!strcmp(fmt, "DMP")) {
665 lm = lm_read_dump(file, lminmemory, logmath);
666 if (lm == NULL) {
667 E_INFO
668 ("In lm_read, a DMP format reader is called, but lm cannot be read, Diagnosis: LM is corrupted or not enough memory.\n");
669 return NULL;
670 }
671 }
672 else if (!strcmp(fmt, "TXT32")) {
673 lm = lm_read_txt(file, lminmemory, &err_no, 1, logmath);
674 if (lm == NULL) {
675 E_INFO("In lm_read, failed to read lm in txt format. .\n");
676 return NULL;
677 }
678 }
679 else {
680 E_INFO("Unknown format (%s) is specified\n", fmt);
681 return NULL;
682 }
683
684
685 lm->name = ckd_salloc(lmname);
686 lm->inputenc = IND_BADENCODING;
687 lm->outputenc = IND_BADENCODING;
688
689 lm->is32bits = lm_is32bits(lm);
690
691 E_INFO("The LM routine is operating at %d bits mode\n",
692 lm->is32bits ? 32 : 16);
693
694 /* Initialize the fast trigram cache, with all entries invalid */
695 if (lm->n_tg > 0) {
696 if (lm->is32bits) {
697 lm->tgcache32 =
698 (lm_tgcache_entry32_t *) ckd_calloc(LM_TGCACHE_SIZE,
699 sizeof
700 (lm_tgcache_entry32_t));
701 for (i = 0; i < LM_TGCACHE_SIZE; i++)
702 lm->tgcache32[i].lwid[0] = (s3lmwid32_t) BAD_LMWID(lm);
703 }
704 else {
705 lm->tgcache =
706 (lm_tgcache_entry_t *) ckd_calloc(LM_TGCACHE_SIZE,
707 sizeof(lm_tgcache_entry_t));
708 for (i = 0; i < LM_TGCACHE_SIZE; i++)
709 lm->tgcache[i].lwid[0] = (s3lmwid_t) BAD_LMWID(lm);
710 }
711 }
712
713 if (applyWeight) {
714 lm2logs3(lm, uw); /* Applying unigram weight; convert to logs3 values */
715
716 /* Apply the new lw and wip values */
717 lm->lw = 1.0; /* The initial settings for lw and wip */
718 lm->wip = 0; /* logs3(1.0) */
719 lm_set_param(lm, lw, wip);
720 }
721
722
723 assert(lm);
724 /* Set the size of dictionary */
725 lm->dict_size = ndict;
726 /* E_INFO("lm->dict %d\n",lm->dict_size); */
727 for (u = 0; u < lm->n_ug; u++)
728 lm->ug[u].dictwid = BAD_S3WID;
729
730
731 return lm;
732 }
733
734 /*
735 This convert every string in the lm from lmp->inputenc to
736 lm->outputenc. This function assumes the caller has checked the
737 encoding schemes appropriateness.
738
739 (Caution!) At 20051115, the method is specific and only support hex
740 to value conversion. The code also hasn't considered that output
741 encoding requires a longer length of string than the input encoding.
742 */
743 static void
lm_convert_encoding(lm_t * lmp)744 lm_convert_encoding(lm_t * lmp)
745 {
746 int i;
747
748 E_INFO("Encoding Conversion\n");
749 for (i = 0; i < lmp->n_ug; i++) {
750 #if 0
751 E_INFO("%s\n", lmp->wordstr[i]);
752 #endif
753
754 if (ishex(lmp->wordstr[i])) {
755 hextocode(lmp->wordstr[i]);
756 }
757
758 #if 0
759 E_INFO("%s\n", lmp->wordstr[i]);
760 #endif
761 }
762 }
763
764 int32
lm_write_advance(lm_t * lmp,const char * outputfn,const char * filename,const char * fmt,const char * inputenc,char * outputenc)765 lm_write_advance(lm_t * lmp, const char *outputfn, const char *filename,
766 const char *fmt, const char *inputenc, char *outputenc)
767 {
768 /* This might be duplicated with the caller checking but was done for extra safety. */
769
770 assert(encoding_resolve(inputenc, outputenc));
771
772 lmp->inputenc = encoding_str2ind(inputenc);
773 lmp->outputenc = encoding_str2ind(outputenc);
774
775 if (lmp->inputenc != lmp->outputenc) {
776 E_INFO("Did I come here?\n");
777 lm_convert_encoding(lmp);
778 }
779
780 if (!strcmp(fmt, "TXT")) {
781
782 return lm_write_arpa_text(lmp, outputfn, inputenc, outputenc);
783
784 }
785 else if (!strcmp(fmt, "DMP")) {
786
787 /* set mtime to be zero because sphinx3 has no mechanism to check
788 whether the file is generated earlier (at least for now.) */
789
790 if (lm_is32bits(lmp)) {
791 E_INFO
792 ("16 bit DMP format is specified but LM is decided to be 32 bit mode. (May be it has segment size which is large than 64k or programmer forced it).\n",
793 LM_LEGACY_CONSTANT);
794 E_INFO("Now use 32 bits format.\n");
795 return lm3g_dump(outputfn, lmp, filename, 0, 32);
796 }
797 else {
798 return lm3g_dump(outputfn, lmp, filename, 0, 16);
799 }
800
801
802 }
803 else if (!strcmp(fmt, "DMP32")) {
804
805 /* set mtime to be zero because sphinx3 has no mechanism to check
806 whether the file is generated earlier (at least for now.) */
807
808 return lm3g_dump(outputfn, lmp, filename, 0, 32);
809
810 }
811 else if (!strcmp(fmt, "FST")) {
812
813 E_WARN("Invoke un-tested ATT-FSM writer\n");
814 return lm_write_att_fsm(lmp, outputfn);
815
816 }
817 else {
818
819 E_INFO("Unknown format (%s) is specified\n", fmt);
820 return LM_FAIL;
821 }
822 }
823
824 int32
lm_write(lm_t * lmp,const char * outputfn,const char * filename,const char * fmt)825 lm_write(lm_t * lmp, const char *outputfn, const char *filename, const char *fmt)
826 {
827 return lm_write_advance(lmp, outputfn, filename, fmt, "iso8859-1",
828 "iso8859-1");
829 }
830
831
832 /*
833 * Free stale bigram and trigram info, those not used since last reset.
834 */
835 void
lm_cache_reset(lm_t * lm)836 lm_cache_reset(lm_t * lm)
837 {
838 int32 i, n_bgfree, n_tgfree;
839 tginfo_t *tginfo, *next_tginfo, *prev_tginfo;
840 tginfo32_t *tginfo32, *next_tginfo32, *prev_tginfo32;
841 int32 is32bits;
842
843 n_bgfree = n_tgfree = 0;
844
845
846 /* ARCHAN: RAH only short-circult this function only */
847 if (lm->isLM_IN_MEMORY) /* RAH We are going to short circuit this if we are running with the lm in memory */
848 return;
849
850 is32bits = lm->is32bits;
851
852 if ((lm->n_bg > 0) && (!lm->bg)) { /* Disk-based; free "stale" bigrams */
853
854 if (is32bits) {
855 for (i = 0; i < lm->n_ug; i++) {
856 if (lm->membg32[i].bg32 && (!lm->membg32[i].used)) {
857 lm->n_bg_inmem -=
858 lm->ug[i + 1].firstbg - lm->ug[i].firstbg;
859
860 ckd_free(lm->membg32[i].bg32);
861 lm->membg32[i].bg32 = NULL;
862 n_bgfree++;
863 }
864
865 lm->membg32[i].used = 0;
866 }
867 }
868 else {
869 for (i = 0; i < lm->n_ug; i++) {
870 if (lm->membg[i].bg && (!lm->membg[i].used)) {
871 lm->n_bg_inmem -=
872 lm->ug[i + 1].firstbg - lm->ug[i].firstbg;
873
874 ckd_free(lm->membg[i].bg);
875 lm->membg[i].bg = NULL;
876 n_bgfree++;
877 }
878
879 lm->membg[i].used = 0;
880 }
881 }
882 }
883
884 if (lm->n_tg > 0) {
885 if (is32bits) {
886 for (i = 0; i < lm->n_ug; i++) {
887 prev_tginfo32 = NULL;
888 for (tginfo32 = lm->tginfo32[i]; tginfo32;
889 tginfo32 = next_tginfo32) {
890 next_tginfo32 = tginfo32->next;
891
892 if (!tginfo32->used) {
893 if ((!lm->tg32) && tginfo32->tg32) {
894 lm->n_tg_inmem -= tginfo32->n_tg;
895 ckd_free(tginfo32->tg32);
896 n_tgfree++;
897 }
898
899 ckd_free(tginfo32);
900 if (prev_tginfo32)
901 prev_tginfo32->next = next_tginfo32;
902 else
903 lm->tginfo32[i] = next_tginfo32;
904 }
905 else {
906 tginfo32->used = 0;
907 prev_tginfo32 = tginfo32;
908 }
909 }
910 }
911 }
912 else {
913 for (i = 0; i < lm->n_ug; i++) {
914 prev_tginfo = NULL;
915 for (tginfo = lm->tginfo[i]; tginfo; tginfo = next_tginfo) {
916 next_tginfo = tginfo->next;
917
918 if (!tginfo->used) {
919 if ((!lm->tg) && tginfo->tg) {
920 lm->n_tg_inmem -= tginfo->n_tg;
921 ckd_free(tginfo->tg);
922 n_tgfree++;
923 }
924
925 free(tginfo);
926 if (prev_tginfo)
927 prev_tginfo->next = next_tginfo;
928 else
929 lm->tginfo[i] = next_tginfo;
930 }
931 else {
932 tginfo->used = 0;
933 prev_tginfo = tginfo;
934 }
935 }
936 }
937 }
938 }
939
940 if ((n_tgfree > 0) || (n_bgfree > 0)) {
941 E_INFO("%d tg frees, %d in mem; %d bg frees, %d in mem\n",
942 n_tgfree, lm->n_tg_inmem, n_bgfree, lm->n_bg_inmem);
943 }
944 }
945
946
947 void
lm_cache_stats_dump(lm_t * lm)948 lm_cache_stats_dump(lm_t * lm)
949 {
950 E_INFO
951 ("%9d tg(), %9d tgcache, %8d bo; %5d fills, %8d in mem (%.1f%%)\n",
952 lm->n_tg_score, lm->n_tgcache_hit, lm->n_tg_bo, lm->n_tg_fill,
953 lm->n_tg_inmem, (lm->n_tg_inmem * 100.0) / (lm->n_tg + 1));
954 E_INFO("%8d bg(), %8d bo; %5d fills, %8d in mem (%.1f%%)\n",
955 lm->n_bg_score, lm->n_bg_bo, lm->n_bg_fill, lm->n_bg_inmem,
956 (lm->n_bg_inmem * 100.0) / (lm->n_bg + 1));
957
958 lm->n_tgcache_hit = 0;
959 lm->n_tg_fill = 0;
960 lm->n_tg_score = 0;
961 lm->n_tg_bo = 0;
962 lm->n_bg_fill = 0;
963 lm->n_bg_score = 0;
964 lm->n_bg_bo = 0;
965 }
966
967
968 int32
lm_ug_score(lm_t * lm,s3lmwid32_t lwid,s3wid_t wid)969 lm_ug_score(lm_t * lm, s3lmwid32_t lwid, s3wid_t wid)
970 {
971 if (NOT_LMWID(lm, lwid) || (lwid >= lm->n_ug))
972 E_FATAL("Bad argument (%d) to lm_ug_score\n", lwid);
973
974 lm->access_type = 1;
975
976 if (lm->inclass_ugscore)
977 return (lm->ug[lwid].prob.l + lm->inclass_ugscore[wid]);
978 else
979 return (lm->ug[lwid].prob.l);
980 }
981
982 int32
lm_ug_exists(lm_t * lm,s3lmwid32_t lwid)983 lm_ug_exists(lm_t * lm, s3lmwid32_t lwid)
984 {
985 if (NOT_LMWID(lm, lwid) || (lwid >= lm->n_ug))
986 return 0;
987 else
988 return 1;
989 }
990
991
992 int32
lm_uglist(lm_t * lm,ug_t ** ugptr)993 lm_uglist(lm_t * lm, ug_t ** ugptr)
994 {
995 *ugptr = lm->ug;
996 return (lm->n_ug);
997 }
998
999
1000 /* This create a mapping from either the unigram or words in a class*/
1001 int32
lm_ug_wordprob(lm_t * lm,dict_t * dict,int32 th,wordprob_t * wp)1002 lm_ug_wordprob(lm_t * lm, dict_t * dict, int32 th, wordprob_t * wp)
1003 {
1004 int32 i, j, n, p;
1005 s3wid_t w, dictid;
1006 lmclass_t *lmclass;
1007 lmclass_word_t *lm_cw;
1008 n = lm->n_ug;
1009
1010 for (i = 0, j = 0; i < n; i++) {
1011 w = lm->ug[i].dictwid;
1012 if (IS_S3WID(w)) { /*Is w>0? Then it can be either wid or class id */
1013 if (w < LM_CLASSID_BASE) { /*It is just a word */
1014 if ((p = lm->ug[i].prob.l) >= th) {
1015 wp[j].wid = w;
1016 wp[j].prob = p;
1017 j++;
1018 }
1019 }
1020 else { /* It is a class */
1021 lmclass = LM_CLASSID_TO_CLASS(lm, w); /* Get the class */
1022 lm_cw = lmclass_firstword(lmclass);
1023 while (lmclass_isword(lm_cw)) {
1024 dictid = lmclass_getwid(lm_cw);
1025
1026 /*E_INFO("Lookup dict_id using dict_basewid %d\n",dictid); */
1027 if (IS_S3WID(dictid)) {
1028 if (dictid != dict_basewid(dict, dictid)) {
1029 dictid = dict_basewid(dict, dictid);
1030 }
1031 if ((p =
1032 lm->ug[i].prob.l +
1033 lm->inclass_ugscore[dictid]) >= th) {
1034 wp[j].wid = dictid;
1035 wp[j].prob = lm->ug[i].prob.l;
1036 j++;
1037 }
1038 }
1039 else {
1040 E_INFO("Word %s cannot be found \n",
1041 lmclass_getword(lm_cw));
1042 }
1043
1044 lm_cw = lmclass_nextword(lmclass, lm_cw);
1045
1046 }
1047 }
1048 }
1049 }
1050
1051 return j;
1052 }
1053
1054
1055 /*
1056 * Load bigrams for the given unigram (LMWID) lw1 from disk into memory
1057 */
1058 static void
load_bg(lm_t * lm,s3lmwid32_t lw1)1059 load_bg(lm_t * lm, s3lmwid32_t lw1)
1060 {
1061 int32 i, n, b;
1062 bg_t *bg = NULL;
1063 bg32_t *bg32 = NULL;
1064
1065 int32 mem_sz;
1066 int32 is32bits;
1067
1068 b = lm->ug[lw1].firstbg; /* Absolute first bg index for ug lw1 */
1069 n = lm->ug[lw1 + 1].firstbg - b; /* Not including guard/sentinel */
1070
1071 is32bits = lm->is32bits;
1072 mem_sz = is32bits ? sizeof(bg32_t) : sizeof(bg_t);
1073
1074 if (lm->isLM_IN_MEMORY) { /* RAH, if LM_IN_MEMORY, then we don't need to go get it. */
1075 if (is32bits)
1076 bg32 = lm->membg32[lw1].bg32 = &lm->bg32[b];
1077 else
1078 bg = lm->membg[lw1].bg = &lm->bg[b];
1079 }
1080 else {
1081 if (is32bits)
1082 bg32 = lm->membg32[lw1].bg32 =
1083 (bg32_t *) ckd_calloc(n + 1, mem_sz);
1084 else
1085 bg = lm->membg[lw1].bg = (bg_t *) ckd_calloc(n + 1, mem_sz);
1086
1087 if (fseek(lm->fp, lm->bgoff + b * mem_sz, SEEK_SET) < 0)
1088 E_FATAL_SYSTEM("fseek failed\n");
1089
1090
1091 /* Need to read n+1 because obtaining tg count for one bg also depends on next bg */
1092 if (is32bits) {
1093 if (fread(bg32, mem_sz, n + 1, lm->fp) != (size_t) (n + 1))
1094 E_FATAL("fread failed\n");
1095 if (lm->byteswap) {
1096 for (i = 0; i <= n; i++)
1097 swap_bg32(&(bg32[i]));
1098 }
1099 }
1100 else {
1101 if (fread(bg, mem_sz, n + 1, lm->fp) != (size_t) (n + 1))
1102 E_FATAL("fread failed\n");
1103 if (lm->byteswap) {
1104 for (i = 0; i <= n; i++)
1105 swap_bg(&(bg[i]));
1106 }
1107 }
1108 }
1109 lm->n_bg_fill++;
1110 lm->n_bg_inmem += n;
1111 }
1112
1113
1114 #define BINARY_SEARCH_THRESH 16
1115
1116 /* Locate a specific bigram within a bigram list */
1117 int32
find_bg(bg_t * bg,int32 n,s3lmwid32_t w)1118 find_bg(bg_t * bg, int32 n, s3lmwid32_t w)
1119 {
1120 int32 i, b, e;
1121
1122 /* Binary search until segment size < threshold */
1123 b = 0;
1124 e = n;
1125 while (e - b > BINARY_SEARCH_THRESH) {
1126 i = (b + e) >> 1;
1127 if (bg[i].wid < w)
1128 b = i + 1;
1129 else if (bg[i].wid > w)
1130 e = i;
1131 else
1132 return i;
1133 }
1134
1135 /* Linear search within narrowed segment */
1136 for (i = b; (i < e) && (bg[i].wid != w); i++);
1137 return ((i < e) ? i : -1);
1138 }
1139
1140 /* Locate a specific bigram within a bigram list */
1141 int32
find_bg32(bg32_t * bg,int32 n,s3lmwid32_t w)1142 find_bg32(bg32_t * bg, int32 n, s3lmwid32_t w)
1143 {
1144 int32 i, b, e;
1145
1146 /* Binary search until segment size < threshold */
1147 b = 0;
1148 e = n;
1149 while (e - b > BINARY_SEARCH_THRESH) {
1150 i = (b + e) >> 1;
1151 if (bg[i].wid < w)
1152 b = i + 1;
1153 else if (bg[i].wid > w)
1154 e = i;
1155 else
1156 return i;
1157 }
1158
1159 /* Linear search within narrowed segment */
1160 for (i = b; (i < e) && (bg[i].wid != w); i++);
1161 return ((i < e) ? i : -1);
1162 }
1163
1164
1165 /*** Begin lm_bglist*/
1166 int32
lm_bglist(lm_t * lm,s3lmwid32_t w1,bg_t ** bgptr,int32 * bowt)1167 lm_bglist(lm_t * lm, s3lmwid32_t w1, bg_t ** bgptr, int32 * bowt)
1168 {
1169 int32 n;
1170
1171 if (NOT_LMWID(lm, w1) || (w1 >= lm->n_ug))
1172 E_FATAL("Bad w1 argument (%d) to lm_bglist\n", w1);
1173
1174 n = (lm->n_bg > 0) ? lm->ug[w1 + 1].firstbg - lm->ug[w1].firstbg : 0;
1175
1176 if (n > 0) {
1177 if (!lm->membg[w1].bg)
1178 load_bg(lm, w1);
1179 lm->membg[w1].used = 1;
1180
1181 *bgptr = lm->membg[w1].bg;
1182 *bowt = lm->ug[w1].bowt.l;
1183 }
1184 else {
1185 *bgptr = NULL;
1186 *bowt = 0;
1187 }
1188
1189 return (n);
1190 }
1191
1192 int32
lm_bg32list(lm_t * lm,s3lmwid32_t w1,bg32_t ** bgptr,int32 * bowt)1193 lm_bg32list(lm_t * lm, s3lmwid32_t w1, bg32_t ** bgptr, int32 * bowt)
1194 {
1195 int32 n;
1196
1197 if (NOT_LMWID(lm, w1) || (w1 >= lm->n_ug))
1198 E_FATAL("Bad w1 argument (%d) to lm_bglist\n", w1);
1199
1200 n = (lm->n_bg > 0) ? lm->ug[w1 + 1].firstbg - lm->ug[w1].firstbg : 0;
1201
1202 if (n > 0) {
1203 if (!lm->membg32[w1].bg32)
1204 load_bg(lm, w1);
1205 lm->membg32[w1].used = 1;
1206
1207 *bgptr = lm->membg32[w1].bg32;
1208 *bowt = lm->ug[w1].bowt.l;
1209 }
1210 else {
1211 *bgptr = NULL;
1212 *bowt = 0;
1213 }
1214
1215 return (n);
1216 }
1217
1218 /*** End lm_bglist*/
1219
1220 /*
1221 * This function look-ups the bigram score of p(lw2|lw1)
1222 * The information for lw2 and w2 are repeated because the legacy
1223 * implementation(since s3.2) of vithist used only LM wid rather
1224 * than dictionary wid.
1225 */
1226
1227 int32
lm_bg_score(lm_t * lm,s3lmwid32_t lw1,s3lmwid32_t lw2,s3wid_t w2)1228 lm_bg_score(lm_t * lm, s3lmwid32_t lw1, s3lmwid32_t lw2, s3wid_t w2)
1229 {
1230 int32 i, n, score;
1231 bg_t *bg = NULL;
1232 bg32_t *bg32 = NULL;
1233 int32 is32bits;
1234
1235 is32bits = lm->is32bits;
1236
1237 if ((lm->n_bg == 0) || (NOT_LMWID(lm, lw1)))
1238 return (lm_ug_score(lm, lw2, w2));
1239
1240 lm->n_bg_score++;
1241
1242 if (NOT_LMWID(lm, lw2) || (lw2 >= lm->n_ug))
1243 E_FATAL("Bad lw2 argument (%d) to lm_bg_score\n", lw2);
1244
1245 n = lm->ug[lw1 + 1].firstbg - lm->ug[lw1].firstbg;
1246
1247 if (n > 0) {
1248 if (is32bits) {
1249 if (!lm->membg32[lw1].bg32)
1250 load_bg(lm, lw1);
1251 lm->membg32[lw1].used = 1;
1252 bg32 = lm->membg32[lw1].bg32;
1253 i = find_bg32(bg32, n, lw2);
1254 }
1255 else {
1256 if (!lm->membg[lw1].bg)
1257 load_bg(lm, lw1);
1258 lm->membg[lw1].used = 1;
1259 bg = lm->membg[lw1].bg;
1260 i = find_bg(bg, n, lw2);
1261 }
1262 }
1263 else
1264 i = -1;
1265
1266 if (i >= 0) {
1267 if (is32bits)
1268 score = lm->bgprob[bg32[i].probid].l;
1269 else
1270 score = lm->bgprob[bg[i].probid].l;
1271
1272 if (lm->inclass_ugscore) { /*Only add within class prob if class information exists.
1273 Is actually ok to just add the score because if the word
1274 is not within-class. The returning scores will be 0. I just
1275 love to safe-guard it :-).
1276 */
1277 score += lm->inclass_ugscore[w2];
1278 }
1279
1280 lm->access_type = 2;
1281 }
1282 else {
1283 lm->n_bg_bo++;
1284 lm->access_type = 1;
1285 score = lm->ug[lw1].bowt.l + lm->ug[lw2].prob.l;
1286 }
1287
1288 return (score);
1289 }
1290
1291 int32
lm_bg_exists(lm_t * lm,s3lmwid32_t lw1,s3lmwid32_t lw2)1292 lm_bg_exists(lm_t * lm, s3lmwid32_t lw1, s3lmwid32_t lw2)
1293 {
1294 int32 i, n, score;
1295 bg_t *bg = NULL;
1296 bg32_t *bg32 = NULL;
1297 int32 is32bits;
1298
1299 is32bits = lm->is32bits;
1300
1301 if ((lm->n_bg == 0) || (NOT_LMWID(lm, lw1)))
1302 return 0;
1303
1304 if (NOT_LMWID(lm, lw2) || (lw2 >= lm->n_ug))
1305 return 0;
1306
1307 n = lm->ug[lw1 + 1].firstbg - lm->ug[lw1].firstbg;
1308
1309 if (n > 0) {
1310 if (is32bits) {
1311 if (!lm->membg32[lw1].bg32)
1312 load_bg(lm, lw1);
1313 lm->membg32[lw1].used = 1;
1314 bg32 = lm->membg32[lw1].bg32;
1315 i = find_bg32(bg32, n, lw2);
1316 }
1317 else {
1318 if (!lm->membg[lw1].bg)
1319 load_bg(lm, lw1);
1320 lm->membg[lw1].used = 1;
1321 bg = lm->membg[lw1].bg;
1322
1323 i = find_bg(bg, n, lw2);
1324 }
1325 }
1326 else
1327 i = -1;
1328
1329 if (i >= 0)
1330 return 1;
1331 else
1332 return 0;
1333
1334
1335 return (score);
1336 }
1337
1338
1339 static void
load_tg(lm_t * lm,s3lmwid32_t lw1,s3lmwid32_t lw2)1340 load_tg(lm_t * lm, s3lmwid32_t lw1, s3lmwid32_t lw2)
1341 {
1342 int32 i, n, b;
1343 int32 t = -1; /* Let's make sure that if t isn't initialized after the
1344 * "if" statement below, it makes things go bad */
1345 bg_t *bg = NULL;
1346 bg32_t *bg32 = NULL;
1347 tg_t *tg = NULL;
1348 tg32_t *tg32 = NULL;
1349 tginfo_t *tginfo = NULL;
1350 tginfo32_t *tginfo32 = NULL;
1351 int32 mem_sz_tg, mem_sz_tginfo;
1352 int32 is32bits;
1353
1354 is32bits = lm->is32bits;
1355 mem_sz_tg = is32bits ? sizeof(tg32_t) : sizeof(tg_t);
1356 mem_sz_tginfo = is32bits ? sizeof(tginfo32_t) : sizeof(tginfo_t);
1357
1358 /* First allocate space for tg information for bg lw1,lw2 */
1359
1360 if (is32bits) {
1361 tginfo32 = (tginfo32_t *) ckd_malloc(mem_sz_tginfo);
1362 tginfo32->w1 = lw1;
1363 tginfo32->tg32 = NULL;
1364 tginfo32->next = lm->tginfo32[lw2];
1365 lm->tginfo32[lw2] = tginfo32;
1366 }
1367 else {
1368 tginfo = (tginfo_t *) ckd_malloc(mem_sz_tginfo);
1369 tginfo->w1 = lw1;
1370 tginfo->tg = NULL;
1371 tginfo->next = lm->tginfo[lw2];
1372 lm->tginfo[lw2] = tginfo;
1373 }
1374
1375 /* Locate bigram lw1,lw2 */
1376
1377 b = lm->ug[lw1].firstbg;
1378 n = lm->ug[lw1 + 1].firstbg - b;
1379
1380
1381 /* Make sure bigrams for lw1, if any, loaded into memory */
1382 if (n > 0) {
1383 if (is32bits) {
1384 if (!lm->membg32[lw1].bg32)
1385 load_bg(lm, lw1);
1386 lm->membg32[lw1].used = 1;
1387 bg32 = lm->membg32[lw1].bg32;
1388 }
1389 else {
1390 if (!lm->membg[lw1].bg)
1391 load_bg(lm, lw1);
1392 lm->membg[lw1].used = 1;
1393 bg = lm->membg[lw1].bg;
1394 }
1395 }
1396
1397 /* At this point, n = #bigrams for lw1 */
1398 if (n > 0
1399 && (i =
1400 is32bits ? find_bg32(bg32, n, lw2) : find_bg(bg, n,
1401 lw2)) >= 0) {
1402
1403 /* if(i<0){
1404 E_INFO("What is the value of i %d, lw2 %d\n",i,lw2);
1405 } */
1406
1407 if (i >= 0) {
1408 if (is32bits)
1409 tginfo32->bowt = lm->tgbowt[bg32[i].bowtid].l;
1410 else
1411 tginfo->bowt = lm->tgbowt[bg[i].bowtid].l;
1412
1413
1414 /* Find t = Absolute first trigram index for bigram lw1,lw2 */
1415 b += i; /* b = Absolute index of bigram lw1,lw2 on disk */
1416 t = lm->tg_segbase[b >> lm->log_bg_seg_sz];
1417 t += is32bits ? bg32[i].firsttg : bg[i].firsttg;
1418
1419 /* E_INFO("%d %d\n",lm->tg_segbase[b >> lm->log_bg_seg_sz],t); */
1420 /* Find #tg for bigram w1,w2 */
1421 n = lm->tg_segbase[(b + 1) >> lm->log_bg_seg_sz];
1422 n += is32bits ? bg32[i + 1].firsttg : bg[i + 1].firsttg;
1423 n -= t;
1424
1425 if (is32bits)
1426 tginfo32->n_tg = n;
1427 else
1428 tginfo->n_tg = n;
1429
1430 }
1431
1432 }
1433 else { /* No bigram w1,w2 */
1434
1435 if (is32bits) {
1436 tginfo32->bowt = 0;
1437 n = tginfo32->n_tg = 0;
1438 }
1439 else {
1440 tginfo->bowt = 0;
1441 n = tginfo->n_tg = 0;
1442 }
1443 }
1444
1445 /* "t" has not been assigned any meanigful value, so if you use it
1446 * beyond this point, make sure it's been properly assigned.
1447 */
1448 /* assert (t != -1); */
1449
1450 /* At this point, n = #trigrams for lw1,lw2. Read them in */
1451
1452 if (lm->isLM_IN_MEMORY) {
1453 /* RAH, already have this in memory */
1454 if (n > 0) {
1455 assert(t != -1);
1456 if (is32bits)
1457 tg32 = tginfo32->tg32 = &lm->tg32[t];
1458 else
1459 tg = tginfo->tg = &lm->tg[t];
1460 }
1461 }
1462 else {
1463 if (n > 0) {
1464
1465 if (is32bits)
1466 tg32 = tginfo32->tg32 =
1467 (tg32_t *) ckd_calloc(n, mem_sz_tg);
1468 else
1469 tg = tginfo->tg = (tg_t *) ckd_calloc(n, mem_sz_tg);
1470
1471
1472 if (fseek(lm->fp, lm->tgoff + t * mem_sz_tg, SEEK_SET) < 0)
1473 E_FATAL_SYSTEM("fseek failed\n");
1474
1475
1476 if (is32bits) {
1477 if (fread(tg32, mem_sz_tg, n, lm->fp) != (size_t) n)
1478 E_FATAL("fread(tg32, %d at %d) failed\n", n,
1479 lm->tgoff);
1480 if (lm->byteswap) {
1481 for (i = 0; i < n; i++) {
1482 SWAP_INT32(&(tg32[i].wid));
1483 SWAP_INT32(&(tg32[i].probid));
1484 }
1485 }
1486 }
1487 else {
1488 if (fread(tg, mem_sz_tg, n, lm->fp) != (size_t) n)
1489 E_FATAL("fread(tg, %d at %d) failed\n", n, lm->tgoff);
1490 if (lm->byteswap) {
1491 for (i = 0; i < n; i++) {
1492 SWAP_INT16(&(tg[i].wid));
1493 SWAP_INT16(&(tg[i].probid));
1494 }
1495 }
1496 }
1497 }
1498 }
1499 lm->n_tg_fill++;
1500 lm->n_tg_inmem += n;
1501 }
1502
1503
1504 /* Similar to find_bg */
1505 int32
find_tg(tg_t * tg,int32 n,s3lmwid32_t w)1506 find_tg(tg_t * tg, int32 n, s3lmwid32_t w)
1507 {
1508 int32 i, b, e;
1509
1510 b = 0;
1511 e = n;
1512 while (e - b > BINARY_SEARCH_THRESH) {
1513 i = (b + e) >> 1;
1514 if (tg[i].wid < w)
1515 b = i + 1;
1516 else if (tg[i].wid > w)
1517 e = i;
1518 else
1519 return i;
1520 }
1521
1522 for (i = b; (i < e) && (tg[i].wid != w); i++);
1523 return ((i < e) ? i : -1);
1524 }
1525
1526 int32
find_tg32(tg32_t * tg,int32 n,s3lmwid32_t w)1527 find_tg32(tg32_t * tg, int32 n, s3lmwid32_t w)
1528 {
1529 int32 i, b, e;
1530
1531 b = 0;
1532 e = n;
1533 while (e - b > BINARY_SEARCH_THRESH) {
1534 i = (b + e) >> 1;
1535 if (tg[i].wid < w)
1536 b = i + 1;
1537 else if (tg[i].wid > w)
1538 e = i;
1539 else
1540 return i;
1541 }
1542
1543 for (i = b; (i < e) && (tg[i].wid != w); i++);
1544 return ((i < e) ? i : -1);
1545 }
1546
1547
1548 int32
lm_tglist(lm_t * lm,s3lmwid32_t lw1,s3lmwid32_t lw2,tg_t ** tgptr,int32 * bowt)1549 lm_tglist(lm_t * lm, s3lmwid32_t lw1, s3lmwid32_t lw2, tg_t ** tgptr,
1550 int32 * bowt)
1551 {
1552 tginfo_t *tginfo, *prev_tginfo;
1553
1554 if (lm->n_tg <= 0) {
1555 *tgptr = NULL;
1556 *bowt = 0;
1557 return 0;
1558 }
1559
1560 if (NOT_LMWID(lm, lw1) || (lw1 >= lm->n_ug))
1561 E_FATAL("Bad lw1 argument (%d) to lm_tglist\n", lw1);
1562 if (NOT_LMWID(lm, lw2) || (lw2 >= lm->n_ug))
1563 E_FATAL("Bad lw2 argument (%d) to lm_tglist\n", lw2);
1564
1565 prev_tginfo = NULL;
1566 for (tginfo = lm->tginfo[lw2]; tginfo; tginfo = tginfo->next) {
1567 if (tginfo->w1 == lw1)
1568 break;
1569 prev_tginfo = tginfo;
1570 }
1571
1572 if (!tginfo) {
1573 load_tg(lm, lw1, lw2);
1574 tginfo = lm->tginfo[lw2];
1575 }
1576 else if (prev_tginfo) {
1577 prev_tginfo->next = tginfo->next;
1578 tginfo->next = lm->tginfo[lw2];
1579 lm->tginfo[lw2] = tginfo;
1580 }
1581 tginfo->used = 1;
1582
1583 *tgptr = tginfo->tg;
1584 *bowt = tginfo->bowt;
1585
1586 return (tginfo->n_tg);
1587 }
1588
1589 int32
lm_tg32list(lm_t * lm,s3lmwid32_t lw1,s3lmwid32_t lw2,tg32_t ** tgptr,int32 * bowt)1590 lm_tg32list(lm_t * lm, s3lmwid32_t lw1, s3lmwid32_t lw2, tg32_t ** tgptr,
1591 int32 * bowt)
1592 {
1593 tginfo32_t *tginfo32, *prev_tginfo32;
1594
1595 if (lm->n_tg <= 0) {
1596 *tgptr = NULL;
1597 *bowt = 0;
1598 return 0;
1599 }
1600
1601 if (NOT_LMWID(lm, lw1) || (lw1 >= lm->n_ug))
1602 E_FATAL("Bad lw1 argument (%d) to lm_tglist\n", lw1);
1603 if (NOT_LMWID(lm, lw2) || (lw2 >= lm->n_ug))
1604 E_FATAL("Bad lw2 argument (%d) to lm_tglist\n", lw2);
1605
1606 prev_tginfo32 = NULL;
1607 for (tginfo32 = lm->tginfo32[lw2]; tginfo32; tginfo32 = tginfo32->next) {
1608 if (tginfo32->w1 == lw1)
1609 break;
1610 prev_tginfo32 = tginfo32;
1611 }
1612
1613 if (!tginfo32) {
1614 load_tg(lm, lw1, lw2);
1615 tginfo32 = lm->tginfo32[lw2];
1616 }
1617 else if (prev_tginfo32) {
1618 prev_tginfo32->next = tginfo32->next;
1619 tginfo32->next = lm->tginfo32[lw2];
1620 lm->tginfo32[lw2] = tginfo32;
1621 }
1622 tginfo32->used = 1;
1623
1624 *tgptr = tginfo32->tg32;
1625 *bowt = tginfo32->bowt;
1626
1627 return (tginfo32->n_tg);
1628 }
1629
1630 /*
1631 * This function look-ups the trigram score of p(lw3|lw2,lw1)
1632 * and compute the in-class ug probability of w3.
1633 * The information for lw3 and w3 are repeated because the legacy
1634 * implementation(since s3.2) of vithist used only LM wid rather
1635 * than dictionary wid.
1636 *
1637 */
1638
1639 int32
lm_tg_score(lm_t * lm,s3lmwid32_t lw1,s3lmwid32_t lw2,s3lmwid32_t lw3,s3wid_t w3)1640 lm_tg_score(lm_t * lm, s3lmwid32_t lw1, s3lmwid32_t lw2, s3lmwid32_t lw3,
1641 s3wid_t w3)
1642 {
1643 int32 i, h, n, score;
1644 tg_t *tg;
1645 tginfo_t *tginfo, *prev_tginfo;
1646 tg32_t *tg32;
1647 tginfo32_t *tginfo32, *prev_tginfo32;
1648 int32 is32bits;
1649
1650 tg = NULL;
1651 tginfo = prev_tginfo = NULL;
1652
1653 tg32 = NULL;
1654 tginfo32 = prev_tginfo32 = NULL;
1655
1656
1657 is32bits = lm->is32bits;
1658
1659 /* E_INFO("lw1 %d, lw2 %d, lw3 %d is32bits %d BAD_LMWID %d\n",lw1,lw2,lw3,is32bits, BAD_LMWID(lm)); */
1660
1661 if ((lm->n_tg == 0) || (NOT_LMWID(lm, lw1)))
1662 return (lm_bg_score(lm, lw2, lw3, w3));
1663
1664 lm->n_tg_score++;
1665
1666 /* E_INFO("lw1 %d, lw2 %d, lw3 %d is32bits %d BAD_LMWID %d\n",lw1,lw2,lw3,is32bits, BAD_LMWID(lm)); */
1667
1668 if (NOT_LMWID(lm, lw1) || (lw1 >= lm->n_ug))
1669 E_FATAL("Bad lw1 argument (%d) to lm_tg_score\n", lw1);
1670 if (NOT_LMWID(lm, lw2) || (lw2 >= lm->n_ug))
1671 E_FATAL("Bad lw2 argument (%d) to lm_tg_score\n", lw2);
1672 if (NOT_LMWID(lm, lw3) || (lw3 >= lm->n_ug))
1673 E_FATAL("Bad lw3 argument (%d) to lm_tg_score\n", lw3);
1674
1675 /* Lookup tgcache first; compute hash(lw1, lw2, lw3) */
1676 h = ((lw1 & 0x000003ff) << 21) + ((lw2 & 0x000003ff) << 11) +
1677 (lw3 & 0x000007ff);
1678 h %= LM_TGCACHE_SIZE;
1679
1680
1681 if (is32bits) {
1682 if ((lm->tgcache32[h].lwid[0] == lw1) &&
1683 (lm->tgcache32[h].lwid[1] == lw2) &&
1684 (lm->tgcache32[h].lwid[2] == lw3)) {
1685
1686 lm->n_tgcache_hit++;
1687 return lm->tgcache32[h].lscr;
1688 }
1689
1690 prev_tginfo32 = NULL;
1691 for (tginfo32 = lm->tginfo32[lw2]; tginfo32;
1692 tginfo32 = tginfo32->next) {
1693 if (tginfo32->w1 == lw1)
1694 break;
1695 prev_tginfo32 = tginfo32;
1696 }
1697
1698 }
1699 else {
1700 if ((lm->tgcache[h].lwid[0] == lw1) &&
1701 (lm->tgcache[h].lwid[1] == lw2) &&
1702 (lm->tgcache[h].lwid[2] == lw3)) {
1703
1704 lm->n_tgcache_hit++;
1705 return lm->tgcache[h].lscr;
1706 }
1707
1708 prev_tginfo = NULL;
1709 for (tginfo = lm->tginfo[lw2]; tginfo; tginfo = tginfo->next) {
1710 if (tginfo->w1 == lw1)
1711 break;
1712 prev_tginfo = tginfo;
1713 }
1714 }
1715
1716 if (is32bits) {
1717 if (!tginfo32) {
1718 load_tg(lm, lw1, lw2);
1719 tginfo32 = lm->tginfo32[lw2];
1720 }
1721 else if (prev_tginfo32) {
1722 prev_tginfo32->next = tginfo32->next;
1723 tginfo32->next = lm->tginfo32[lw2];
1724 lm->tginfo32[lw2] = tginfo32;
1725 }
1726 tginfo32->used = 1;
1727 }
1728 else {
1729 if (!tginfo) {
1730 load_tg(lm, lw1, lw2);
1731 tginfo = lm->tginfo[lw2];
1732 }
1733 else if (prev_tginfo) {
1734 prev_tginfo->next = tginfo->next;
1735 tginfo->next = lm->tginfo[lw2];
1736 lm->tginfo[lw2] = tginfo;
1737 }
1738 tginfo->used = 1;
1739 }
1740
1741
1742 /* Trigrams for w1,w2 now in memory; look for w1,w2,w3 */
1743 if (is32bits) {
1744 n = tginfo32->n_tg;
1745 tg32 = tginfo32->tg32;
1746 assert(tginfo32);
1747 }
1748 else {
1749 n = tginfo->n_tg;
1750 tg = tginfo->tg;
1751 assert(tginfo);
1752 }
1753
1754 if (is32bits)
1755 i = find_tg32(tg32, n, lw3);
1756 else
1757 i = find_tg(tg, n, lw3);
1758
1759 if (i >= 0) {
1760 if (is32bits)
1761 score = lm->tgprob[tg32[i].probid].l;
1762 else
1763 score = lm->tgprob[tg[i].probid].l;
1764
1765 if (lm->inclass_ugscore) { /*Only add within class prob if class information exists.
1766 Is actually ok to just add the score because if the word
1767 is not within-class. The returning scores will be 0.
1768 */
1769 score += lm->inclass_ugscore[w3];
1770 }
1771 lm->access_type = 3;
1772 }
1773 else {
1774 lm->n_tg_bo++;
1775 score = is32bits ? tginfo32->bowt : tginfo->bowt;
1776 score += lm_bg_score(lm, lw2, lw3, w3);
1777
1778 }
1779
1780 if (is32bits) {
1781 lm->tgcache32[h].lwid[0] = lw1;
1782 lm->tgcache32[h].lwid[1] = lw2;
1783 lm->tgcache32[h].lwid[2] = lw3;
1784 lm->tgcache32[h].lscr = score;
1785 }
1786 else {
1787 lm->tgcache[h].lwid[0] = lw1;
1788 lm->tgcache[h].lwid[1] = lw2;
1789 lm->tgcache[h].lwid[2] = lw3;
1790 lm->tgcache[h].lscr = score;
1791 }
1792
1793
1794 #if 0
1795 printf(" %5d %5d -> %8d\n", lw1, lw2, score);
1796 /* ENABLE this when you suspect the lm routine produce abnormal scores */
1797 if (score > 0) {
1798 E_INFO
1799 ("score %d >0 lm->ug[lw1].bowt.l %d lm_ug[lw2].prob.l %d, lw1 %d lw2 %d i, %d\n",
1800 score, lm->ug[lw1].bowt.l, lm->ug[lw2].bowt.l, lw1, lw2, i);
1801 }
1802 #endif
1803
1804 return (score);
1805 }
1806
1807 int32
lm_tg_exists(lm_t * lm,s3lmwid32_t lw1,s3lmwid32_t lw2,s3lmwid32_t lw3)1808 lm_tg_exists(lm_t * lm, s3lmwid32_t lw1, s3lmwid32_t lw2, s3lmwid32_t lw3)
1809 {
1810 int32 i, n;
1811 tg_t *tg;
1812 tginfo_t *tginfo, *prev_tginfo;
1813 tg32_t *tg32;
1814 tginfo32_t *tginfo32, *prev_tginfo32;
1815
1816
1817 int32 is32bits;
1818
1819 tg = NULL;
1820 tginfo = prev_tginfo = NULL;
1821 tg32 = NULL;
1822 tginfo32 = prev_tginfo32 = NULL;
1823 is32bits = lm->is32bits;
1824
1825 if ((lm->n_tg == 0) || (NOT_LMWID(lm, lw1)))
1826 return 0;
1827
1828 if (NOT_LMWID(lm, lw1) || (lw1 >= lm->n_ug))
1829 return 0;
1830 if (NOT_LMWID(lm, lw2) || (lw2 >= lm->n_ug))
1831 return 0;
1832 if (NOT_LMWID(lm, lw3) || (lw3 >= lm->n_ug))
1833 return 0;
1834
1835 if (is32bits) {
1836 prev_tginfo32 = NULL;
1837 for (tginfo32 = lm->tginfo32[lw2]; tginfo32;
1838 tginfo32 = tginfo32->next) {
1839 if (tginfo32->w1 == lw1)
1840 break;
1841 prev_tginfo32 = tginfo32;
1842 }
1843 }
1844 else {
1845 prev_tginfo = NULL;
1846 for (tginfo = lm->tginfo[lw2]; tginfo; tginfo = tginfo->next) {
1847 if (tginfo->w1 == lw1)
1848 break;
1849 prev_tginfo = tginfo;
1850 }
1851 }
1852
1853 if (is32bits) {
1854 if (!tginfo32) {
1855 load_tg(lm, lw1, lw2);
1856 tginfo32 = lm->tginfo32[lw2];
1857 }
1858 else if (prev_tginfo32) {
1859 prev_tginfo32->next = tginfo32->next;
1860 tginfo32->next = lm->tginfo32[lw2];
1861 lm->tginfo32[lw2] = tginfo32;
1862 }
1863 tginfo32->used = 1;
1864 /* Trigrams for w1,w2 now in memory; look for w1,w2,w3 */
1865 n = tginfo32->n_tg;
1866 tg32 = tginfo32->tg32;
1867 assert(tginfo32);
1868 }
1869 else {
1870 if (!tginfo) {
1871 load_tg(lm, lw1, lw2);
1872 tginfo = lm->tginfo[lw2];
1873 }
1874 else if (prev_tginfo) {
1875 prev_tginfo->next = tginfo->next;
1876 tginfo->next = lm->tginfo[lw2];
1877 lm->tginfo[lw2] = tginfo;
1878 }
1879 tginfo->used = 1;
1880 /* Trigrams for w1,w2 now in memory; look for w1,w2,w3 */
1881 n = tginfo->n_tg;
1882 tg = tginfo->tg;
1883 assert(tginfo);
1884 }
1885
1886 if (is32bits)
1887 i = find_tg32(tg32, n, lw3);
1888 else
1889 i = find_tg(tg, n, lw3);
1890
1891 if (i >= 0)
1892 return 1;
1893 else
1894 return 0;
1895 }
1896
1897
1898 s3lmwid32_t
lm_wid(lm_t * lm,const char * word)1899 lm_wid(lm_t * lm, const char *word)
1900 {
1901 int32 i;
1902
1903 for (i = 0; i < lm->n_ug; i++)
1904 if (strcmp(lm->wordstr[i], word) == 0)
1905 return ((s3lmwid32_t) i);
1906
1907 return BAD_LMWID(lm);
1908 }
1909
1910 void
lm_free(lm_t * lm)1911 lm_free(lm_t * lm)
1912 {
1913 int i;
1914 tginfo_t *tginfo;
1915 tginfo32_t *tginfo32;
1916
1917 if (lm->fp)
1918 fclose(lm->fp);
1919
1920 ckd_free((void *) lm->ug);
1921
1922 for (i = 0; i < lm->n_ug; i++)
1923 ckd_free((void *) lm->wordstr[i]); /* */
1924 ckd_free((void *) lm->wordstr);
1925
1926 if (lm->n_bg > 0) {
1927 if (lm->bg || lm->bg32) { /* Memory-based; free all bg */
1928 if (lm->bg)
1929 ckd_free(lm->bg);
1930 if (lm->bg32)
1931 ckd_free(lm->bg32);
1932
1933 if (lm->membg)
1934 ckd_free(lm->membg);
1935 if (lm->membg32)
1936 ckd_free(lm->membg32);
1937 }
1938 else { /* Disk-based; free in-memory bg */
1939 if (lm->membg) {
1940 for (i = 0; i < lm->n_ug; ++i)
1941 ckd_free(lm->membg[i].bg);
1942 ckd_free(lm->membg);
1943 }
1944 if (lm->membg32) {
1945 for (i = 0; i < lm->n_ug; ++i)
1946 ckd_free(lm->membg32[i].bg32);
1947 ckd_free(lm->membg32);
1948 }
1949 }
1950
1951 ckd_free(lm->bgprob);
1952 }
1953
1954 if (lm->n_tg > 0) {
1955 if (lm->tg)
1956 ckd_free((void *) lm->tg);
1957 if (lm->tg32)
1958 ckd_free((void *) lm->tg32);
1959
1960 if (lm->tginfo) {
1961 for (i = 0; i < lm->n_ug; i++) {
1962 if (lm->tginfo[i] != NULL) {
1963 /* Free the whole linked list of tginfo. */
1964 while (lm->tginfo[i]) {
1965 tginfo = lm->tginfo[i];
1966 lm->tginfo[i] = tginfo->next;
1967 if (!lm->isLM_IN_MEMORY)
1968 ckd_free(tginfo->tg);
1969 ckd_free((void *) tginfo);
1970 }
1971 }
1972 }
1973 ckd_free((void *) lm->tginfo);
1974 }
1975 if (lm->tginfo32) {
1976 for (i = 0; i < lm->n_ug; i++) {
1977 if (lm->tginfo32[i] != NULL) {
1978 while (lm->tginfo32[i]) {
1979 tginfo32 = lm->tginfo32[i];
1980 lm->tginfo32[i] = tginfo32->next;
1981 if (!lm->isLM_IN_MEMORY)
1982 ckd_free(tginfo32->tg32);
1983 ckd_free((void *) tginfo32);
1984 }
1985 }
1986 }
1987 ckd_free((void *) lm->tginfo32);
1988 }
1989
1990
1991
1992 if (lm->tgcache)
1993 ckd_free((void *) lm->tgcache);
1994 if (lm->tgcache32)
1995 ckd_free((void *) lm->tgcache32);
1996
1997 ckd_free((void *) lm->tg_segbase);
1998 ckd_free((void *) lm->tgprob);
1999 ckd_free((void *) lm->tgbowt);
2000 }
2001
2002 if (lm->lmclass) {
2003 for (i = 0; i < lm->n_lmclass; ++i)
2004 lmclass_free(lm->lmclass[i]);
2005 ckd_free(lm->lmclass);
2006 }
2007
2008 if (lm->inclass_ugscore) {
2009 ckd_free(lm->inclass_ugscore);
2010 }
2011
2012 if (lm->HT) {
2013 hash_table_free(lm->HT);
2014 }
2015
2016 if (lm->dict2lmwid) {
2017 ckd_free(lm->dict2lmwid);
2018 }
2019
2020 if (lm->name)
2021 ckd_free(lm->name);
2022
2023 ckd_free((void *) lm);
2024 }
2025
2026 static void
copy_bgt_to_bg32t(bg_t * b16,bg32_t * b32)2027 copy_bgt_to_bg32t(bg_t * b16, bg32_t * b32)
2028 {
2029 b32->wid = (s3lmwid32_t) b16->wid;
2030 b32->probid = (uint32) b16->probid;
2031 b32->bowtid = (uint32) b16->bowtid;
2032 b32->firsttg = (uint32) b16->firsttg;
2033 }
2034
2035 void
copy_bg_to_bg32(lm_t * lm)2036 copy_bg_to_bg32(lm_t * lm)
2037 {
2038 int i;
2039 assert(lm->bg32 == NULL);
2040 lm->bg32 = (bg32_t *) ckd_calloc(lm->n_bg + 1, sizeof(bg32_t));
2041
2042 for (i = 0; i <= lm->n_bg; i++)
2043 copy_bgt_to_bg32t(&(lm->bg[i]), &(lm->bg32[i]));
2044 }
2045
2046 static void
copy_bg32t_to_bgt(bg32_t * b32,bg_t * b16)2047 copy_bg32t_to_bgt(bg32_t * b32, bg_t * b16)
2048 {
2049 assert(b32->wid <= LM_LEGACY_CONSTANT);
2050 b16->wid = (s3lmwid_t) b32->wid;
2051 b16->probid = (uint16) b32->probid;
2052 b16->bowtid = (uint16) b32->bowtid;
2053 b16->firsttg = (uint16) b32->firsttg;
2054 }
2055
2056 void
copy_bg32_to_bg(lm_t * lm)2057 copy_bg32_to_bg(lm_t * lm)
2058 {
2059 int i;
2060 assert(lm->bg == NULL);
2061 lm->bg = (bg_t *) ckd_calloc(lm->n_bg + 1, sizeof(bg_t));
2062
2063 for (i = 0; i <= lm->n_bg; i++)
2064 copy_bg32t_to_bgt(&(lm->bg32[i]), &(lm->bg[i]));
2065
2066 }
2067
2068 void
swap_bg(bg_t * b16)2069 swap_bg(bg_t * b16)
2070 {
2071 SWAP_INT16(&(b16->wid));
2072 SWAP_INT16(&(b16->probid));
2073 SWAP_INT16(&(b16->bowtid));
2074 SWAP_INT16(&(b16->firsttg));
2075 }
2076
2077 void
swap_bg32(bg32_t * b32)2078 swap_bg32(bg32_t * b32)
2079 {
2080 SWAP_INT32(&(b32->wid));
2081 SWAP_INT32(&(b32->probid));
2082 SWAP_INT32(&(b32->bowtid));
2083 SWAP_INT32(&(b32->firsttg));
2084 }
2085
2086 static void
copy_tgt_to_tg32t(tg_t * t16,tg32_t * t32)2087 copy_tgt_to_tg32t(tg_t * t16, tg32_t * t32)
2088 {
2089 t32->wid = (s3lmwid32_t) t16->wid;
2090 t32->probid = (uint32) t16->probid;
2091 }
2092
2093
2094
2095 void
copy_tg_to_tg32(lm_t * lm)2096 copy_tg_to_tg32(lm_t * lm)
2097 {
2098 int i;
2099 assert(lm->tg32 == NULL);
2100 lm->tg32 = (tg32_t *) ckd_calloc(lm->n_tg, sizeof(tg32_t));
2101
2102 for (i = 0; i < lm->n_tg; i++)
2103 copy_tgt_to_tg32t(&(lm->tg[i]), &(lm->tg32[i]));
2104
2105 }
2106
2107 static void
copy_tg32t_to_tgt(tg32_t * t32,tg_t * t16)2108 copy_tg32t_to_tgt(tg32_t * t32, tg_t * t16)
2109 {
2110 t16->wid = (s3lmwid_t) t32->wid;
2111 t16->probid = (uint32) t32->probid;
2112 }
2113
2114
2115 void
copy_tg32_to_tg(lm_t * lm)2116 copy_tg32_to_tg(lm_t * lm)
2117 {
2118 int i;
2119 assert(lm->tg == NULL);
2120 lm->tg = (tg_t *) ckd_calloc(lm->n_tg, sizeof(tg_t));
2121
2122 for (i = 0; i < lm->n_tg; i++)
2123 copy_tg32t_to_tgt(&(lm->tg32[i]), &(lm->tg[i]));
2124
2125 }
2126
2127 void
swap_tg(tg_t * t16)2128 swap_tg(tg_t * t16)
2129 {
2130 SWAP_INT16(&(t16->wid));
2131 SWAP_INT16(&(t16->probid));
2132 }
2133
2134 void
swap_tg32(tg32_t * t32)2135 swap_tg32(tg32_t * t32)
2136 {
2137 SWAP_INT32(&(t32->wid));
2138 SWAP_INT32(&(t32->probid));
2139 }
2140
2141
2142 int32
lm_rawscore(lm_t * lm,int32 score)2143 lm_rawscore(lm_t * lm, int32 score)
2144 {
2145
2146 score -= lm->wip;
2147 score /= (int32) lm->lw;
2148
2149 return score;
2150 }
2151
2152 void
lm_convert_structure(lm_t * model,int32 is32bits)2153 lm_convert_structure(lm_t * model, int32 is32bits)
2154 {
2155 /* Convert the data structure */
2156 if (is32bits) { /* Convert from 16 bits to 32 bits */
2157 if (model->n_bg > 0) {
2158 if (model->bg32 == NULL) {
2159 assert(model->bg != NULL);
2160 copy_bg_to_bg32(model);
2161 }
2162 }
2163 if (model->n_tg > 0) {
2164 if (model->tg32 == NULL) {
2165 assert(model->tg != NULL);
2166 copy_tg_to_tg32(model);
2167 }
2168 }
2169 }
2170 else { /* Convert from 32 bits to 16 bits */
2171 if (model->n_bg > 0) {
2172 if (model->bg == NULL) {
2173 assert(model->bg32 != NULL);
2174 copy_bg32_to_bg(model);
2175 }
2176 }
2177 if (model->n_tg > 0) {
2178 if (model->tg == NULL) {
2179 assert(model->tg32 != NULL);
2180 copy_tg32_to_tg(model);
2181 }
2182 }
2183 }
2184
2185 if (is32bits) {
2186 if (model->bg > 0)
2187 assert(model->bg32 != NULL);
2188 if (model->tg > 0)
2189 assert(model->tg32 != NULL);
2190 }
2191 else {
2192 if (model->bg > 0)
2193 assert(model->bg != NULL);
2194 if (model->tg > 0)
2195 assert(model->tg != NULL);
2196 }
2197
2198 }
2199
2200
2201
2202 #if (_LM_TEST_)
2203 static int32
sentence_lmscore(lm_t * lm,const char * line)2204 sentence_lmscore(lm_t * lm, const char *line)
2205 {
2206 char *word[1024];
2207 s3lmwid32_t w[1024];
2208 int32 nwd, score, tgscr;
2209 int32 i, j;
2210
2211 if ((nwd = str2words(line, word, 1020)) < 0)
2212 E_FATAL("Increase word[] and w[] arrays size\n");
2213
2214 w[0] = BAD_LMWID(lm);
2215 w[1] = lm_wid(lm, S3_START_WORD);
2216 if (NOT_LMWID(lm, w[1]))
2217 E_FATAL("Unknown word: %s\n", S3_START_WORD);
2218
2219 for (i = 0; i < nwd; i++) {
2220 w[i + 2] = lm_wid(lm, word[i]);
2221 if (NOT_LMWID(lm, w[i + 2])) {
2222 E_ERROR("Unknown word: %s\n", word[i]);
2223 return 0;
2224 }
2225 }
2226
2227 w[i + 2] = lm_wid(lm, S3_FINISH_WORD);
2228 if (NOT_LMWID(lm, w[i + 2]))
2229 E_FATAL("Unknown word: %s\n", S3_FINISH_WORD);
2230
2231 score = 0;
2232 for (i = 0, j = 2; i <= nwd; i++, j++) {
2233 tgscr = lm_tg_score(lm, w[j - 2], w[j - 1], w[j]);
2234 score += tgscr;
2235 printf("\t%10d %s\n", tgscr, lm->wordstr[w[j]]);
2236 }
2237
2238 return (score);
2239 }
2240
2241
main(int32 argc,char * argv[])2242 main(int32 argc, char *argv[])
2243 {
2244 char line[4096];
2245 int32 score, k;
2246 lm_t *lm;
2247
2248 if (argc < 2)
2249 E_FATAL("Usage: %s <LMdumpfile>\n", argv[0]);
2250
2251 logs3_init(1.0001, 1, 1);
2252 lm = lm_read(argv[1], 9.5, 0.2);
2253
2254 if (1) { /* Short cut this so we can test for memory leaks */
2255 for (;;) {
2256 printf("> ");
2257 if (fgets(line, sizeof(line), stdin) == NULL)
2258 break;
2259
2260 score = sentence_lmscore(lm, line);
2261
2262 k = strlen(line);
2263 if (line[k - 1] == '\n')
2264 line[k - 1] = '\0';
2265 printf("LMScr(%s) = %d\n", line, score);
2266 }
2267 } /* */
2268 lm_free(lm);
2269 exit(0);
2270 }
2271 #endif
2272