1 /* ==================================================================== 2 * Copyright (c) 1999-2004 Carnegie Mellon University. All rights 3 * reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in 14 * the documentation and/or other materials provided with the 15 * distribution. 16 * 17 * This work was supported in part by funding from the Defense Advanced 18 * Research Projects Agency and the National Science Foundation of the 19 * United States of America, and the CMU Sphinx Speech Consortium. 20 * 21 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 22 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 23 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY 25 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 * 33 * ==================================================================== 34 * 35 */ 36 /* 37 * lm_3g.c -- Darpa Trigram LM module (adapted to Sphinx 3) 38 * 39 * ********************************************** 40 * CMU ARPA Speech Project 41 * 42 * Copyright (c) 1997 Carnegie Mellon University. 43 * ALL RIGHTS RESERVED. 44 * ********************************************** 45 * 46 * HISTORY 47 * $Log: lm_3g.c,v $ 48 * Revision 1.3 2006/03/01 20:05:09 arthchan2003 49 * Pretty format the lm dumping and make numbers in 4 decimals only. 50 * 51 * Revision 1.2 2006/02/23 04:08:36 arthchan2003 52 * Merged from branch SPHINX3_5_2_RCI_IRII_BRANCH 53 * 1, Added lm_3g.c - a TXT-based LM routines. 54 * 2, Added lm_3g_dmp.c - a DMP-based LM routines. 55 * 3, (Contributed by LIUM) Added lm_attfsm.c - convert lm to FSM 56 * 4, Added lmset.c - a wrapper for the lmset_t structure. 57 * 58 * Revision 1.1.2.4 2006/01/16 19:58:25 arthchan2003 59 * Small change to make function public. 60 * 61 * Revision 1.1.2.3 2005/11/17 06:21:05 arthchan2003 62 * 1, Change all lm_write_arpa_* functions' s3wid_t to s3lmwid_t. 2, Make the writing code to be changed more easily. 63 * 64 * Revision 1.1.2.2 2005/07/28 20:04:20 dhdfu 65 * Make LM writing not crash for bigram LM 66 * 67 * Revision 1.1.2.1 2005/07/13 01:39:55 arthchan2003 68 * Added lm_3g.c, which take cares of reading and writing of LM into the lm_t (3-gram specific lm structure.) 69 * 70 */ 71 72 #include <string.h> 73 74 #include "pio.h" 75 #include "lm.h" 76 #include "bio.h" 77 #include "logs3.h" 78 #include "wid.h" 79 80 /*#define LEGACY_MAX_SORTED_ENTRIES 65534*/ 81 82 /* 83 20060321 ARCHAN: Why MAX_SORTED_ENTRIES = 200000? 84 85 Generally, MAX_SORTED_ENTRIES relates the quantization schemes of 86 the log probabilities values and back-off weights. In the 87 Sphinx/CMUCamLMtk case. This is usually represented as 88 -x.xxxx. That is to say maximally 100000 could be involved. It is 89 also possible that value such as -99.0 and other special values is 90 involved. Therefore, doubling the size is reasonable measure. 91 92 When we use better precision, say 5 decimal places, we should 93 revise the validity of the above schemes. 94 */ 95 #define MAX_SORTED_ENTRIES 200000 96 #define QUANTIZATION_MULTIPLIER 10000 97 #define QUANTIZATION_DIVISOR 0.0001 98 99 100 #define FIRST_BG(m,u) ((m)->ug[u].firstbg) 101 #define TSEG_BASE(m,b) ((m)->tg_segbase[(b)>>LOG2_BG_SEG_SZ]) 102 103 #define FIRST_TG(m,b) (TSEG_BASE((m),(b))+((m)->bg[b].firsttg)) 104 #define FIRST_TG32(m,b) (TSEG_BASE((m),(b))+((m)->bg32[b].firsttg)) 105 106 107 static int32 108 wstr2wid(lm_t * model, /**< an LM */ 109 char *w /**< a pre-allocated word string */ 110 ) 111 { 112 void *val; 113 114 if (hash_table_lookup(model->HT, w, &val) != 0) 115 return NO_WORD; 116 return ((int32)(long)val); 117 } 118 119 /* 120 * Initialize sorted list with the 0-th entry = MIN_PROB_F, which may be needed 121 * to replace spurious values in the Darpa LM file. 122 */ 123 static void 124 init_sorted_list(sorted_list_t * l /**< a sorted list */ 125 ) 126 { 127 l->list = 128 (sorted_entry_t *) ckd_calloc(MAX_SORTED_ENTRIES, 129 sizeof(sorted_entry_t)); 130 l->list[0].val.f = MIN_PROB_F; 131 l->list[0].lower = 0; 132 l->list[0].higher = 0; 133 l->free = 1; 134 } 135 136 /** 137 * Free a sorted list 138 */ 139 static void 140 free_sorted_list(sorted_list_t * l /**< a sorted list */ 141 ) 142 { 143 free(l->list); 144 } 145 146 static lmlog_t * 147 vals_in_sorted_list(sorted_list_t * l) 148 { 149 lmlog_t *vals; 150 int32 i; 151 152 vals = (lmlog_t *) ckd_calloc(l->free, sizeof(lmlog_t)); 153 for (i = 0; i < l->free; i++) 154 vals[i].f = l->list[i].val.f; 155 return (vals); 156 } 157 158 static int32 159 sorted_id(sorted_list_t * l, float *val) 160 { 161 int32 i = 0; 162 163 for (;;) { 164 if (*val == l->list[i].val.f) 165 return (i); 166 if (*val < l->list[i].val.f) { 167 if (l->list[i].lower == 0) { 168 169 if (l->free >= MAX_SORTED_ENTRIES) 170 E_INFO("sorted list overflow\n"); 171 172 l->list[i].lower = l->free; 173 (l->free)++; 174 i = l->list[i].lower; 175 l->list[i].val.f = *val; 176 return (i); 177 } 178 else 179 i = l->list[i].lower; 180 } 181 else { 182 if (l->list[i].higher == 0) { 183 184 if (l->free >= MAX_SORTED_ENTRIES) 185 E_INFO("sorted list overflow\n"); 186 187 l->list[i].higher = l->free; 188 (l->free)++; 189 i = l->list[i].higher; 190 l->list[i].val.f = *val; 191 return (i); 192 } 193 else 194 i = l->list[i].higher; 195 } 196 } 197 } 198 199 /** 200 * Read and return #unigrams, #bigrams, #trigrams as stated in input file. 201 * 202 * @warning The internal buffer has size 256. 203 * 204 * @return LM_NO_DATA_MARK if there is "\\data\\" mark. LM_UNKNOWN_NG if an 205 * unknown K of K-gram appears. LM_BAD_LM_COUNT if a bad LM counts. LM_SUCCESS 206 * if the whole reading succeeds. 207 * 208 */ 209 static int 210 ReadNgramCounts(FILE * fp, int32 * n_ug, /**< Out: number of unigram read */ 211 int32 * n_bg, /**< Out: number of bigram read */ 212 int32 * n_tg /**< Out: number of trigram read */ 213 ) 214 { 215 char string[256]; 216 int32 ngram, ngram_cnt; 217 218 /* skip file until past the '\data\' marker */ 219 220 221 do { 222 fgets(string, sizeof(string), fp); 223 } while ((strcmp(string, "\\data\\\n") != 0) && (!feof(fp))); 224 225 226 if (strcmp(string, "\\data\\\n") != 0) { 227 228 E_WARN("No \\data\\ mark in LM file\n"); 229 return LM_NO_DATA_MARK; 230 } 231 232 233 *n_ug = *n_bg = *n_tg = 0; 234 while (fgets(string, sizeof(string), fp) != NULL) { 235 if (sscanf(string, "ngram %d=%d", &ngram, &ngram_cnt) != 2) 236 break; 237 238 switch (ngram) { 239 case 1: 240 *n_ug = ngram_cnt; 241 break; 242 case 2: 243 *n_bg = ngram_cnt; 244 break; 245 case 3: 246 *n_tg = ngram_cnt; 247 break; 248 default: 249 E_WARN("Unknown ngram (%d)\n", ngram); 250 return LM_UNKNOWN_NG; 251 break; 252 } 253 } 254 255 /* Position file to just after the unigrams header '\1-grams:\' */ 256 while ((strcmp(string, "\\1-grams:\n") != 0) && (!feof(fp))) 257 fgets(string, sizeof(string), fp); 258 259 /* Check counts; NOTE: #trigrams *CAN* be 0 */ 260 if ((*n_ug <= 0) || (*n_bg <= 0) || (*n_tg < 0)) { 261 E_WARN("Bad or missing ngram count\n"); 262 return LM_BAD_LM_COUNT; 263 } 264 return LM_SUCCESS; 265 } 266 267 /** 268 Allocate a new unigram table. Initially all dictionary words 269 are defined as NO_WORD, the probabilities and backoff weights 270 are -99. 271 272 @return a pointer of unigram if succeed, NULL if failed. 273 */ 274 ug_t * 275 NewUnigramTable(int32 n_ug) 276 { 277 ug_t *table; 278 int32 i; 279 table = NULL; 280 281 table = (ug_t *) ckd_calloc(n_ug, sizeof(ug_t)); 282 if (table == NULL) { 283 E_WARN("Fail to allocate the unigram table\n"); 284 return NULL; 285 } 286 for (i = 0; i < n_ug; i++) { 287 table[i].dictwid = NO_WORD; 288 table[i].prob.f = -99.0; 289 table[i].bowt.f = -99.0; 290 } 291 return table; 292 } 293 294 /** 295 Allocate a new model. That includes the tables of unigram, bigram 296 and trigram. 297 298 FIXME: should have more comments. 299 */ 300 static lm_t * 301 NewModel(int32 n_ug, int32 n_bg, int32 n_tg, int32 version) 302 { 303 lm_t *model; 304 305 model = (lm_t *) ckd_calloc(1, sizeof(lm_t)); 306 307 lm_null_struct(model); 308 /* 309 * Allocate one extra unigram and bigram entry: sentinels to terminate 310 * followers (bigrams and trigrams, respectively) of previous entry. 311 */ 312 model->ug = NewUnigramTable(n_ug + 1); 313 314 model->version = version; 315 if (model->version == LMTXT_VERSION) 316 model->is32bits = n_ug > LM_LEGACY_CONSTANT; 317 else if (model->version == LMFORCED_TXT32VERSION) 318 model->is32bits = 1; 319 320 if (model->is32bits) { 321 model->bg32 = (bg32_t *) ckd_calloc(n_bg + 1, sizeof(bg32_t)); 322 if (n_tg > 0) 323 model->tg32 = (tg32_t *) ckd_calloc(n_tg + 1, sizeof(tg32_t)); 324 325 } 326 else { 327 model->bg = (bg_t *) ckd_calloc(n_bg + 1, sizeof(bg_t)); 328 if (n_tg > 0) 329 model->tg = (tg_t *) ckd_calloc(n_tg + 1, sizeof(tg_t)); 330 } 331 332 if (n_tg > 0) { 333 model->tg_segbase = 334 (int32 *) ckd_calloc((n_bg + 1) / BG_SEG_SZ + 1, 335 sizeof(int32)); 336 337 E_INFO("%8d = tseg_base entries allocated\n", 338 (n_bg + 1) / BG_SEG_SZ + 1); 339 340 } 341 342 /*This part will not be compiled */ 343 model->max_ug = model->n_ug = n_ug; 344 model->n_bg = n_bg; 345 model->n_tg = n_tg; 346 347 model->HT = hash_table_new(model->n_ug, HASH_CASE_YES); 348 model->log_bg_seg_sz = LOG2_BG_SEG_SZ; /* Default */ 349 350 return model; 351 } 352 353 /* 354 * Read in the unigrams from given file into the LM structure model. On 355 * entry to this procedure, the file pointer is positioned just after the 356 * header line '\1-grams:'. 357 * 358 * @return LM_BAD_LM_COUNT, if there is a unigram which is large than what 359 * specify by the header. LM_SUCCESS if reading succeeds. 360 */ 361 static int32 362 ReadUnigrams(FILE * fp, lm_t * model /**< An LM where unigram will be filled in */ 363 ) 364 { 365 char string[256]; 366 char name[128]; 367 int32 wcnt, i; 368 float p1, bo_wt; 369 s3lmwid32_t startwid, endwid; 370 371 E_INFO("Reading unigrams\n"); 372 373 wcnt = 0; 374 while ((fgets(string, sizeof(string), fp) != NULL) && 375 (strcmp(string, "\\2-grams:\n") != 0)) { 376 if (sscanf(string, "%f %s %f", &p1, name, &bo_wt) != 3) { 377 if (string[0] != '\n') 378 E_WARN("Format error; unigram ignored:%s", string); 379 continue; 380 } 381 382 if (wcnt >= model->n_ug) { 383 E_WARN("Too many unigrams\n"); 384 return LM_BAD_LM_COUNT; 385 } 386 387 /* Associate name with word id */ 388 /* This is again not local */ 389 model->wordstr[wcnt] = (char *) ckd_salloc(name); 390 hash_table_enter(model->HT, model->wordstr[wcnt], (void *)(long)wcnt); 391 model->ug[wcnt].prob.f = p1; 392 model->ug[wcnt].bowt.f = bo_wt; 393 model->ug[wcnt].dictwid = wcnt; 394 wcnt++; 395 } 396 397 if (model->n_ug != wcnt) { 398 E_WARN("lm_t.n_ug(%d) != #unigrams read(%d)\n", model->n_ug, wcnt); 399 model->n_ug = wcnt; 400 } 401 402 startwid = endwid = BAD_LMWID(model); 403 404 for (i = 0; i < model->n_ug; i++) { 405 if (strcmp(model->wordstr[i], S3_START_WORD) == 0) 406 startwid = i; 407 else if (strcmp(model->wordstr[i], S3_FINISH_WORD) == 0) 408 endwid = i; 409 } 410 411 /* Force ugprob(<s>) = MIN_PROB_F */ 412 if (IS_LMWID(model, startwid)) { 413 model->ug[startwid].prob.f = MIN_PROB_F; 414 model->startlwid = startwid; 415 } 416 417 /* Force bowt(</s>) = MIN_PROB_F */ 418 if (IS_LMWID(model, endwid)) { 419 model->ug[endwid].bowt.f = MIN_PROB_F; 420 model->finishlwid = endwid; 421 } 422 423 return LM_SUCCESS; 424 } 425 426 /* 427 * Read bigrams from given file into given model structure. File may be arpabo 428 * or arpabo-id format, depending on idfmt = 0 or 1. 429 * 430 * @return LM_UNKNOWN_WORDS if a word appears but it couldn't be found 431 * in the unigram. LM_BAD_BIGRAM if the word id is bad or out of bound 432 * LM_TOO_MANY if there are too many n-grams than specified in the header. 433 * LM_SUCCESS if the reading succeeds. 434 */ 435 static int32 436 ReadBigrams(FILE * fp, lm_t * model, int32 idfmt) 437 { 438 char string[1024], word1[256], word2[256]; 439 int32 w1, w2, prev_w1, bgcount, p; 440 bg_t *bgptr; 441 bg32_t *bgptr32; 442 float p2, bo_wt; 443 int32 n_fld, n; 444 int32 is32bits; 445 446 E_INFO("Reading bigrams\n"); 447 448 bgcount = 0; 449 bgptr = model->bg; 450 bgptr32 = model->bg32; 451 452 prev_w1 = -1; 453 n_fld = (model->n_tg > 0) ? 4 : 3; 454 455 bo_wt = 0.0; 456 is32bits = model->is32bits; 457 while (fgets(string, sizeof(string), fp) != NULL) { 458 if (!idfmt) 459 n = sscanf(string, "%f %s %s %f", &p2, word1, word2, &bo_wt); 460 else 461 n = sscanf(string, "%f %d %d %f", &p2, &w1, &w2, &bo_wt); 462 if (n < n_fld) { 463 if (string[0] != '\n') 464 break; 465 continue; 466 } 467 468 if (!idfmt) { 469 if ((w1 = wstr2wid(model, word1)) == NO_WORD) { 470 E_WARN("Unknown word: %s\n", word1); 471 return LM_UNKNOWN_WORDS; 472 } 473 if ((w2 = wstr2wid(model, word2)) == NO_WORD) { 474 E_WARN("Unknown word: %s\n", word2); 475 return LM_UNKNOWN_WORDS; 476 } 477 } 478 else { 479 if ((w1 >= model->n_ug) || (w2 >= model->n_ug) || (w1 < 0) 480 || (w2 < 0)) { 481 E_WARN("Bad bigram: %s", string); 482 return LM_BAD_BIGRAM; 483 } 484 } 485 486 /* HACK!! to quantize probs to 4 decimal digits */ 487 p = p2 * QUANTIZATION_MULTIPLIER; 488 p2 = p * QUANTIZATION_DIVISOR; 489 p = bo_wt * QUANTIZATION_MULTIPLIER; 490 bo_wt = p * QUANTIZATION_DIVISOR; 491 492 if (bgcount >= model->n_bg) { 493 E_WARN("Too many bigrams\n"); 494 return LM_TOO_MANY_NGRAM; 495 } 496 497 if (is32bits) { 498 bgptr32->wid = w2; 499 bgptr32->probid = sorted_id(&(model->sorted_prob2), &p2); 500 if (model->n_tg > 0) 501 bgptr32->bowtid = 502 sorted_id(&(model->sorted_bowt2), &bo_wt); 503 504 } 505 else { 506 bgptr->wid = w2; 507 bgptr->probid = sorted_id(&(model->sorted_prob2), &p2); 508 if (model->n_tg > 0) 509 bgptr->bowtid = sorted_id(&(model->sorted_bowt2), &bo_wt); 510 } 511 512 if (w1 != prev_w1) { 513 if (w1 < prev_w1) 514 E_INFO("Bigrams not in unigram order\n"); 515 516 for (prev_w1++; prev_w1 <= w1; prev_w1++) 517 model->ug[prev_w1].firstbg = bgcount; 518 prev_w1 = w1; 519 } 520 521 bgcount++; 522 523 if (is32bits) 524 bgptr32++; 525 else 526 bgptr++; 527 528 if ((bgcount & 0x0000ffff) == 0) { 529 E_INFO_NOFN("Processing bigram .\n"); 530 } 531 } 532 if ((strcmp(string, "\\end\\\n") != 0) 533 && (strcmp(string, "\\3-grams:\n") != 0)) { 534 E_WARN("Bad bigram: %s\n", string); 535 return LM_BAD_BIGRAM; 536 } 537 538 for (prev_w1++; prev_w1 <= model->n_ug; prev_w1++) 539 model->ug[prev_w1].firstbg = bgcount; 540 541 return LM_SUCCESS; 542 } 543 544 /* 545 * Reading Trigrams 546 * 547 * Very similar to ReadBigrams (in the sense that they are both 548 * written in C.) 549 * 550 * @return LM_UNKNOWN_WORDS when an unknown word couldn't be found. 551 * LM_BAD_TRIGRAM when a bad trigram is found. LM_NO_MINUS_1GRAM if 552 * the corresponding bigram of a trigram couldn't be 553 * found. LM_OFFSET_TOO_LARGE when in 16 bit mode, the trigram count 554 * of a segment is too huge. LM_BAD_TRIGRAM when a bad trigram is 555 * found. LM_SUCCESS when the whole reading is ok. 556 */ 557 static int 558 ReadTrigrams(FILE * fp, lm_t * model, int32 idfmt) 559 { 560 char string[1024], word1[256], word2[256], word3[256]; 561 int32 i, n, w1, w2, w3, prev_w1, prev_w2, tgcount, prev_bg, bg, endbg, 562 p; 563 int32 seg, prev_seg, prev_seg_lastbg; 564 tg_t *tgptr; 565 tg32_t *tgptr32; 566 bg_t *bgptr; 567 bg32_t *bgptr32; 568 float p3; 569 int32 is32bits; 570 571 E_INFO("Reading trigrams\n"); 572 573 is32bits = model->is32bits; 574 tgcount = 0; 575 tgptr = model->tg; 576 tgptr32 = model->tg32; 577 prev_w1 = -1; 578 prev_w2 = -1; 579 prev_bg = -1; 580 prev_seg = -1; 581 582 while (fgets(string, sizeof(string), fp) != NULL) { 583 if (!idfmt) 584 n = sscanf(string, "%f %s %s %s", &p3, word1, word2, word3); 585 else 586 n = sscanf(string, "%f %d %d %d", &p3, &w1, &w2, &w3); 587 if (n != 4) { 588 if (string[0] != '\n') 589 break; 590 continue; 591 } 592 593 if (!idfmt) { 594 if ((w1 = wstr2wid(model, word1)) == NO_WORD) { 595 E_WARN("Unknown word: %s\n", word1); 596 return LM_UNKNOWN_WORDS; 597 } 598 if ((w2 = wstr2wid(model, word2)) == NO_WORD) { 599 E_WARN("Unknown word: %s\n", word2); 600 return LM_UNKNOWN_WORDS; 601 } 602 if ((w3 = wstr2wid(model, word3)) == NO_WORD) { 603 E_WARN("Unknown word: %s\n", word3); 604 return LM_UNKNOWN_WORDS; 605 } 606 } 607 else { 608 if ((w1 >= model->n_ug) || (w2 >= model->n_ug) 609 || (w3 >= model->n_ug) || (w1 < 0) || (w2 < 0) || (w3 < 0)) { 610 E_WARN("Bad trigram: %s\n", string); 611 return LM_BAD_TRIGRAM; 612 } 613 } 614 615 /* HACK!! to quantize probs to 4 decimal digits */ 616 p = p3 * QUANTIZATION_MULTIPLIER; 617 p3 = p * QUANTIZATION_DIVISOR; 618 619 if (tgcount >= model->n_tg) { 620 E_WARN("Too many trigrams\n"); 621 return LM_TOO_MANY_NGRAM; 622 } 623 624 if (is32bits) { 625 tgptr32->wid = w3; 626 tgptr32->probid = sorted_id(&model->sorted_prob3, &p3); 627 628 } 629 else { 630 tgptr->wid = w3; 631 tgptr->probid = sorted_id(&model->sorted_prob3, &p3); 632 } 633 634 if ((w1 != prev_w1) || (w2 != prev_w2)) { 635 /* Trigram for a new bigram; update tg info for all previous bigrams */ 636 if ((w1 < prev_w1) || ((w1 == prev_w1) && (w2 < prev_w2))) 637 E_INFO("Trigrams not in bigram order\n"); 638 639 640 bg = (w1 != prev_w1) ? model->ug[w1].firstbg : prev_bg + 1; 641 endbg = model->ug[w1 + 1].firstbg; 642 643 if (is32bits) { 644 bgptr32 = model->bg32 + bg; 645 for (; (bg < endbg) && (bgptr32->wid != w2); 646 bg++, bgptr32++); 647 } 648 else { 649 bgptr = model->bg + bg; 650 for (; (bg < endbg) && (bgptr->wid != w2); bg++, bgptr++); 651 } 652 653 if (bg >= endbg) { 654 E_WARN("Missing bigram for trigram: %s", string); 655 return LM_NO_MINUS_1GRAM; 656 } 657 658 /* bg = bigram entry index for <w1,w2>. Update tseg_base */ 659 seg = bg >> LOG2_BG_SEG_SZ; 660 for (i = prev_seg + 1; i <= seg; i++) 661 model->tg_segbase[i] = tgcount; 662 663 /* E_INFO("bg %d, seg %d, prev_seg %d, tgcount %d, tg_segbase[prev_seg] %d, tgoff %d\n",bg,seg,prev_seg,tgcount,model->tg_segbase[prev_seg],tgcount - model->tg_segbase[prev_seg]); */ 664 /* Update trigrams pointers for all bigrams until bg */ 665 if (prev_seg < seg) { 666 int32 tgoff = 0; 667 668 if (prev_seg >= 0) { 669 tgoff = tgcount - model->tg_segbase[prev_seg]; 670 671 /* E_INFO("Offset %d tgcount %d, seg_base %d from tseg_base > %d, prev_seg %d seg %d\n", 672 tgoff, tgcount, model->tg_segbase[prev_seg],LM_LEGACY_CONSTANT,prev_seg,seg); */ 673 674 if (!is32bits) { 675 if (tgoff > LM_LEGACY_CONSTANT) { 676 E_WARN 677 ("Offset %d tgcount %d, seg_base %d from tseg_base > %d\n", 678 tgoff, tgcount, 679 model->tg_segbase[prev_seg], 680 LM_LEGACY_CONSTANT); 681 return LM_OFFSET_TOO_LARGE; 682 } 683 } 684 } 685 686 prev_seg_lastbg = ((prev_seg + 1) << LOG2_BG_SEG_SZ) - 1; 687 688 if (is32bits) { 689 bgptr32 = model->bg32 + prev_bg; 690 for (++prev_bg, ++bgptr32; prev_bg <= prev_seg_lastbg; 691 prev_bg++, bgptr32++) 692 bgptr32->firsttg = tgoff; 693 for (; prev_bg <= bg; prev_bg++, bgptr32++) 694 bgptr32->firsttg = 0; 695 } 696 else { 697 bgptr = model->bg + prev_bg; 698 for (++prev_bg, ++bgptr; prev_bg <= prev_seg_lastbg; 699 prev_bg++, bgptr++) 700 bgptr->firsttg = tgoff; 701 for (; prev_bg <= bg; prev_bg++, bgptr++) 702 bgptr->firsttg = 0; 703 } 704 705 } 706 else { 707 int32 tgoff; 708 709 tgoff = tgcount - model->tg_segbase[prev_seg]; 710 711 if (!is32bits) { 712 if (tgoff > LM_LEGACY_CONSTANT) { 713 E_WARN 714 ("Offset %d tgcount %d, seg_base %d from tseg_base > %d\n", 715 tgoff, tgcount, model->tg_segbase[prev_seg], 716 LM_LEGACY_CONSTANT); 717 return LM_OFFSET_TOO_LARGE; 718 } 719 } 720 721 if (is32bits) { 722 bgptr32 = model->bg32 + prev_bg; 723 for (++prev_bg, ++bgptr32; prev_bg <= bg; 724 prev_bg++, bgptr32++) 725 bgptr32->firsttg = tgoff; 726 } 727 else { 728 bgptr = model->bg + prev_bg; 729 for (++prev_bg, ++bgptr; prev_bg <= bg; 730 prev_bg++, bgptr++) 731 bgptr->firsttg = tgoff; 732 } 733 } 734 735 prev_w1 = w1; 736 prev_w2 = w2; 737 prev_bg = bg; 738 prev_seg = seg; 739 } 740 741 tgcount++; 742 /* E_INFO("\ntg_count %d: This line: %s, w1 %d, prev_w1 %d, w2 %d, prev_2 %d, w3 %d\n\n", 743 tgcount,string, 744 w1,prev_w1, 745 w2,prev_w2, 746 w3 747 ); */ 748 749 if (is32bits) { 750 tgptr32++; 751 } 752 else 753 tgptr++; 754 755 if ((tgcount & 0x0000ffff) == 0) { 756 E_INFO_NOFN("Processing trigram .\n"); 757 } 758 } 759 if (strcmp(string, "\\end\\\n") != 0) { 760 E_WARN("Bad trigram: %s\n", string); 761 return LM_BAD_TRIGRAM; 762 } 763 764 for (prev_bg++; prev_bg <= model->n_bg; prev_bg++) { 765 if ((prev_bg & (BG_SEG_SZ - 1)) == 0) 766 model->tg_segbase[prev_bg >> LOG2_BG_SEG_SZ] = tgcount; 767 768 if (!is32bits) { 769 if ((tgcount - model->tg_segbase[prev_bg >> LOG2_BG_SEG_SZ]) > 770 LM_LEGACY_CONSTANT) { 771 E_WARN 772 ("Offset %d tgcount %d, seg_base %d from tseg_base > %d\n", 773 model->tg_segbase[prev_seg >> LOG2_BG_SEG_SZ], 774 tgcount, 775 model->tg_segbase[prev_seg >> LOG2_BG_SEG_SZ], 776 LM_LEGACY_CONSTANT); 777 return LM_OFFSET_TOO_LARGE; 778 } 779 } 780 781 if (is32bits) { 782 model->bg32[prev_bg].firsttg = 783 tgcount - model->tg_segbase[prev_bg >> LOG2_BG_SEG_SZ]; 784 } 785 else { 786 model->bg[prev_bg].firsttg = 787 tgcount - model->tg_segbase[prev_bg >> LOG2_BG_SEG_SZ]; 788 } 789 } 790 791 return LM_SUCCESS; 792 } 793 794 /** 795 This is a function to read an ARPA-based LM. 796 797 (200060708) At this point, it is not memory leak-free. 798 799 @return an initialized LM if everything is alright. NULL, if something goes 800 wrong. 801 802 */ 803 lm_t * 804 lm_read_txt(const char *filename, /**< Input: The file name*/ 805 int32 lminmemory, /**< Input: Whether lm is in memory */ 806 int32 * err_no, /**< Input/Output: Depends on the problem that LM 807 reading encounters, it could be errors 808 from -2 (LM_OFFSET_TOO_LARGE) to 809 -15 (LM_CANNOT_ALLOCATE). Please checkout 810 lm.h for details. 811 */ 812 int32 isforced32bit, /** Input: normally, we should let lm_read_txt 813 to decide whether a file is 32 bit or not. 814 When the lm_read_txt couldn't decide that before 815 reading or if more specificially when we hit 816 the LM segment size problems. Then this bit 817 will alter the reading behavior to 32 bit. 818 */ 819 logmath_t *logmath 820 ) 821 { 822 lm_t *model; 823 FILE *fp = NULL; 824 int32 usingPipe = FALSE; 825 int32 n_unigram; 826 int32 n_bigram; 827 int32 n_trigram; 828 int32 idfmt = 0; 829 int32 _errmsg; 830 831 E_INFO("Reading LM file %s\n", filename); 832 833 fp = fopen_comp(filename, "r", &usingPipe); 834 if (fp == NULL) { 835 E_WARN("failed to read filename for LM\n"); 836 *err_no = LM_FILE_NOT_FOUND; 837 return NULL; 838 } 839 840 _errmsg = ReadNgramCounts(fp, &n_unigram, &n_bigram, &n_trigram); 841 if (_errmsg != LM_SUCCESS) { 842 E_WARN("Couldnt' read the ngram count\n"); 843 *err_no = _errmsg; 844 fclose(fp); 845 return NULL; 846 } 847 848 E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram); 849 /* HACK! This should be something provided by the dictionary What is dict_size? */ 850 851 model = NewModel(n_unigram, n_bigram, n_trigram, 852 isforced32bit ? 853 LMFORCED_TXT32VERSION : LMTXT_VERSION); 854 if (model == NULL) { 855 E_WARN 856 ("Cannot allocate tables for new lm with ug %d, bg %d, tg %d\n", 857 n_unigram, n_bigram, n_trigram); 858 *err_no = LM_CANNOT_ALLOCATE; 859 fclose(fp); 860 return NULL; 861 } 862 863 model->n_ng = 1; 864 model->logmath = logmath; 865 866 model->isLM_IN_MEMORY = lminmemory; 867 868 model->is32bits = lm_is32bits(model); 869 if (model->is32bits) 870 E_INFO("Is 32 bits %d, lm->version %d\n", model->is32bits, 871 model->version); 872 873 /* ARCHAN. Should do checking as well. I was lazy. 874 */ 875 876 if (model->n_bg > 0) { 877 model->n_ng = 2; 878 if (model->is32bits) { 879 model->membg32 = 880 (membg32_t *) ckd_calloc(model->n_ug, sizeof(membg32_t)); 881 882 } 883 else { 884 model->membg = 885 (membg_t *) ckd_calloc(model->n_ug, sizeof(membg_t)); 886 887 } 888 } 889 890 if (model->n_tg > 0) { 891 model->n_ng = 3; 892 893 if (model->is32bits) { 894 model->tginfo32 = 895 (tginfo32_t **) ckd_calloc(model->n_ug, 896 sizeof(tginfo32_t *)); 897 898 } 899 else { 900 model->tginfo = 901 (tginfo_t **) ckd_calloc(model->n_ug, sizeof(tginfo_t *)); 902 } 903 } 904 905 906 /* Have to put it somewhere in lm as a kind of buffer */ 907 model->wordstr = (char **) ckd_calloc(n_unigram, sizeof(char *)); 908 909 /* control the lm dumping mechanism */ 910 911 _errmsg = ReadUnigrams(fp, model); 912 if (_errmsg != LM_SUCCESS) { 913 *err_no = _errmsg; 914 fclose(fp); 915 return NULL; 916 } 917 918 E_INFO("%8d = #unigrams created\n", model->n_ug); 919 920 init_sorted_list(&(model->sorted_prob2)); 921 if (model->n_tg > 0) 922 init_sorted_list(&(model->sorted_bowt2)); 923 924 _errmsg = ReadBigrams(fp, model, idfmt); 925 if (_errmsg != LM_SUCCESS) { 926 *err_no = _errmsg; 927 fclose(fp); 928 return NULL; 929 } 930 931 model->n_bg = FIRST_BG(model, model->n_ug); 932 model->n_bgprob = model->sorted_prob2.free; 933 model->bgprob = vals_in_sorted_list(&(model->sorted_prob2)); 934 free_sorted_list(&(model->sorted_prob2)); 935 936 E_INFO("%8d = #bigrams created\n", model->n_bg); 937 E_INFO("%8d = #prob2 entries\n", model->n_bgprob); 938 939 if (model->n_tg > 0) { 940 /* Create trigram bo-wts array */ 941 model->n_tgbowt = model->sorted_bowt2.free; 942 model->tgbowt = vals_in_sorted_list(&(model->sorted_bowt2)); 943 free_sorted_list(&(model->sorted_bowt2)); 944 E_INFO("%8d = #bo_wt2 entries\n", model->n_tgbowt); 945 946 init_sorted_list(&(model->sorted_prob3)); 947 948 _errmsg = ReadTrigrams(fp, model, idfmt); 949 if (_errmsg != LM_SUCCESS) { 950 *err_no = _errmsg; 951 fclose(fp); 952 return NULL; 953 } 954 955 model->n_tg = 956 model->is32bits ? FIRST_TG32(model, 957 model->n_bg) : FIRST_TG(model, 958 model-> 959 n_bg); 960 model->n_tgprob = model->sorted_prob3.free; 961 model->tgprob = vals_in_sorted_list(&(model->sorted_prob3)); 962 E_INFO("%8d = #trigrams created\n", model->n_tg); 963 E_INFO("%8d = #prob3 entries\n", model->n_tgprob); 964 965 free_sorted_list(&model->sorted_prob3); 966 } 967 968 fclose(fp); 969 *err_no = LM_SUCCESS; 970 return model; 971 } 972 973 974 static char const *txtheader[] = { 975 "#############################################################################", 976 "#Copyright (c) 1999-2004 Carnegie Mellon University. All rights reserved", 977 "#############################################################################", 978 "#=============================================================================", 979 "#=============== This file was produced by the CMU Sphinx 3.X ==============", 980 "#=============================================================================", 981 "#############################################################################", 982 "This file is in the ARPA-standard format introduced by Doug Paul.", 983 "", 984 "p(wd3|wd1,wd2)= if(trigram exists) p_3(wd1,wd2,wd3)", 985 " else if(bigram w1,w2 exists) bo_wt_2(w1,w2)*p(wd3|wd2)", 986 " else p(wd3|w2)", 987 "", 988 "p(wd2|wd1)= if(bigram exists) p_2(wd1,wd2)", 989 " else bo_wt_1(wd1)*p_1(wd2)", 990 "", 991 "All probs and back-off weights (bo_wt) are given in log10 form.", 992 "\\", 993 "Data formats:", 994 "", 995 "Beginning of data mark: \\data\\", 996 NULL 997 }; 998 999 /** 1000 Write an arpa LM header (Sphinx 3.x -specific) 1001 */ 1002 static void 1003 lm_write_arpa_header(lm_t * lmp, /**< The LM pointer */ 1004 FILE * fp /**< The file pointer */ 1005 ) 1006 { 1007 /* Print header */ 1008 int32 i, j; 1009 for (i = 0; txtheader[i] != NULL; i++) 1010 fprintf(fp, "%s\n", txtheader[i]); 1011 1012 for (i = 1; i <= lmp->n_ng; i++) { 1013 fprintf(fp, "ngram %d=nr # number of %d-grams\n", i, i); 1014 } 1015 fprintf(fp, "\n"); 1016 for (i = 1; i <= lmp->n_ng; i++) { 1017 fprintf(fp, "\\%d-grams:\n", i); 1018 fprintf(fp, "p_%d ", i); 1019 for (j = 1; j <= i; j++) { 1020 fprintf(fp, "wd_%d ", j); 1021 } 1022 if (i == lmp->n_ng) { 1023 fprintf(fp, "\n"); 1024 } 1025 else { 1026 fprintf(fp, "bo_wt_%d\n", i); 1027 } 1028 } 1029 1030 fprintf(fp, "\n"); 1031 fprintf(fp, "end of data mark: \\end\\\n"); 1032 fprintf(fp, "\n"); 1033 } 1034 1035 /** 1036 Write the n-gram counts for ARPA file format. 1037 */ 1038 static void 1039 lm_write_arpa_count(lm_t * lmp, /**< The LM pointer */ 1040 FILE * fp /**< The file pointer */ 1041 ) 1042 { 1043 fprintf(fp, "\\data\\\n"); 1044 1045 fprintf(fp, "ngram %d=%d\n", 1, lmp->n_ug); 1046 if (lmp->n_bg) 1047 fprintf(fp, "ngram %d=%d\n", 2, lmp->n_bg); 1048 if (lmp->n_tg) 1049 fprintf(fp, "ngram %d=%d\n", 3, lmp->n_tg); 1050 1051 fprintf(fp, "\n"); 1052 1053 } 1054 1055 /** 1056 Write unigram 1057 */ 1058 1059 static void 1060 lm_write_arpa_unigram(lm_t * lmp, /**< The LM pointer */ 1061 FILE * fp /**< The file pointer */ 1062 ) 1063 { 1064 int32 i; 1065 fprintf(fp, "\\1-grams:\n"); 1066 for (i = 0; i < lmp->n_ug; i++) { 1067 fprintf(fp, "%.4f ", lmp->ug[i].prob.f); 1068 fprintf(fp, "%s", lmp->wordstr[i]); 1069 fprintf(fp, " "); 1070 fprintf(fp, "%.4f\n", lmp->ug[i].bowt.f); 1071 } 1072 fprintf(fp, "\n"); 1073 } 1074 1075 /** 1076 Write bigram in arpa format 1077 */ 1078 1079 static void 1080 lm_write_arpa_bigram(lm_t * lmp, FILE * fp) 1081 { 1082 int32 i, j; 1083 int32 n, b; 1084 s3lmwid32_t lw1, lw2; 1085 int32 is32bit; 1086 uint32 probid; 1087 uint32 bowtid; 1088 1089 fprintf(fp, "\\2-grams:\n"); 1090 1091 /* Decide whether we are working on 32bits 1092 either lmp->bg is NULL or number of ug need more than 16 bits to represent. 1093 */ 1094 is32bit = lm_is32bits(lmp); 1095 1096 for (i = 0; i <= lmp->n_ug - 1; i++) { 1097 b = lmp->ug[i].firstbg; 1098 n = lmp->ug[i + 1].firstbg; 1099 1100 for (j = b; j < n; j++) { 1101 lw1 = i; 1102 if (is32bit) { 1103 assert(lmp->bg32 != NULL); 1104 lw2 = lmp->bg32[j].wid; 1105 probid = lmp->bg32[j].probid; 1106 bowtid = lmp->bg32[j].bowtid; 1107 1108 } 1109 else { 1110 assert(lmp->bg != NULL); 1111 lw2 = (s3lmwid32_t) lmp->bg[j].wid; 1112 probid = (int32) lmp->bg[j].probid; 1113 bowtid = (int32) lmp->bg[j].bowtid; 1114 } 1115 1116 fprintf(fp, "%.4f ", lmp->bgprob[probid].f); 1117 fprintf(fp, "%s", lmp->wordstr[lw1]); 1118 fprintf(fp, " "); 1119 fprintf(fp, "%s", lmp->wordstr[lw2]); 1120 1121 if (lmp->tgbowt) { 1122 fprintf(fp, " "); 1123 fprintf(fp, "%.4f\n", lmp->tgbowt[bowtid].f); 1124 } 1125 else 1126 fprintf(fp, "\n"); 1127 1128 } 1129 } 1130 1131 fprintf(fp, "\n"); 1132 } 1133 1134 /** 1135 Write trigram in arpa format 1136 */ 1137 1138 static void 1139 lm_write_arpa_trigram(lm_t * lmp, /**< The pointer of LM */ 1140 FILE * fp /**< A FILE pointer */ 1141 ) 1142 { 1143 int32 i, j, k; 1144 int32 b_bg, n_bg; 1145 int32 b_tg, n_tg; 1146 s3lmwid32_t lw1, lw2, lw3; 1147 uint32 probid; 1148 int32 is32bit; 1149 1150 is32bit = lm_is32bits(lmp); 1151 1152 fprintf(fp, "\\3-grams:\n"); 1153 for (i = 0; i <= lmp->n_ug - 1; i++) { 1154 b_bg = lmp->ug[i].firstbg; 1155 n_bg = lmp->ug[i + 1].firstbg; 1156 1157 for (j = b_bg; j <= n_bg - 1; j++) { 1158 1159 if (is32bit) { 1160 assert(lmp->bg32); 1161 b_tg = 1162 lmp->tg_segbase[j >> lmp->log_bg_seg_sz] + 1163 lmp->bg32[j].firsttg; 1164 n_tg = 1165 lmp->tg_segbase[(j + 1) >> lmp->log_bg_seg_sz] + 1166 lmp->bg32[j + 1].firsttg; 1167 } 1168 else { 1169 assert(lmp->bg); 1170 b_tg = 1171 lmp->tg_segbase[j >> lmp->log_bg_seg_sz] + 1172 lmp->bg[j].firsttg; 1173 n_tg = 1174 lmp->tg_segbase[(j + 1) >> lmp->log_bg_seg_sz] + 1175 lmp->bg[j + 1].firsttg; 1176 } 1177 1178 for (k = b_tg; k < n_tg; k++) { 1179 1180 if (is32bit) { 1181 assert(lmp->bg32 && lmp->tg32); 1182 lw1 = i; 1183 lw2 = lmp->bg32[j].wid; 1184 lw3 = lmp->tg32[k].wid; 1185 probid = lmp->tg32[k].probid; 1186 } 1187 else { 1188 assert(lmp->bg && lmp->tg); 1189 lw1 = i; 1190 lw2 = (s3lmwid32_t) lmp->bg[j].wid; 1191 lw3 = (s3lmwid32_t) lmp->tg[k].wid; 1192 probid = (uint32) lmp->tg[k].probid; 1193 } 1194 1195 fprintf(fp, "%.4f ", lmp->tgprob[probid].f); 1196 fprintf(fp, "%s", lmp->wordstr[lw1]); 1197 fprintf(fp, " "); 1198 fprintf(fp, "%s", lmp->wordstr[lw2]); 1199 fprintf(fp, " "); 1200 fprintf(fp, "%s", lmp->wordstr[lw3]); 1201 fprintf(fp, "\n"); 1202 1203 } 1204 } 1205 } 1206 1207 } 1208 1209 /** 1210 Write the end for an arpa file format 1211 */ 1212 static void 1213 lm_write_arpa_end(lm_t * lmp, /**< The LM pointer */ 1214 FILE * fp /**< The FILE Pointer */ 1215 ) 1216 { 1217 fprintf(fp, "\\end\\\n"); 1218 } 1219 1220 /** 1221 * Write an LM with ARPA file format 1222 */ 1223 int32 1224 lm_write_arpa_text(lm_t * lmp, const char *outputfn) 1225 { 1226 FILE *fp; 1227 int is32bit; 1228 1229 E_INFO("Dumping LM to %s\n", outputfn); 1230 if ((fp = fopen(outputfn, "w")) == NULL) { 1231 E_ERROR("Cannot create file %s\n", outputfn); 1232 return LM_FAIL; 1233 } 1234 1235 is32bit = lm_is32bits(lmp); 1236 lm_write_arpa_header(lmp, fp); 1237 lm_write_arpa_count(lmp, fp); 1238 lm_write_arpa_unigram(lmp, fp); 1239 1240 lm_convert_structure(lmp, is32bit); 1241 1242 if (lmp->n_bg > 0) 1243 lm_write_arpa_bigram(lmp, fp); 1244 if (lmp->n_tg > 0) 1245 lm_write_arpa_trigram(lmp, fp); 1246 lm_write_arpa_end(lmp, fp); 1247 1248 fclose(fp); 1249 return LM_SUCCESS; 1250 } 1251