1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ 2 /* ==================================================================== 3 * Copyright (c) 1999-2004 Carnegie Mellon University. All rights 4 * reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 18 * This work was supported in part by funding from the Defense Advanced 19 * Research Projects Agency and the National Science Foundation of the 20 * United States of America, and the CMU Sphinx Speech Consortium. 21 * 22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY 26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 * 34 * ==================================================================== 35 * 36 */ 37 /* 38 * lm.h - Disk/memory based word-trigram backoff LM 39 * 40 * ********************************************** 41 * CMU ARPA Speech Project 42 * 43 * Copyright (c) 1997 Carnegie Mellon University. 44 * ALL RIGHTS RESERVED. 45 * ********************************************** 46 * 47 * HISTORY 48 * $Log: lm.h,v $ 49 * Revision 1.16 2006/03/02 22:10:36 arthchan2003 50 * Add *g_write into the code. 51 * 52 * Revision 1.15 2006/02/28 22:26:51 egouvea 53 * Moved definition of lm_wid() outside of the #if 0/#endif block, so 54 * it's declared. 55 * 56 * Revision 1.14 2006/02/24 13:38:08 arthchan2003 57 * Added lm_read, it is a simple version of lm_read_advance. 58 * 59 * Revision 1.13 2006/02/23 04:16:29 arthchan2003 60 * Merged from SPHINX3_5_2_RCI_IRII_BRANCH: 61 * Splited the original lm.c into five parts, 62 * a, lm.c - a controller of other subroutines. 63 * b, lm_3g.c - implement TXT-based lm operations 64 * c, lm_3g_dmp.c - implement DMP-based lm operations 65 * d, lm_attfsm.c - implement FSM-based lm operations 66 * e, lmset.c - implement sets of lm. 67 * 68 * Revision 1.12.4.3 2006/01/16 19:56:37 arthchan2003 69 * 1, lm_rawscore doesn't need a language weight, 2, Support dumping the LM in FST format. This code used Yannick Esteve's and LIUM code. 70 * 71 * Revision 1.12.4.2 2005/11/17 06:15:22 arthchan2003 72 * Added input-encoding and output-encoding into the lm structure. 73 * 74 * Revision 1.12.4.1 2005/07/13 01:46:22 arthchan2003 75 * 1, Fixed dox-doc, 2, Added more documentation for major functions such as lm_read and lm_write. 76 * 77 * Revision 1.12 2005/06/21 22:24:02 arthchan2003 78 * Log. In this change, I introduced a new interface for lm ,which is 79 * call lmset_t. lmset_t wraps up multiple lm, n_lm, n_alloclm into the 80 * same structure and handle LM initialization (lm_init) switching, 81 * (lmset_curlm_widx), delete LM (lmset_delete_lm). The internal 82 * structure is called lmarray and is an array of pointers of lm. The 83 * current lm is always maintained and pointed by a pointer called cur_lm 84 * . This substantially clarify the structure of the code. At this 85 * check-in, not every core function of lmset is completed. 86 * e.g. lmset_add_lm because that required testing of several LM reading 87 * routines and could be quite time-consuming. 88 * 89 * Log. Another notable change is the fact dict2lmwid map is started to 90 * be part of the LM. The reason of this is clearly described inside the 91 * code. Don't want to repeat here. 92 * 93 * Log. The new interface has been already used broadly in both Sphinx 94 * 3.0 and sphinx 3.x family of tools. 95 * 96 * Revision 1.5 2005/06/18 03:22:28 archan 97 * Add lmset_init. A wrapper function of various LM initialization and initialize an lmset It is now used in decode, livepretend, dag and astar. 98 * 99 * Revision 1.4 2005/06/17 23:44:40 archan 100 * Sphinx3 to s3.generic, 1, Support -lmname in decode and livepretend. 2, Wrap up the initialization of dict2lmwid to lm initialization. 3, add Dave's trick in LM switching in mode 4 of the search. 101 * 102 * Revision 1.3 2005/06/13 04:02:59 archan 103 * Fixed most doxygen-style documentation under libs3decoder. 104 * 105 * Revision 1.2 2005/05/10 21:21:54 archan 106 * Three functionalities added but not tested. Code on 1) addition/deletion of LM in mode 4. 2) reading text-based LM 3) Converting txt-based LM to dmp-based LM. 107 * 108 * Revision 1.1 2005/05/04 06:08:07 archan 109 * Refactor all lm routines except fillpen.c into ./libs3decoder/liblm/ . This will be equivalent to ./lib/liblm in future. 110 * 111 * Revision 1.6 2005/05/04 04:02:24 archan 112 * Implementation of lm addition, deletion in (mode 4) time-switching tree implementation of search. Not yet tested. Just want to keep up my own momentum. 113 * 114 * Revision 1.5 2005/04/21 23:50:26 archan 115 * Some more refactoring on the how reporting of structures inside kbcore_t is done, it is now 50% nice. Also added class-based LM test case into test-decode.sh.in. At this moment, everything in search mode 5 is already done. It is time to test the idea whether the search can really be used. 116 * 117 * Revision 1.4 2005/04/20 03:37:59 archan 118 * LM code changes: functions are added to set, add and delete LM from the lmset, change the legacy lmset data structure to contain n_lm and n_alloc_lm. 119 * 120 * Revision 1.3 2005/03/30 01:22:47 archan 121 * Fixed mistakes in last updates. Add 122 * 123 * 124 * 20.Apr.2001 RAH (rhoughton@mediasite.com, ricky.houghton@cs.cmu.edu) 125 * Adding lm_free() to free allocated memory 126 * 127 * 24-Jun-97 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University 128 * Added lm_t.access_type; made lm_wid externally visible. 129 * 130 * 24-Jun-97 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University 131 * Added lm_t.log_bg_seg_sz and lm_t.bg_seg_sz. 132 * 133 * 13-Feb-97 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University 134 * Created from original S3 version. 135 */ 136 137 138 #ifndef _S3_LM_H_ 139 #define _S3_LM_H_ 140 141 #include <stdio.h> 142 143 #include <logmath.h> 144 #include <hash_table.h> 145 #include <cmd_ln.h> 146 147 #ifdef __cplusplus 148 extern "C" { 149 #endif 150 #if 0 151 } /* Fool Emacs into not indenting things. */ 152 #endif 153 154 #define LM_DICTWID_BADMAP -16000 /** An illegal mapping */ 155 #define LM_CLASSID_BASE 0x01000000 /** Interpreted as LMclass ID */ 156 157 /** Upper limit of the words of Sphinx 3.X */ 158 #define LM_LEGACY_CONSTANT BAD_S3LMWID /**< =65535 (~65k), this is introduced 159 since 1996 when Ravi first wrote Sphinx 3.0. It 160 was with us since. 161 */ 162 163 #define LM_SPHINX_CONSTANT BAD_S3LMWID32 /**< (4 billion), ARCHAN: this is introduced by in Sphinx 3.6 164 during the time of Release Candidate I (2006 March). The caveat of using 165 this constant is that it is much hard to detect byte-swapping problem. 166 in general. Also, if the world has more than 10000 cities, each has 1 million 167 roads name. We are stuck in this case. I assume this will happen in 168 year3001. 169 */ 170 171 172 #define LM_CLASSID_TO_CLASS(m,i) ((m)->lmclass[(i)-LM_CLASSID_BASE]) 173 174 #define MIN_PROB_F -99.0 /**< The minimum value of probabilities and 175 backoff weights. When changing, notice 176 that both s2 and s3 may transform this 177 number to very small integer (say -2e-31) 178 This will easily cause integer wrap 179 around. -99 is chosen for that reason. 180 */ 181 182 #define LM_ALLOC_BLOCK 16 /** The number of LMs to allocate at a time. 183 */ 184 185 /** 186 Sucess and error message. 187 */ 188 #define LM_SUCCESS 1 /**< Constant that indicates an operation succeed 189 */ 190 #define LM_FAIL 0 /**< Constant that define an operation failed. */ 191 #define LM_NOT_FOUND -1 /**< Constant which indicate an LM couldn't be 192 found */ 193 #define LM_OFFSET_TOO_LARGE -2 /**< Constant where the 16 bit LM was 194 used, but th tgcount is larger than 195 LM_LEGACY_CONSTANT (65535). This 196 breaks addressing scheme in the 197 current LM. 198 */ 199 #define LM_NO_DATA_MARK -3 /**< When reading text-based LM, 200 return thisif we see no data 201 mark */ 202 #define LM_UNKNOWN_NG -4 /**< When reading the header of LM, if 203 there is unknown K for K-gram */ 204 #define LM_BAD_LM_COUNT -5 /**< When reading LM, if count is bad, 205 return this msg */ 206 #define LM_UNKNOWN_WORDS -6 /**< When an unknown word is found 207 during LM readin, return this 208 message */ 209 #define LM_BAD_BIGRAM -7 /**< A bad bigram, it could be word 210 ids larger than # of unigram, it 211 could be word id smaller than 0. 212 It could also be bigram out of 213 bound. 214 */ 215 #define LM_BAD_TRIGRAM -8 /**< A bad trigram, it could be word 216 ids larger than # of unigram, it 217 could be word id smaller than 0. 218 It could also be bigram out of 219 bound. 220 */ 221 #define LM_BAD_QUADGRAM -9 /**< (RESERVED BUT NOT USED) A bad 222 quadgram (4-gram), it could be word 223 ids larger than # of unigram, it 224 could be word id smaller than 0. 225 It could also be bigram out of 226 bound. 227 */ 228 #define LM_BAD_QUINGRAM -10 /**< (RESERVED BUT NOT USED) A bad 229 quingram (5-gram), it could be 230 word ids larger than # of unigram, 231 it could be word id smaller than 232 0. It could also be bigram out of 233 bound. BTW, there is no need to 234 remind me the mixed use of 235 quadgram and quingram is stupid 236 English. I read Manning and 237 Schultze. 238 */ 239 #define LM_BAD_NGRAM -11 /**< (RESERVED BUT NOT USED) A bad 240 n-gram. generalization of message 241 -7 to -10. In our case, we don't 242 make the message as specific as 243 possible. 244 */ 245 #define LM_TOO_MANY_NGRAM -12 /**< When reading LM, if the number of 246 n-grams is more than the number 247 specified header. return this 248 header */ 249 #define LM_NO_MINUS_1GRAM -13 /**< When reading n-gram, if the 250 corresponding (n-1)-gram doesn't 251 exists, return this message. */ 252 #define LM_FILE_NOT_FOUND -14 /**< When couldn't find the LM file, 253 return this message */ 254 #define LM_CANNOT_ALLOCATE -15 /**< When cannot allocate tables in LM 255 return this message */ 256 257 /** Versioning of LM */ 258 #define LMDMP_VERSIONNULL 0 /**< VERSION 0 is oldest, in the past, we 259 used to use the version number to 260 store the number of unigram, you will 261 see logic that said vn > LMDMP_VERSIONNULL 262 */ 263 264 #define LMDMP_VERSION_TG_16BIT -1 /**< VERSION 1 is the simplest DMP file which 265 is trigram or lower which used 16 bits in 266 bigram and trigram.*/ 267 268 #define LMDMP_VERSION_TG_16BIT_V2 -2 /**< VERSION 2 means legacy VERSION 1 DMP file 269 which has log_bg_seg_sz != 9*/ 270 271 #define LMDMP_VERSION_TG_32BIT -3 /**< VERSION 3 is the 32 bit 272 extension of VERSION 1 but 273 the bigram and trigram are 274 represented by 32 bits data 275 structure */ 276 277 #define LMTXT_VERSION 1000 /**< VERSION 1000 is the text-based LM */ 278 #define LMFST_VERSION 1001 /**< VERSION 1001 is the FST-based LM */ 279 #define LMFORCED_TXT32VERSION 1002 /**< VERSION 1002 is the internal version of 280 text-based LM. The difference betwwen 281 1002 and 1000 is that 1002 will assume 282 LM is 32bits. This fact is used in 283 lm_is32bits(lm) 284 */ 285 286 287 #define NO_WORD -1 288 289 #include "s3types.h" 290 #include "lmclass.h" 291 #include "dict.h" 292 293 /* 294 * ARCHAN 20050503: comment copied from Sphinx 2 295 * Bigram probs and bo-wts, and trigram probs are kept in separate tables 296 * rather than within the bigram_t and trigram_t structures. These tables 297 * hold unique prob and bo-wt values, and can be < 64K long (see lm_3g.h). 298 * The following tree structure is used to construct these tables of unique 299 * values. Whenever a new value is read from the LM file, the sorted tree 300 * structure is searched to see if the value already exists, and inserted 301 * if not found. 302 */ 303 304 /** \file lm.h 305 \brief Language model 306 307 This is the header file for language model support in Sphinx 3. 308 Sphinx 3 supports language model in 4 formats. The four formats are 309 310 ARPA format: First appear in Sphinx 2. We port it to Sphinx 3 in 311 3.X (X=6) 312 313 DMP : Sphinx 3 slow and fast used it, so does later in Sphinx 3.X 314 (X>4) 315 316 DMP32 : We start to break the limit of number of words of 317 65535. This is the first LM file format in Sphinx 3.X that could 318 capture 4 billion words in the language model 319 320 FST: In AT&T format, we start to support in 3.X (X=6). 321 322 At 20060302 323 we can only read and used ARPA, DMP-based format in the decoder. 324 we can write ARPA, DMP, DMP32 and FST file format. 325 */ 326 327 /** \struct lmlog_t 328 \brief Log quantities represented in either floating or integer format 329 */ 330 typedef union { 331 float32 f; /**< The floating point component */ 332 int32 l; /**< The integer component */ 333 } lmlog_t; 334 335 336 337 /** \struct sorted_entry_t 338 \brief single entry used in the linked list structure of lm reading 339 */ 340 341 typedef struct sorted_entry_s { 342 lmlog_t val; /**< value being kept in this node */ 343 uint32 lower; /**< index of another entry. All descendants down 344 this path have their val < this node's val. 345 0 => no son exists (0 is root index) */ 346 uint32 higher; /**< index of another entry. All descendants down 347 this path have their val > this node's val 348 0 => no son exists (0 is root index) */ 349 } sorted_entry_t; 350 351 /** \struct sorted_list_t 352 * 353 * \brief The sorted list used lm reading. list is a (64K long) array. The first entry is the root of the tree and is created during initialization. 354 */ 355 typedef struct { 356 sorted_entry_t *list; /**< Beginnig of the list */ 357 int32 free; /**< first free element in list */ 358 } sorted_list_t; 359 360 /** \struct ug_t 361 * \brief A unigram structure 362 * Please see 363 */ 364 typedef struct { 365 s3wid_t dictwid; /**< Dictionary word id, or BAD_S3WID if unknown. However, the LM 366 module merely sets this field to BAD_S3WID. It is upto the 367 application to fill in this field (HACK!!), so that this 368 module can be independent of a dictionary. */ 369 lmlog_t prob; /**< Unigram probability */ 370 lmlog_t bowt; 371 int32 firstbg; /**< 1st bigram entry on disk */ 372 } ug_t; 373 374 /** \struct bg_t 375 * \brief A bigram structure 376 */ 377 378 typedef struct { 379 s3lmwid_t wid; /**< LM wid (index into lm_t.ug) */ 380 uint16 probid; /**< Index into array of actualy bigram probs*/ 381 uint16 bowtid; /**< Index into array of actualy bigram backoff wts */ 382 uint16 firsttg; /**< 1st trigram entry on disk (see tg_segbase below) */ 383 } bg_t; 384 385 386 /** \struct bg32_t 387 * \brief A bigram structure which has 32 bits. 388 */ 389 typedef struct { 390 s3lmwid32_t wid; /**< LM wid (index into lm_t.ug) */ 391 uint32 probid; /**< Index into array of actualy bigram probs*/ 392 uint32 bowtid; /**< Index into array of actualy bigram backoff wts */ 393 uint32 firsttg; /**< 1st trigram entry on disk (see tg_segbase below) */ 394 } bg32_t; 395 396 397 /** \struct tg_t 398 * \brief A trigram structure 399 */ 400 401 typedef struct { 402 s3lmwid_t wid; /**< LM wid (index into lm_t.ug) */ 403 uint16 probid; /**< Index into array of actualy trigram probs*/ 404 } tg_t; 405 406 407 /** \struct tg32_t 408 * \brief A 32 bits version of tg_t 409 */ 410 411 typedef struct { 412 s3lmwid32_t wid; /**< LM wid (index into lm_t.ug) */ 413 uint32 probid; /**< Index into array of actualy trigram probs*/ 414 } tg32_t; 415 416 417 /** \struct membg_t 418 * \brief Management of in-memory bigrams. Not used if all bigrams in memory. 419 */ 420 typedef struct { 421 bg_t *bg; /**< Bigrams for a specific unigram; see lm_t.membg */ 422 int32 used; /**< Whether used since last lm_reset. If not used, at the next 423 lm_reset bg are freed */ 424 } membg_t; 425 426 /** \struct membg32_t 427 * 428 * \brief A 32 bits version of membg_t 429 */ 430 typedef struct { 431 bg32_t *bg32; /**< Bigrams for a specific unigram; see lm_t.membg */ 432 int32 used; /**< Whether used since last lm_reset. If not used, at the next 433 lm_reset bg are freed */ 434 } membg32_t; 435 436 437 /** 438 * \struct tginfo_t 439 * \brief trigram cache that enhance locating trigram for a given bigram (w_1,w_2) 440 * 441 * The following trigram information cache eliminates most traversals of 1g->2g->3g 442 * tree to locate trigrams for a given bigram (w1,w2). The organization is optimized 443 * for locality of access. All bigrams (*,w2) for a given w2, for which trigrams have 444 * been accessed "recently", form a linear linked list, pointed to by lm_t.tginfo[w2]. 445 * If disk-based, all trigrams for the given bg loaded upon request. Cached info (and 446 * tg if disk-based) freed at lm_reset if not used since last such reset. 447 */ 448 typedef struct tginfo_s { 449 s3lmwid_t w1; /**< w1 component of bigram w1,w2. All bigrams with 450 same w2 linked together. */ 451 int32 n_tg; /**< #tg for parent bigram w1,w2 */ 452 tg_t *tg; /**< Trigrams for w1,w2 */ 453 int32 bowt; /**< tg bowt for w1,w2 */ 454 int32 used; /**< whether used since last lm_reset */ 455 struct tginfo_s *next; /**< Next w1 with same parent w2 */ 456 } tginfo_t; 457 458 /** 459 * \struct tginfo32_t 460 * \brief 32 bit version of tginfo 461 * 462 */ 463 typedef struct tginfo32_s { 464 s3lmwid32_t w1; /**< w1 component of bigram w1,w2. All bigrams with 465 same w2 linked together. */ 466 int32 n_tg; /**< #tg for parent bigram w1,w2 */ 467 tg32_t *tg32; /**< Trigrams for w1,w2 */ 468 int32 bowt; /**< tg bowt for w1,w2 */ 469 int32 used; /**< whether used since last lm_reset */ 470 struct tginfo32_s *next; /**< Next w1 with same parent w2 */ 471 } tginfo32_t; 472 473 474 /* 475 * \struct lm_tgcache_entry_t 476 * Entries in a fast and dirty cache for trigram lookups. See lm_t.tgcache. 477 */ 478 typedef struct { 479 s3lmwid_t lwid[3]; /**< 0 = oldest, 2 = newest (i.e., P(2|0,1)) */ 480 int32 lscr; /**< LM score for above trigram */ 481 } lm_tgcache_entry_t; 482 483 484 /* 485 * \struct lm_tgcache_entry32_t 486 * \brief 32 bit version of lm_tg_cache_entry 487 */ 488 typedef struct { 489 s3lmwid32_t lwid[3]; /**< 0 = oldest, 2 = newest (i.e., P(2|0,1)) */ 490 int32 lscr; /**< LM score for above trigram */ 491 } lm_tgcache_entry32_t; 492 493 494 495 /* 496 * A note on lm/dict/dict2lm. -ARCHAN 20050616 497 * 498 * In older versions of sphinx3 (<s3.4). dict2lm is a separate object 499 * from lm and dict. A kb actually owns a dict2lm so programer will 500 * read the lm. This seprates the initalization of lm and dict2lm and 501 * it makes a lot of sense if there is **only one** lm and **only one 502 * dict2lm. 503 * 504 * However, when multiple LMs and switching of them is required. 505 * Then, the problem of the above architecture starts to show up. For 506 * example, 507 * lmset=lm_read_ctl (); 508 * for(i=0;i<kb->n_lm;i++){ 509 * dict2lmwid[i]=wid_dict_lm_map 510 * } 511 * At the same time, one will also have an array of lms (lmset[i]) for 512 * corresponding dict2lm[i]! 513 * 514 * Of course, having multiple arrays of things will somedays caused 515 * problems. 516 * 517 * The resolution is that we observed that the dict2lm map mostly 518 * changed when the lm needs to change. Also, the fact that the 519 * dictionary pronounciation itself seldom changes. That is partially 520 * caused by the fact we don't have too much research on So at the 521 * end, that is why it makes sense to let the lm to own a dict2lm. 522 * 523 * What if we also allow the dictionary to change? That is a tough 524 * question. In that case perhaps, we should still inventory of sets 525 * of lm and dict2lm and allow lm to store a pointer of dict2lm. Once 526 * there are changes in dict, programmer will be responsible to update 527 * dict2lm. (Storing pointers will allow programmers not to update 528 * everything but just lms corresponding to a particular dict.) I 529 * guess in that case it will be sign of having a wrapper that control 530 * both lm and dict together. 531 */ 532 533 /* 534 * Comments by RKM 535 * To conserve space, bg/tg probs/ptrs kept in many tables. Since the number of 536 * distinct prob values << #bg/#tg, these table indices can be easily fit into 537 * 16 bits. bgprob and bgbowt are such indices. The firsttg entry for a bigram 538 * is harder. It is supposed to be the index of the first trigram entry for each 539 * bigram. But #tg can be >> 2^16. Hence the following segmentation scheme: 540 * Partition bigrams into segments of lm_t.bg_seg_sz consecutive entries, such that 541 * #trigrams in each segment <= 2**16 (the corresponding trigram segment). The 542 * bigram_t.firsttg value is then a 16-bit relative index within the trigram 543 * segment. A separate table--lm_t.tg_segbase--has the absolute index of the 544 * 1st trigram for each segment. 545 */ 546 547 /* Default values for lm_t.log_bg_seg.sz */ 548 #define LOG2_BG_SEG_SZ 9 549 #define BG_SEG_SZ (1 << (LOG2_BG_SEG_SZ)) 550 #define LM_TGCACHE_SIZE 100003 /* A prime no. (hopefully it IS one!) */ 551 552 /* 20040211 ARCHAN: Yes! Indeed it is a prime */ 553 554 /** \struct lm_t 555 * \brief The language model. 556 * All unigrams are read into memory on initialization. 557 * Bigrams and trigrams read in on demand. 558 */ 559 typedef struct lm_s { 560 char *name ; /**< The name of the LM */ 561 int32 n_ug; /**< #unigrams in LM */ 562 int32 n_bg; /**< #bigrams in entire LM */ 563 int32 n_tg; /**< #trigrams in entire LM */ 564 int32 max_ug; /**< To which n_ug can grow with dynamic addition of words */ 565 566 int32 n_ng; /**< if unigram, n_ng=1, if bigram n_bg=2 and so one */ 567 568 char **wordstr; /**< The LM word list (in unigram order) */ 569 570 571 uint32 log_bg_seg_sz;/**< See big comment above */ 572 uint32 bg_seg_sz; 573 574 ug_t *ug; /**< Unigrams */ 575 576 /* 20040225 ARCHAN : Data structure to maintain dictionary information */ 577 /* Data structure for dictionary to LM words look up mapping */ 578 /* 20060306 ARCHAN: Change this to a 32 bits data structure */ 579 s3lmwid32_t *dict2lmwid; /**< a mapping from dictionary word to LM word */ 580 s3lmwid32_t startlwid; /**< S3_START_WORD id, if it exists */ 581 s3lmwid32_t finishlwid; /**< S3_FINISH_WORD id, if it exists */ 582 583 bg_t *bg; /**< NULL iff disk-based */ 584 tg_t *tg; /**< NULL iff disk-based */ 585 membg_t *membg; /**< membg[w1] = bigrams for lm wid w1 (used iff disk-based) */ 586 tginfo_t **tginfo; /**< tginfo[w2] = fast trigram access info for bigrams (*,w2) */ 587 588 589 lm_tgcache_entry_t *tgcache; /**< <w0,w1,w2> hashed to an entry into 590 this array. Only the last trigram 591 mapping to any * given hash entry is 592 kept in that entry. (The cache 593 doesn't have to be super-efficient.) 594 */ 595 596 597 /**************************/ 598 599 600 bg32_t *bg32; /**< Bigram 32 bits, NULL iff disk-based */ 601 tg32_t *tg32; /**< Trigram 32 bits NULL iff disk-based */ 602 membg32_t *membg32; /**< membg 32bits membg[w1] = bigrams for lm wid w1 (used iff disk-based) */ 603 tginfo32_t **tginfo32; /**< tginfo 32bits tginfo[w2] = fast trigram access info for bigrams (*,w2) */ 604 605 lm_tgcache_entry32_t *tgcache32; /** tgcache 32 bits */ 606 607 /**************************/ 608 609 lmlog_t *bgprob; /**< Table of actual bigram probs */ 610 lmlog_t *tgprob; /**< Table of actual trigram probs */ 611 lmlog_t *tgbowt; /**< Table of actual trigram backoff weights */ 612 int32 *tg_segbase; /**< tg_segbase[i>>lm_t.log_bg_seg_sz] = index of 1st 613 trigram for bigram segment (i>>lm_t.log_bg_seg_sz) */ 614 int32 n_bgprob; 615 int32 n_tgprob; 616 int32 n_tgbowt; 617 618 FILE *fp; 619 int32 byteswap; /**< Whether this file is in the WRONG byte order */ 620 int32 bgoff; /**< BG offsets into DMP file (used iff disk-based) */ 621 int32 tgoff; /**< TG offsets into DMP file (used iff disk-based) */ 622 623 float32 lw; /**< Language weight currently in effect for this LM */ 624 int32 wip; /**< logs3(word insertion penalty) in effect for this LM */ 625 626 627 /* Statistics */ 628 int32 n_bg_fill; /**< #bg fill operations */ 629 int32 n_bg_inmem; /**< #bg in memory */ 630 int32 n_bg_score; /**< #bg_score operations */ 631 int32 n_bg_bo; /**< #bg_score ops backed off to ug */ 632 int32 n_tg_fill; /**< Similar stats for trigrams */ 633 int32 n_tg_inmem; /**< #tg in memory */ 634 int32 n_tg_score; /**< #tg_score operations */ 635 int32 n_tg_bo; /**< #tg_score ops backed off to bg */ 636 int32 n_tgcache_hit; /**< # of trigram cache hit ops backed off to bg */ 637 638 int32 access_type; /**< Updated on every lm_{tg,bg,ug}_score call to reflect the kind of 639 n-gram accessed: 3 for 3-gram, 2 for 2-gram and 1 for 1-gram */ 640 641 642 int32 isLM_IN_MEMORY; /**< Whether LM in in memory, it is a property, potentially it means 643 the code could allow you some model to be disk-based, some are not. */ 644 645 int32 dict_size; /**< Only used in class-based LM, because class-based LM is addressed in 646 the dictionary space. */ 647 hash_table_t *HT; /**< hash table for word-string->word-id map */ 648 649 650 /* Data structure that maintains the class information */ 651 lmclass_t **lmclass; /**< LM class for this LM */ 652 int32 n_lmclass; /**< # LM class */ 653 int32 *inclass_ugscore; /**< An array of inter-class unigram probability */ 654 655 656 int32 inputenc ; /**< Input encoding method */ 657 int32 outputenc ; /**< Output encoding method */ 658 int32 version; /**< The version number of LM, in particular, this is the version that recently 659 read in. 660 */ 661 int32 is32bits; /**< Whether the current LM is 32 bits or not. Derived from version and n_ug*/ 662 663 /* Arrays of unique bigram probs and bo-wts, and trigram probs */ 664 sorted_list_t sorted_prob2; /**< Temporary Variable: Sorted list */ 665 sorted_list_t sorted_bowt2; /**< Temporary Variable: Sorted list */ 666 sorted_list_t sorted_prob3; /**< Temporary Variable: Sorted list */ 667 int32 max_sorted_entries; /**< Temporary Variable: 2x the maximum size of the MAX_SORTED_ENTRIES*/ 668 669 logmath_t *logmath; 670 } lm_t; 671 672 673 674 /** \struct lmset_t 675 \brief Structure for multiple LM, provide operations for addition/deletion/read 676 Structure for multiple, named LMs, started from s2 677 */ 678 typedef struct lmset_s { 679 lm_t **lmarray; /**< 1 dimensional array of pointers of lm_t */ 680 lm_t *cur_lm; /**< TEMPORARY VARIABLE: The current LM */ 681 682 int32 cur_lm_idx; /**< TEMPORARY VARIABLE : The current LM index */ 683 int32 n_lm; /**< number of LM */ 684 int32 n_alloc_lm; /**< number of allocated LM */ 685 } lmset_t; 686 687 /** Access macros; not meant for arbitrary use */ 688 #define lm_lmwid2dictwid(lm,u) ((lm)->ug[u].dictwid) 689 #define lm_n_ug(lm) ((lm)->n_ug) 690 #define lm_n_bg(lm) ((lm)->n_bg) 691 #define lm_n_tg(lm) ((lm)->n_tg) 692 #define lm_wordstr(lm,u) ((lm)->wordstr[u]) 693 #define lm_startwid(lm) ((lm)->startlwid) 694 #define lm_finishwid(lm) ((lm)->finishlwid) 695 #define lm_access_type(lm) ((lm)->access_type) 696 697 698 /** \struct wordprob_t 699 \brief Generic structure that could be used at any n-gram level 700 */ 701 typedef struct { 702 s3wid_t wid; /**< NOTE: dictionary wid; may be BAD_S3WID if not available */ 703 int32 prob; /**< The probability */ 704 } wordprob_t; 705 706 707 /** A wrapper function of controlling the behavior of LM initialization 708 * 709 * (ARCHAN 20050617) lmset_init controls the behavior how the lmset 710 * which is an array of lm was initialized by different command-line 711 * arguments. lmfile and lmctlfile are mutually exclusive. Each 712 * will invoke one reading functions. 713 * 714 * In the case of -lmfile is specified. A lmset with one single lm 715 * (or lmset->n_lm=1) will be returned. The single lm's name will be 716 * called lmname. 717 * 718 * In the case of -lmctlfile is specified. A lmset with multiple lms 719 * will be returned. The number of lm will depend on the number of 720 * lm specified by -lmctlfile. For the format, please read the 721 * current format of -lmctlfile in lm.c 722 * 723 * ctl_lm is the equivalent of -ctl for lm. When -ctl_lm is not 724 * specified in command-line (ctl_lm is NULL). Then either lm with 725 * name lmname will be used as the default lm. If lmname is NULL, then 726 * the first lm will be named as the "default" 727 * 728 * lmdumpdir is currently not used. It is there for backward 729 * compatibility purpose. 730 * 731 * lw,wip,uw are language weight, word insertion pernalty and 732 * unigram weight. Their values are crucial to computation of the 733 * language model score. Therefore, the programmer is urged to 734 * carefully set these three values and also be careful of the 735 * order. 736 * 737 * dict is assumed to be a pre-initialized dict_t structure which is 738 * used in deriving the mapping between the dictionary word and the 739 * lm words 740 * 741 * ARCHAN 20050711 -lminmemory is the only global variable that 742 * control the code and we haven't explicitly specify it. Currently, 743 * if the LM is DMP, both -lminmeory=0 or -lminmeory=1 could be used. 744 * if the LM is txt-base, only -lminmemory=1 is accepted. (This will 745 * be changed in future.) 746 * 747 * 748 * ARCHAN 20050705: A survival guide for this part of the code. Our 749 * language mode code is unnecessarily complicated and is mainly 750 * caused by the fact the way we specified class-based LM and 751 * multiple LM are inter-dependent. For example, one could specify a 752 * multiple LMs file (i.e. lmctlfile) and have no classes. However, 753 * if one would like to specify class information even with a single 754 * LM, one need to use a multiple LM file format (i.e. lmctlfile). 755 * 756 * This difficulty is well-observed in the period of Sphinx 757 * 3.4-3.6. That might imply that a new LM format is needed if we 758 * want to sustain this part of the development. 759 * 760 */ 761 S3DECODER_EXPORT 762 lmset_t* lmset_init(const char* lmfile, /**< The lm file name, lmfile and lmctlfile are mutally exclusive */ 763 const char* lmctlfile, /**< The file that specified multiple LMs and class information, lmfile and lmctlfile are mutually exclusive */ 764 const char* ctl_lm, /**< The control file that describes which lm to use for a particular utterance*/ 765 const char* lmname, /**< The LM name to use if ctl_lm is not specified */ 766 const char* lmdumpdir, /**< Currently not used */ 767 float32 lw, /**< Language model weight */ 768 float32 wip, /**< Word insertion penalty */ 769 float32 uw, /**< Unigram weight */ 770 dict_t *dict, /**< A pre-initialized dict_t structure */ 771 logmath_t *logmath 772 ); 773 774 775 /* It is still a sore point: To have two interfaces for two different 776 type of input. Some of the code is still duplicated. Changing 777 one doesn't the other one will be changed 778 */ 779 780 /** 781 * Read a single LM into the lmset. 782 */ 783 lmset_t* lmset_read_lm(const char *lmfile, /**< In: The LM file */ 784 dict_t *dict, /**< In: A pre-initialized dictionary file*/ 785 const char *lmname, /**< In: The LM name */ 786 float64 lw, /**< The language weight */ 787 float64 wip, /**< The word insertion penalty */ 788 float64 uw, /**< The unigram weight */ 789 const char *lmdumpdir, /**< In: LM dump dir */ 790 logmath_t *logmath 791 ); 792 793 /** 794 * Read the LM control file. **Usually**, it is also a class-based LM, 795 */ 796 797 lmset_t* lmset_read_ctl(const char * ctlfile,/**< Control file name */ 798 dict_t* dict, /**< In: Dictionary */ 799 float64 lw, /**< In: Language weight */ 800 float64 wip, /**< In: Word insertion penalty */ 801 float64 uw, /**< In: Unigram weight */ 802 const char* lmdumpdir, /**< In: LMdumpdir */ 803 logmath_t *logmath 804 ); 805 806 /** 807 * Get an LM by index. 808 */ 809 lm_t* lmset_get_lm_widx(lmset_t *lms, /**< In: The set of LM */ 810 int32 lmidx /**< In: LM index */ 811 ); 812 813 /** 814 * Get an LM by name 815 * @return a pointer of the LM with name lmname 816 */ 817 lm_t* lmset_get_lm_wname(lmset_t *lms, /**< In: The set of LM */ 818 const char *lmname /**< In: The LM name */ 819 ); 820 821 /** 822 * Set the current LM with index 823 */ 824 void lmset_set_curlm_widx(lmset_t *lms, /**< In: The set of LM */ 825 int32 lmidx /**< In: LM index */ 826 ); 827 828 /** 829 * Set the current LM with name 830 */ 831 S3DECODER_EXPORT 832 void lmset_set_curlm_wname(lmset_t *lms, /**< In: The set of LM */ 833 const char *lmname /**< In: The LM name */ 834 ); 835 836 /** 837 * Convert name to index 838 */ 839 int32 lmset_name_to_idx(lmset_t *lms, /**< In: The set of LM */ 840 const char *lmname /**< In: The LM name */ 841 ); 842 843 /** 844 * Convert index to name 845 * @return a pointer of the name string. No memory is allocated. 846 */ 847 848 char* lmset_idx_to_name(lmset_t *lms, /**< In: The set of LM */ 849 int32 lmidx /**< In: LM index */ 850 ); 851 852 853 /** 854 * Add a new lm into the lmset. Notice that lms->n_lm will be added by 1 855 */ 856 857 void lmset_add_lm(lmset_t *lms, /**< In/Out : The set of LM */ 858 lm_t *lm, /**< In : The input LM */ 859 const char* lmname /**< In: The lm name */ 860 ); 861 862 /** 863 * Delete a LM with lmname. Notice that lms->n_lm will be subtracted by 1 864 */ 865 866 void lmset_delete_lm(lmset_t *lms, /**< In/Out : The set of LM */ 867 const char *lmname /**< The lm name */ 868 ); 869 870 /** 871 * Free the lmset data structure 872 */ 873 S3DECODER_EXPORT 874 void lmset_free(lmset_t *lms /**< In: The set of LM */ 875 ); 876 877 /** 878 * Return trigram followers for given two words. Both w1 and w2 must be valid. 879 * Return value: #trigrams in returned list. 880 */ 881 int32 lm_tglist (lm_t *lmp, /**< In: LM being queried */ 882 s3lmwid32_t w1, /**< In: LM word id of the first of a 2-word history */ 883 s3lmwid32_t w2, /**< In: LM word id of the second of the 2-word history */ 884 tg_t **tg, /**< Out: *tg = array of trigrams for <w1,w2> */ 885 int32 *bowt /**< Out: *bowt = backoff-weight for <w1, w2> */ 886 ); 887 888 int32 lm_tg32list (lm_t *lmp, /**< In: LM being queried */ 889 s3lmwid32_t w1, /**< In: LM word id of the first of a 2-word history */ 890 s3lmwid32_t w2, /**< In: LM word id of the second of the 2-word history */ 891 tg32_t **tg, /**< Out: *tg = array of trigrams for <w1,w2> */ 892 int32 *bowt /**< Out: *bowt = backoff-weight for <w1, w2> */ 893 ); 894 895 /** 896 * Return the bigram followers for the given word w. 897 * Return value: #bigrams in returned list. 898 */ 899 int32 lm_bglist (lm_t *lmp, /**< In: LM being queried */ 900 s3lmwid32_t w, /**< In: LM word id of the 1-word history */ 901 bg_t **bg, /**< Out: *bg = array of bigrams for w */ 902 int32 *bowt /**< Out: *bowt = backoff-weight for w */ 903 ); 904 905 int32 lm_bg32list (lm_t *lmp, /**< In: LM being queried */ 906 s3lmwid32_t w, /**< In: LM word id of the 1-word history */ 907 bg32_t **bg, /**< Out: *bg = array of bigrams for w */ 908 int32 *bowt /**< Out: *bowt = backoff-weight for w */ 909 ); 910 911 912 #if 0 /*Obsolete and it will cause conflict the code, so comment for now*/ 913 /* 914 * Somewhat like lm_bglist, but fill up a wordprob_t array from the bigram list found, instead 915 * of simply returning the bglist. The wordprob array contains dictionary word IDs. But note 916 * that only the base IDs are entered; the caller is responsible for filling out the alternative 917 * pronunciations. 918 * Return value: #entries filled in the wordprob array. 919 */ 920 int32 lm_bg_wordprob(lm_t *lm, /**< In: LM being queried */ 921 s3lmwid32_t w, /**< In: LM word ID of the 1-word history */ 922 int32 th, /**< In: If a prob (logs3, langwt-ed) < th, ignore it */ 923 wordprob_t *wp, /**< In/Out: Array to be filled; caller must have 924 allocated this array */ 925 int32 *bowt /**< Out: *bowt = backoff-weight associated with w */ 926 ); 927 928 #endif 929 930 /* Return LM word ID for the given string, or BAD_LMWID(lm) if not available */ 931 s3lmwid32_t lm_wid (lm_t *lm, const char *wd); 932 933 /** 934 Set all pointers to NULL in the lm 935 */ 936 void lm_null_struct(lm_t* lm 937 ); 938 939 /** 940 * Like lm_bg_wordprob, but for unigrams. 941 * Return value: #entries filled in the wordprob array. 942 */ 943 int32 lm_ug_wordprob(lm_t *lm, /**< In: LM being queried */ 944 dict_t *dict, /**< In : The dictionary */ 945 int32 th, 946 wordprob_t *wp /**< In/out: Array to be filled */ 947 ); 948 949 /** Return the unigrams in LM. Return value: #unigrams in returned list. */ 950 int32 lm_uglist (lm_t *lmp, /**< In: LM being queried */ 951 ug_t **ug /**< Out: *ug = unigram array */ 952 ); 953 954 955 956 /* 20040227: This also account the in-class probability of wid*/ 957 /** Return unigram score for the given word */ 958 int32 lm_ug_score (lm_t *lmp, /**< In: LM begin queried */ 959 s3lmwid32_t lwid, /**< LM ID for the word */ 960 s3wid_t wid /**< Dict ID for the word */ 961 ); 962 963 964 int32 lm_ug_exists(lm_t* lm , /**< LM */ 965 s3lmwid32_t lwid /**< LM ID for the word */ 966 ); 967 968 /* 969 * Return bigram score for the given two word sequence. If w1 is BAD_LMWID(lm), return 970 * lm_ug_score (w2). 971 * 20040227: This also account for the in-class probability of w2. 972 */ 973 int32 lm_bg_score (lm_t *lmp, /**< In: LM begin queried */ 974 s3lmwid32_t lw1, 975 s3lmwid32_t lw2, 976 s3wid_t w2); 977 978 979 /** 980 Whether a certain bigram exists. 981 */ 982 int32 lm_bg_exists (lm_t *lm, /**< In: LM */ 983 s3lmwid32_t lw1, 984 s3lmwid32_t lw2 985 ); 986 987 /** 988 * Return trigram score for the given three word sequence. If w1 is BAD_LMWID(lm), return 989 * lm_bg_score (w2, w3). If both lw1 and lw2 are BAD_LMWID(lm), return lm_ug_score (lw3). 990 * 991 * 20040227: This also account for the in-class probability of w3. 992 */ 993 int32 lm_tg_score (lm_t *lmp, /**< In: LM begin queried */ 994 s3lmwid32_t lw1, 995 s3lmwid32_t lw2, 996 s3lmwid32_t lw3, 997 s3wid_t w3); 998 999 1000 /** 1001 Whether a certain trigram exists. 1002 */ 1003 int32 lm_tg_exists (lm_t *lm, /**< In: LM */ 1004 s3lmwid32_t lw1, 1005 s3lmwid32_t lw2, 1006 s3lmwid32_t lw3 1007 ); 1008 1009 /** 1010 * Set the language-weight and insertion penalty parameters for the LM, after revoking 1011 * any earlier set of such parameters. 1012 * 1013 * WARNING!! This function doesn't prevent underflow of values. Make sure you call 1014 * safe lm2logs3 before it. 1015 */ 1016 void lm_set_param (lm_t *lm, /**< In: the LM */ 1017 float64 lw, /**< In: the langauage weight */ 1018 float64 wip /**< In: the word insertion penalty */ 1019 ); 1020 1021 1022 S3DECODER_EXPORT 1023 int32 lm_rawscore (lm_t *lm, /**< In: the LM */ 1024 int32 score 1025 ); 1026 1027 1028 1029 /** LM cache related */ 1030 S3DECODER_EXPORT 1031 void lm_cache_reset (lm_t *lmp /**< In: the LM */ 1032 ); 1033 1034 /** LM cache statistic dumping */ 1035 S3DECODER_EXPORT 1036 void lm_cache_stats_dump (lm_t *lmp /**< In: the LM */ 1037 ); 1038 1039 /** 1040 * A simple version of reading in a LM 1041 * 1042 * lm_read is a simple version of lm_read_advance. It will assume 1043 * language weight, word insertion penalty and unigram weight to be 1044 * automatically applied. There is also no class-based LM (so 1045 * ndict=0). Format is set to NULL, so the program will determine 1046 * it automatically. 1047 */ 1048 lm_t * lm_read ( 1049 const char *file, /**< In: LM file being read */ 1050 const char *lmname, /**<In: LM name*/ 1051 cmd_ln_t *config, 1052 logmath_t *logmath); 1053 1054 /** 1055 * Read an LM file, it will automatically decide whether the file is 1056 * a DUMP file or a txt file. Then call lm_read_txt and lm_read_dump 1057 * (non-public functions) correspondingly. Currently the code is 1058 * not aware about OOV. 1059 * 1060 * lw, wip, uw and ndict are mainly used for recognition purpose. 1061 * When lm_read is used for other purpose, one could just used dummy 1062 * setting. recommended one is lw=1.0,wip=0.1,uw=1.0 and 1063 * ndict=0. These are very useful when lm_read is just used as 1064 * reading the LM. 1065 * 1066 * If applyweight is 0, lw,wip, uw will not be apply the LM at all. 1067 * This will allow users to just call the LM routine without 1068 * initializing other modules (such as logs3_init). 1069 * 1070 * If applyweight is 1, then logs3_init must be called before lm_read. 1071 * This is usually the case when kb_init is called before the code. 1072 * 1073 * fmt now could be either "TXT", "DMP" and "TXT32" or just 1074 * NULL. If it is NULL, the LM format will be automatically 1075 * determined. If it is specified as "TXT" or "DMP", the 1076 * corresponding lm reader will be called. In such a case, it is 1077 * important for the users to know what he/she is doing. 1078 * (Unfortunately, this is mostly not true. ) 1079 * In the case of "TXT32", a text LM will be forced to 32bit mode. 1080 * 1081 * ndict is the dictionary size of the application. This is needed 1082 * because class-based LM are addressed in the dictionary wid-space 1083 * instead of lm wid-space. If class-based LM is not used, just set 1084 * this to zero. 1085 * 1086 * Note: there are two defense mechanisms of lm_read_advance. 1087 * First of all, if no fmt is specified, it will start to read 1088 * the lm in the order of DMP->TXT. Second, if txt format 1089 * is specified but LM is found to hit the 16bit legacy segments 1090 * limit, it will automatically switch to read TXT32 LM 1091 * 1092 * @return pointer to LM structure created. 1093 */ 1094 lm_t *lm_read_advance (const char *file, /**< In: LM file being read */ 1095 const char *lmname, /**<In: LM name*/ 1096 float64 lw, /**< In: Language weight */ 1097 float64 wip, /**< In: Word insertion penalty */ 1098 float64 uw, /**< In: Unigram weight (interpolation with uniform distr.) */ 1099 int32 ndict, /**< In: Number of dictionary entry. We need that because 1100 class-based LM is addressed in dictionary word ID space. 1101 */ 1102 const char* fmt, /**< In: file format of the LM, it is 1103 now either "TXT", "DMP" and NULL, 1104 if NULL, file format is 1105 automaticaly determined */ 1106 int32 applyweight, /**< In: whether lw,wip, uw should be 1107 applied to the lm or not */ 1108 logmath_t *logmath 1109 ); 1110 1111 S3DECODER_EXPORT 1112 lm_t *lm_read_advance2(const char *file, /**< In: LM file being read */ 1113 const char *lmname, /**<In: LM name*/ 1114 float64 lw, /**< In: Language weight */ 1115 float64 wip, /**< In: Word insertion penalty */ 1116 float64 uw, /**< In: Unigram weight (interpolation with uniform distr.) */ 1117 int32 ndict, /**< In: Number of dictionary entry. We need that because 1118 class-based LM is addressed in dictionary word ID space. 1119 */ 1120 const char* fmt, /**< In: file format of the LM, it is 1121 now either "TXT", "DMP" and NULL, 1122 if NULL, file format is 1123 automaticaly determined */ 1124 int32 applyweight, /**< In: whether lw,wip, uw should be 1125 applied to the lm or not */ 1126 int lminmemory, /**< In: Whether LM is read into memory */ 1127 logmath_t *logmath 1128 ); 1129 /** 1130 Simple writing of an LM file, the input and output encoding will 1131 assume to be iso8859-1. Call lm_write. To convert encoding, please use 1132 lm_write_advance. 1133 */ 1134 S3DECODER_EXPORT 1135 int32 lm_write(lm_t *model, /** In: the pointer LM we want to output */ 1136 const char *outputfile, /**< In: the output file name */ 1137 const char *filename, /**< In: the LM file name */ 1138 const char *fmt /**< In: LM file format, it is now either "TXT" or "DMP" */ 1139 ); 1140 1141 /** 1142 Writing of an LM file with advanced options such as encoding support. 1143 Called by lm_write. 1144 1145 fmt now could be TXT, DMP, FST 1146 1147 inputenc and outputenc could now be iso8859-1, gb2312-hex, gb2312. 1148 Not every pair of conversion works. 1149 1150 Current input/output encodings support list. 1151 0: iso8859-1 1152 1: gb2312-hex 1153 2: gb2312 1154 1155 -: do nothing 1156 n: doesn't make sense or not compatible 1157 x: not supported yet 1158 y: supported 1159 1160 i\o 0 1 2 1161 0 - n n 1162 1 n - y 1163 2 n x - 1164 1165 When we have 4 encoding types: This document should be 1166 implemented as a data structure. 1167 1168 This conversion table is copied from encoding.c, please take a 1169 look the latest support in encoding.c 1170 */ 1171 1172 int32 lm_write_advance(lm_t *model, /**< In: the pointer LM we want to output */ 1173 const char *outputfile, /**< In: the output file name */ 1174 const char *filename, /**< In: the LM file name */ 1175 const char *fmt, /**< In: LM file format, it is now either "TXT", "DMP", "FST" */ 1176 const char* inputenc, /**< In: Input encoding type */ 1177 char* outputenc /**< Out: Output encoding type */ 1178 ); 1179 1180 /* RAH, added code for freeing allocated memory 1181 */ 1182 /** 1183 Deallocate the language model. 1184 */ 1185 S3DECODER_EXPORT 1186 void lm_free (lm_t *lm /**< In: a LM structure */ 1187 ); 1188 1189 /** 1190 Add word list to the LM 1191 For each word in the file, call lm_add_wordlist. 1192 The file is assume to have a format like this: 1193 <word1> 1194 <word2> 1195 <word3> 1196 <word4> 1197 1198 If the lmwid2dictid mapping is not updated, or the dictionary 1199 itself is not used in the context. Just specify dict=NULL; 1200 1201 */ 1202 int32 lm_add_wordlist(lm_t *lm, /**< In/Out: a modified LM structure */ 1203 dict_t *dict, /**< In: an initialized dictionary structure 1204 Used to update 1205 */ 1206 const char* filename /**< In: a file that contains a 1207 list of word one wants to 1208 add*/ 1209 ); 1210 1211 /** 1212 Add a word to the LM 1213 1214 look up the dictionary and see whether it exists in the dictionary 1215 Looks alike with wid.c's logic at this point. 1216 1217 (Incomplete!) Not fully tested in the situation for on-line 1218 recognition. 1219 1220 We also avoid the addition of classes at this point because that 1221 could complicated things quite a lot. 1222 */ 1223 int32 lm_add_word_to_ug(lm_t *lm, /**< In/Out: a modified LM structure */ 1224 dict_t *dict, /**< In: an initialized dictionary structure 1225 Used to update lmwid2dictid mapping. 1226 */ 1227 const char* newword /**<In: a pointer of a new word */ 1228 ); 1229 /** 1230 Get class ID given a LM. 1231 */ 1232 int32 lm_get_classid (lm_t *model, /**< In: LM file being queried*/ 1233 const char *name /**< In: The name of the class */ 1234 ); 1235 1236 /** 1237 * Explicity convert structure from 16bit -> 32bit or 32bit to 16bit. 1238 */ 1239 void lm_convert_structure(lm_t *model, /**< In: LM file being used */ 1240 int32 is32bits 1241 ); 1242 1243 /** 1244 Check whether the model is operating at 32 bits 1245 */ 1246 int32 lm_is32bits(lm_t* model); 1247 1248 /** 1249 Write of UG structure 1250 */ 1251 void ug_write(FILE* fp, /**< A file pointer */ 1252 ug_t* ug /**< A pointer of the ug_t structure */ 1253 ); 1254 /** 1255 Write of BG structure 1256 */ 1257 void bg_write(FILE* fp, /**< A file pointer */ 1258 bg_t* bg /**< A pointer of the bg_t structure */ 1259 ); 1260 1261 /** 1262 Write of BG (32bits) structure 1263 */ 1264 void bg32_write(FILE* fp, /**< A file pointer */ 1265 bg32_t* bg /**< A pointer of the bg32_t structure */ 1266 ); 1267 1268 /** 1269 Write of TG structure 1270 */ 1271 1272 void tg_write(FILE* fp, /**< A file pointer */ 1273 tg_t* tg /**< A pointer of the tg_t structure */ 1274 ); 1275 1276 /** 1277 Write of TG (32bits) structure 1278 */ 1279 1280 void tg32_write(FILE* fp, /**< A file pointer */ 1281 tg32_t* tg /**< A pointer of the tg32_t structure */ 1282 ); 1283 1284 1285 /** 1286 Convert the 16 bit bigram structure to 32 bit 1287 */ 1288 void copy_bg_to_bg32(lm_t *lm /**< LM */ 1289 ); 1290 1291 /** 1292 Convert the 32 bit bigram structure to 16 bit 1293 */ 1294 1295 void copy_bg32_to_bg(lm_t *lm /**< LM */ 1296 ); 1297 1298 /** 1299 Convert the 16 bit trigram structure to 32 bit 1300 */ 1301 void copy_tg_to_tg32(lm_t *lm /**< LM */ 1302 ); 1303 1304 /** 1305 Convert the 32 bit trigram structure to 16 bit 1306 */ 1307 1308 void copy_tg32_to_tg(lm_t *lm /**< LM */ 1309 ); 1310 1311 /** 1312 Swap 16 bits bigram 1313 */ 1314 void swap_bg(bg_t* bg); 1315 1316 1317 /** 1318 Swap 32 bits bigram 1319 */ 1320 void swap_bg32(bg32_t* bg); 1321 1322 /** 1323 Swap 16 bits trigram 1324 */ 1325 void swap_tg(tg_t* tg); 1326 1327 1328 /** 1329 Swap 32 bits trigram 1330 */ 1331 void swap_tg32(tg32_t* tg); 1332 1333 int32 find_bg (bg_t *bg, /**< In: The bigram */ 1334 int32 n, 1335 s3lmwid32_t w 1336 ); 1337 1338 int32 find_bg32 (bg32_t *bg, /**< In: The bigram */ 1339 int32 n, 1340 s3lmwid32_t w 1341 ); 1342 1343 1344 int32 find_tg (tg_t *tg, /**< In: The trigram */ 1345 int32 n, s3lmwid32_t w); 1346 1347 int32 find_tg32 (tg32_t *tg, /**< In: The trigram */ 1348 int32 n, s3lmwid32_t w); 1349 1350 /* Macro versions of access functions */ 1351 #define LM_TGPROB(lm,tgptr) ((lm)->tgprob[(tgptr)->probid].l) 1352 #define LM_BGPROB(lm,bgptr) ((lm)->bgprob[(bgptr)->probid].l) 1353 #define LM_UGPROB(lm,ugptr) ((ugptr)->prob.l) 1354 #define LM_RAWSCORE(lm,score) ((score - (lm)->wip) / ((lm)->lw)) 1355 #define LM_DICTWID(lm,lmwid) ((lm)->ug[(lmwid)].dictwid) 1356 1357 /** 1358 Create a new unigram table 1359 */ 1360 ug_t *NewUnigramTable (int32 n_ug /**< Number of unigram */ 1361 ); 1362 1363 1364 #if 0 1365 { /* Stop indent from complaining */ 1366 #endif 1367 #ifdef __cplusplus 1368 } 1369 #endif 1370 1371 #endif 1372