1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 1999-2004 Carnegie Mellon University.  All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 /*
38  * lm.h - Disk/memory based word-trigram backoff LM
39  *
40  * **********************************************
41  * CMU ARPA Speech Project
42  *
43  * Copyright (c) 1997 Carnegie Mellon University.
44  * ALL RIGHTS RESERVED.
45  * **********************************************
46  *
47  * HISTORY
48  * $Log: lm.h,v $
49  * Revision 1.16  2006/03/02 22:10:36  arthchan2003
50  * Add *g_write into the code.
51  *
52  * Revision 1.15  2006/02/28 22:26:51  egouvea
53  * Moved definition of lm_wid() outside of the #if 0/#endif block, so
54  * it's declared.
55  *
56  * Revision 1.14  2006/02/24 13:38:08  arthchan2003
57  * Added lm_read, it is a simple version of lm_read_advance.
58  *
59  * Revision 1.13  2006/02/23 04:16:29  arthchan2003
60  * Merged from SPHINX3_5_2_RCI_IRII_BRANCH:
61  * Splited the original lm.c into five parts,
62  * a, lm.c - a controller of other subroutines.
63  * b, lm_3g.c - implement TXT-based lm operations
64  * c, lm_3g_dmp.c - implement DMP-based lm operations
65  * d, lm_attfsm.c - implement FSM-based lm operations
66  * e, lmset.c - implement sets of lm.
67  *
68  * Revision 1.12.4.3  2006/01/16 19:56:37  arthchan2003
69  * 1, lm_rawscore doesn't need a language weight, 2, Support dumping the LM in FST format.  This code used Yannick Esteve's and LIUM code.
70  *
71  * Revision 1.12.4.2  2005/11/17 06:15:22  arthchan2003
72  * Added input-encoding and output-encoding into the lm structure.
73  *
74  * Revision 1.12.4.1  2005/07/13 01:46:22  arthchan2003
75  * 1, Fixed dox-doc, 2, Added more documentation for major functions such as lm_read and lm_write.
76  *
77  * Revision 1.12  2005/06/21 22:24:02  arthchan2003
78  * Log. In this change, I introduced a new interface for lm ,which is
79  * call lmset_t. lmset_t wraps up multiple lm, n_lm, n_alloclm into the
80  * same structure and handle LM initialization (lm_init) switching,
81  * (lmset_curlm_widx), delete LM (lmset_delete_lm).  The internal
82  * structure is called lmarray and is an array of pointers of lm.  The
83  * current lm is always maintained and pointed by a pointer called cur_lm
84  * . This substantially clarify the structure of the code.  At this
85  * check-in, not every core function of lmset is completed.
86  * e.g. lmset_add_lm because that required testing of several LM reading
87  * routines and could be quite time-consuming.
88  *
89  * Log. Another notable change is the fact dict2lmwid map is started to
90  * be part of the LM. The reason of this is clearly described inside the
91  * code. Don't want to repeat here.
92  *
93  * Log. The new interface has been already used broadly in both Sphinx
94  * 3.0 and sphinx 3.x family of tools.
95  *
96  * Revision 1.5  2005/06/18 03:22:28  archan
97  * Add lmset_init. A wrapper function of various LM initialization and initialize an lmset It is now used in decode, livepretend, dag and astar.
98  *
99  * Revision 1.4  2005/06/17 23:44:40  archan
100  * Sphinx3 to s3.generic, 1, Support -lmname in decode and livepretend.  2, Wrap up the initialization of dict2lmwid to lm initialization. 3, add Dave's trick in LM switching in mode 4 of the search.
101  *
102  * Revision 1.3  2005/06/13 04:02:59  archan
103  * Fixed most doxygen-style documentation under libs3decoder.
104  *
105  * Revision 1.2  2005/05/10 21:21:54  archan
106  * Three functionalities added but not tested. Code on 1) addition/deletion of LM in mode 4. 2) reading text-based LM 3) Converting txt-based LM to dmp-based LM.
107  *
108  * Revision 1.1  2005/05/04 06:08:07  archan
109  * Refactor all lm routines except fillpen.c into ./libs3decoder/liblm/ . This will be equivalent to ./lib/liblm in future.
110  *
111  * Revision 1.6  2005/05/04 04:02:24  archan
112  * Implementation of lm addition, deletion in (mode 4) time-switching tree implementation of search.  Not yet tested. Just want to keep up my own momentum.
113  *
114  * Revision 1.5  2005/04/21 23:50:26  archan
115  * Some more refactoring on the how reporting of structures inside kbcore_t is done, it is now 50% nice. Also added class-based LM test case into test-decode.sh.in.  At this moment, everything in search mode 5 is already done.  It is time to test the idea whether the search can really be used.
116  *
117  * Revision 1.4  2005/04/20 03:37:59  archan
118  * LM code changes: functions are added to set, add and delete LM from the lmset, change the legacy lmset data structure to contain n_lm and n_alloc_lm.
119  *
120  * Revision 1.3  2005/03/30 01:22:47  archan
121  * Fixed mistakes in last updates. Add
122  *
123  *
124  * 20.Apr.2001  RAH (rhoughton@mediasite.com, ricky.houghton@cs.cmu.edu)
125  *              Adding lm_free() to free allocated memory
126  *
127  * 24-Jun-97	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
128  * 		Added lm_t.access_type; made lm_wid externally visible.
129  *
130  * 24-Jun-97	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
131  * 		Added lm_t.log_bg_seg_sz and lm_t.bg_seg_sz.
132  *
133  * 13-Feb-97	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
134  *              Created from original S3 version.
135  */
136 
137 
138 #ifndef _S3_LM_H_
139 #define _S3_LM_H_
140 
141 #include <stdio.h>
142 
143 #include <logmath.h>
144 #include <hash_table.h>
145 #include <cmd_ln.h>
146 
147 #ifdef __cplusplus
148 extern "C" {
149 #endif
150 #if 0
151 } /* Fool Emacs into not indenting things. */
152 #endif
153 
154 #define LM_DICTWID_BADMAP	-16000		/** An illegal mapping */
155 #define LM_CLASSID_BASE		0x01000000	/** Interpreted as LMclass ID */
156 
157 /** Upper limit of the words of Sphinx 3.X */
158 #define LM_LEGACY_CONSTANT      BAD_S3LMWID          /**< =65535 (~65k), this is introduced
159 							since 1996 when Ravi first wrote Sphinx 3.0. It
160 							was with us since.
161 						     */
162 
163 #define LM_SPHINX_CONSTANT      BAD_S3LMWID32      /**< (4 billion), ARCHAN: this is introduced by in Sphinx 3.6
164 						      during the time of Release Candidate I (2006 March). The caveat of using
165 						      this constant is that it is much hard to detect byte-swapping problem.
166 						      in general. Also, if the world has more than 10000 cities, each has 1 million
167 						      roads name. We are stuck in this case. I assume this will happen in
168 						      year3001.
169 						   */
170 
171 
172 #define LM_CLASSID_TO_CLASS(m,i)	((m)->lmclass[(i)-LM_CLASSID_BASE])
173 
174 #define MIN_PROB_F       -99.0  /**< The minimum value of probabilities and
175                                    backoff weights. When changing, notice
176                                    that both s2 and s3 may transform this
177                                    number to very small integer (say -2e-31)
178                                    This will easily cause integer wrap
179                                    around.  -99 is chosen for that reason.
180                                 */
181 
182 #define LM_ALLOC_BLOCK      16  /** The number of LMs to allocate at a time.
183                                  */
184 
185 /**
186    Sucess and error message.
187  */
188 #define LM_SUCCESS           1  /**< Constant that indicates an operation succeed
189                                  */
190 #define LM_FAIL              0  /**< Constant that define an operation failed.  */
191 #define LM_NOT_FOUND        -1  /**< Constant which indicate an LM couldn't be
192                                    found */
193 #define LM_OFFSET_TOO_LARGE -2  /**< Constant where the 16 bit LM was
194                                    used, but th tgcount is larger than
195                                    LM_LEGACY_CONSTANT (65535). This
196                                    breaks addressing scheme in the
197                                    current LM.
198                                 */
199 #define LM_NO_DATA_MARK     -3  /**< When reading text-based LM,
200                                    return thisif we see no data
201                                    mark  */
202 #define LM_UNKNOWN_NG       -4  /**< When reading the header of LM, if
203                                    there is unknown K for K-gram */
204 #define LM_BAD_LM_COUNT     -5  /**< When reading LM, if count is bad,
205                                    return this msg */
206 #define LM_UNKNOWN_WORDS    -6  /**< When an unknown word is found
207                                    during LM readin, return this
208                                    message */
209 #define LM_BAD_BIGRAM       -7  /**< A bad bigram, it could be word
210                                    ids larger than # of unigram, it
211                                    could be word id smaller than 0.
212                                    It could also be bigram out of
213                                    bound.
214                                 */
215 #define LM_BAD_TRIGRAM      -8  /**< A bad trigram, it could be word
216                                    ids larger than # of unigram, it
217                                    could be word id smaller than 0.
218                                    It could also be bigram out of
219                                    bound.
220                                 */
221 #define LM_BAD_QUADGRAM     -9  /**< (RESERVED BUT NOT USED) A bad
222                                    quadgram (4-gram), it could be word
223                                    ids larger than # of unigram, it
224                                    could be word id smaller than 0.
225                                    It could also be bigram out of
226                                    bound.
227                                 */
228 #define LM_BAD_QUINGRAM     -10  /**< (RESERVED BUT NOT USED) A bad
229                                     quingram (5-gram), it could be
230                                     word ids larger than # of unigram,
231                                     it could be word id smaller than
232                                     0.  It could also be bigram out of
233                                     bound.  BTW, there is no need to
234                                     remind me the mixed use of
235                                     quadgram and quingram is stupid
236                                     English.  I read Manning and
237                                     Schultze.
238                                  */
239 #define LM_BAD_NGRAM       -11  /**< (RESERVED BUT NOT USED) A bad
240                                    n-gram.  generalization of message
241                                    -7 to -10. In our case, we don't
242                                    make the message as specific as
243                                    possible.
244                                  */
245 #define LM_TOO_MANY_NGRAM  -12  /**< When reading LM, if the number of
246                                    n-grams is more than the number
247                                    specified header.  return this
248                                    header */
249 #define LM_NO_MINUS_1GRAM  -13  /**< When reading n-gram, if the
250                                    corresponding (n-1)-gram doesn't
251                                    exists, return this message. */
252 #define LM_FILE_NOT_FOUND  -14  /**< When couldn't find the LM file,
253                                    return this message */
254 #define LM_CANNOT_ALLOCATE -15  /**< When cannot allocate tables in LM
255                                    return this message */
256 
257 /** Versioning of LM */
258 #define LMDMP_VERSIONNULL 0   /**< VERSION 0 is oldest, in the past, we
259 				 used to use the version number to
260 				 store the number of unigram, you will
261 				 see logic that said vn > LMDMP_VERSIONNULL
262 			      */
263 
264 #define LMDMP_VERSION_TG_16BIT -1 /**< VERSION 1 is the simplest DMP file which
265 				     is trigram or lower which used 16 bits in
266 				     bigram and trigram.*/
267 
268 #define LMDMP_VERSION_TG_16BIT_V2 -2 /**< VERSION 2 means legacy VERSION 1 DMP file
269 					which has log_bg_seg_sz != 9*/
270 
271 #define LMDMP_VERSION_TG_32BIT -3 /**< VERSION 3 is the 32 bit
272 				     extension of VERSION 1 but
273 				     the bigram and trigram are
274 				     represented by 32 bits data
275 				     structure */
276 
277 #define LMTXT_VERSION         1000 /**< VERSION 1000 is the text-based LM */
278 #define LMFST_VERSION         1001 /**< VERSION 1001 is the FST-based LM */
279 #define LMFORCED_TXT32VERSION 1002 /**< VERSION 1002 is the internal version of
280                                       text-based LM. The difference betwwen
281                                       1002 and 1000 is that 1002 will assume
282                                       LM is 32bits.  This fact is used in
283                                       lm_is32bits(lm)
284                                    */
285 
286 
287 #define NO_WORD	-1
288 
289 #include "s3types.h"
290 #include "lmclass.h"
291 #include "dict.h"
292 
293 /*
294  * ARCHAN 20050503: comment copied from Sphinx 2
295  * Bigram probs and bo-wts, and trigram probs are kept in separate tables
296  * rather than within the bigram_t and trigram_t structures.  These tables
297  * hold unique prob and bo-wt values, and can be < 64K long (see lm_3g.h).
298  * The following tree structure is used to construct these tables of unique
299  * values.  Whenever a new value is read from the LM file, the sorted tree
300  * structure is searched to see if the value already exists, and inserted
301  * if not found.
302  */
303 
304 /** \file lm.h
305     \brief Language model
306 
307     This is the header file for language model support in Sphinx 3.
308     Sphinx 3 supports language model in 4 formats. The four formats are
309 
310     ARPA format: First appear in Sphinx 2. We port it to Sphinx 3 in
311     3.X (X=6)
312 
313     DMP : Sphinx 3 slow and fast used it, so does later in Sphinx 3.X
314     (X>4)
315 
316     DMP32 : We start to break the limit of number of words of
317     65535. This is the first LM file format in Sphinx 3.X that could
318     capture 4 billion words in the language model
319 
320     FST: In AT&T format, we start to support in 3.X (X=6).
321 
322     At 20060302
323     we can only read and used ARPA, DMP-based format in the decoder.
324     we can write ARPA, DMP, DMP32 and FST file format.
325 */
326 
327 /** \struct lmlog_t
328     \brief Log quantities represented in either floating or integer format
329 */
330 typedef union {
331     float32 f; /**< The floating point component */
332     int32 l;   /**< The integer component */
333 } lmlog_t;
334 
335 
336 
337 /** \struct sorted_entry_t
338     \brief single entry used in the linked list structure of lm reading
339 */
340 
341 typedef struct sorted_entry_s {
342     lmlog_t val;		/**< value being kept in this node */
343     uint32 lower;	/**< index of another entry.  All descendants down
344 			   this path have their val < this node's val.
345 			   0 => no son exists (0 is root index) */
346     uint32 higher;	/**< index of another entry.  All descendants down
347 			   this path have their val > this node's val
348 			   0 => no son exists (0 is root index) */
349 } sorted_entry_t;
350 
351 /** \struct sorted_list_t
352  *
353  * \brief The sorted list used lm reading.  list is a (64K long) array.  The first entry is the root of the tree and is created during initialization.
354  */
355 typedef struct {
356     sorted_entry_t *list; /**< Beginnig of the list  */
357     int32 free;		/**< first free element in list */
358 } sorted_list_t;
359 
360 /** \struct ug_t
361  * \brief A unigram structure
362  * Please see
363  */
364 typedef struct {
365     s3wid_t dictwid;	/**< Dictionary word id, or BAD_S3WID if unknown.  However, the LM
366                            module merely sets this field to BAD_S3WID.  It is upto the
367                            application to fill in this field (HACK!!), so that this
368                            module can be independent of a dictionary. */
369     lmlog_t prob;       /**< Unigram probability */
370     lmlog_t bowt;
371     int32 firstbg;	/**< 1st bigram entry on disk */
372 } ug_t;
373 
374 /** \struct bg_t
375  * \brief A bigram structure
376  */
377 
378 typedef struct {
379     s3lmwid_t wid;	/**< LM wid (index into lm_t.ug) */
380     uint16 probid;      /**< Index into array of actualy bigram probs*/
381     uint16 bowtid;      /**< Index into array of actualy bigram backoff wts */
382     uint16 firsttg;     /**< 1st trigram entry on disk (see tg_segbase below) */
383 } bg_t;
384 
385 
386 /** \struct bg32_t
387  * \brief A bigram structure which has 32 bits.
388  */
389 typedef struct {
390     s3lmwid32_t wid;	/**< LM wid (index into lm_t.ug) */
391     uint32 probid;      /**< Index into array of actualy bigram probs*/
392     uint32 bowtid;      /**< Index into array of actualy bigram backoff wts */
393     uint32 firsttg;     /**< 1st trigram entry on disk (see tg_segbase below) */
394 } bg32_t;
395 
396 
397 /** \struct tg_t
398  * \brief A trigram structure
399  */
400 
401 typedef struct {
402     s3lmwid_t wid;	/**< LM wid (index into lm_t.ug) */
403     uint16 probid;      /**< Index into array of actualy trigram probs*/
404 } tg_t;
405 
406 
407 /** \struct tg32_t
408  * \brief A 32 bits version of tg_t
409  */
410 
411 typedef struct {
412     s3lmwid32_t wid;	/**< LM wid (index into lm_t.ug) */
413     uint32 probid;      /**< Index into array of actualy trigram probs*/
414 } tg32_t;
415 
416 
417 /** \struct membg_t
418  *  \brief Management of in-memory bigrams.  Not used if all bigrams in memory.
419  */
420 typedef struct {
421     bg_t *bg;		/**< Bigrams for a specific unigram; see lm_t.membg */
422     int32 used;		/**< Whether used since last lm_reset.  If not used, at the next
423                            lm_reset bg are freed */
424 } membg_t;
425 
426 /** \struct membg32_t
427  *
428  * \brief A 32 bits version of membg_t
429  */
430 typedef struct {
431     bg32_t *bg32;		/**< Bigrams for a specific unigram; see lm_t.membg */
432     int32 used;		/**< Whether used since last lm_reset.  If not used, at the next
433                            lm_reset bg are freed */
434 } membg32_t;
435 
436 
437 /**
438  * \struct tginfo_t
439  * \brief trigram cache that enhance locating trigram for a given bigram (w_1,w_2)
440  *
441  * The following trigram information cache eliminates most traversals of 1g->2g->3g
442  * tree to locate trigrams for a given bigram (w1,w2).  The organization is optimized
443  * for locality of access.  All bigrams (*,w2) for a given w2, for which trigrams have
444  * been accessed "recently", form a linear linked list, pointed to by lm_t.tginfo[w2].
445  * If disk-based, all trigrams for the given bg loaded upon request.  Cached info (and
446  * tg if disk-based) freed at lm_reset if not used since last such reset.
447  */
448 typedef struct tginfo_s {
449     s3lmwid_t w1;		/**< w1 component of bigram w1,w2.  All bigrams with
450 				   same w2 linked together. */
451     int32 n_tg;			/**< #tg for parent bigram w1,w2 */
452     tg_t *tg;			/**< Trigrams for w1,w2 */
453     int32 bowt;			/**< tg bowt for w1,w2 */
454     int32 used;			/**< whether used since last lm_reset */
455     struct tginfo_s *next;	/**< Next w1 with same parent w2 */
456 } tginfo_t;
457 
458 /**
459  * \struct tginfo32_t
460  * \brief 32 bit version of tginfo
461  *
462  */
463 typedef struct tginfo32_s {
464     s3lmwid32_t w1;		/**< w1 component of bigram w1,w2.  All bigrams with
465 				   same w2 linked together. */
466     int32 n_tg;			/**< #tg for parent bigram w1,w2 */
467     tg32_t *tg32;			/**< Trigrams for w1,w2 */
468     int32 bowt;			/**< tg bowt for w1,w2 */
469     int32 used;			/**< whether used since last lm_reset */
470     struct tginfo32_s *next;	/**< Next w1 with same parent w2 */
471 } tginfo32_t;
472 
473 
474 /*
475  * \struct lm_tgcache_entry_t
476  * Entries in a fast and dirty cache for trigram lookups.  See lm_t.tgcache.
477  */
478 typedef struct {
479     s3lmwid_t lwid[3];		/**< 0 = oldest, 2 = newest (i.e., P(2|0,1)) */
480     int32 lscr;			/**< LM score for above trigram */
481 } lm_tgcache_entry_t;
482 
483 
484 /*
485  * \struct lm_tgcache_entry32_t
486  * \brief 32 bit version of lm_tg_cache_entry
487  */
488 typedef struct {
489     s3lmwid32_t lwid[3];		/**< 0 = oldest, 2 = newest (i.e., P(2|0,1)) */
490     int32 lscr;			/**< LM score for above trigram */
491 } lm_tgcache_entry32_t;
492 
493 
494 
495 /*
496  * A note on lm/dict/dict2lm.   -ARCHAN 20050616
497  *
498  * In older versions of sphinx3 (<s3.4). dict2lm is a separate object
499  * from lm and dict.  A kb actually owns a dict2lm so programer will
500  * read the lm.  This seprates the initalization of lm and dict2lm and
501  * it makes a lot of sense if there is **only one** lm and **only one
502  * dict2lm.
503  *
504  * However, when multiple LMs and switching of them is required.
505  * Then, the problem of the above architecture starts to show up.  For
506  * example,
507  *  lmset=lm_read_ctl ();
508  *  for(i=0;i<kb->n_lm;i++){
509  *   dict2lmwid[i]=wid_dict_lm_map
510  *  }
511  * At the same time, one will also have an array of lms (lmset[i]) for
512  * corresponding dict2lm[i]!
513  *
514  * Of course, having multiple arrays of things will somedays caused
515  * problems.
516  *
517  * The resolution is that we observed that the dict2lm map mostly
518  * changed when the lm needs to change. Also, the fact that the
519  * dictionary pronounciation itself seldom changes. That is partially
520  * caused by the fact we don't have too much research on So at the
521  * end, that is why it makes sense to let the lm to own a dict2lm.
522  *
523  * What if we also allow the dictionary to change? That is a tough
524  * question.  In that case perhaps, we should still inventory of sets
525  * of lm and dict2lm and allow lm to store a pointer of dict2lm.  Once
526  * there are changes in dict, programmer will be responsible to update
527  * dict2lm. (Storing pointers will allow programmers not to update
528  * everything but just lms corresponding to a particular dict.)  I
529  * guess in that case it will be sign of having a wrapper that control
530  * both lm and dict together.
531  */
532 
533 /*
534  * Comments by RKM
535  * To conserve space, bg/tg probs/ptrs kept in many tables.  Since the number of
536  * distinct prob values << #bg/#tg, these table indices can be easily fit into
537  * 16 bits.  bgprob and bgbowt are such indices.  The firsttg entry for a bigram
538  * is harder.  It is supposed to be the index of the first trigram entry for each
539  * bigram.  But #tg can be >> 2^16.  Hence the following segmentation scheme:
540  * Partition bigrams into segments of lm_t.bg_seg_sz consecutive entries, such that
541  * #trigrams in each segment <= 2**16 (the corresponding trigram segment).  The
542  * bigram_t.firsttg value is then a 16-bit relative index within the trigram
543  * segment.  A separate table--lm_t.tg_segbase--has the absolute index of the
544  * 1st trigram for each segment.
545  */
546 
547 /* Default values for lm_t.log_bg_seg.sz */
548 #define LOG2_BG_SEG_SZ  9
549 #define BG_SEG_SZ       (1 << (LOG2_BG_SEG_SZ))
550 #define LM_TGCACHE_SIZE		100003	/* A prime no. (hopefully it IS one!) */
551 
552 /* 20040211 ARCHAN: Yes! Indeed it is a prime */
553 
554 /** \struct lm_t
555  * \brief The language model.
556  * All unigrams are read into memory on initialization.
557  * Bigrams and trigrams read in on demand.
558  */
559 typedef struct lm_s {
560     char *name ;        /**< The name of the LM */
561     int32 n_ug;         /**< #unigrams in LM */
562     int32 n_bg;         /**< #bigrams in entire LM */
563     int32 n_tg;         /**< #trigrams in entire LM */
564     int32 max_ug;       /**< To which n_ug can grow with dynamic addition of words */
565 
566     int32 n_ng;           /**< if unigram, n_ng=1, if bigram n_bg=2 and so one */
567 
568     char **wordstr;	/**< The LM word list (in unigram order) */
569 
570 
571     uint32 log_bg_seg_sz;/**< See big comment above */
572     uint32 bg_seg_sz;
573 
574     ug_t *ug;           /**< Unigrams */
575 
576     /* 20040225 ARCHAN : Data structure to maintain dictionary information */
577     /* Data structure for dictionary to LM words look up mapping */
578     /* 20060306 ARCHAN: Change this to a 32 bits data structure */
579     s3lmwid32_t *dict2lmwid; /**< a mapping from dictionary word to LM word */
580     s3lmwid32_t startlwid;	/**< S3_START_WORD id, if it exists */
581     s3lmwid32_t finishlwid;	/**< S3_FINISH_WORD id, if it exists */
582 
583     bg_t *bg;		/**< NULL iff disk-based */
584     tg_t *tg;		/**< NULL iff disk-based */
585     membg_t *membg;	/**< membg[w1] = bigrams for lm wid w1 (used iff disk-based) */
586     tginfo_t **tginfo;	/**< tginfo[w2] = fast trigram access info for bigrams (*,w2) */
587 
588 
589     lm_tgcache_entry_t *tgcache; /**< <w0,w1,w2> hashed to an entry into
590                                     this array.  Only the last trigram
591                                     mapping to any * given hash entry is
592                                     kept in that entry.  (The cache
593                                     doesn't have to be super-efficient.)
594                                  */
595 
596 
597     /**************************/
598 
599 
600     bg32_t *bg32;		/**< Bigram 32 bits, NULL iff disk-based */
601     tg32_t *tg32;		/**< Trigram 32 bits NULL iff disk-based */
602     membg32_t *membg32;	/**< membg 32bits membg[w1] = bigrams for lm wid w1 (used iff disk-based) */
603     tginfo32_t **tginfo32;	/**< tginfo 32bits tginfo[w2] = fast trigram access info for bigrams (*,w2) */
604 
605     lm_tgcache_entry32_t *tgcache32; /** tgcache 32 bits */
606 
607     /**************************/
608 
609     lmlog_t *bgprob;    /**< Table of actual bigram probs */
610     lmlog_t *tgprob;    /**< Table of actual trigram probs */
611     lmlog_t *tgbowt;    /**< Table of actual trigram backoff weights */
612     int32 *tg_segbase;  /**< tg_segbase[i>>lm_t.log_bg_seg_sz] = index of 1st
613                            trigram for bigram segment (i>>lm_t.log_bg_seg_sz) */
614     int32 n_bgprob;
615     int32 n_tgprob;
616     int32 n_tgbowt;
617 
618     FILE *fp;
619     int32 byteswap;     /**< Whether this file is in the WRONG byte order */
620     int32 bgoff;        /**< BG offsets into DMP file (used iff disk-based) */
621     int32 tgoff;        /**< TG offsets into DMP file (used iff disk-based) */
622 
623     float32 lw;		/**< Language weight currently in effect for this LM */
624     int32 wip;          /**< logs3(word insertion penalty) in effect for this LM */
625 
626 
627     /* Statistics */
628     int32 n_bg_fill;    /**< #bg fill operations */
629     int32 n_bg_inmem;   /**< #bg in memory */
630     int32 n_bg_score;   /**< #bg_score operations */
631     int32 n_bg_bo;	/**< #bg_score ops backed off to ug */
632     int32 n_tg_fill;	/**< Similar stats for trigrams */
633     int32 n_tg_inmem;   /**< #tg in memory */
634     int32 n_tg_score;   /**< #tg_score operations */
635     int32 n_tg_bo;      /**< #tg_score ops backed off to bg */
636     int32 n_tgcache_hit;  /**< # of trigram cache hit ops backed off to bg */
637 
638     int32 access_type;	/**< Updated on every lm_{tg,bg,ug}_score call to reflect the kind of
639                            n-gram accessed: 3 for 3-gram, 2 for 2-gram and 1 for 1-gram */
640 
641 
642     int32 isLM_IN_MEMORY;  /**< Whether LM in in memory, it is a property, potentially it means
643                               the code could allow you some model to be disk-based, some are not. */
644 
645     int32 dict_size;  /**< Only used in class-based LM, because class-based LM is addressed in
646                          the dictionary space. */
647     hash_table_t *HT;		/**<  hash table for word-string->word-id map */
648 
649 
650     /* Data structure that maintains the class information */
651     lmclass_t **lmclass;   /**< LM class for this LM */
652     int32 n_lmclass;      /**< # LM class */
653     int32 *inclass_ugscore; /**< An array of inter-class unigram probability */
654 
655 
656     int32 inputenc ; /**< Input encoding method */
657     int32 outputenc ; /**< Output encoding method */
658     int32 version;  /**< The version number of LM, in particular, this is the version that recently
659                        read in.
660                     */
661     int32 is32bits; /**< Whether the current LM is 32 bits or not. Derived from version and n_ug*/
662 
663     /* Arrays of unique bigram probs and bo-wts, and trigram probs */
664     sorted_list_t sorted_prob2; /**< Temporary Variable: Sorted list */
665     sorted_list_t sorted_bowt2; /**< Temporary Variable: Sorted list */
666     sorted_list_t sorted_prob3; /**< Temporary Variable: Sorted list */
667     int32 max_sorted_entries; /**< Temporary Variable: 2x the maximum size of the MAX_SORTED_ENTRIES*/
668 
669     logmath_t *logmath;
670 } lm_t;
671 
672 
673 
674 /** \struct lmset_t
675     \brief Structure for multiple LM, provide operations for addition/deletion/read
676     Structure for multiple, named LMs, started from s2
677 */
678 typedef struct lmset_s {
679     lm_t **lmarray;  /**< 1 dimensional array of pointers of lm_t */
680     lm_t *cur_lm; /**< TEMPORARY VARIABLE: The current LM */
681 
682     int32 cur_lm_idx; /**< TEMPORARY VARIABLE : The current LM index */
683     int32 n_lm;       /**< number of LM */
684     int32 n_alloc_lm; /**< number of allocated LM */
685 } lmset_t;
686 
687 /** Access macros; not meant for arbitrary use */
688 #define lm_lmwid2dictwid(lm,u)	((lm)->ug[u].dictwid)
689 #define lm_n_ug(lm)		((lm)->n_ug)
690 #define lm_n_bg(lm)		((lm)->n_bg)
691 #define lm_n_tg(lm)		((lm)->n_tg)
692 #define lm_wordstr(lm,u)	((lm)->wordstr[u])
693 #define lm_startwid(lm)		((lm)->startlwid)
694 #define lm_finishwid(lm)	((lm)->finishlwid)
695 #define lm_access_type(lm)	((lm)->access_type)
696 
697 
698 /** \struct wordprob_t
699     \brief Generic structure that could be used at any n-gram level
700 */
701 typedef struct {
702     s3wid_t wid;	/**< NOTE: dictionary wid; may be BAD_S3WID if not available */
703     int32 prob;         /**< The probability */
704 } wordprob_t;
705 
706 
707 /** A wrapper function of controlling the behavior of LM initialization
708  *
709  * (ARCHAN 20050617) lmset_init controls the behavior how the lmset
710  * which is an array of lm was initialized by different command-line
711  * arguments.  lmfile and lmctlfile are mutually exclusive.  Each
712  * will invoke one reading functions.
713  *
714  * In the case of -lmfile is specified.  A lmset with one single lm
715  * (or lmset->n_lm=1) will be returned. The single lm's name will be
716  * called lmname.
717  *
718  * In the case of -lmctlfile is specified. A lmset with multiple lms
719  * will be returned. The number of lm will depend on the number of
720  * lm specified by -lmctlfile.  For the format, please read the
721  * current format of -lmctlfile in lm.c
722  *
723  * ctl_lm is the equivalent of -ctl for lm.  When -ctl_lm is not
724  * specified in command-line (ctl_lm is NULL). Then either lm with
725  * name lmname will be used as the default lm.  If lmname is NULL, then
726  * the first lm will be named as the "default"
727  *
728  * lmdumpdir is currently not used. It is there for backward
729  * compatibility purpose.
730  *
731  * lw,wip,uw are language weight, word insertion pernalty and
732  * unigram weight. Their values are crucial to computation of the
733  * language model score. Therefore, the programmer is urged to
734  * carefully set these three values and also be careful of the
735  * order.
736  *
737  * dict is assumed to be a pre-initialized dict_t structure which is
738  * used in deriving the mapping between the dictionary word and the
739  * lm words
740  *
741  * ARCHAN 20050711 -lminmemory is the only global variable that
742  * control the code and we haven't explicitly specify it.  Currently,
743  * if the LM is DMP, both -lminmeory=0 or -lminmeory=1 could be used.
744  * if the LM is txt-base, only -lminmemory=1 is accepted. (This will
745  * be changed in future.)
746  *
747  *
748  * ARCHAN 20050705: A survival guide for this part of the code.  Our
749  * language mode code is unnecessarily complicated and is mainly
750  * caused by the fact the way we specified class-based LM and
751  * multiple LM are inter-dependent. For example, one could specify a
752  * multiple LMs file (i.e. lmctlfile) and have no classes.  However,
753  * if one would like to specify class information even with a single
754  * LM, one need to use a multiple LM file format (i.e. lmctlfile).
755  *
756  * This difficulty is well-observed in the period of Sphinx
757  * 3.4-3.6. That might imply that a new LM format is needed if we
758  * want to sustain this part of the development.
759  *
760  */
761 S3DECODER_EXPORT
762 lmset_t* lmset_init(const char* lmfile,  /**< The lm file name, lmfile and lmctlfile are mutally exclusive */
763 		    const char* lmctlfile, /**< The file that specified multiple LMs and class information, lmfile and lmctlfile are mutually exclusive */
764 		    const char* ctl_lm,    /**< The control file that describes which lm to use for a particular utterance*/
765 		    const char* lmname,    /**< The LM name to use if ctl_lm is not specified  */
766 		    const char* lmdumpdir, /**< Currently not used */
767 		    float32 lw,      /**< Language model weight */
768 		    float32 wip,     /**< Word insertion penalty */
769 		    float32 uw,      /**< Unigram weight */
770 		    dict_t *dict,     /**< A pre-initialized dict_t structure */
771 		    logmath_t *logmath
772     );
773 
774 
775 /* It is still a sore point: To have two interfaces for two different
776    type of input.  Some of the code is still duplicated.  Changing
777    one doesn't the other one will be changed
778 */
779 
780 /**
781  *  Read a single LM into the lmset.
782  */
783 lmset_t* lmset_read_lm(const char *lmfile, /**< In: The LM file */
784 		       dict_t *dict,       /**< In: A pre-initialized dictionary file*/
785 		       const char *lmname, /**< In: The LM name */
786 		       float64 lw,         /**< The language weight */
787 		       float64 wip,        /**< The word insertion penalty */
788 		       float64 uw,          /**< The unigram weight */
789 		       const char *lmdumpdir, /**< In: LM dump dir */
790 		       logmath_t *logmath
791     );
792 
793 /**
794  * Read the LM control file. **Usually**, it is also a class-based LM,
795  */
796 
797 lmset_t* lmset_read_ctl(const char * ctlfile,/**< Control file name */
798 			dict_t* dict,  /**< In: Dictionary */
799 			float64 lw,	/**< In: Language weight */
800 			float64 wip,	/**< In: Word insertion penalty */
801 			float64 uw,    /**< In: Unigram weight */
802 			const char* lmdumpdir, /**< In: LMdumpdir */
803 			logmath_t *logmath
804     );
805 
806 /**
807  * Get an LM by index.
808  */
809 lm_t* lmset_get_lm_widx(lmset_t *lms,  /**< In: The set of LM */
810 			int32 lmidx    /**< In: LM index */
811     );
812 
813 /**
814  * Get an LM by name
815  * @return a pointer of the LM with name lmname
816  */
817 lm_t* lmset_get_lm_wname(lmset_t *lms,  /**< In: The set of LM */
818 			 const char *lmname   /**< In: The LM name */
819     );
820 
821 /**
822  * Set the current LM with index
823  */
824 void lmset_set_curlm_widx(lmset_t *lms, /**< In: The set of LM */
825 			  int32 lmidx   /**< In: LM index */
826     );
827 
828 /**
829  * Set the current LM with name
830  */
831 S3DECODER_EXPORT
832 void lmset_set_curlm_wname(lmset_t *lms, /**< In: The set of LM */
833 			   const char *lmname   /**< In: The LM name */
834     );
835 
836 /**
837  * Convert name to index
838  */
839 int32 lmset_name_to_idx(lmset_t *lms, /**< In: The set of LM */
840 			const char *lmname /**< In: The LM name */
841     );
842 
843 /**
844  * Convert index to name
845  * @return a pointer of the name string.  No memory is allocated.
846  */
847 
848 char* lmset_idx_to_name(lmset_t *lms, /**< In: The set of LM */
849 			int32 lmidx /**< In: LM index */
850     );
851 
852 
853 /**
854  * Add a new lm into the lmset. Notice that lms->n_lm will be added by 1
855  */
856 
857 void lmset_add_lm(lmset_t *lms,  /**< In/Out : The set of LM */
858 		  lm_t *lm,      /**< In : The input LM */
859 		  const char* lmname /**< In: The lm name */
860     );
861 
862 /**
863  * Delete a LM with lmname. Notice that lms->n_lm will be subtracted by 1
864  */
865 
866 void lmset_delete_lm(lmset_t *lms, /**< In/Out : The set of LM */
867 		     const char *lmname /**< The lm name */
868     );
869 
870 /**
871  * Free the lmset data structure
872  */
873 S3DECODER_EXPORT
874 void lmset_free(lmset_t *lms /**< In: The set of LM */
875     );
876 
877 /**
878  * Return trigram followers for given two words.  Both w1 and w2 must be valid.
879  * Return value: #trigrams in returned list.
880  */
881 int32 lm_tglist (lm_t *lmp,	/**< In: LM being queried */
882 		 s3lmwid32_t w1,	/**< In: LM word id of the first of a 2-word history */
883 		 s3lmwid32_t w2,	/**< In: LM word id of the second of the 2-word history */
884 		 tg_t **tg,	/**< Out: *tg = array of trigrams for <w1,w2> */
885 		 int32 *bowt	/**< Out: *bowt = backoff-weight for <w1, w2> */
886     );
887 
888 int32 lm_tg32list (lm_t *lmp,	/**< In: LM being queried */
889 		   s3lmwid32_t w1,	/**< In: LM word id of the first of a 2-word history */
890 		   s3lmwid32_t w2,	/**< In: LM word id of the second of the 2-word history */
891 		   tg32_t **tg,	/**< Out: *tg = array of trigrams for <w1,w2> */
892 		   int32 *bowt	/**< Out: *bowt = backoff-weight for <w1, w2> */
893     );
894 
895 /**
896  * Return the bigram followers for the given word w.
897  * Return value: #bigrams in returned list.
898  */
899 int32 lm_bglist (lm_t *lmp,	/**< In: LM being queried */
900 		 s3lmwid32_t w,	/**< In: LM word id of the 1-word history */
901 		 bg_t **bg,	/**< Out: *bg = array of bigrams for w */
902 		 int32 *bowt	/**< Out: *bowt = backoff-weight for w */
903     );
904 
905 int32 lm_bg32list (lm_t *lmp,	/**< In: LM being queried */
906 		   s3lmwid32_t w,	/**< In: LM word id of the 1-word history */
907 		   bg32_t **bg,	/**< Out: *bg = array of bigrams for w */
908 		   int32 *bowt	/**< Out: *bowt = backoff-weight for w */
909     );
910 
911 
912 #if 0 /*Obsolete and it will cause conflict the code, so comment for now*/
913 /*
914  * Somewhat like lm_bglist, but fill up a wordprob_t array from the bigram list found, instead
915  * of simply returning the bglist.  The wordprob array contains dictionary word IDs.  But note
916  * that only the base IDs are entered; the caller is responsible for filling out the alternative
917  * pronunciations.
918  * Return value:  #entries filled in the wordprob array.
919  */
920 int32 lm_bg_wordprob(lm_t *lm,		/**< In: LM being queried */
921 		     s3lmwid32_t w,	/**< In: LM word ID of the 1-word history */
922 		     int32 th,		/**< In: If a prob (logs3, langwt-ed) < th, ignore it */
923 		     wordprob_t *wp,	/**< In/Out: Array to be filled; caller must have
924 					   allocated this array */
925 		     int32 *bowt	/**< Out: *bowt = backoff-weight associated with w */
926     );
927 
928 #endif
929 
930 /* Return LM word ID for the given string, or BAD_LMWID(lm) if not available */
931 s3lmwid32_t lm_wid (lm_t *lm, const char *wd);
932 
933 /**
934    Set all pointers to NULL in the lm
935 */
936 void lm_null_struct(lm_t* lm
937     );
938 
939 /**
940  * Like lm_bg_wordprob, but for unigrams.
941  * Return value:  #entries filled in the wordprob array.
942  */
943 int32 lm_ug_wordprob(lm_t *lm, /**< In: LM being queried */
944 		     dict_t *dict, /**< In : The dictionary */
945 		     int32 th,
946 		     wordprob_t *wp /**< In/out: Array to be filled */
947     );
948 
949 /** Return the unigrams in LM.  Return value: #unigrams in returned list. */
950 int32 lm_uglist (lm_t *lmp,	/**< In: LM being queried */
951 		 ug_t **ug	/**< Out: *ug = unigram array */
952     );
953 
954 
955 
956 /* 20040227: This also account the in-class probability of wid*/
957 /** Return unigram score for the given word */
958 int32 lm_ug_score (lm_t *lmp,  /**< In: LM begin queried */
959 		   s3lmwid32_t lwid, /**< LM ID for the word */
960 		   s3wid_t wid     /**< Dict ID for the word */
961     );
962 
963 
964 int32 lm_ug_exists(lm_t* lm ,  /**< LM */
965 		   s3lmwid32_t lwid /**< LM ID for the word */
966     );
967 
968 /*
969  * Return bigram score for the given two word sequence.  If w1 is BAD_LMWID(lm), return
970  * lm_ug_score (w2).
971  * 20040227: This also account for the in-class probability of w2.
972  */
973 int32 lm_bg_score (lm_t *lmp, /**< In: LM begin queried */
974 		   s3lmwid32_t lw1,
975 		   s3lmwid32_t lw2,
976 		   s3wid_t w2);
977 
978 
979 /**
980    Whether a certain bigram exists.
981 */
982 int32 lm_bg_exists (lm_t *lm,  /**< In: LM */
983 		    s3lmwid32_t lw1,
984 		    s3lmwid32_t lw2
985     );
986 
987 /**
988  * Return trigram score for the given three word sequence.  If w1 is BAD_LMWID(lm), return
989  * lm_bg_score (w2, w3).  If both lw1 and lw2 are BAD_LMWID(lm), return lm_ug_score (lw3).
990  *
991  * 20040227: This also account for the in-class probability of w3.
992  */
993 int32 lm_tg_score (lm_t *lmp,  /**< In: LM begin queried */
994 		   s3lmwid32_t lw1,
995 		   s3lmwid32_t lw2,
996 		   s3lmwid32_t lw3,
997 		   s3wid_t w3);
998 
999 
1000 /**
1001    Whether a certain trigram exists.
1002 */
1003 int32 lm_tg_exists (lm_t *lm,  /**< In: LM */
1004 		    s3lmwid32_t lw1,
1005 		    s3lmwid32_t lw2,
1006 		    s3lmwid32_t lw3
1007     );
1008 
1009 /**
1010  * Set the language-weight and insertion penalty parameters for the LM, after revoking
1011  * any earlier set of such parameters.
1012  *
1013  * WARNING!! This function doesn't prevent underflow of values.  Make sure you call
1014  * safe lm2logs3 before it.
1015  */
1016 void lm_set_param (lm_t *lm,  /**< In: the LM */
1017 		   float64 lw,  /**< In: the langauage weight */
1018 		   float64 wip  /**< In: the word insertion penalty */
1019     );
1020 
1021 
1022 S3DECODER_EXPORT
1023 int32 lm_rawscore (lm_t *lm,  /**< In: the LM */
1024 		   int32 score
1025     );
1026 
1027 
1028 
1029 /** LM cache related */
1030 S3DECODER_EXPORT
1031 void lm_cache_reset (lm_t *lmp /**< In: the LM */
1032     );
1033 
1034 /** LM cache statistic dumping */
1035 S3DECODER_EXPORT
1036 void lm_cache_stats_dump (lm_t *lmp /**< In: the LM */
1037     );
1038 
1039 /**
1040  * A simple version of reading in a LM
1041  *
1042  * lm_read is a simple version of lm_read_advance.  It will assume
1043  * language weight, word insertion penalty and unigram weight to be
1044  * automatically applied.  There is also no class-based LM (so
1045  * ndict=0).  Format is set to NULL, so the program will determine
1046  * it automatically.
1047  */
1048 lm_t * lm_read (
1049     const char *file,	/**< In: LM file being read */
1050     const char *lmname,  /**<In: LM name*/
1051     cmd_ln_t *config,
1052     logmath_t *logmath);
1053 
1054 /**
1055  * Read an LM file, it will automatically decide whether the file is
1056  * a DUMP file or a txt file. Then call lm_read_txt and lm_read_dump
1057  * (non-public functions) correspondingly.  Currently the code is
1058  * not aware about OOV.
1059  *
1060  * lw, wip, uw and ndict are mainly used for recognition purpose.
1061  * When lm_read is used for other purpose, one could just used dummy
1062  * setting.  recommended one is lw=1.0,wip=0.1,uw=1.0 and
1063  * ndict=0. These are very useful when lm_read is just used as
1064  * reading the LM.
1065  *
1066  * If applyweight is 0, lw,wip, uw will not be apply the LM at all.
1067  * This will allow users to just call the LM routine without
1068  * initializing other modules (such as logs3_init).
1069  *
1070  * If applyweight is 1, then logs3_init must be called before lm_read.
1071  * This is usually the case when kb_init is called before the code.
1072  *
1073  * fmt now could be either "TXT", "DMP" and "TXT32" or just
1074  * NULL. If it is NULL, the LM format will be automatically
1075  * determined.  If it is specified as "TXT" or "DMP", the
1076  * corresponding lm reader will be called. In such a case, it is
1077  * important for the users to know what he/she is doing.
1078  * (Unfortunately, this is mostly not true. )
1079  * In the case of "TXT32", a text LM will be forced to 32bit mode.
1080  *
1081  * ndict is the dictionary size of the application.  This is needed
1082  * because class-based LM are addressed in the dictionary wid-space
1083  * instead of lm wid-space. If class-based LM is not used, just set
1084  * this to zero.
1085  *
1086  * Note: there are two defense mechanisms of lm_read_advance.
1087  * First of all, if no fmt is specified, it will start to read
1088  * the lm in the order of DMP->TXT. Second, if txt format
1089  * is specified but LM is found to hit the 16bit legacy segments
1090  * limit, it will automatically switch to read TXT32 LM
1091  *
1092  * @return pointer to LM structure created.
1093  */
1094 lm_t *lm_read_advance (const char *file,	/**< In: LM file being read */
1095 		       const char *lmname,   /**<In: LM name*/
1096 		       float64 lw,	/**< In: Language weight */
1097 		       float64 wip,	/**< In: Word insertion penalty */
1098 		       float64 uw,	/**< In: Unigram weight (interpolation with uniform distr.) */
1099 		       int32 ndict,    /**< In: Number of dictionary entry.  We need that because
1100 					  class-based LM is addressed in dictionary word ID space.
1101 				       */
1102 		       const char* fmt,       /**< In: file format of the LM, it is
1103 					   now either "TXT", "DMP" and NULL,
1104 					   if NULL, file format is
1105 					   automaticaly determined */
1106 		       int32 applyweight,      /**< In: whether lw,wip, uw should be
1107 						 applied to the lm or not */
1108 		       logmath_t *logmath
1109     );
1110 
1111 S3DECODER_EXPORT
1112 lm_t *lm_read_advance2(const char *file,	/**< In: LM file being read */
1113 		       const char *lmname,   /**<In: LM name*/
1114 		       float64 lw,	/**< In: Language weight */
1115 		       float64 wip,	/**< In: Word insertion penalty */
1116 		       float64 uw,	/**< In: Unigram weight (interpolation with uniform distr.) */
1117 		       int32 ndict,    /**< In: Number of dictionary entry.  We need that because
1118 					  class-based LM is addressed in dictionary word ID space.
1119 				       */
1120 		       const char* fmt,       /**< In: file format of the LM, it is
1121 					   now either "TXT", "DMP" and NULL,
1122 					   if NULL, file format is
1123 					   automaticaly determined */
1124 		       int32 applyweight,      /**< In: whether lw,wip, uw should be
1125                                                   applied to the lm or not */
1126                        int lminmemory, /**< In: Whether LM is read into memory */
1127 		       logmath_t *logmath
1128     );
1129 /**
1130    Simple writing of an LM file, the input and output encoding will
1131    assume to be iso8859-1. Call lm_write. To convert encoding, please use
1132    lm_write_advance.
1133 */
1134 S3DECODER_EXPORT
1135 int32 lm_write(lm_t *model, /** In: the pointer LM we want to output */
1136 	       const char *outputfile, /**< In: the output file name */
1137 	       const char *filename, /**< In: the LM file name  */
1138 	       const char *fmt   /**< In: LM file format, it is now either "TXT" or "DMP" */
1139     );
1140 
1141 /**
1142    Writing of an LM file with advanced options such as encoding support.
1143    Called by lm_write.
1144 
1145    fmt now could be TXT, DMP, FST
1146 
1147    inputenc and outputenc could now be iso8859-1, gb2312-hex, gb2312.
1148    Not every pair of conversion works.
1149 
1150    Current input/output encodings support list.
1151    0: iso8859-1
1152    1: gb2312-hex
1153    2: gb2312
1154 
1155    -: do nothing
1156    n: doesn't make sense or not compatible
1157    x: not supported yet
1158    y: supported
1159 
1160    i\o 0 1 2
1161    0 - n n
1162    1 n - y
1163    2 n x -
1164 
1165    When we have 4 encoding types: This document should be
1166    implemented as a data structure.
1167 
1168    This conversion table is copied from encoding.c, please take a
1169    look the latest support in encoding.c
1170 */
1171 
1172 int32 lm_write_advance(lm_t *model, /**< In: the pointer LM we want to output */
1173 		       const char *outputfile, /**< In: the output file name */
1174 		       const char *filename, /**< In: the LM file name  */
1175 		       const char *fmt,   /**< In: LM file format, it is now either "TXT", "DMP", "FST" */
1176 		       const char* inputenc, /**< In: Input encoding type */
1177 		       char* outputenc /**< Out: Output encoding type */
1178     );
1179 
1180 /* RAH, added code for freeing allocated memory
1181  */
1182 /**
1183    Deallocate the language model.
1184 */
1185 S3DECODER_EXPORT
1186 void lm_free (lm_t *lm /**< In: a LM structure */
1187     );
1188 
1189 /**
1190    Add word list to the LM
1191    For each word in the file, call lm_add_wordlist.
1192    The file is assume to have a format like this:
1193    <word1>
1194    <word2>
1195    <word3>
1196    <word4>
1197 
1198    If the lmwid2dictid mapping is not updated, or the dictionary
1199    itself is not used in the context.  Just specify dict=NULL;
1200 
1201 */
1202 int32 lm_add_wordlist(lm_t *lm, /**< In/Out: a modified LM structure */
1203 		      dict_t *dict, /**< In: an initialized dictionary structure
1204 				       Used to update
1205 				    */
1206 		      const char* filename /**< In: a file that contains a
1207 					list of word one wants to
1208 					add*/
1209     );
1210 
1211 /**
1212    Add a word to the LM
1213 
1214    look up the dictionary and see whether it exists in the dictionary
1215    Looks alike with wid.c's logic at this point.
1216 
1217    (Incomplete!) Not fully tested in the situation for on-line
1218    recognition.
1219 
1220    We also avoid the addition of classes at this point because that
1221    could complicated things quite a lot.
1222 */
1223 int32 lm_add_word_to_ug(lm_t *lm, /**< In/Out: a modified LM structure */
1224 			dict_t *dict, /**< In: an initialized dictionary structure
1225 					 Used to update lmwid2dictid mapping.
1226 				      */
1227 			const char* newword /**<In: a pointer of a new word */
1228     );
1229 /**
1230     Get class ID given a LM.
1231 */
1232 int32 lm_get_classid (lm_t *model, /**< In: LM file being queried*/
1233 		      const char *name   /**< In: The name of the class */
1234     );
1235 
1236 /**
1237  * Explicity convert structure from 16bit -> 32bit or 32bit to 16bit.
1238  */
1239 void lm_convert_structure(lm_t *model, /**< In: LM file being used */
1240 			  int32 is32bits
1241     );
1242 
1243 /**
1244    Check whether the model is operating at 32 bits
1245 */
1246 int32 lm_is32bits(lm_t* model);
1247 
1248 /**
1249    Write of UG structure
1250 */
1251 void ug_write(FILE* fp,  /**< A file pointer */
1252 	      ug_t* ug   /**< A pointer of the ug_t structure */
1253     );
1254 /**
1255    Write of BG structure
1256 */
1257 void bg_write(FILE* fp, /**< A file pointer */
1258 	      bg_t* bg  /**< A pointer of the bg_t structure */
1259     );
1260 
1261 /**
1262    Write of BG (32bits) structure
1263 */
1264 void bg32_write(FILE* fp, /**< A file pointer */
1265 		bg32_t* bg  /**< A pointer of the bg32_t structure */
1266     );
1267 
1268 /**
1269    Write of TG structure
1270 */
1271 
1272 void tg_write(FILE* fp, /**< A file pointer */
1273 	      tg_t* tg  /**< A pointer of the tg_t structure */
1274     );
1275 
1276 /**
1277    Write of TG (32bits) structure
1278 */
1279 
1280 void tg32_write(FILE* fp, /**< A file pointer */
1281 		tg32_t* tg  /**< A pointer of the tg32_t structure */
1282     );
1283 
1284 
1285 /**
1286    Convert the 16 bit bigram structure to 32 bit
1287 */
1288 void copy_bg_to_bg32(lm_t *lm /**< LM */
1289     );
1290 
1291 /**
1292    Convert the 32 bit bigram structure to 16 bit
1293 */
1294 
1295 void copy_bg32_to_bg(lm_t *lm /**< LM */
1296     );
1297 
1298 /**
1299    Convert the 16 bit trigram structure to 32 bit
1300 */
1301 void copy_tg_to_tg32(lm_t *lm /**< LM */
1302     );
1303 
1304 /**
1305    Convert the 32 bit trigram structure to 16 bit
1306 */
1307 
1308 void copy_tg32_to_tg(lm_t *lm /**< LM */
1309     );
1310 
1311 /**
1312    Swap 16 bits bigram
1313 */
1314 void swap_bg(bg_t* bg);
1315 
1316 
1317 /**
1318    Swap 32 bits bigram
1319 */
1320 void swap_bg32(bg32_t* bg);
1321 
1322 /**
1323    Swap 16 bits trigram
1324 */
1325 void swap_tg(tg_t* tg);
1326 
1327 
1328 /**
1329    Swap 32 bits trigram
1330 */
1331 void swap_tg32(tg32_t* tg);
1332 
1333 int32 find_bg (bg_t *bg,  /**< In: The bigram */
1334 	       int32 n,
1335 	       s3lmwid32_t w
1336     );
1337 
1338 int32 find_bg32 (bg32_t *bg,  /**< In: The bigram */
1339 		 int32 n,
1340 		 s3lmwid32_t w
1341     );
1342 
1343 
1344 int32 find_tg (tg_t *tg, /**< In: The trigram */
1345 	       int32 n, s3lmwid32_t w);
1346 
1347 int32 find_tg32 (tg32_t *tg, /**< In: The trigram */
1348 		 int32 n, s3lmwid32_t w);
1349 
1350 /* Macro versions of access functions */
1351 #define LM_TGPROB(lm,tgptr)	((lm)->tgprob[(tgptr)->probid].l)
1352 #define LM_BGPROB(lm,bgptr)	((lm)->bgprob[(bgptr)->probid].l)
1353 #define LM_UGPROB(lm,ugptr)	((ugptr)->prob.l)
1354 #define LM_RAWSCORE(lm,score)	((score - (lm)->wip) / ((lm)->lw))
1355 #define LM_DICTWID(lm,lmwid)     ((lm)->ug[(lmwid)].dictwid)
1356 
1357 /**
1358     Create a new unigram table
1359 */
1360 ug_t *NewUnigramTable (int32 n_ug /**< Number of unigram */
1361     );
1362 
1363 
1364 #if 0
1365 { /* Stop indent from complaining */
1366 #endif
1367 #ifdef __cplusplus
1368 }
1369 #endif
1370 
1371 #endif
1372