1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 1999-2004 Carnegie Mellon University.  All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 /*
38  * dict2pid.h -- Triphones for dictionary
39  *
40  * **********************************************
41  * CMU ARPA Speech Project
42  *
43  * Copyright (c) 1999 Carnegie Mellon University.
44  * ALL RIGHTS RESERVED.
45  * **********************************************
46  *
47  * HISTORY
48  * $Log$
49  * Revision 1.1  2006/04/05  20:27:30  dhdfu
50  * A Great Reorganzation of header files and executables
51  *
52  * Revision 1.9  2006/02/22 21:05:16  arthchan2003
53  * Merged from branch SPHINX3_5_2_RCI_IRII_BRANCH:
54  *
55  * 1, Added logic to handle bothe composite and non composite left
56  * triphone.  Composite left triphone's logic (the original one) is
57  * tested thoroughly. The non-composite triphone (or full triphone) is
58  * found to have bugs.  The latter is fended off from the users in the
59  * library level.
60  *
61  * 2, Fixed dox-doc.
62  *
63  * Revision 1.8.4.5  2005/11/17 06:13:49  arthchan2003
64  * Use compressed right context in expansion in triphones.
65  *
66  * Revision 1.8.4.4  2005/10/17 04:48:45  arthchan2003
67  * Free resource correctly in dict2pid.
68  *
69  * Revision 1.8.4.3  2005/10/07 19:03:38  arthchan2003
70  * Added xwdssid_t structure.  Also added compression routines.
71  *
72  * Revision 1.8.4.2  2005/09/25 19:13:31  arthchan2003
73  * Added optional full triphone expansion support when building context phone mapping.
74  *
75  * Revision 1.8.4.1  2005/07/17 05:20:30  arthchan2003
76  * Fixed dox-doc.
77  *
78  * Revision 1.8  2005/06/21 21:03:49  arthchan2003
79  * 1, Introduced a reporting routine. 2, Fixed doyxgen documentation, 3, Added  keyword.
80  *
81  * Revision 1.5  2005/06/13 04:02:57  archan
82  * Fixed most doxygen-style documentation under libs3decoder.
83  *
84  * Revision 1.4  2005/04/21 23:50:26  archan
85  * Some more refactoring on the how reporting of structures inside kbcore_t is done, it is now 50% nice. Also added class-based LM test case into test-decode.sh.in.  At this moment, everything in search mode 5 is already done.  It is time to test the idea whether the search can really be used.
86  *
87  * Revision 1.3  2005/03/30 01:22:46  archan
88  * Fixed mistakes in last updates. Add
89  *
90  *
91  * 14-Sep-1999	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
92  * 		Added dict2pid_comsseq2sen_active().
93  *
94  * 04-May-1999	M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
95  * 		Started.
96  */
97 
98 
99 #ifndef _S3_DICT2PID_H_
100 #define _S3_DICT2PID_H_
101 
102 
103 #include <stdio.h>
104 
105 #include <logmath.h>
106 #include "s3types.h"
107 #include "mdef.h"
108 #include "dict.h"
109 #include "ctxt_table.h"
110 
111 /** \file dict2pid.h
112  * \brief Building triphones for a dictionary.
113  *
114  * This is one of the more complicated parts of a cross-word
115  * triphone model decoder.  The first and last phones of each word
116  * get their left and right contexts, respectively, from other
117  * words.  For single-phone words, both its contexts are from other
118  * words, simultaneously.  As these words are not known beforehand,
119  * life gets complicated.  In this implementation, when we do not
120  * wish to distinguish between distinct contexts, we use a COMPOSITE
121  * triphone (a bit like BBN's fast-match implementation), by
122  * clubbing together all possible contexts.
123  *
124  * There are 3 cases:
125  *
126  *   1. Internal phones, and boundary phones without any specific
127  * context, in each word.  The boundary phones are modelled using
128  * composite phones, internal ones using ordinary phones.
129  *
130  *   2. The first phone of a multi-phone word, for a specific
131  *history (i.e., in a 2g/3g/4g...  tree) has known left and right
132  *contexts.  The possible left contexts are limited to the possible
133  *last phones of the history.  So it can be modelled separately,
134  *efficiently, as an ordinary triphone.
135  *
136  *   3. The one phone in a single-phone word, for a specific history
137  * (i.e., in a 2g/3g/4g...  tree) has a known left context, but
138  * unknown right context.  It is modelled using a composite
139  * triphone.  (Note that right contexts are always composite, left
140  * contexts are composite only in the unigram tree.)
141  *
142  * A composite triphone is formed as follows.  (NOTE: this assumes
143  * that all CIphones/triphones have the same HMM topology,
144  * specifically, no. of states.)  A composite triphone represents a
145  * situation where either the left or the right context (or both)
146  * for a given base phone is unknown.  That is, it represents the
147  * set of all possible ordinary triphones derivable from * the
148  * unkown context(s).  Let us call this set S.  It is modelled using
149  * the same HMM topology * as the ordinary triphones, but with
150  * COMPOSITE states.  A composite state (in a given position * in
151  * the HMM state topology) is the set of states (senones) at that
152  * position derived from S.
153  *
154  * Actually, we generally deal with COMPOSITE SENONE-SEQUENCES
155  * rather than COMPOSITE PHONES.  The former are compressed forms of
156  * the latter, by virtue of state sharing among phones.  (See
157  * mdef.h.)
158  *
159  * In 3.6, the composite triphone will only be build when -composite
160  * 1 (default) is specified.  Other than that, full triphone
161  * expansion will be carried out in run-time
162  */
163 
164 #ifdef __cplusplus
165 extern "C" {
166 #endif
167 #if 0
168 } /* Fool Emacs into not indenting things. */
169 #endif
170 
171 /**
172    \struct dict2pid_t
173    \brief Building composite triphone (as well as word internal triphones) with the dictionary.
174 */
175 
176 typedef struct {
177     s3ssid_t **internal;	/**< For internal phone positions (not first, not last), the
178 				   ssid; for first and last positions, the composite ssid.
179 				   ([word][phone-position])
180 				   if -composite is 0, then internal[0] and internal[pronlen-1] will
181 				   equal to BAD_SSID;
182 				*/
183 
184     /*Notice the order of the arguments */
185 
186     s3ssid_t ***ldiph_lc;	/**< For multi-phone words, [base][rc][lc] -> ssid; filled out for
187 				   word-initial base x rc combinations in current vocabulary */
188 
189 
190     s3ssid_t ***rdiph_rc;	/**< For multi-phone words, [base][lc][rc] -> ssid; filled out for
191 				   word-initial base x lc combinations in current vocabulary */
192 
193     xwdssid_t **rssid;          /**< Right context state sequence id table
194                                    First dimension: base phone,
195                                    Second dimension: left context.
196                                 */
197 
198 
199     s3ssid_t ***lrdiph_rc;      /**< For single-phone words, [base][lc][rc] -> ssid; filled out for
200                                    word-initial base x lc combinations in current vocabulary */
201 
202     xwdssid_t **lrssid;          /**< Left-Right context state sequence id table
203                                     First dimension: base phone,
204                                     Second dimension: left context.
205 
206                                  */
207 
208 
209     int32 is_composite;         /**< Whether we will build composite triphone. If yes, the
210                                    structure will be in composite triphone mode, single_lc,
211                                    comstate, comsseq and comwt will be initialized. Otherwise, the code
212                                    will be in normal triphone mode.  The parameters will be left NULL.
213                                 */
214 
215     s3ssid_t **single_lc;	/**< For single phone words, [base][lc] -> composite ssid; filled
216 				   out for single phone words in current vocabulary */
217 
218     s3senid_t **comstate;	/**< comstate[i] = BAD_S3SENID terminated set of senone IDs in
219 				   the i-th composite state */
220     s3senid_t **comsseq;	/**< comsseq[i] = sequence of composite state IDs in i-th
221 				   composite phone (composite sseq). */
222     int32 *comwt;		/**< Weight associated with each composite state (logs3 value).
223 				   Final composite state score weighted by this amount */
224     int32 n_comstate;		/**< #Composite states */
225     int32 n_comsseq;		/**< #Composite senone sequences */
226     int32 n_ci;   /**< Number of CI phone in */
227     int32 n_dictsize; /**< Dictionary size */
228 
229 } dict2pid_t;
230 
231 /** Access macros; not designed for arbitrary use */
232 #define dict2pid_internal(d,w,p)	((d)->internal[w][p]) /**< return internal dict2pid*/
233 #define dict2pid_n_comstate(d)		((d)->n_comstate)     /**< return number of composite state*/
234 #define dict2pid_n_comsseq(d)		((d)->n_comsseq)      /**< return number of composite state sequence*/
235 #define dict2pid_is_composite(d)	((d)->is_composite)      /**< return whether dict2pid is in composite triphone mode or not*/
236 
237 #define IS_COMPOSITE 1
238 #define NOT_COMPOSITE 0
239 
240 /** Build the dict2pid structure for the given model/dictionary */
241 dict2pid_t *dict2pid_build (mdef_t *mdef,  /**< A  model definition*/
242 			    dict_t *dict,   /**< An initialized dictionary */
243 			    int32 is_composite, /**< Whether composite triphones will be built */
244 			    logmath_t *logmath
245     );
246 
247 
248 /** Free the memory dict2pid structure */
249 void dict2pid_free(dict2pid_t *d2p /**< In: the d2p */
250     );
251 /**
252  * Compute composite senone scores from ordinary senone scores (max of component senones)
253  */
254 void dict2pid_comsenscr (dict2pid_t *d2p,        /**< In: a dict2pid_t structure */
255 			 int32 *senscr,		/**< In: Ordinary senone scores */
256 			 int32 *comsenscr	/**< Out: Composite senone scores */
257     );
258 
259 /**
260  * Mark active senones as indicated by the input array of composite senone-sequence active flags.
261  * Caller responsible for allocating and clearing sen[] before calling this function.
262  */
263 void dict2pid_comsseq2sen_active (dict2pid_t *d2p,      /**< In: a dict2pid_t structure */
264 				  mdef_t *mdef,         /**< In: a mdef_t structure */
265 				  uint8 *comssid,	/**< In: Active flag for each comssid */
266 				  uint8 *sen		/**< In/Out: Active flags set for senones
267 							   indicated by the active comssid */
268     );
269 /** For debugging */
270 void dict2pid_dump (FILE *fp,        /**< In: a file pointer */
271 		    dict2pid_t *d2p, /**< In: a dict2pid_t structure */
272 		    mdef_t *mdef,    /**< In: a mdef_t structure*/
273 		    dict_t *dict     /**< In: a dictionary structure */
274     );
275 
276 /** Report a dict2pid data structure */
277 void dict2pid_report(dict2pid_t *d2p /**< In: a dict2pid_t structure */
278     );
279 
280 /**
281    Get number of rc
282 */
283 int32 get_rc_nssid(dict2pid_t *d2p,  /**< In: a dict2pid */
284 		   s3wid_t w,        /**< In: a wid */
285 		   dict_t *dict      /**< In: a dictionary */
286     );
287 
288 /**
289    Get RC map
290 */
291 s3cipid_t* dict2pid_get_rcmap(dict2pid_t *d2p,  /**< In: a dict2pid */
292 			      s3wid_t w,        /**< In: a wid */
293 			      dict_t *dict      /**< In: a dictionary */
294     );
295 
296 #if 0
297 { /* Stop indent from complaining */
298 #endif
299 #ifdef __cplusplus
300 }
301 #endif
302 
303 
304 #endif
305