1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ 2 /* ==================================================================== 3 * Copyright (c) 1999-2004 Carnegie Mellon University. All rights 4 * reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 18 * This work was supported in part by funding from the Defense Advanced 19 * Research Projects Agency and the National Science Foundation of the 20 * United States of America, and the CMU Sphinx Speech Consortium. 21 * 22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY 26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 * 34 * ==================================================================== 35 * 36 */ 37 /* 38 * dict2pid.h -- Triphones for dictionary 39 * 40 * ********************************************** 41 * CMU ARPA Speech Project 42 * 43 * Copyright (c) 1999 Carnegie Mellon University. 44 * ALL RIGHTS RESERVED. 45 * ********************************************** 46 * 47 * HISTORY 48 * $Log$ 49 * Revision 1.1 2006/04/05 20:27:30 dhdfu 50 * A Great Reorganzation of header files and executables 51 * 52 * Revision 1.9 2006/02/22 21:05:16 arthchan2003 53 * Merged from branch SPHINX3_5_2_RCI_IRII_BRANCH: 54 * 55 * 1, Added logic to handle bothe composite and non composite left 56 * triphone. Composite left triphone's logic (the original one) is 57 * tested thoroughly. The non-composite triphone (or full triphone) is 58 * found to have bugs. The latter is fended off from the users in the 59 * library level. 60 * 61 * 2, Fixed dox-doc. 62 * 63 * Revision 1.8.4.5 2005/11/17 06:13:49 arthchan2003 64 * Use compressed right context in expansion in triphones. 65 * 66 * Revision 1.8.4.4 2005/10/17 04:48:45 arthchan2003 67 * Free resource correctly in dict2pid. 68 * 69 * Revision 1.8.4.3 2005/10/07 19:03:38 arthchan2003 70 * Added xwdssid_t structure. Also added compression routines. 71 * 72 * Revision 1.8.4.2 2005/09/25 19:13:31 arthchan2003 73 * Added optional full triphone expansion support when building context phone mapping. 74 * 75 * Revision 1.8.4.1 2005/07/17 05:20:30 arthchan2003 76 * Fixed dox-doc. 77 * 78 * Revision 1.8 2005/06/21 21:03:49 arthchan2003 79 * 1, Introduced a reporting routine. 2, Fixed doyxgen documentation, 3, Added keyword. 80 * 81 * Revision 1.5 2005/06/13 04:02:57 archan 82 * Fixed most doxygen-style documentation under libs3decoder. 83 * 84 * Revision 1.4 2005/04/21 23:50:26 archan 85 * Some more refactoring on the how reporting of structures inside kbcore_t is done, it is now 50% nice. Also added class-based LM test case into test-decode.sh.in. At this moment, everything in search mode 5 is already done. It is time to test the idea whether the search can really be used. 86 * 87 * Revision 1.3 2005/03/30 01:22:46 archan 88 * Fixed mistakes in last updates. Add 89 * 90 * 91 * 14-Sep-1999 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University 92 * Added dict2pid_comsseq2sen_active(). 93 * 94 * 04-May-1999 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University 95 * Started. 96 */ 97 98 99 #ifndef _S3_DICT2PID_H_ 100 #define _S3_DICT2PID_H_ 101 102 103 #include <stdio.h> 104 105 #include <logmath.h> 106 #include "s3types.h" 107 #include "mdef.h" 108 #include "dict.h" 109 #include "ctxt_table.h" 110 111 /** \file dict2pid.h 112 * \brief Building triphones for a dictionary. 113 * 114 * This is one of the more complicated parts of a cross-word 115 * triphone model decoder. The first and last phones of each word 116 * get their left and right contexts, respectively, from other 117 * words. For single-phone words, both its contexts are from other 118 * words, simultaneously. As these words are not known beforehand, 119 * life gets complicated. In this implementation, when we do not 120 * wish to distinguish between distinct contexts, we use a COMPOSITE 121 * triphone (a bit like BBN's fast-match implementation), by 122 * clubbing together all possible contexts. 123 * 124 * There are 3 cases: 125 * 126 * 1. Internal phones, and boundary phones without any specific 127 * context, in each word. The boundary phones are modelled using 128 * composite phones, internal ones using ordinary phones. 129 * 130 * 2. The first phone of a multi-phone word, for a specific 131 *history (i.e., in a 2g/3g/4g... tree) has known left and right 132 *contexts. The possible left contexts are limited to the possible 133 *last phones of the history. So it can be modelled separately, 134 *efficiently, as an ordinary triphone. 135 * 136 * 3. The one phone in a single-phone word, for a specific history 137 * (i.e., in a 2g/3g/4g... tree) has a known left context, but 138 * unknown right context. It is modelled using a composite 139 * triphone. (Note that right contexts are always composite, left 140 * contexts are composite only in the unigram tree.) 141 * 142 * A composite triphone is formed as follows. (NOTE: this assumes 143 * that all CIphones/triphones have the same HMM topology, 144 * specifically, no. of states.) A composite triphone represents a 145 * situation where either the left or the right context (or both) 146 * for a given base phone is unknown. That is, it represents the 147 * set of all possible ordinary triphones derivable from * the 148 * unkown context(s). Let us call this set S. It is modelled using 149 * the same HMM topology * as the ordinary triphones, but with 150 * COMPOSITE states. A composite state (in a given position * in 151 * the HMM state topology) is the set of states (senones) at that 152 * position derived from S. 153 * 154 * Actually, we generally deal with COMPOSITE SENONE-SEQUENCES 155 * rather than COMPOSITE PHONES. The former are compressed forms of 156 * the latter, by virtue of state sharing among phones. (See 157 * mdef.h.) 158 * 159 * In 3.6, the composite triphone will only be build when -composite 160 * 1 (default) is specified. Other than that, full triphone 161 * expansion will be carried out in run-time 162 */ 163 164 #ifdef __cplusplus 165 extern "C" { 166 #endif 167 #if 0 168 } /* Fool Emacs into not indenting things. */ 169 #endif 170 171 /** 172 \struct dict2pid_t 173 \brief Building composite triphone (as well as word internal triphones) with the dictionary. 174 */ 175 176 typedef struct { 177 s3ssid_t **internal; /**< For internal phone positions (not first, not last), the 178 ssid; for first and last positions, the composite ssid. 179 ([word][phone-position]) 180 if -composite is 0, then internal[0] and internal[pronlen-1] will 181 equal to BAD_SSID; 182 */ 183 184 /*Notice the order of the arguments */ 185 186 s3ssid_t ***ldiph_lc; /**< For multi-phone words, [base][rc][lc] -> ssid; filled out for 187 word-initial base x rc combinations in current vocabulary */ 188 189 190 s3ssid_t ***rdiph_rc; /**< For multi-phone words, [base][lc][rc] -> ssid; filled out for 191 word-initial base x lc combinations in current vocabulary */ 192 193 xwdssid_t **rssid; /**< Right context state sequence id table 194 First dimension: base phone, 195 Second dimension: left context. 196 */ 197 198 199 s3ssid_t ***lrdiph_rc; /**< For single-phone words, [base][lc][rc] -> ssid; filled out for 200 word-initial base x lc combinations in current vocabulary */ 201 202 xwdssid_t **lrssid; /**< Left-Right context state sequence id table 203 First dimension: base phone, 204 Second dimension: left context. 205 206 */ 207 208 209 int32 is_composite; /**< Whether we will build composite triphone. If yes, the 210 structure will be in composite triphone mode, single_lc, 211 comstate, comsseq and comwt will be initialized. Otherwise, the code 212 will be in normal triphone mode. The parameters will be left NULL. 213 */ 214 215 s3ssid_t **single_lc; /**< For single phone words, [base][lc] -> composite ssid; filled 216 out for single phone words in current vocabulary */ 217 218 s3senid_t **comstate; /**< comstate[i] = BAD_S3SENID terminated set of senone IDs in 219 the i-th composite state */ 220 s3senid_t **comsseq; /**< comsseq[i] = sequence of composite state IDs in i-th 221 composite phone (composite sseq). */ 222 int32 *comwt; /**< Weight associated with each composite state (logs3 value). 223 Final composite state score weighted by this amount */ 224 int32 n_comstate; /**< #Composite states */ 225 int32 n_comsseq; /**< #Composite senone sequences */ 226 int32 n_ci; /**< Number of CI phone in */ 227 int32 n_dictsize; /**< Dictionary size */ 228 229 } dict2pid_t; 230 231 /** Access macros; not designed for arbitrary use */ 232 #define dict2pid_internal(d,w,p) ((d)->internal[w][p]) /**< return internal dict2pid*/ 233 #define dict2pid_n_comstate(d) ((d)->n_comstate) /**< return number of composite state*/ 234 #define dict2pid_n_comsseq(d) ((d)->n_comsseq) /**< return number of composite state sequence*/ 235 #define dict2pid_is_composite(d) ((d)->is_composite) /**< return whether dict2pid is in composite triphone mode or not*/ 236 237 #define IS_COMPOSITE 1 238 #define NOT_COMPOSITE 0 239 240 /** Build the dict2pid structure for the given model/dictionary */ 241 dict2pid_t *dict2pid_build (mdef_t *mdef, /**< A model definition*/ 242 dict_t *dict, /**< An initialized dictionary */ 243 int32 is_composite, /**< Whether composite triphones will be built */ 244 logmath_t *logmath 245 ); 246 247 248 /** Free the memory dict2pid structure */ 249 void dict2pid_free(dict2pid_t *d2p /**< In: the d2p */ 250 ); 251 /** 252 * Compute composite senone scores from ordinary senone scores (max of component senones) 253 */ 254 void dict2pid_comsenscr (dict2pid_t *d2p, /**< In: a dict2pid_t structure */ 255 int32 *senscr, /**< In: Ordinary senone scores */ 256 int32 *comsenscr /**< Out: Composite senone scores */ 257 ); 258 259 /** 260 * Mark active senones as indicated by the input array of composite senone-sequence active flags. 261 * Caller responsible for allocating and clearing sen[] before calling this function. 262 */ 263 void dict2pid_comsseq2sen_active (dict2pid_t *d2p, /**< In: a dict2pid_t structure */ 264 mdef_t *mdef, /**< In: a mdef_t structure */ 265 uint8 *comssid, /**< In: Active flag for each comssid */ 266 uint8 *sen /**< In/Out: Active flags set for senones 267 indicated by the active comssid */ 268 ); 269 /** For debugging */ 270 void dict2pid_dump (FILE *fp, /**< In: a file pointer */ 271 dict2pid_t *d2p, /**< In: a dict2pid_t structure */ 272 mdef_t *mdef, /**< In: a mdef_t structure*/ 273 dict_t *dict /**< In: a dictionary structure */ 274 ); 275 276 /** Report a dict2pid data structure */ 277 void dict2pid_report(dict2pid_t *d2p /**< In: a dict2pid_t structure */ 278 ); 279 280 /** 281 Get number of rc 282 */ 283 int32 get_rc_nssid(dict2pid_t *d2p, /**< In: a dict2pid */ 284 s3wid_t w, /**< In: a wid */ 285 dict_t *dict /**< In: a dictionary */ 286 ); 287 288 /** 289 Get RC map 290 */ 291 s3cipid_t* dict2pid_get_rcmap(dict2pid_t *d2p, /**< In: a dict2pid */ 292 s3wid_t w, /**< In: a wid */ 293 dict_t *dict /**< In: a dictionary */ 294 ); 295 296 #if 0 297 { /* Stop indent from complaining */ 298 #endif 299 #ifdef __cplusplus 300 } 301 #endif 302 303 304 #endif 305