1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ 2 /* ==================================================================== 3 * Copyright (c) 1999-2004 Carnegie Mellon University. All rights 4 * reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 18 * This work was supported in part by funding from the Defense Advanced 19 * Research Projects Agency and the National Science Foundation of the 20 * United States of America, and the CMU Sphinx Speech Consortium. 21 * 22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY 26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 * 34 * ==================================================================== 35 * 36 */ 37 /************************************************* 38 * CMU ARPA Speech Project 39 * 40 * Copyright (c) 2000 Carnegie Mellon University. 41 * ALL RIGHTS RESERVED. 42 ************************************************* 43 * 44 * May 14, 2004 45 * Created by Yitao Sun (yitao@cs.cmu.edu) based on the live.h created by 46 * Rita Singh. The Live Decode API is the new top level API for Sphinx3. 47 * The goal of the Live Decode API is to provide a well documented and 48 * comprehensive API to control all aspects of the Sphinx3 speech decoder 49 * engine. 50 * 51 * The return values, for example, hypothesis segments and string, unlike 52 * the rest of Sphinx3, are read-only, maintained internally, and clobbered 53 * by subsequent calls. 54 */ 55 56 /* 57 revision 1.9 58 date: 2004/09/03 21:45:26; author: yitao; state: Exp; lines: +2 -2 59 60 cleaning up remote_decode API by moving list operations into a list API 61 ---------------------------- 62 revision 1.8 63 date: 2004/09/03 16:50:56; author: yitao; state: Exp; lines: +108 -37 64 65 66 modified comments to suit the use of doc++- 67 ---------------------------- 68 revision 1.7 69 date: 2004/08/27 05:22:43; author: yitao; state: Exp; lines: +75 -105 70 71 72 removed remote-decode API from the linux compile. added doc++ comments for live_decod 73 e.h- 74 ---------------------------- 75 revision 1.6 76 date: 2004/08/25 20:44:31; author: yitao; state: Exp; lines: +13 -15 77 78 79 1. added code to record uttid in live-decode 80 2. added more code to flesh out remote-decode. not compiling yet. 81 ---------------------------- 82 revision 1.5 83 date: 2004/08/23 20:41:38; author: yitao; state: Exp; lines: +1 -11 84 85 basic implementation for remote-decode API. not compiling yet. 86 ---------------------------- 87 revision 1.4 88 date: 2004/08/19 19:12:50; author: yitao; state: Exp; lines: +1 -1 89 90 incompleted files remote-decode API. 91 ---------------------------- 92 revision 1.3 93 date: 2004/08/09 21:40:36; author: yitao; state: Exp; lines: +11 -20 94 95 1. fixed some bugs in Live-Decode API. changed kb.c, kb.h, utt.c, live_decode.c, liv 96 e_decode.h. 97 2. changed some filenames in src/programs/. now there are 2 sets of livedecode and l 98 ivepretend: one that uses the old API (livedecode and livepretend), and one that uses 99 the new API (livedecode2 and livepretend2). 100 3. modified Makefile.am to reflect the filename changes above. 101 ---------------------------- 102 revision 1.2 103 date: 2004/08/08 23:34:50; author: arthchan2003; state: Exp; lines: +1 -1 104 temporary fixes of live_decode.c and live_decode.h 105 ---------------------------- 106 revision 1.1 107 date: 2004/08/06 15:07:38; author: yitao; state: Exp; 108 *** empty log message *** 109 ============================================================================= 110 111 */ 112 113 #include <cmd_ln.h> 114 #include <fe.h> 115 #include "s3types.h" 116 #include "sphinx3_export.h" 117 #include "kb.h" 118 #include "kbcore.h" 119 #include "dag.h" 120 #include "search.h" 121 122 #ifndef __S3_DECODE_H 123 #define __S3_DECODE_H 124 125 /** \file live_decode_API.h 126 * \brief header for live mode decoding API 127 */ 128 #ifdef __cplusplus 129 extern "C" { 130 #endif 131 #if 0 132 } /* Fool Emacs into not indenting things. */ 133 #endif 134 135 S3DECODER_EXPORT 136 extern arg_t S3_DECODE_ARG_DEFS[]; 137 138 #define S3_DECODE_SUCCESS 0 139 #define S3_DECODE_ERROR_OUT_OF_MEMORY -0x01 140 #define S3_DECODE_ERROR_NULL_POINTER -0x02 141 #define S3_DECODE_ERROR_INVALID_STATE -0x04 142 #define S3_DECODE_ERROR_INTERNAL -0x08 143 144 #define S3_DECODE_STATE_IDLE 0 145 #define S3_DECODE_STATE_DECODING 1 146 #define S3_DECODE_STATE_FINISHED 2 147 148 /** Wrapper structure for live-mode recognition 149 */ 150 S3DECODER_EXPORT 151 typedef struct 152 { 153 /** 154 * Knowledge base. 155 */ 156 kb_t kb; 157 158 /** 159 * Pointer to the knowledge base core. 160 */ 161 kbcore_t *kbcore; 162 163 /** 164 * Parameter: intervals at which wbeam is used for phone transitions. 165 */ 166 int32 phones_skip; 167 168 /** 169 * Number of frames decoded. 170 */ 171 int32 num_frames_decoded; 172 173 /** 174 * Number of frames entered. 175 */ 176 int32 num_frames_entered; 177 178 /** 179 * Current state of the live decoder. 180 */ 181 int32 state; 182 183 /** 184 * UTTID (obviously NOT) filled in by knowledge-base. 185 */ 186 char *uttid; 187 188 /** 189 * The frame number at which the hypothesis is recorded. 190 */ 191 int32 hyp_frame_num; 192 193 /** 194 * Hypothesis string. Result (or partial result) of the recognition is 195 * stored as a complete string. 196 */ 197 char *hyp_str; 198 199 /** 200 * Hypothesis word segments. Result (or partial result) of the recognition 201 * is stored as word segments. Null-terminated array. 202 */ 203 hyp_t **hyp_segs; 204 205 /** 206 * Boolean indicates whether we will internally swap the samples. 207 */ 208 int32 swap; 209 210 /** 211 * Boolean indicates whether a partial hypothesis will be dumped. 212 */ 213 int32 phypdump; 214 215 /** 216 * Extension for the raw director 217 */ 218 const char* rawext; 219 220 } s3_decode_t; 221 222 223 /** Initializes a Sphinx3 decoder object (re-entrant). Internal 224 modules, eg. search algorithms, language model, accoustic model, 225 etc, are read from file and initialized. The decoder internal 226 variables are set to a starting state. 227 228 This version of the Sphinx3 decoder assumes the user has 229 externally parsed arguments using <I>cmd_ln_parse_r()</I> or 230 <I>cmd_ln_parse_file_r()</I>. The user is responsible for calling 231 <I>cmd_ln_free_r()</I> when he/she is done with the decoder. 232 233 @param decoder Pointer to the decoder. 234 @param config Pointer to the command-line object 235 returned by <i>cmd_ln_parse_r()</i>. 236 @return 0 for success. -1 for failure. 237 */ 238 S3DECODER_EXPORT 239 int s3_decode_init(s3_decode_t *_decode, cmd_ln_t *_config); 240 241 /** Wraps up the Sphinx3 decoder. All internal modules are closed or unloaded. 242 Internal variables are freed and/or set to a finishing state. This 243 function should be called once the user is finished with the Sphinx3 244 decoder. 245 246 @param decoder Pointer to the decoder. 247 @see s3_decode_init 248 */ 249 S3DECODER_EXPORT 250 void s3_decode_close(s3_decode_t *_decode); 251 252 /** Marks the start of the current utterance. An utterance is a session of 253 speech decoding that starts with a call to <I>s3_decode_begin_utt()</I> and 254 ends with a call to <I>{@link s3_decode_end_utt s3_decode_end_utt()}</I>. 255 In the duration of an utterance, speech data is processed with either 256 <I>{@link s3_decode_process_raw s3_decode_process_raw()}</I> or 257 <I>{@link s3_decode_process_ceps s3_decode_process_ceps()}</I>. Decoding 258 results (hypothesis) can be retrieved any time after the start of an 259 utterance using <I>{@link s3_decode_hypothesis s3_decode_hypothesis()}</I>. 260 All previous results will be clobbered at the start of a new utterance. 261 262 At the moment, there is an undocumented time limit to the length of an 263 utterance. (Yitao: there is?) 264 265 @param decoder Pointer to the decoder. 266 @param uttid Utterance ID string. If <I>null</I>, a somewhat unique 267 utterance id will be generated instead. 268 @return 0 for success. -1 for failure. 269 @see s3_decode_end_utt 270 @see s3_decode_process 271 @see s3_decode_hypothesis 272 */ 273 S3DECODER_EXPORT 274 int s3_decode_begin_utt(s3_decode_t *_decode, char *_uttid); 275 276 /** Marks the end of the current utterance. The Sphinx3 decoder can no longer 277 process speech data until the start of the next utterance. Any hypothesis 278 retrieved prior to the end of the utterance is called a partial hypothesis. 279 Any hypothesis retrieved after the end of the utterance is called the final 280 hypothesis. See <I>{@link s3_decode_hypothesis s3_decode_hypothesis()}</I> 281 on how to retrieve hypothesis. 282 283 @param decoder Pointer to the decoder 284 @see s3_decode_begin_utt 285 @see s3_decode_process 286 @see s3_decode_hypothesis 287 */ 288 S3DECODER_EXPORT 289 void s3_decode_end_utt(s3_decode_t *_decode); 290 291 /** Process a buffer of cepstrum frames for the current utterance. This 292 function has to be called in the duration of an utterance. That is, in 293 between calls to <I>{@link s3_decode_begin_utt s3_decode_begin_utt()}</I> 294 and <I>{@link s3_decode_end_utt s3_decode_end_utt()}</I>. 295 296 One common issue with Sphinx3 decoder is the mismatch of parameters to 297 the signal processor and accoustic model. Please double check with the 298 accoustic model training scripts and your signal processing front-end to 299 make sure the cepstrals are generated consistently. 300 301 @param decoder Pointer to the decoder. 302 @param frames Buffer of audio feature frames. 303 @param num_frames Number of frames in the buffer. 304 @return 0 for success. -1 for failure. 305 @see s3_decode_begin_utt 306 @see s3_decode_end_utt 307 @see s3_decode_process_ceps 308 */ 309 S3DECODER_EXPORT 310 int s3_decode_process(s3_decode_t *_decode, 311 float32 **_frames, 312 int32 _num_frames); 313 314 /** Retrieve partial or final decoding results (hypothesis). Any 315 hypothesis retrieved prior to the end of the utterance is called a 316 partial hypothesis. Any hypothesis retrieved after the end of the 317 utterance is called the final hypothesis. The hypothesis can be 318 returned in a plain READ-ONLY string and/or an array of READ-ONLY word 319 segments. In the plain string result, all filler and end words are 320 filtered out as well as the pronouciation information. What is left is a 321 very readable string representation of the decoding result. There is no 322 such filtering in the word segment result. 323 324 Here is an example on how to use the result returned by 325 <I>s3_decode_hypothesis</I>: 326 327 <PRE> 328 s3_decode_t d; 329 char *str, *uttid; 330 hyp_t **segs; 331 332 ... 333 334 s3_decode_hypothesis(&d, &uttid, &str, &segs); 335 printf("Decoded string: %s\n", str); 336 for (; *segs; segs++) { 337 printf("Word-segment id: %i\n", (*segs)->id); 338 } 339 </PRE> 340 341 @param decoder Pointer to the decoder. 342 @param hyp_str Return pointer to a READ-ONLY string. If <I>null</I>, 343 the string is not returned. 344 @param hyp_segs Return pointer to a null-terminated array of word 345 segments. If <I>null</I>, the array is not returned. 346 @return 0 for success. -1 for failure. 347 */ 348 S3DECODER_EXPORT 349 int s3_decode_hypothesis(s3_decode_t *_decode, char **_uttid, 350 char **_hyp_str, hyp_t ***_hyp_segs); 351 352 /** Retrieve a word graph of final hypothesis. You must call 353 * s3_decode_end_utt() before this. See {@link dag.h} and {@link 354 * astar.h} for information on what to do with this structure. 355 * 356 * @param decoder Pointer to the decoder. 357 * @return A dag_t structure, or NULL on failure. This pointer 358 * becomes invalid after a call to s3_decode_begin_utt(). 359 */ 360 S3DECODER_EXPORT 361 dag_t *s3_decode_word_graph(s3_decode_t *_decode); 362 363 /** Set LM 364 @param _decode Pointer to the decode 365 @param lmname the language model name 366 @see s3_decode_read_lm s3_decode_delete_lm 367 */ 368 S3DECODER_EXPORT 369 void s3_decode_set_lm(s3_decode_t *_decode, const char *lmname); 370 371 /** Delete LM 372 @param _decode Pointer to the live mode decode 373 @param lmname the language model name 374 @see s3_decode_set_lm s3_decode_read_lm 375 */ 376 S3DECODER_EXPORT 377 void s3_decode_delete_lm(s3_decode_t *_decode, const char *lmname); 378 379 380 /** Read LM from a file. 381 @param _decode Pointer to the decoder. 382 @param lmfile LM file name. 383 @param lmname LM name associated with this file. 384 @see s3_decode_set_lm 385 */ 386 S3DECODER_EXPORT 387 void s3_decode_read_lm(s3_decode_t *_decode, 388 const char *lmfile, 389 const char *lmname); 390 391 #if 0 392 { /* Stop indent from complaining */ 393 #endif 394 #ifdef __cplusplus 395 } 396 #endif 397 398 #endif 399 400 401 402 403