1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 1999-2004 Carnegie Mellon University.	All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 /*************************************************
38  * CMU ARPA Speech Project
39  *
40  * Copyright (c) 2000 Carnegie Mellon University.
41  * ALL RIGHTS RESERVED.
42  *************************************************
43  *
44  *  May 14, 2004
45  *    Created by Yitao Sun (yitao@cs.cmu.edu) based on the live.h created by
46  *    Rita Singh.  The Live Decode API is the new top level API for Sphinx3.
47  *    The goal of the Live Decode API is to provide a well documented and
48  *    comprehensive API to control all aspects of the Sphinx3 speech decoder
49  *    engine.
50  *
51  *    The return values, for example, hypothesis segments and string, unlike
52  *    the rest of Sphinx3, are read-only, maintained internally, and clobbered
53  *    by subsequent calls.
54  */
55 
56 /*
57   revision 1.9
58   date: 2004/09/03 21:45:26;  author: yitao;  state: Exp;  lines: +2 -2
59 
60   cleaning up remote_decode API by moving list operations into a list API
61   ----------------------------
62   revision 1.8
63   date: 2004/09/03 16:50:56;  author: yitao;  state: Exp;  lines: +108 -37
64 
65 
66   modified comments to suit the use of doc++-
67   ----------------------------
68   revision 1.7
69   date: 2004/08/27 05:22:43;  author: yitao;  state: Exp;  lines: +75 -105
70 
71 
72   removed remote-decode API from the linux compile.  added doc++ comments for live_decod
73   e.h-
74   ----------------------------
75   revision 1.6
76   date: 2004/08/25 20:44:31;  author: yitao;  state: Exp;  lines: +13 -15
77 
78 
79   1.  added code to record uttid in live-decode
80   2.  added more code to flesh out remote-decode.  not compiling yet.
81   ----------------------------
82   revision 1.5
83   date: 2004/08/23 20:41:38;  author: yitao;  state: Exp;  lines: +1 -11
84 
85   basic implementation for remote-decode API.  not compiling yet.
86   ----------------------------
87   revision 1.4
88   date: 2004/08/19 19:12:50;  author: yitao;  state: Exp;  lines: +1 -1
89 
90   incompleted files remote-decode API.
91   ----------------------------
92   revision 1.3
93   date: 2004/08/09 21:40:36;  author: yitao;  state: Exp;  lines: +11 -20
94 
95   1.  fixed some bugs in Live-Decode API.  changed kb.c, kb.h, utt.c, live_decode.c, liv
96   e_decode.h.
97   2.  changed some filenames in src/programs/.  now there are 2 sets of livedecode and l
98   ivepretend: one that uses the old API (livedecode and livepretend), and one that uses
99   the new API (livedecode2 and livepretend2).
100   3.  modified Makefile.am to reflect the filename changes above.
101   ----------------------------
102   revision 1.2
103   date: 2004/08/08 23:34:50;  author: arthchan2003;  state: Exp;  lines: +1 -1
104   temporary fixes of live_decode.c and live_decode.h
105   ----------------------------
106   revision 1.1
107   date: 2004/08/06 15:07:38;  author: yitao;  state: Exp;
108   *** empty log message ***
109   =============================================================================
110 
111 */
112 
113 #include <cmd_ln.h>
114 #include <fe.h>
115 #include "s3types.h"
116 #include "sphinx3_export.h"
117 #include "kb.h"
118 #include "kbcore.h"
119 #include "dag.h"
120 #include "search.h"
121 
122 #ifndef __S3_DECODE_H
123 #define __S3_DECODE_H
124 
125 /** \file live_decode_API.h
126  * \brief header for live mode decoding API
127  */
128 #ifdef __cplusplus
129 extern "C" {
130 #endif
131 #if 0
132 } /* Fool Emacs into not indenting things. */
133 #endif
134 
135 S3DECODER_EXPORT
136 extern arg_t S3_DECODE_ARG_DEFS[];
137 
138 #define S3_DECODE_SUCCESS			0
139 #define S3_DECODE_ERROR_OUT_OF_MEMORY		-0x01
140 #define S3_DECODE_ERROR_NULL_POINTER		-0x02
141 #define S3_DECODE_ERROR_INVALID_STATE		-0x04
142 #define S3_DECODE_ERROR_INTERNAL		-0x08
143 
144 #define S3_DECODE_STATE_IDLE			0
145 #define S3_DECODE_STATE_DECODING		1
146 #define S3_DECODE_STATE_FINISHED		2
147 
148 /** Wrapper structure for live-mode recognition
149  */
150 S3DECODER_EXPORT
151 typedef struct
152 {
153     /**
154      * Knowledge base.
155      */
156     kb_t kb;
157 
158     /**
159      * Pointer to the knowledge base core.
160      */
161     kbcore_t *kbcore;
162 
163     /**
164      * Parameter: intervals at which wbeam is used for phone transitions.
165      */
166     int32 phones_skip;
167 
168     /**
169      * Number of frames decoded.
170      */
171     int32 num_frames_decoded;
172 
173     /**
174      * Number of frames entered.
175      */
176     int32 num_frames_entered;
177 
178     /**
179      * Current state of the live decoder.
180      */
181     int32 state;
182 
183     /**
184      * UTTID (obviously NOT) filled in by knowledge-base.
185      */
186     char *uttid;
187 
188     /**
189      * The frame number at which the hypothesis is recorded.
190      */
191     int32 hyp_frame_num;
192 
193     /**
194      * Hypothesis string.  Result (or partial result) of the recognition is
195      * stored as a complete string.
196      */
197     char *hyp_str;
198 
199     /**
200      * Hypothesis word segments.  Result (or partial result) of the recognition
201      * is stored as word segments.  Null-terminated array.
202      */
203     hyp_t **hyp_segs;
204 
205     /**
206      * Boolean indicates whether we will internally swap the samples.
207      */
208     int32 swap;
209 
210     /**
211      * Boolean indicates whether a partial hypothesis will be dumped.
212      */
213     int32 phypdump;
214 
215     /**
216      * Extension for the raw director
217      */
218     const char* rawext;
219 
220 } s3_decode_t;
221 
222 
223 /** Initializes a Sphinx3 decoder object (re-entrant).  Internal
224     modules, eg. search algorithms, language model, accoustic model,
225     etc, are read from file and initialized.  The decoder internal
226     variables are set to a starting state.
227 
228     This version of the Sphinx3 decoder assumes the user has
229     externally parsed arguments using <I>cmd_ln_parse_r()</I> or
230     <I>cmd_ln_parse_file_r()</I>.  The user is responsible for calling
231     <I>cmd_ln_free_r()</I> when he/she is done with the decoder.
232 
233     @param decoder Pointer to the decoder.
234     @param config Pointer to the command-line object
235                   returned by <i>cmd_ln_parse_r()</i>.
236     @return 0 for success.  -1 for failure.
237 */
238 S3DECODER_EXPORT
239 int s3_decode_init(s3_decode_t *_decode, cmd_ln_t *_config);
240 
241 /** Wraps up the Sphinx3 decoder.  All internal modules are closed or unloaded.
242     Internal variables are freed and/or set to a finishing state.  This
243     function should be called once the user is finished with the Sphinx3
244     decoder.
245 
246     @param decoder Pointer to the decoder.
247     @see s3_decode_init
248 */
249 S3DECODER_EXPORT
250 void s3_decode_close(s3_decode_t *_decode);
251 
252 /** Marks the start of the current utterance.  An utterance is a session of
253     speech decoding that starts with a call to <I>s3_decode_begin_utt()</I> and
254     ends with a call to <I>{@link s3_decode_end_utt s3_decode_end_utt()}</I>.
255     In the duration of an utterance, speech data is processed with either
256     <I>{@link s3_decode_process_raw s3_decode_process_raw()}</I> or
257     <I>{@link s3_decode_process_ceps s3_decode_process_ceps()}</I>.  Decoding
258     results (hypothesis) can be retrieved any time after the start of an
259     utterance using <I>{@link s3_decode_hypothesis s3_decode_hypothesis()}</I>.
260     All previous results will be clobbered at the start of a new utterance.
261 
262     At the moment, there is an undocumented time limit to the length of an
263     utterance.  (Yitao: there is?)
264 
265     @param decoder Pointer to the decoder.
266     @param uttid Utterance ID string.  If <I>null</I>, a somewhat unique
267     utterance id will be generated instead.
268     @return 0 for success.  -1 for failure.
269     @see s3_decode_end_utt
270     @see s3_decode_process
271     @see s3_decode_hypothesis
272 */
273 S3DECODER_EXPORT
274 int s3_decode_begin_utt(s3_decode_t *_decode, char *_uttid);
275 
276 /** Marks the end of the current utterance.  The Sphinx3 decoder  can no longer
277     process speech data until the start of the next utterance.  Any hypothesis
278     retrieved prior to the end of the utterance is called a partial hypothesis.
279     Any hypothesis retrieved after the end of the utterance is called the final
280     hypothesis.  See <I>{@link s3_decode_hypothesis s3_decode_hypothesis()}</I>
281     on how to retrieve hypothesis.
282 
283     @param decoder Pointer to the decoder
284     @see s3_decode_begin_utt
285     @see s3_decode_process
286     @see s3_decode_hypothesis
287 */
288 S3DECODER_EXPORT
289 void s3_decode_end_utt(s3_decode_t *_decode);
290 
291 /** Process a buffer of cepstrum frames for the current utterance.  This
292     function has to be called in the duration of an utterance.  That is, in
293     between calls to <I>{@link s3_decode_begin_utt s3_decode_begin_utt()}</I>
294     and <I>{@link s3_decode_end_utt s3_decode_end_utt()}</I>.
295 
296     One common issue with Sphinx3 decoder is the mismatch of parameters to
297     the signal processor and accoustic model.  Please double check with the
298     accoustic model training scripts and your signal processing front-end to
299     make sure the cepstrals are generated consistently.
300 
301     @param decoder Pointer to the decoder.
302     @param frames Buffer of audio feature frames.
303     @param num_frames Number of frames in the buffer.
304     @return 0 for success.  -1 for failure.
305     @see s3_decode_begin_utt
306     @see s3_decode_end_utt
307     @see s3_decode_process_ceps
308 */
309 S3DECODER_EXPORT
310 int s3_decode_process(s3_decode_t *_decode,
311                       float32 **_frames,
312                       int32 _num_frames);
313 
314 /** Retrieve partial or final decoding results (hypothesis).  Any
315     hypothesis retrieved prior to the end of the utterance is called a
316     partial hypothesis.  Any hypothesis retrieved after the end of the
317     utterance is called the final hypothesis.  The hypothesis can be
318     returned in a plain READ-ONLY string and/or an array of READ-ONLY word
319     segments.  In the plain string result, all filler and end words are
320     filtered out as well as the pronouciation information.  What is left is a
321     very readable string representation of the decoding result.  There is no
322     such filtering in the word segment result.
323 
324     Here is an example on how to use the result returned by
325     <I>s3_decode_hypothesis</I>:
326 
327     <PRE>
328     s3_decode_t d;
329     char *str, *uttid;
330     hyp_t **segs;
331 
332     ...
333 
334     s3_decode_hypothesis(&d, &uttid, &str, &segs);
335     printf("Decoded string: %s\n", str);
336     for (; *segs; segs++) {
337     printf("Word-segment id: %i\n", (*segs)->id);
338     }
339     </PRE>
340 
341     @param decoder Pointer to the decoder.
342     @param hyp_str Return pointer to a READ-ONLY string.  If <I>null</I>,
343     the string is not returned.
344     @param hyp_segs Return pointer to a null-terminated array of word
345     segments.  If <I>null</I>, the array is not returned.
346     @return 0 for success.  -1 for failure.
347 */
348 S3DECODER_EXPORT
349 int s3_decode_hypothesis(s3_decode_t *_decode, char **_uttid,
350                          char **_hyp_str, hyp_t ***_hyp_segs);
351 
352 /** Retrieve a word graph of final hypothesis.  You must call
353  * s3_decode_end_utt() before this.  See {@link dag.h} and {@link
354  * astar.h} for information on what to do with this structure.
355  *
356  * @param decoder Pointer to the decoder.
357  * @return A dag_t structure, or NULL on failure.  This pointer
358  * becomes invalid after a call to s3_decode_begin_utt().
359  */
360 S3DECODER_EXPORT
361 dag_t *s3_decode_word_graph(s3_decode_t *_decode);
362 
363 /** Set LM
364     @param _decode Pointer to the decode
365     @param lmname the language model name
366     @see s3_decode_read_lm s3_decode_delete_lm
367 */
368 S3DECODER_EXPORT
369 void s3_decode_set_lm(s3_decode_t *_decode, const char *lmname);
370 
371 /** Delete LM
372     @param _decode Pointer to the live mode decode
373     @param lmname the language model name
374     @see s3_decode_set_lm s3_decode_read_lm
375 */
376 S3DECODER_EXPORT
377 void s3_decode_delete_lm(s3_decode_t *_decode, const char *lmname);
378 
379 
380 /** Read LM from a file.
381     @param _decode Pointer to the decoder.
382     @param lmfile LM file name.
383     @param lmname LM name associated with this file.
384     @see s3_decode_set_lm
385 */
386 S3DECODER_EXPORT
387 void s3_decode_read_lm(s3_decode_t *_decode,
388                        const char *lmfile,
389                        const char *lmname);
390 
391 #if 0
392 { /* Stop indent from complaining */
393 #endif
394 #ifdef __cplusplus
395 }
396 #endif
397 
398 #endif
399 
400 
401 
402 
403