1 /**
2  * @file   recog.h
3  *
4  * <JA>
5  * @brief  �����������������
6  *
7  * ǧ��������Υ������������Ԥ��ޤ������������ϡ�
8  * Recog ��ȥåץ��������Ȥ��ơ����Ѥ��벻����ǥ롤�����ǥ롤
9  * �������Ȥ߹�碌��ǧ����������������ʣ�������ޤ���
10  *
11  * �����Υ��������ϡ��б����� jconf ������깽¤�Ρ������
12  * ���Ѥ��륵�֥��������ؤΥݥ���������ޤ���PROCESS_AM �ϲ�����ǥ롤
13  * PROCESS_LM �ϸ����ǥ뤴�Ȥ��������ޤ���
14  *
15  * MFCCCalc �ϡ�
16  * ������ǥ뤪��� GMM ���׵ᤵ���ѥ�᡼�������פ�Ĵ�٤��Τ���
17  * ��������������Τ�ɬ�פʤ�����������ޤ���Ʊ���MFCC�������
18  * ����¾�Υե��ȥ���ɽ���������IJ�����ǥ뤪���GMM�ɤ����Ǥ�
19  * Ʊ�� MFCCCalc ����ͭ����ޤ���
20  *
21  * </JA>
22  *
23  * <EN>
24  * @brief  Enging instance definitions
25  *
26  * This file defines the engine instance and all its sub instances.
27  * The top instance is Recog, and it consists of several
28  * sub instances for LM, AM, and recognition process instances.
29  *
30  * Each sub-instance keeps pointer to corresponding jconf setting
31  * part, and also has pointers to other instances to use.
32  * PROCESS_AM will be generated for each acoustic model, and PROCESS_LM
33  * will be for each language model.
34  *
35  * MFCCCalc will be generated for each required MFCC frontend types
36  * by inspecting all AMs and GMM.  The AM's and GMMs that requires
37  * exactly the same MFCC frontend will share the same MFCC frontend.
38  *
39  * </EN>
40  *
41  * <pre>
42  * Recog
43  *    +- *JCONF
44  *    +- input related work area
45  *    +- MFCCCalc[] (linked list) (generated from HMM + GMM)
46  *    +- PROCESS_AM[] (linked list)
47  *       +- *pointer to JCONF_AM
48  *       +- *pointer to MFCCCalc
49  *       +- hmminfo, hmm_gs
50  *       +- hmmwrk
51  *       +- multipath, ccd_flag, cmn_loaded
52  *    +- PROCESS_LM[] (linked list)
53  *       +- *pointer to JCONF_LM
54  *       +- *pointer to PROCESS_AM
55  *       +- lmtype, lmvar
56  *       +- winfo
57  *       +- ngram or grammars
58  *       +- lmfunc
59  *    +- RecogProcess process[] (linked list)
60  *       +- *pointer to JCONF_SEARCH
61  *       +- *pointer to PROCESS_AM
62  *       +- *pointer to PROCESS_LM
63  *       +- lmtype, lmvar
64  *       +- misc. param
65  *    +- GMMCalc
66  *       +- *JCONF_AM for GMM
67  *       +- *pointer to MFCCCalc
68  * </pre>
69  *
70  * @author Akinobu Lee
71  * @date   Fri Feb 16 13:42:28 2007
72  *
73  * $Revision: 1.7 $
74  *
75  */
76 /*
77  * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University
78  * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
79  * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology
80  * All rights reserved
81  */
82 
83 /*
84 */
85 
86 #ifndef __J_RECOG_H__
87 #define __J_RECOG_H__
88 
89 #include <sent/stddefs.h>
90 #include <sent/hmm.h>
91 #include <sent/vocabulary.h>
92 #include <sent/ngram2.h>
93 #include <sent/dfa.h>
94 #include <julius/wchmm.h>
95 #include <julius/search.h>
96 #include <julius/callback.h>
97 #include <julius/jconf.h>
98 
99 /*
100   How tokens are managed:
101    o  tlist[][] is a token stocker.  It holds all tokens in sequencial
102       buffer.  They are malloced first on startup, and refered by ID while
103       Viterbi procedure.  In word-pair mode, each token also has a link to
104       another token to allow a node to have more than 1 token.
105 
106    o  token[n] holds the current ID number of a token associated to a
107       lexicon tree node 'n'.
108 
109   */
110 /**
111  * Work area for the first pass
112  *
113  */
114 typedef struct __FSBeam__ {
115   /* token stocker */
116   TOKEN2 *tlist[2];     ///< Token space to hold all token entities.
117   TOKENID *tindex[2];   ///< Token index corresponding to @a tlist for sort
118   int maxtnum;          ///< Allocated number of tokens (will grow)
119   int expand_step;      ///< Number of tokens to be increased per expansion
120   boolean expanded;     ///< TRUE if the tlist[] and tindex[] has been expanded at last create_token();
121   int tnum[2];          ///< Current number of tokens used in @a tlist
122   int n_start;          ///< Start index of in-beam nodes on @a tindex
123   int n_end;            ///< end index of in-beam nodes on @a tindex
124   int tl;               ///< Current work area id (0 or 1, swapped for each frame)
125   int tn;               ///< Next work area id (0 or 1, swapped for each frame)
126 
127   /* Active token list */
128   TOKENID *token;       ///< Active token list that holds currently assigned tokens for each tree node
129 #ifdef UNIGRAM_FACTORING
130   /* for wordend processing with 1-gram factoring */
131   LOGPROB wordend_best_score; ///< Best score of word-end nodes
132   int wordend_best_node;        ///< Node id of the best wordend nodes
133   TRELLIS_ATOM *wordend_best_tre; ///< Trellis word corresponds to above
134   WORD_ID wordend_best_last_cword;      ///< Last context-aware word of above
135 #endif
136 
137   int totalnodenum;     ///< Allocated number of nodes in @a token
138   TRELLIS_ATOM bos;     ///< Special token for beginning-of-sentence
139   boolean nodes_malloced; ///< Flag to check if tokens already allocated
140   LOGPROB lm_weight;           ///< Language score weight (local copy)
141   LOGPROB lm_penalty;          ///< Word insertion penalty (local copy)
142   LOGPROB lm_penalty_trans; ///< Additional insertion penalty for transparent words (local copy)
143   LOGPROB penalty1; ///< Word insertion penalty for DFA (local copy)
144 #if defined(WPAIR) && defined(WPAIR_KEEP_NLIMIT)
145   boolean wpair_keep_nlimit; ///< Keeps only N token on word-pair approx. (local copy from jconf)
146 #endif
147   /* for short-pause segmentation */
148   boolean in_sparea;         ///< TRUE when we are in a pause area now
149   int tmp_sparea_start;         ///< Memorize where the current pause area begins
150 #ifdef SP_BREAK_RESUME_WORD_BEGIN
151   WORD_ID tmp_sp_break_last_word; ///< Keep the max word hypothesis at beginning of this segment as the starting word of next segment
152 #else
153   WORD_ID last_tre_word;        ///< Keep ths max word hypothesis at the end of this segment for as the starting word of the next segment
154 #endif
155   boolean first_sparea;  ///< TRUE when we are in the first pause area
156   int sp_duration;   ///< Number of current successive sp frame
157 #ifdef SPSEGMENT_NAIST
158   boolean after_trigger;        ///< TRUE if speech already triggered
159   int trigger_duration;         ///< Current speech duration at uptrigger detection
160   boolean want_rewind;          ///< TRUE if process wants mfcc rewinding
161   int rewind_frame;             ///< Place to rewind to
162   boolean want_rewind_reprocess; ///< TRUE if requires re-processing after rewind
163 #endif
164   char *pausemodelnames;        ///< pause model name string to detect segment
165   char **pausemodel;            ///< each pause model name to detect segment
166   int pausemodelnum;            ///< num of pausemodel
167 } FSBeam;
168 
169 
170 /**
171  * Work area for realtime processing of 1st pass
172  *
173  */
174 typedef struct __RealBeam__ {
175   /* input parameter */
176   int maxframelen;              ///< Maximum allowed input frame length
177 
178   SP16 *window;         ///< Window buffer for MFCC calculation
179   int windowlen;                ///< Buffer length of @a window
180   int windownum;                ///< Currently left samples in @a window
181 
182   /* for short-pause segmentation */
183   boolean last_is_segmented; ///<  TRUE if last pass was a segmented input
184   SP16 *rest_Speech; ///< Speech samples left unprocessed by segmentation at previous segment
185   int rest_alloc_len;   ///< Allocated length of rest_Speech
186   int rest_len;         ///< Current stored length of rest_Speech
187 
188 } RealBeam;
189 
190 /**
191  * Work area for the 2nd pass
192  *
193  */
194 typedef struct __StackDecode__ {
195   int hypo_len_count[MAXSEQNUM+1];      ///< Count of popped hypothesis per each length
196   int maximum_filled_length; ///< Current least beam-filled depth
197 #ifdef SCAN_BEAM
198   LOGPROB *framemaxscore; ///< Maximum score of each frame on 2nd pass for score enveloping
199 #endif
200   NODE *stocker_root; ///< Node stocker for recycle
201   int popctr;           ///< Num of popped hypotheses from stack
202   int genectr;          ///< Num of generated hypotheses
203   int pushctr;          ///< Num of hypotheses actually pushed to stack
204   int finishnum;        ///< Num of found sentence hypothesis
205   NODE *current;                ///< Current node for debug
206 
207 #ifdef CONFIDENCE_MEASURE
208   LOGPROB cm_alpha;             ///< alpha scaling value from jconf
209 # ifdef CM_MULTIPLE_ALPHA
210   LOGPROB *cmsumlist;        ///< Sum of cm score for each alpha coef.
211   int cmsumlistlen;             ///< Allocated length of cmsumlist.
212 # endif
213 # ifdef CM_SEARCH
214   LOGPROB cm_tmpbestscore; ///< Temporal best score for summing up scores
215 #  ifndef CM_MULTIPLE_ALPHA
216   LOGPROB cm_tmpsum;            ///< Sum of CM score
217 #  endif
218   int l_stacksize;              ///< Local stack size for CM
219   int l_stacknum;               ///< Num of hypo. in local stack for CM
220   NODE *l_start;        ///< Top node of local stack for CM
221   NODE *l_bottom;       ///< bottom node of local stack for CM
222 # endif
223 # ifdef CM_NBEST
224   LOGPROB *sentcm = NULL;       ///< Confidence score of each sentence
225   LOGPROB *wordcm = NULL;       ///< Confidence score of each word voted from @a sentcm
226   int sentnum;          ///< Allocated length of @a sentcm
227 # endif
228 #endif /* CONFIDENCE_MEASURE */
229 
230   LOGPROB *wordtrellis[2]; ///< Buffer to compute viterbi path of a word
231   LOGPROB *g;           ///< Buffer to hold source viterbi scores
232   HMM_Logical **phmmseq;        ///< Phoneme sequence to be computed
233   int phmmlen_max;              ///< Maximum length of @a phmmseq.
234   boolean *has_sp;              ///< Mark which phoneme allow short pause for multi-path mode
235 #ifdef GRAPHOUT_PRECISE_BOUNDARY
236   short *wend_token_frame[2]; ///< Propagating token of word-end frame to detect corresponding end-of-words at word head
237   LOGPROB *wend_token_gscore[2]; ///< Propagating token of scores at word-end to detect corresponding end-of-words at word head
238   short *wef;           ///< Work area for word-end frame tokens for v2
239   LOGPROB *wes;         ///< Work area for word-end score tokens for v2
240 #endif
241 
242 } StackDecode;
243 
244 /**
245  * User LM function entry point
246  *
247  */
248 typedef struct {
249   LOGPROB (*uniprob)(WORD_INFO *, WORD_ID, LOGPROB); ///< Pointer to function returning word occurence probability
250   LOGPROB (*biprob)(WORD_INFO *, WORD_ID, WORD_ID, LOGPROB); ///< Pointer to function returning a word probability given a word context (corresponds to bi-gram)
251   LOGPROB (*lmprob)(WORD_INFO *, WORD_ID *, int, WORD_ID, LOGPROB); ///< Pointer to function returning LM probability
252 } LMFunc;
253 
254 /**
255  * Work area for GMM calculation
256  *
257  */
258 typedef struct __gmm_calc__{
259   LOGPROB *gmm_score;   ///< Current accumurated scores for each GMM
260   boolean *is_voice;            ///< True if corresponding model designates speech, FALSE if noise
261   int framecount;               ///< Current frame count
262 
263   short OP_nstream;             ///< Number of input stream for GMM
264   VECT *OP_vec_stream[MAXSTREAMNUM]; ///< input vector for each stream at that frame
265   short OP_veclen_stream[MAXSTREAMNUM]; ///< vector length for each stream
266 
267   LOGPROB *OP_calced_score; ///< Work area for Gaussian pruning on GMM: scores
268   int *OP_calced_id; ///< Work area for Gaussian pruning on GMM: id
269   int OP_calced_num; ///< Work area for Gaussian pruning on GMM: number of above
270   int OP_calced_maxnum; ///< Work area for Gaussian pruning on GMM: size of allocated area
271   int OP_gprune_num; ///< Number of Gaussians to be computed in Gaussian pruning
272   VECT *OP_vec;         ///< Local workarea to hold the input vector of current frame
273   short OP_veclen;              ///< Local workarea to hold the length of above
274   HTK_HMM_Data *max_d;  ///< Hold model of the maximum score
275   int max_i;                    ///< Index of max_d
276 #ifdef CONFIDENCE_MEASURE
277   LOGPROB gmm_max_cm;   ///< Hold maximum score
278 #endif
279 #ifdef GMM_VAD
280   LOGPROB *rates;   ///< voice rate of recent N frames (cycle buffer)
281   int nframe;                   ///< Length of rates
282   boolean filled;
283   int framep;                   ///< Current frame pointer
284 
285   boolean in_voice;             ///< TRUE if currently in voice area
286   boolean up_trigger;           ///< TRUE when detect up trigger
287   boolean down_trigger;         ///< TRUE when detect down trigger
288   boolean after_trigger;        ///< TRUE when currently we are processing speech segment
289   boolean want_rewind;          ///< TRUE if GMM wants rewinding its MFCC
290   boolean want_rewind_reprocess; ///< TRUE if GMM wants re-processing after rewind
291   int rewind_frame;             ///< Frame to rewind
292   int duration;                 ///< Current GMM duration work
293 #endif
294 } GMMCalc;
295 
296 /**
297  * Alignment result, valid when forced alignment was done
298  *
299  */
300 typedef struct __sentence_align__ {
301   int num;                    ///< Number of units
302   short unittype;             ///< Unit type (one of PER_*)
303   WORD_ID *w;                 ///< word sequence by id (PER_WORD)
304   HMM_Logical **ph;     ///< Phone sequence (PER_PHONEME, PER_STATE)
305   short *loc; ///< sequence of state location in a phone (PER_STATE)
306   boolean *is_iwsp;           ///< TRUE if PER_STATE and this is the inter-word pause state at multipath mode
307   int *begin_frame;           ///< List of beginning frame
308   int *end_frame;             ///< List of ending frame
309   LOGPROB *avgscore;          ///< Score averaged by frames
310   LOGPROB allscore;           ///< Re-computed acoustic score
311   struct __sentence_align__ *next; ///< data chain pointer
312 } SentenceAlign;
313 
314 /**
315  * Output result structure
316  *
317  */
318 typedef struct __sentence__ {
319   WORD_ID word[MAXSEQNUM];      ///< Sequence of word ID
320   int word_num;                 ///< Number of words in the sentence
321   LOGPROB score;                ///< Likelihood (LM+AM)
322   LOGPROB confidence[MAXSEQNUM]; ///< Word confidence scores
323   LOGPROB score_lm;             ///< Language model likelihood (scaled) for N-gram
324   LOGPROB score_am;             ///< Acoustic model likelihood for N-gram
325   int gram_id;                  ///< The grammar ID this sentence belongs to for DFA
326   SentenceAlign *align;
327 
328 } Sentence;
329 
330 /**
331  * A/D-in work area
332  *
333  */
334 typedef struct __adin__ {
335   /* functions */
336   /// Pointer to function for device initialization (call once on startup)
337   boolean (*ad_standby)(int, void *);
338   /// Pointer to function to open audio stream for capturing
339   boolean (*ad_begin)();
340   /// Pointer to function to close audio stream capturing
341   boolean (*ad_end)();
342   /// Pointer to function to begin / restart recording
343   boolean (*ad_resume)();
344   /// Pointer to function to pause recording
345   boolean (*ad_pause)();
346   /// Pointer to function to terminate current recording immediately
347   boolean (*ad_terminate)();
348   /// Pointer to function to read samples
349   int (*ad_read)(SP16 *, int);
350 
351   /* configuration parameters */
352   int thres;            ///< Input Level threshold (0-32767)
353   int noise_zerocross;  ///< Computed threshold of zerocross num in the cycle buffer
354   int nc_max;           ///< Computed number of fragments for tail margin
355   boolean adin_cut_on;  ///< TRUE if do input segmentation by silence
356   boolean silence_cut_default; ///< Device-dependent default value of adin_cut_on()
357   boolean strip_flag;   ///< TRUE if skip invalid zero samples
358   boolean enable_thread;        ///< TRUE if input device needs threading
359   boolean need_zmean;   ///< TRUE if perform zmeansource
360 
361   /* work area */
362   int c_length; ///< Computed length of cycle buffer for zero-cross, actually equals to head margin length
363   int c_offset; ///< Static data DC offset (obsolute, should be 0)
364   SP16 *swapbuf;                ///< Buffer for re-triggering in tail margin
365   int sbsize;    ///< Size of @a swapbuf
366   int sblen;    ///< Current length of @a swapbuf
367   int rest_tail;                ///< Samples not processed yet in swap buffer
368 
369   ZEROCROSS zc;                 ///< Work area for zero-cross computation
370 
371 #ifdef HAVE_PTHREAD
372   /* Variables related to POSIX threading */
373   pthread_t adin_thread;	///< Thread information
374   pthread_mutex_t mutex;        ///< Lock primitive
375   SP16 *speech;         ///< Unprocessed samples recorded by A/D-in thread
376   int speechlen;                ///< Current length of @a speech
377 /*
378  * Semaphore to start/stop recognition.
379  *
380  * If TRUE, A/D-in thread will store incoming samples to @a speech and
381  * main thread will detect and process them.
382  * If FALSE, A/D-in thread will still get input and check trigger as the same
383  * as TRUE case, but does not store them to @a speech.
384  *
385  */
386   boolean transfer_online;
387   /**
388    * TRUE if buffer overflow occured in adin thread.
389    *
390    */
391   boolean adinthread_buffer_overflowed;
392   /**
393    * TRUE if adin thread ended
394    *
395    */
396   boolean adinthread_ended;
397 
398   boolean ignore_speech_while_recog; ///< TRUE if ignore speech input between call, while waiting recognition process
399 
400 #endif
401 
402   /* Input data buffer */
403   SP16 *buffer; ///< Temporary buffer to hold input samples
404   int bpmax;            ///< Maximum length of @a buffer
405   int bp;                       ///< Current point to store the next data
406   int current_len;              ///< Current length of stored samples
407   SP16 *cbuf;           ///< Buffer for flushing cycle buffer just after detecting trigger
408   boolean down_sample; ///< TRUE if perform down sampling from 48kHz to 16kHz
409   SP16 *buffer48; ///< Another temporary buffer to hold 48kHz inputs
410   int io_rate; ///< frequency rate (should be 3 always for 48/16 conversion
411 
412   boolean is_valid_data;        ///< TRUE if we are now triggered
413   int nc;               ///< count of current tail silence segments
414   boolean end_of_stream;        ///< TRUE if we have reached the end of stream
415   boolean need_init;    ///< if TRUE, initialize buffer on startup
416 
417   DS_BUFFER *ds;           ///< Filter buffer for 48-to-16 conversion
418 
419   boolean rehash; ///< TRUE is want rehash at rewinding on decoder-based VAD
420 
421   boolean input_side_segment;   ///< TRUE if segmentation requested by ad_read
422 
423   unsigned int total_captured_len;
424   unsigned int last_trigger_sample;
425 
426 } ADIn;
427 
428 /**
429  * Recognition result output structure.  You may want to use with model data
430  * to get fully detailed results.
431  *
432  */
433 typedef struct __Output__ {
434   /**
435    * 1: recognition in progress
436    * 0: recognition succeeded (at least one candidate has been found)
437    * -1: search failed, no candidate has been found
438    * -2: input rejected by short input
439    * -3: input rejected by GMM
440    *
441    */
442   int status;
443 
444   int num_frame;                ///< Number of frames of the recognized part
445   int length_msec;              ///< Length of the recognized part
446 
447   Sentence *sent;               ///< List of (N-best) recognition result sentences
448   int sentnum;                  ///< Number of sentences
449 
450   WordGraph *wg1;               ///< List of word graph generated on 1st pass
451   int wg1_num;                  ///< Num of words in the wg1
452 
453   WordGraph *wg;                ///< List of word graph
454 
455   CN_CLUSTER *confnet;          ///< List of confusion network clusters
456 
457   Sentence pass1;               ///< Recognition result on the 1st pass
458 
459 } Output;
460 
461 
462 /**********************************************************************/
463 /**********************************************************************/
464 /**********************************************************************/
465 
466 /**
467  * instance for a parameter vector computation
468  *
469  */
470 typedef struct __mfcc_calc__ {
471 
472   /**
473    * Unique id
474    *
475    */
476   short id;
477 
478   /**
479    * Parameter setting (entity in JCONF_AM)
480    *
481    */
482   Value *para;
483 
484   /**
485    * TRUE if the para came from "-htkconf"
486    *
487    */
488   boolean htk_loaded;
489   /**
490    * TRUE if the para came from binhmm embedded header
491    *
492    */
493   boolean hmm_loaded;
494 
495   /**
496    * Check input parameter type with header of the hmmdefs
497    * (-notypecheck to unset)
498    */
499   boolean paramtype_check_flag;
500 
501   /**
502    * Parameter extraction work area
503    *
504    */
505   MFCCWork *wrk;
506 
507   /**
508    * Parameter vector sequence to be recognized
509    *
510    */
511   HTK_Param *param;
512 
513   /**
514    * Rest parameter for next segment for short-pause segmentation
515    */
516   HTK_Param *rest_param;
517 
518   /**
519    * Work area and setting for cepstral mean normalization
520    *
521    */
522   struct {
523     /**
524      * CMN: load initial cepstral mean from file at startup (-cmnload)
525      */
526     char *load_filename;
527     /**
528      * CMN: update cepstral mean while recognition
529      * (-cmnnoupdate to unset)
530      */
531     boolean update;
532     /**
533      * CMN: save cepstral mean to file at end of every recognition (-cmnsave)
534      */
535     char *save_filename;
536     /**
537      * CMN: MAP weight for initial cepstral mean on (-cmnmapweight)
538    */
539     float map_weight;
540 
541     /**
542      * TRUE if CMN parameter loaded from file at boot up
543      */
544     boolean loaded;
545 
546     /**
547      * realtime CMN work area
548      *
549      */
550     CMNWork *wrk;
551 
552   } cmn;
553 
554   /**
555    * Work area for front-end processing
556    *
557    */
558   struct {
559     /**
560      * Estimated noise spectrum
561      */
562     float *ssbuf;
563 
564     /**
565      * Length of @a ssbuf
566      */
567     int sslen;
568 
569     /**
570      * Alpha coefficient for spectral subtraction
571      *
572      */
573     float ss_alpha;
574 
575     /**
576      * Flooring coefficient for spectral subtraction
577      *
578      */
579     float ss_floor;
580 
581     /**
582      * SS: compute noise spectrum from head silence on file input (-sscalc)
583      */
584     boolean sscalc;
585 
586     /**
587      * With "-sscalc", specify noise length at input head in msec (-sscalclen)
588      */
589     int sscalc_len;
590 
591     /**
592      * Load noise spectrum data from file (-ssload), that was made by "mkss".
593      */
594     char *ssload_filename;
595 
596     /**
597      * Parameter extraction work area for spectral subtraction
598      *
599      */
600     MFCCWork *mfccwrk_ss;
601 
602   } frontend;
603 
604   /**
605    * work area for energy normalization on real time processing
606    *
607    */
608   ENERGYWork ewrk;
609 
610   /**
611    * delta MFCC cycle buffer
612    *
613    */
614   DeltaBuf *db;
615   /**
616    * accel MFCC cycle buffer
617    *
618    */
619   DeltaBuf *ab;
620   /**
621    * working buffer holding current computing mfcc vector
622    *
623    */
624   VECT *tmpmfcc;
625 
626   /**
627    * FALSE indicates that the current frame (f) is not valid and should
628    * not be used for recognition
629    *
630    */
631   boolean valid;
632 
633   /**
634    * Current frame
635    *
636    */
637   int f;
638 
639   /**
640    * Processed frame length when segmented
641    *
642    */
643   int last_time;
644 
645   /**
646    * Re-start frame if segmenetd
647    *
648    */
649   int sparea_start;
650 
651   /**
652    * TRUE if a parent instance has decided segmented
653    *
654    */
655   boolean segmented;
656 
657   /**
658    * TRUE if an input functionhas decided segmented
659    *
660    */
661   boolean segmented_by_input;
662 
663   /**
664    * id of an plugin module if MFCC should be obtained via plugin
665    *
666    */
667   int plugin_source;
668 
669   /**
670    * Function entry points for plugin input
671    *
672    */
673   struct {
674     /// Pointer to function for device initialization (call once on startup)
675     boolean (*fv_standby)();
676     /// Pointer to function to open audio stream for capturing
677     boolean (*fv_begin)();
678     /// Pointer to function to read samples
679     int (*fv_read)(VECT *, int);
680     /// Pointer to function to close audio stream capturing
681     boolean (*fv_end)();
682     /// Pointer to function to begin / restart recording
683     boolean (*fv_resume)();
684     /// Pointer to function to pause recording
685     boolean (*fv_pause)();
686     /// Pointer to function to terminate current recording immediately
687     boolean (*fv_terminate)();
688   } func;
689 
690 #ifdef POWER_REJECT
691   float avg_power;
692 #endif
693 
694   /**
695    * pointer to next
696    *
697    */
698   struct __mfcc_calc__ *next;
699 
700 } MFCCCalc;
701 
702 /**
703  * instance for an AM.
704  *
705  */
706 typedef struct __process_am__ {
707 
708   /**
709    * Configuration parameters
710    *
711    */
712   JCONF_AM *config;
713 
714   /**
715    * Corresponding input parameter vector instance
716    *
717    */
718   MFCCCalc *mfcc;
719 
720   /**
721    * Main phoneme HMM
722    */
723   HTK_HMM_INFO *hmminfo;
724 
725   /**
726    * HMM for Gaussian Selection
727    */
728   HTK_HMM_INFO *hmm_gs;
729 
730   /**
731    * Work area and outprob cache for HMM output probability computation
732    */
733   HMMWork hmmwrk;
734 
735   /**
736    * pointer to next
737    *
738    */
739   struct __process_am__ *next;
740 
741 } PROCESS_AM;
742 
743 /**
744  * instance for a LM.
745  *
746  */
747 typedef struct __process_lm__ {
748 
749   /**
750    * Configuration parameters
751    *
752    */
753   JCONF_LM *config;
754 
755   /**
756    * Corresponding AM
757    *
758    */
759   PROCESS_AM *am;
760 
761 
762   /**
763    * the LM type of this Model holder: will be set from Jconf used for loading
764    *
765    */
766   int lmtype;
767 
768   /**
769    * the LM variation type of this Model holder: will be set from
770    * Jconf used for loading
771    *
772    */
773   int lmvar;
774 
775   /**
776    * Main Word dictionary for all LM types
777    */
778   WORD_INFO *winfo;
779 
780   /**
781    * Main N-gram language model (do not use with grammars)
782    */
783   NGRAM_INFO *ngram;
784 
785   /**
786    * List of all loaded grammars (do not use with ngram)
787    */
788   MULTIGRAM *grammars;
789 
790   /**
791    * Current maximum value of assigned grammar ID.
792    * A new grammar ID will be assigned to each new grammar.
793    *
794    */
795   int gram_maxid;
796 
797   /**
798    * Global DFA for recognition.  This will be generated from @a grammars,
799    * concatinating each DFA into one.
800    */
801   DFA_INFO *dfa;
802 
803   /**
804    * TRUE if modified in multigram_update()
805    *
806    */
807   boolean global_modified;
808 
809   /**
810    * LM User function entry point
811    *
812    */
813   LMFunc lmfunc;
814 
815   /**
816    * pointer to next
817    *
818    */
819   struct __process_lm__ *next;
820 
821 } PROCESS_LM;
822 
823 /**
824  * instance for a decoding, i.e. set of LM, AM and parameters
825  *
826  */
827 typedef struct __recogprocess__ {
828 
829   /**
830    * TRUE is this instance is alive, or FALSE when temporary disabled.
831    *
832    */
833   boolean live;
834 
835   /**
836    * 1 if this instance should be made alive in the next recognition,
837    * -1 if should become dead in the next recognition,
838    * or 0 to leave unchanged.
839    *
840    */
841   short active;
842 
843   /**
844    * search configuration data
845    *
846    */
847   JCONF_SEARCH *config;
848 
849   /**
850    * acoustic model instance to use
851    *
852    */
853   PROCESS_AM *am;
854 
855   /**
856    * language model instance to use
857    *
858    */
859   PROCESS_LM *lm;
860 
861   /**
862    * Language model type: one of LM_UNDEF, LM_NGRAM, LM_DFA
863    *
864    */
865   int lmtype;
866 
867   /**
868    * Variation type of language model: one of LM_NGRAM, LM_DFA_GRAMMAR,
869    * LM_DFA_WORD
870    *
871    */
872   int lmvar;
873 
874   /**
875    * Whether handle phone context dependency (local copy from jconf)
876    */
877   boolean ccd_flag;
878 
879   /**
880    * Word-conjunction HMM as tree lexicon
881    */
882   WCHMM_INFO *wchmm;
883 
884   /**
885    * Actual beam width of 1st pass (will be set on startup)
886    */
887   int trellis_beam_width;
888 
889   /**
890    * Word trellis index generated at the 1st pass
891    */
892   BACKTRELLIS *backtrellis;
893 
894   /**
895    * Work area for the first pass
896    */
897   FSBeam pass1;
898 
899   /**
900    * Work area for second pass
901    *
902    */
903   StackDecode pass2;
904 
905   /**
906    * Word sequence of best hypothesis on 1st pass
907    */
908   WORD_ID pass1_wseq[MAXSEQNUM];
909 
910   /**
911    * Number of words in @a pass1_wseq
912    */
913   int pass1_wnum;
914 
915   /**
916    * Score of @a pass1_wseq
917    */
918   LOGPROB pass1_score;
919 
920   /**
921    * Last maximum word hypothesis on the begin point for short-pause segmentation
922    */
923   WORD_ID sp_break_last_word;
924   /**
925    * Last (not transparent) context word for LM for short-pause segmentation
926    */
927   WORD_ID sp_break_last_nword;
928   /**
929    * Allow override of last context word from result of 2nd pass for short-pause segmentation
930    */
931   boolean sp_break_last_nword_allow_override;
932   /**
933    * Search start word on 2nd pass for short-pause segmentation
934    */
935   WORD_ID sp_break_2_begin_word;
936   /**
937    * Search end word on 2nd pass for short-pause segmentation
938    */
939   WORD_ID sp_break_2_end_word;
940 
941   /**
942    * Input length in frames
943    */
944   int peseqlen;
945 
946   /**
947    * GraphOut: total number of words in the generated graph
948    */
949   int graph_totalwordnum;
950 
951   /**
952    * Recognition results
953    *
954    */
955   Output result;
956 
957   /**
958    * graphout: will be set from value from jconf->graph.enabled
959    *
960    */
961   boolean graphout;
962 
963   /**
964    * Temporal matrix work area to hold the order relations between words
965    * for confusion network construction.
966    *
967    */
968   char *order_matrix;
969 
970   /**
971    * Number of words to be expressed in the order matrix for confusion network
972    * construction.
973    *
974    */
975   int order_matrix_count;
976 
977 #ifdef DETERMINE
978   int determine_count;
979   LOGPROB determine_maxnodescore;
980   boolean determined;
981   LOGPROB determine_last_wid;
982   boolean have_determine;
983 #endif
984 
985   /**
986    * TRUE if has something to output at CALLBACK_RESULT_PASS1_INTERIM.
987    *
988    */
989   boolean have_interim;
990 
991   /**
992    * User-defined data hook.  JuliusLib does not concern about its content.
993    *
994    */
995   void *hook;
996 
997   /**
998    * Pointer to next instance
999    *
1000    */
1001   struct __recogprocess__ *next;
1002 
1003 } RecogProcess;
1004 
1005 /**
1006  * Top level instance for the whole recognition process
1007  *
1008  */
1009 typedef struct __Recog__ {
1010 
1011   /*******************************************/
1012   /**
1013    * User-specified configuration parameters
1014    *
1015    */
1016   Jconf *jconf;
1017 
1018   /*******************************************/
1019   /**
1020    * A/D-in buffers
1021    *
1022    */
1023   ADIn *adin;
1024 
1025   /**
1026    * Work area for the realtime processing of first pass
1027    */
1028   RealBeam real;
1029 
1030   /**
1031    * Linked list of MFCC calculation/reading instances
1032    *
1033    */
1034   MFCCCalc *mfcclist;
1035 
1036   /**
1037    * Linked list of acoustic model instances
1038    *
1039    */
1040   PROCESS_AM *amlist;
1041 
1042   /**
1043    * Linked list of language model instances
1044    *
1045    */
1046   PROCESS_LM *lmlist;
1047 
1048   /**
1049    * Linked list of recognition process instances
1050    *
1051    */
1052   RecogProcess *process_list;
1053 
1054 
1055   /**
1056    * TRUE when engine is processing a segment (for short-pause segmentation)
1057    *
1058    */
1059   boolean process_segment;
1060 
1061   /*******************************************/
1062   /* inputs */
1063 
1064   /**
1065    * Input speech data
1066    */
1067   SP16 *speech;
1068 
1069   /**
1070    * Allocate length of speech
1071    *
1072    */
1073   int speechalloclen;
1074 
1075   /**
1076    * Input length in samples
1077    */
1078   int speechlen;
1079 
1080   /**
1081    * Input length in frames
1082    */
1083   int peseqlen;
1084 
1085   /*******************************************/
1086 
1087   /**
1088    * GMM definitions
1089    *
1090    */
1091   HTK_HMM_INFO *gmm;
1092 
1093   /**
1094    * Pointer to MFCC instance for GMM
1095    *
1096    */
1097   MFCCCalc *gmmmfcc;
1098 
1099   /**
1100    * Work area for GMM calculation
1101    *
1102    */
1103   GMMCalc *gc;
1104 
1105   /*******************************************/
1106   /* misc. */
1107 
1108   /**
1109    * Status flag indicating whether the recognition is alive or not.  If
1110    * TRUE, the process is currently activated, either monitoring an
1111    * audio input or recognizing the current input.  If FALSE, the recognition
1112    * is now disabled until some activation command has been arrived from
1113    * client.  While disabled, all the inputs are ignored.
1114    *
1115    * If set to FALSE in the program, Julius/Julian will stop after
1116    * the current recognition ends, and enter the disabled status.
1117    *
1118    */
1119   boolean process_active;
1120 
1121   /**
1122    * If set to TRUE, Julius/Julian stops recognition immediately, terminating
1123    * the currenct recognition process, and enter into disabled status.
1124    *
1125    */
1126   boolean process_want_terminate;
1127 
1128   /**
1129    * If set to TRUE, Julius/Julian stops recognition softly.  If it is
1130    * performing recognition of the 1st pass, it immediately segments the
1131    * current input, process the 2nd pass, and output the result.  Then it
1132    * enters the disabled status.
1133    *
1134    */
1135   boolean process_want_reload;
1136 
1137   /**
1138    * When to refresh the global lexicon if received while recognition for
1139    * DFA
1140    *
1141    */
1142   short gram_switch_input_method;
1143 
1144   /**
1145    * TRUE if audio stream is now open and engine is either listening
1146    * audio stream or recognizing a speech.  FALSE on startup or when
1147    * in pause specified by a module command.
1148    *
1149    */
1150   boolean process_online;
1151 
1152   /**
1153    * Function pointer to parameter vector computation for realtime 1st pass.
1154    * default: RealTimeMFCC() in realtime-1stpass.c
1155    *
1156    */
1157   boolean (*calc_vector)(MFCCCalc *, SP16 *, int);
1158 
1159   /**
1160    * TRUE when recognition triggered and some recognition started,
1161    * FALSE if engine terminated with no input.
1162    *
1163    */
1164   boolean triggered;
1165 
1166   /**
1167    * Callback entry point
1168    *
1169    */
1170   void (*callback_function[SIZEOF_CALLBACK_ID][MAX_CALLBACK_HOOK])();
1171   /**
1172    * Callback user data
1173    *
1174    */
1175   void *callback_user_data[SIZEOF_CALLBACK_ID][MAX_CALLBACK_HOOK];
1176   /**
1177    * Numbers of callbacks registered
1178    *
1179    */
1180   int callback_function_num[SIZEOF_CALLBACK_ID];
1181   /**
1182    * Callback function code list
1183    *
1184    */
1185   int callback_list_code[MAX_CALLBACK_HOOK*SIZEOF_CALLBACK_ID];
1186   /**
1187    * Callback function location list
1188    *
1189    */
1190   int callback_list_loc[MAX_CALLBACK_HOOK*SIZEOF_CALLBACK_ID];
1191   /**
1192    * Number of callbacks
1193    *
1194    */
1195   int callback_num;
1196 
1197   /*******************************************/
1198 
1199   /**
1200    * User-defined data hook.  JuliusLib does not concern about its content.
1201    *
1202    */
1203   void *hook;
1204 
1205 } Recog;
1206 
1207 #endif /* __J_RECOG_H__ */
1208