1 /** 2 * @file recog.h 3 * 4 * <JA> 5 * @brief ��������� 6 * 7 * ǧ������Υ����������Ԥ��ޤ��������ϡ� 8 * Recog ��ȥåץ����Ȥ��ơ����Ѥ��벻����ǥ롤�����ǥ롤 9 * �������Ȥ߹�碌��ǧ������������ʣ�������ޤ��� 10 * 11 * �����Υ����ϡ��б����� jconf ������깽¤�Ρ������ 12 * ���Ѥ��륵�֥����ؤΥݥ�������ޤ���PROCESS_AM �ϲ�����ǥ롤 13 * PROCESS_LM �ϸ����ǥ뤴�Ȥ��������ޤ��� 14 * 15 * MFCCCalc �ϡ� 16 * ������ǥ뤪��� GMM ���ᤵ���ѥ��������פ�Ĵ�٤��Τ��� 17 * ��������������Τ�ɬ�פʤ�����������ޤ���Ʊ���MFCC������� 18 * ����¾�Υե��ȥ���ɽ���������IJ�����ǥ뤪���GMM�ɤ����Ǥ� 19 * Ʊ�� MFCCCalc ����ͭ����ޤ��� 20 * 21 * </JA> 22 * 23 * <EN> 24 * @brief Enging instance definitions 25 * 26 * This file defines the engine instance and all its sub instances. 27 * The top instance is Recog, and it consists of several 28 * sub instances for LM, AM, and recognition process instances. 29 * 30 * Each sub-instance keeps pointer to corresponding jconf setting 31 * part, and also has pointers to other instances to use. 32 * PROCESS_AM will be generated for each acoustic model, and PROCESS_LM 33 * will be for each language model. 34 * 35 * MFCCCalc will be generated for each required MFCC frontend types 36 * by inspecting all AMs and GMM. The AM's and GMMs that requires 37 * exactly the same MFCC frontend will share the same MFCC frontend. 38 * 39 * </EN> 40 * 41 * <pre> 42 * Recog 43 * +- *JCONF 44 * +- input related work area 45 * +- MFCCCalc[] (linked list) (generated from HMM + GMM) 46 * +- PROCESS_AM[] (linked list) 47 * +- *pointer to JCONF_AM 48 * +- *pointer to MFCCCalc 49 * +- hmminfo, hmm_gs 50 * +- hmmwrk 51 * +- multipath, ccd_flag, cmn_loaded 52 * +- PROCESS_LM[] (linked list) 53 * +- *pointer to JCONF_LM 54 * +- *pointer to PROCESS_AM 55 * +- lmtype, lmvar 56 * +- winfo 57 * +- ngram or grammars 58 * +- lmfunc 59 * +- RecogProcess process[] (linked list) 60 * +- *pointer to JCONF_SEARCH 61 * +- *pointer to PROCESS_AM 62 * +- *pointer to PROCESS_LM 63 * +- lmtype, lmvar 64 * +- misc. param 65 * +- GMMCalc 66 * +- *JCONF_AM for GMM 67 * +- *pointer to MFCCCalc 68 * </pre> 69 * 70 * @author Akinobu Lee 71 * @date Fri Feb 16 13:42:28 2007 72 * 73 * $Revision: 1.7 $ 74 * 75 */ 76 /* 77 * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University 78 * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology 79 * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology 80 * All rights reserved 81 */ 82 83 /* 84 */ 85 86 #ifndef __J_RECOG_H__ 87 #define __J_RECOG_H__ 88 89 #include <sent/stddefs.h> 90 #include <sent/hmm.h> 91 #include <sent/vocabulary.h> 92 #include <sent/ngram2.h> 93 #include <sent/dfa.h> 94 #include <julius/wchmm.h> 95 #include <julius/search.h> 96 #include <julius/callback.h> 97 #include <julius/jconf.h> 98 99 /* 100 How tokens are managed: 101 o tlist[][] is a token stocker. It holds all tokens in sequencial 102 buffer. They are malloced first on startup, and refered by ID while 103 Viterbi procedure. In word-pair mode, each token also has a link to 104 another token to allow a node to have more than 1 token. 105 106 o token[n] holds the current ID number of a token associated to a 107 lexicon tree node 'n'. 108 109 */ 110 /** 111 * Work area for the first pass 112 * 113 */ 114 typedef struct __FSBeam__ { 115 /* token stocker */ 116 TOKEN2 *tlist[2]; ///< Token space to hold all token entities. 117 TOKENID *tindex[2]; ///< Token index corresponding to @a tlist for sort 118 int maxtnum; ///< Allocated number of tokens (will grow) 119 int expand_step; ///< Number of tokens to be increased per expansion 120 boolean expanded; ///< TRUE if the tlist[] and tindex[] has been expanded at last create_token(); 121 int tnum[2]; ///< Current number of tokens used in @a tlist 122 int n_start; ///< Start index of in-beam nodes on @a tindex 123 int n_end; ///< end index of in-beam nodes on @a tindex 124 int tl; ///< Current work area id (0 or 1, swapped for each frame) 125 int tn; ///< Next work area id (0 or 1, swapped for each frame) 126 127 /* Active token list */ 128 TOKENID *token; ///< Active token list that holds currently assigned tokens for each tree node 129 #ifdef UNIGRAM_FACTORING 130 /* for wordend processing with 1-gram factoring */ 131 LOGPROB wordend_best_score; ///< Best score of word-end nodes 132 int wordend_best_node; ///< Node id of the best wordend nodes 133 TRELLIS_ATOM *wordend_best_tre; ///< Trellis word corresponds to above 134 WORD_ID wordend_best_last_cword; ///< Last context-aware word of above 135 #endif 136 137 int totalnodenum; ///< Allocated number of nodes in @a token 138 TRELLIS_ATOM bos; ///< Special token for beginning-of-sentence 139 boolean nodes_malloced; ///< Flag to check if tokens already allocated 140 LOGPROB lm_weight; ///< Language score weight (local copy) 141 LOGPROB lm_penalty; ///< Word insertion penalty (local copy) 142 LOGPROB lm_penalty_trans; ///< Additional insertion penalty for transparent words (local copy) 143 LOGPROB penalty1; ///< Word insertion penalty for DFA (local copy) 144 #if defined(WPAIR) && defined(WPAIR_KEEP_NLIMIT) 145 boolean wpair_keep_nlimit; ///< Keeps only N token on word-pair approx. (local copy from jconf) 146 #endif 147 /* for short-pause segmentation */ 148 boolean in_sparea; ///< TRUE when we are in a pause area now 149 int tmp_sparea_start; ///< Memorize where the current pause area begins 150 #ifdef SP_BREAK_RESUME_WORD_BEGIN 151 WORD_ID tmp_sp_break_last_word; ///< Keep the max word hypothesis at beginning of this segment as the starting word of next segment 152 #else 153 WORD_ID last_tre_word; ///< Keep ths max word hypothesis at the end of this segment for as the starting word of the next segment 154 #endif 155 boolean first_sparea; ///< TRUE when we are in the first pause area 156 int sp_duration; ///< Number of current successive sp frame 157 #ifdef SPSEGMENT_NAIST 158 boolean after_trigger; ///< TRUE if speech already triggered 159 int trigger_duration; ///< Current speech duration at uptrigger detection 160 boolean want_rewind; ///< TRUE if process wants mfcc rewinding 161 int rewind_frame; ///< Place to rewind to 162 boolean want_rewind_reprocess; ///< TRUE if requires re-processing after rewind 163 #endif 164 char *pausemodelnames; ///< pause model name string to detect segment 165 char **pausemodel; ///< each pause model name to detect segment 166 int pausemodelnum; ///< num of pausemodel 167 } FSBeam; 168 169 170 /** 171 * Work area for realtime processing of 1st pass 172 * 173 */ 174 typedef struct __RealBeam__ { 175 /* input parameter */ 176 int maxframelen; ///< Maximum allowed input frame length 177 178 SP16 *window; ///< Window buffer for MFCC calculation 179 int windowlen; ///< Buffer length of @a window 180 int windownum; ///< Currently left samples in @a window 181 182 /* for short-pause segmentation */ 183 boolean last_is_segmented; ///< TRUE if last pass was a segmented input 184 SP16 *rest_Speech; ///< Speech samples left unprocessed by segmentation at previous segment 185 int rest_alloc_len; ///< Allocated length of rest_Speech 186 int rest_len; ///< Current stored length of rest_Speech 187 188 } RealBeam; 189 190 /** 191 * Work area for the 2nd pass 192 * 193 */ 194 typedef struct __StackDecode__ { 195 int hypo_len_count[MAXSEQNUM+1]; ///< Count of popped hypothesis per each length 196 int maximum_filled_length; ///< Current least beam-filled depth 197 #ifdef SCAN_BEAM 198 LOGPROB *framemaxscore; ///< Maximum score of each frame on 2nd pass for score enveloping 199 #endif 200 NODE *stocker_root; ///< Node stocker for recycle 201 int popctr; ///< Num of popped hypotheses from stack 202 int genectr; ///< Num of generated hypotheses 203 int pushctr; ///< Num of hypotheses actually pushed to stack 204 int finishnum; ///< Num of found sentence hypothesis 205 NODE *current; ///< Current node for debug 206 207 #ifdef CONFIDENCE_MEASURE 208 LOGPROB cm_alpha; ///< alpha scaling value from jconf 209 # ifdef CM_MULTIPLE_ALPHA 210 LOGPROB *cmsumlist; ///< Sum of cm score for each alpha coef. 211 int cmsumlistlen; ///< Allocated length of cmsumlist. 212 # endif 213 # ifdef CM_SEARCH 214 LOGPROB cm_tmpbestscore; ///< Temporal best score for summing up scores 215 # ifndef CM_MULTIPLE_ALPHA 216 LOGPROB cm_tmpsum; ///< Sum of CM score 217 # endif 218 int l_stacksize; ///< Local stack size for CM 219 int l_stacknum; ///< Num of hypo. in local stack for CM 220 NODE *l_start; ///< Top node of local stack for CM 221 NODE *l_bottom; ///< bottom node of local stack for CM 222 # endif 223 # ifdef CM_NBEST 224 LOGPROB *sentcm = NULL; ///< Confidence score of each sentence 225 LOGPROB *wordcm = NULL; ///< Confidence score of each word voted from @a sentcm 226 int sentnum; ///< Allocated length of @a sentcm 227 # endif 228 #endif /* CONFIDENCE_MEASURE */ 229 230 LOGPROB *wordtrellis[2]; ///< Buffer to compute viterbi path of a word 231 LOGPROB *g; ///< Buffer to hold source viterbi scores 232 HMM_Logical **phmmseq; ///< Phoneme sequence to be computed 233 int phmmlen_max; ///< Maximum length of @a phmmseq. 234 boolean *has_sp; ///< Mark which phoneme allow short pause for multi-path mode 235 #ifdef GRAPHOUT_PRECISE_BOUNDARY 236 short *wend_token_frame[2]; ///< Propagating token of word-end frame to detect corresponding end-of-words at word head 237 LOGPROB *wend_token_gscore[2]; ///< Propagating token of scores at word-end to detect corresponding end-of-words at word head 238 short *wef; ///< Work area for word-end frame tokens for v2 239 LOGPROB *wes; ///< Work area for word-end score tokens for v2 240 #endif 241 242 } StackDecode; 243 244 /** 245 * User LM function entry point 246 * 247 */ 248 typedef struct { 249 LOGPROB (*uniprob)(WORD_INFO *, WORD_ID, LOGPROB); ///< Pointer to function returning word occurence probability 250 LOGPROB (*biprob)(WORD_INFO *, WORD_ID, WORD_ID, LOGPROB); ///< Pointer to function returning a word probability given a word context (corresponds to bi-gram) 251 LOGPROB (*lmprob)(WORD_INFO *, WORD_ID *, int, WORD_ID, LOGPROB); ///< Pointer to function returning LM probability 252 } LMFunc; 253 254 /** 255 * Work area for GMM calculation 256 * 257 */ 258 typedef struct __gmm_calc__{ 259 LOGPROB *gmm_score; ///< Current accumurated scores for each GMM 260 boolean *is_voice; ///< True if corresponding model designates speech, FALSE if noise 261 int framecount; ///< Current frame count 262 263 short OP_nstream; ///< Number of input stream for GMM 264 VECT *OP_vec_stream[MAXSTREAMNUM]; ///< input vector for each stream at that frame 265 short OP_veclen_stream[MAXSTREAMNUM]; ///< vector length for each stream 266 267 LOGPROB *OP_calced_score; ///< Work area for Gaussian pruning on GMM: scores 268 int *OP_calced_id; ///< Work area for Gaussian pruning on GMM: id 269 int OP_calced_num; ///< Work area for Gaussian pruning on GMM: number of above 270 int OP_calced_maxnum; ///< Work area for Gaussian pruning on GMM: size of allocated area 271 int OP_gprune_num; ///< Number of Gaussians to be computed in Gaussian pruning 272 VECT *OP_vec; ///< Local workarea to hold the input vector of current frame 273 short OP_veclen; ///< Local workarea to hold the length of above 274 HTK_HMM_Data *max_d; ///< Hold model of the maximum score 275 int max_i; ///< Index of max_d 276 #ifdef CONFIDENCE_MEASURE 277 LOGPROB gmm_max_cm; ///< Hold maximum score 278 #endif 279 #ifdef GMM_VAD 280 LOGPROB *rates; ///< voice rate of recent N frames (cycle buffer) 281 int nframe; ///< Length of rates 282 boolean filled; 283 int framep; ///< Current frame pointer 284 285 boolean in_voice; ///< TRUE if currently in voice area 286 boolean up_trigger; ///< TRUE when detect up trigger 287 boolean down_trigger; ///< TRUE when detect down trigger 288 boolean after_trigger; ///< TRUE when currently we are processing speech segment 289 boolean want_rewind; ///< TRUE if GMM wants rewinding its MFCC 290 boolean want_rewind_reprocess; ///< TRUE if GMM wants re-processing after rewind 291 int rewind_frame; ///< Frame to rewind 292 int duration; ///< Current GMM duration work 293 #endif 294 } GMMCalc; 295 296 /** 297 * Alignment result, valid when forced alignment was done 298 * 299 */ 300 typedef struct __sentence_align__ { 301 int num; ///< Number of units 302 short unittype; ///< Unit type (one of PER_*) 303 WORD_ID *w; ///< word sequence by id (PER_WORD) 304 HMM_Logical **ph; ///< Phone sequence (PER_PHONEME, PER_STATE) 305 short *loc; ///< sequence of state location in a phone (PER_STATE) 306 boolean *is_iwsp; ///< TRUE if PER_STATE and this is the inter-word pause state at multipath mode 307 int *begin_frame; ///< List of beginning frame 308 int *end_frame; ///< List of ending frame 309 LOGPROB *avgscore; ///< Score averaged by frames 310 LOGPROB allscore; ///< Re-computed acoustic score 311 struct __sentence_align__ *next; ///< data chain pointer 312 } SentenceAlign; 313 314 /** 315 * Output result structure 316 * 317 */ 318 typedef struct __sentence__ { 319 WORD_ID word[MAXSEQNUM]; ///< Sequence of word ID 320 int word_num; ///< Number of words in the sentence 321 LOGPROB score; ///< Likelihood (LM+AM) 322 LOGPROB confidence[MAXSEQNUM]; ///< Word confidence scores 323 LOGPROB score_lm; ///< Language model likelihood (scaled) for N-gram 324 LOGPROB score_am; ///< Acoustic model likelihood for N-gram 325 int gram_id; ///< The grammar ID this sentence belongs to for DFA 326 SentenceAlign *align; 327 328 } Sentence; 329 330 /** 331 * A/D-in work area 332 * 333 */ 334 typedef struct __adin__ { 335 /* functions */ 336 /// Pointer to function for device initialization (call once on startup) 337 boolean (*ad_standby)(int, void *); 338 /// Pointer to function to open audio stream for capturing 339 boolean (*ad_begin)(); 340 /// Pointer to function to close audio stream capturing 341 boolean (*ad_end)(); 342 /// Pointer to function to begin / restart recording 343 boolean (*ad_resume)(); 344 /// Pointer to function to pause recording 345 boolean (*ad_pause)(); 346 /// Pointer to function to terminate current recording immediately 347 boolean (*ad_terminate)(); 348 /// Pointer to function to read samples 349 int (*ad_read)(SP16 *, int); 350 351 /* configuration parameters */ 352 int thres; ///< Input Level threshold (0-32767) 353 int noise_zerocross; ///< Computed threshold of zerocross num in the cycle buffer 354 int nc_max; ///< Computed number of fragments for tail margin 355 boolean adin_cut_on; ///< TRUE if do input segmentation by silence 356 boolean silence_cut_default; ///< Device-dependent default value of adin_cut_on() 357 boolean strip_flag; ///< TRUE if skip invalid zero samples 358 boolean enable_thread; ///< TRUE if input device needs threading 359 boolean need_zmean; ///< TRUE if perform zmeansource 360 361 /* work area */ 362 int c_length; ///< Computed length of cycle buffer for zero-cross, actually equals to head margin length 363 int c_offset; ///< Static data DC offset (obsolute, should be 0) 364 SP16 *swapbuf; ///< Buffer for re-triggering in tail margin 365 int sbsize; ///< Size of @a swapbuf 366 int sblen; ///< Current length of @a swapbuf 367 int rest_tail; ///< Samples not processed yet in swap buffer 368 369 ZEROCROSS zc; ///< Work area for zero-cross computation 370 371 #ifdef HAVE_PTHREAD 372 /* Variables related to POSIX threading */ 373 pthread_t adin_thread; ///< Thread information 374 pthread_mutex_t mutex; ///< Lock primitive 375 SP16 *speech; ///< Unprocessed samples recorded by A/D-in thread 376 int speechlen; ///< Current length of @a speech 377 /* 378 * Semaphore to start/stop recognition. 379 * 380 * If TRUE, A/D-in thread will store incoming samples to @a speech and 381 * main thread will detect and process them. 382 * If FALSE, A/D-in thread will still get input and check trigger as the same 383 * as TRUE case, but does not store them to @a speech. 384 * 385 */ 386 boolean transfer_online; 387 /** 388 * TRUE if buffer overflow occured in adin thread. 389 * 390 */ 391 boolean adinthread_buffer_overflowed; 392 /** 393 * TRUE if adin thread ended 394 * 395 */ 396 boolean adinthread_ended; 397 398 boolean ignore_speech_while_recog; ///< TRUE if ignore speech input between call, while waiting recognition process 399 400 #endif 401 402 /* Input data buffer */ 403 SP16 *buffer; ///< Temporary buffer to hold input samples 404 int bpmax; ///< Maximum length of @a buffer 405 int bp; ///< Current point to store the next data 406 int current_len; ///< Current length of stored samples 407 SP16 *cbuf; ///< Buffer for flushing cycle buffer just after detecting trigger 408 boolean down_sample; ///< TRUE if perform down sampling from 48kHz to 16kHz 409 SP16 *buffer48; ///< Another temporary buffer to hold 48kHz inputs 410 int io_rate; ///< frequency rate (should be 3 always for 48/16 conversion 411 412 boolean is_valid_data; ///< TRUE if we are now triggered 413 int nc; ///< count of current tail silence segments 414 boolean end_of_stream; ///< TRUE if we have reached the end of stream 415 boolean need_init; ///< if TRUE, initialize buffer on startup 416 417 DS_BUFFER *ds; ///< Filter buffer for 48-to-16 conversion 418 419 boolean rehash; ///< TRUE is want rehash at rewinding on decoder-based VAD 420 421 boolean input_side_segment; ///< TRUE if segmentation requested by ad_read 422 423 unsigned int total_captured_len; 424 unsigned int last_trigger_sample; 425 426 } ADIn; 427 428 /** 429 * Recognition result output structure. You may want to use with model data 430 * to get fully detailed results. 431 * 432 */ 433 typedef struct __Output__ { 434 /** 435 * 1: recognition in progress 436 * 0: recognition succeeded (at least one candidate has been found) 437 * -1: search failed, no candidate has been found 438 * -2: input rejected by short input 439 * -3: input rejected by GMM 440 * 441 */ 442 int status; 443 444 int num_frame; ///< Number of frames of the recognized part 445 int length_msec; ///< Length of the recognized part 446 447 Sentence *sent; ///< List of (N-best) recognition result sentences 448 int sentnum; ///< Number of sentences 449 450 WordGraph *wg1; ///< List of word graph generated on 1st pass 451 int wg1_num; ///< Num of words in the wg1 452 453 WordGraph *wg; ///< List of word graph 454 455 CN_CLUSTER *confnet; ///< List of confusion network clusters 456 457 Sentence pass1; ///< Recognition result on the 1st pass 458 459 } Output; 460 461 462 /**********************************************************************/ 463 /**********************************************************************/ 464 /**********************************************************************/ 465 466 /** 467 * instance for a parameter vector computation 468 * 469 */ 470 typedef struct __mfcc_calc__ { 471 472 /** 473 * Unique id 474 * 475 */ 476 short id; 477 478 /** 479 * Parameter setting (entity in JCONF_AM) 480 * 481 */ 482 Value *para; 483 484 /** 485 * TRUE if the para came from "-htkconf" 486 * 487 */ 488 boolean htk_loaded; 489 /** 490 * TRUE if the para came from binhmm embedded header 491 * 492 */ 493 boolean hmm_loaded; 494 495 /** 496 * Check input parameter type with header of the hmmdefs 497 * (-notypecheck to unset) 498 */ 499 boolean paramtype_check_flag; 500 501 /** 502 * Parameter extraction work area 503 * 504 */ 505 MFCCWork *wrk; 506 507 /** 508 * Parameter vector sequence to be recognized 509 * 510 */ 511 HTK_Param *param; 512 513 /** 514 * Rest parameter for next segment for short-pause segmentation 515 */ 516 HTK_Param *rest_param; 517 518 /** 519 * Work area and setting for cepstral mean normalization 520 * 521 */ 522 struct { 523 /** 524 * CMN: load initial cepstral mean from file at startup (-cmnload) 525 */ 526 char *load_filename; 527 /** 528 * CMN: update cepstral mean while recognition 529 * (-cmnnoupdate to unset) 530 */ 531 boolean update; 532 /** 533 * CMN: save cepstral mean to file at end of every recognition (-cmnsave) 534 */ 535 char *save_filename; 536 /** 537 * CMN: MAP weight for initial cepstral mean on (-cmnmapweight) 538 */ 539 float map_weight; 540 541 /** 542 * TRUE if CMN parameter loaded from file at boot up 543 */ 544 boolean loaded; 545 546 /** 547 * realtime CMN work area 548 * 549 */ 550 CMNWork *wrk; 551 552 } cmn; 553 554 /** 555 * Work area for front-end processing 556 * 557 */ 558 struct { 559 /** 560 * Estimated noise spectrum 561 */ 562 float *ssbuf; 563 564 /** 565 * Length of @a ssbuf 566 */ 567 int sslen; 568 569 /** 570 * Alpha coefficient for spectral subtraction 571 * 572 */ 573 float ss_alpha; 574 575 /** 576 * Flooring coefficient for spectral subtraction 577 * 578 */ 579 float ss_floor; 580 581 /** 582 * SS: compute noise spectrum from head silence on file input (-sscalc) 583 */ 584 boolean sscalc; 585 586 /** 587 * With "-sscalc", specify noise length at input head in msec (-sscalclen) 588 */ 589 int sscalc_len; 590 591 /** 592 * Load noise spectrum data from file (-ssload), that was made by "mkss". 593 */ 594 char *ssload_filename; 595 596 /** 597 * Parameter extraction work area for spectral subtraction 598 * 599 */ 600 MFCCWork *mfccwrk_ss; 601 602 } frontend; 603 604 /** 605 * work area for energy normalization on real time processing 606 * 607 */ 608 ENERGYWork ewrk; 609 610 /** 611 * delta MFCC cycle buffer 612 * 613 */ 614 DeltaBuf *db; 615 /** 616 * accel MFCC cycle buffer 617 * 618 */ 619 DeltaBuf *ab; 620 /** 621 * working buffer holding current computing mfcc vector 622 * 623 */ 624 VECT *tmpmfcc; 625 626 /** 627 * FALSE indicates that the current frame (f) is not valid and should 628 * not be used for recognition 629 * 630 */ 631 boolean valid; 632 633 /** 634 * Current frame 635 * 636 */ 637 int f; 638 639 /** 640 * Processed frame length when segmented 641 * 642 */ 643 int last_time; 644 645 /** 646 * Re-start frame if segmenetd 647 * 648 */ 649 int sparea_start; 650 651 /** 652 * TRUE if a parent instance has decided segmented 653 * 654 */ 655 boolean segmented; 656 657 /** 658 * TRUE if an input functionhas decided segmented 659 * 660 */ 661 boolean segmented_by_input; 662 663 /** 664 * id of an plugin module if MFCC should be obtained via plugin 665 * 666 */ 667 int plugin_source; 668 669 /** 670 * Function entry points for plugin input 671 * 672 */ 673 struct { 674 /// Pointer to function for device initialization (call once on startup) 675 boolean (*fv_standby)(); 676 /// Pointer to function to open audio stream for capturing 677 boolean (*fv_begin)(); 678 /// Pointer to function to read samples 679 int (*fv_read)(VECT *, int); 680 /// Pointer to function to close audio stream capturing 681 boolean (*fv_end)(); 682 /// Pointer to function to begin / restart recording 683 boolean (*fv_resume)(); 684 /// Pointer to function to pause recording 685 boolean (*fv_pause)(); 686 /// Pointer to function to terminate current recording immediately 687 boolean (*fv_terminate)(); 688 } func; 689 690 #ifdef POWER_REJECT 691 float avg_power; 692 #endif 693 694 /** 695 * pointer to next 696 * 697 */ 698 struct __mfcc_calc__ *next; 699 700 } MFCCCalc; 701 702 /** 703 * instance for an AM. 704 * 705 */ 706 typedef struct __process_am__ { 707 708 /** 709 * Configuration parameters 710 * 711 */ 712 JCONF_AM *config; 713 714 /** 715 * Corresponding input parameter vector instance 716 * 717 */ 718 MFCCCalc *mfcc; 719 720 /** 721 * Main phoneme HMM 722 */ 723 HTK_HMM_INFO *hmminfo; 724 725 /** 726 * HMM for Gaussian Selection 727 */ 728 HTK_HMM_INFO *hmm_gs; 729 730 /** 731 * Work area and outprob cache for HMM output probability computation 732 */ 733 HMMWork hmmwrk; 734 735 /** 736 * pointer to next 737 * 738 */ 739 struct __process_am__ *next; 740 741 } PROCESS_AM; 742 743 /** 744 * instance for a LM. 745 * 746 */ 747 typedef struct __process_lm__ { 748 749 /** 750 * Configuration parameters 751 * 752 */ 753 JCONF_LM *config; 754 755 /** 756 * Corresponding AM 757 * 758 */ 759 PROCESS_AM *am; 760 761 762 /** 763 * the LM type of this Model holder: will be set from Jconf used for loading 764 * 765 */ 766 int lmtype; 767 768 /** 769 * the LM variation type of this Model holder: will be set from 770 * Jconf used for loading 771 * 772 */ 773 int lmvar; 774 775 /** 776 * Main Word dictionary for all LM types 777 */ 778 WORD_INFO *winfo; 779 780 /** 781 * Main N-gram language model (do not use with grammars) 782 */ 783 NGRAM_INFO *ngram; 784 785 /** 786 * List of all loaded grammars (do not use with ngram) 787 */ 788 MULTIGRAM *grammars; 789 790 /** 791 * Current maximum value of assigned grammar ID. 792 * A new grammar ID will be assigned to each new grammar. 793 * 794 */ 795 int gram_maxid; 796 797 /** 798 * Global DFA for recognition. This will be generated from @a grammars, 799 * concatinating each DFA into one. 800 */ 801 DFA_INFO *dfa; 802 803 /** 804 * TRUE if modified in multigram_update() 805 * 806 */ 807 boolean global_modified; 808 809 /** 810 * LM User function entry point 811 * 812 */ 813 LMFunc lmfunc; 814 815 /** 816 * pointer to next 817 * 818 */ 819 struct __process_lm__ *next; 820 821 } PROCESS_LM; 822 823 /** 824 * instance for a decoding, i.e. set of LM, AM and parameters 825 * 826 */ 827 typedef struct __recogprocess__ { 828 829 /** 830 * TRUE is this instance is alive, or FALSE when temporary disabled. 831 * 832 */ 833 boolean live; 834 835 /** 836 * 1 if this instance should be made alive in the next recognition, 837 * -1 if should become dead in the next recognition, 838 * or 0 to leave unchanged. 839 * 840 */ 841 short active; 842 843 /** 844 * search configuration data 845 * 846 */ 847 JCONF_SEARCH *config; 848 849 /** 850 * acoustic model instance to use 851 * 852 */ 853 PROCESS_AM *am; 854 855 /** 856 * language model instance to use 857 * 858 */ 859 PROCESS_LM *lm; 860 861 /** 862 * Language model type: one of LM_UNDEF, LM_NGRAM, LM_DFA 863 * 864 */ 865 int lmtype; 866 867 /** 868 * Variation type of language model: one of LM_NGRAM, LM_DFA_GRAMMAR, 869 * LM_DFA_WORD 870 * 871 */ 872 int lmvar; 873 874 /** 875 * Whether handle phone context dependency (local copy from jconf) 876 */ 877 boolean ccd_flag; 878 879 /** 880 * Word-conjunction HMM as tree lexicon 881 */ 882 WCHMM_INFO *wchmm; 883 884 /** 885 * Actual beam width of 1st pass (will be set on startup) 886 */ 887 int trellis_beam_width; 888 889 /** 890 * Word trellis index generated at the 1st pass 891 */ 892 BACKTRELLIS *backtrellis; 893 894 /** 895 * Work area for the first pass 896 */ 897 FSBeam pass1; 898 899 /** 900 * Work area for second pass 901 * 902 */ 903 StackDecode pass2; 904 905 /** 906 * Word sequence of best hypothesis on 1st pass 907 */ 908 WORD_ID pass1_wseq[MAXSEQNUM]; 909 910 /** 911 * Number of words in @a pass1_wseq 912 */ 913 int pass1_wnum; 914 915 /** 916 * Score of @a pass1_wseq 917 */ 918 LOGPROB pass1_score; 919 920 /** 921 * Last maximum word hypothesis on the begin point for short-pause segmentation 922 */ 923 WORD_ID sp_break_last_word; 924 /** 925 * Last (not transparent) context word for LM for short-pause segmentation 926 */ 927 WORD_ID sp_break_last_nword; 928 /** 929 * Allow override of last context word from result of 2nd pass for short-pause segmentation 930 */ 931 boolean sp_break_last_nword_allow_override; 932 /** 933 * Search start word on 2nd pass for short-pause segmentation 934 */ 935 WORD_ID sp_break_2_begin_word; 936 /** 937 * Search end word on 2nd pass for short-pause segmentation 938 */ 939 WORD_ID sp_break_2_end_word; 940 941 /** 942 * Input length in frames 943 */ 944 int peseqlen; 945 946 /** 947 * GraphOut: total number of words in the generated graph 948 */ 949 int graph_totalwordnum; 950 951 /** 952 * Recognition results 953 * 954 */ 955 Output result; 956 957 /** 958 * graphout: will be set from value from jconf->graph.enabled 959 * 960 */ 961 boolean graphout; 962 963 /** 964 * Temporal matrix work area to hold the order relations between words 965 * for confusion network construction. 966 * 967 */ 968 char *order_matrix; 969 970 /** 971 * Number of words to be expressed in the order matrix for confusion network 972 * construction. 973 * 974 */ 975 int order_matrix_count; 976 977 #ifdef DETERMINE 978 int determine_count; 979 LOGPROB determine_maxnodescore; 980 boolean determined; 981 LOGPROB determine_last_wid; 982 boolean have_determine; 983 #endif 984 985 /** 986 * TRUE if has something to output at CALLBACK_RESULT_PASS1_INTERIM. 987 * 988 */ 989 boolean have_interim; 990 991 /** 992 * User-defined data hook. JuliusLib does not concern about its content. 993 * 994 */ 995 void *hook; 996 997 /** 998 * Pointer to next instance 999 * 1000 */ 1001 struct __recogprocess__ *next; 1002 1003 } RecogProcess; 1004 1005 /** 1006 * Top level instance for the whole recognition process 1007 * 1008 */ 1009 typedef struct __Recog__ { 1010 1011 /*******************************************/ 1012 /** 1013 * User-specified configuration parameters 1014 * 1015 */ 1016 Jconf *jconf; 1017 1018 /*******************************************/ 1019 /** 1020 * A/D-in buffers 1021 * 1022 */ 1023 ADIn *adin; 1024 1025 /** 1026 * Work area for the realtime processing of first pass 1027 */ 1028 RealBeam real; 1029 1030 /** 1031 * Linked list of MFCC calculation/reading instances 1032 * 1033 */ 1034 MFCCCalc *mfcclist; 1035 1036 /** 1037 * Linked list of acoustic model instances 1038 * 1039 */ 1040 PROCESS_AM *amlist; 1041 1042 /** 1043 * Linked list of language model instances 1044 * 1045 */ 1046 PROCESS_LM *lmlist; 1047 1048 /** 1049 * Linked list of recognition process instances 1050 * 1051 */ 1052 RecogProcess *process_list; 1053 1054 1055 /** 1056 * TRUE when engine is processing a segment (for short-pause segmentation) 1057 * 1058 */ 1059 boolean process_segment; 1060 1061 /*******************************************/ 1062 /* inputs */ 1063 1064 /** 1065 * Input speech data 1066 */ 1067 SP16 *speech; 1068 1069 /** 1070 * Allocate length of speech 1071 * 1072 */ 1073 int speechalloclen; 1074 1075 /** 1076 * Input length in samples 1077 */ 1078 int speechlen; 1079 1080 /** 1081 * Input length in frames 1082 */ 1083 int peseqlen; 1084 1085 /*******************************************/ 1086 1087 /** 1088 * GMM definitions 1089 * 1090 */ 1091 HTK_HMM_INFO *gmm; 1092 1093 /** 1094 * Pointer to MFCC instance for GMM 1095 * 1096 */ 1097 MFCCCalc *gmmmfcc; 1098 1099 /** 1100 * Work area for GMM calculation 1101 * 1102 */ 1103 GMMCalc *gc; 1104 1105 /*******************************************/ 1106 /* misc. */ 1107 1108 /** 1109 * Status flag indicating whether the recognition is alive or not. If 1110 * TRUE, the process is currently activated, either monitoring an 1111 * audio input or recognizing the current input. If FALSE, the recognition 1112 * is now disabled until some activation command has been arrived from 1113 * client. While disabled, all the inputs are ignored. 1114 * 1115 * If set to FALSE in the program, Julius/Julian will stop after 1116 * the current recognition ends, and enter the disabled status. 1117 * 1118 */ 1119 boolean process_active; 1120 1121 /** 1122 * If set to TRUE, Julius/Julian stops recognition immediately, terminating 1123 * the currenct recognition process, and enter into disabled status. 1124 * 1125 */ 1126 boolean process_want_terminate; 1127 1128 /** 1129 * If set to TRUE, Julius/Julian stops recognition softly. If it is 1130 * performing recognition of the 1st pass, it immediately segments the 1131 * current input, process the 2nd pass, and output the result. Then it 1132 * enters the disabled status. 1133 * 1134 */ 1135 boolean process_want_reload; 1136 1137 /** 1138 * When to refresh the global lexicon if received while recognition for 1139 * DFA 1140 * 1141 */ 1142 short gram_switch_input_method; 1143 1144 /** 1145 * TRUE if audio stream is now open and engine is either listening 1146 * audio stream or recognizing a speech. FALSE on startup or when 1147 * in pause specified by a module command. 1148 * 1149 */ 1150 boolean process_online; 1151 1152 /** 1153 * Function pointer to parameter vector computation for realtime 1st pass. 1154 * default: RealTimeMFCC() in realtime-1stpass.c 1155 * 1156 */ 1157 boolean (*calc_vector)(MFCCCalc *, SP16 *, int); 1158 1159 /** 1160 * TRUE when recognition triggered and some recognition started, 1161 * FALSE if engine terminated with no input. 1162 * 1163 */ 1164 boolean triggered; 1165 1166 /** 1167 * Callback entry point 1168 * 1169 */ 1170 void (*callback_function[SIZEOF_CALLBACK_ID][MAX_CALLBACK_HOOK])(); 1171 /** 1172 * Callback user data 1173 * 1174 */ 1175 void *callback_user_data[SIZEOF_CALLBACK_ID][MAX_CALLBACK_HOOK]; 1176 /** 1177 * Numbers of callbacks registered 1178 * 1179 */ 1180 int callback_function_num[SIZEOF_CALLBACK_ID]; 1181 /** 1182 * Callback function code list 1183 * 1184 */ 1185 int callback_list_code[MAX_CALLBACK_HOOK*SIZEOF_CALLBACK_ID]; 1186 /** 1187 * Callback function location list 1188 * 1189 */ 1190 int callback_list_loc[MAX_CALLBACK_HOOK*SIZEOF_CALLBACK_ID]; 1191 /** 1192 * Number of callbacks 1193 * 1194 */ 1195 int callback_num; 1196 1197 /*******************************************/ 1198 1199 /** 1200 * User-defined data hook. JuliusLib does not concern about its content. 1201 * 1202 */ 1203 void *hook; 1204 1205 } Recog; 1206 1207 #endif /* __J_RECOG_H__ */ 1208