1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ 2 /* ==================================================================== 3 * Copyright (c) 1999-2004 Carnegie Mellon University. All rights 4 * reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 18 * This work was supported in part by funding from the Defense Advanced 19 * Research Projects Agency and the National Science Foundation of the 20 * United States of America, and the CMU Sphinx Speech Consortium. 21 * 22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY 26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 * 34 * ==================================================================== 35 * 36 */ 37 38 /* srch.h 39 * HISTORY 40 * $Log$ 41 * Revision 1.1 2006/04/05 20:27:30 dhdfu 42 * A Great Reorganzation of header files and executables 43 * 44 * Revision 1.2 2006/02/23 15:26:10 arthchan2003 45 * Merged from SPHINX3_5_2_RCI_IRII: 46 * 47 * Summary of changes. Detail could be seen in the comments from the 48 * branches. 49 * 50 * After 6 months, we have two more searches using interface 51 * provided by srch.c. That included an adapted version of Sphinx 2's FSG 52 * search. Also, the original version of flat-lexicon decoding search. 53 * 54 * Second stage search operation is still not properly put in the srch_t 55 * structure. We should create function hooks that allow developer to 56 * put the code more properly than now. 57 * 58 * The interface of srch.c is still not very completed. Things we should 59 * support include switching of AM and MLLR. They are currently 60 * commented. 61 * 62 * Mode 5, the word-dependent tree copies are now fended off from the 63 * users. 64 * 65 * Mode 2, the FSG search are opened. It is not very well tested so the 66 * user will be warned about its nature. 67 * 68 * Revision 1.1.4.15 2006/01/16 20:01:20 arthchan2003 69 * Added Commented code in srch.[ch] for second-stage rescoring. Not used for now. 70 * 71 * Revision 1.1.4.14 2005/11/17 06:36:36 arthchan2003 72 * There are several important changes. 1, acoustic score scale has changed back to put it the search structure. This fixed a bug introduced pre-2005 code branching where only the scaling factor of the last frame. 2, Added a fmt argument of matchseg_write , implemented segmentation output for s2 and ctm file format. matchseg_write also now shared across the flat and tree decoder now. 3, Added Rong's read_seg_hyp_line. 73 * 74 * Revision 1.1.4.13 2005/09/25 19:23:55 arthchan2003 75 * 1, Added arguments for turning on/off LTS rules. 2, Added arguments for turning on/off composite triphones. 3, Moved dict2pid deallocation back to dict2pid. 4, Tidying up the clean up code. 76 * 77 * Revision 1.1.4.12 2005/09/18 01:44:12 arthchan2003 78 * Very boldly, started to support flat lexicon decoding (mode 3) in srch.c. Add log_hypseg. Mode 3 is implemented as srch-one-frame implementation. Scaling doesn't work at this point. 79 * 80 * Revision 1.1.4.11 2005/09/11 23:07:28 arthchan2003 81 * srch.c now support lattice rescoring by rereading the generated lattice in a file. When it is operated, silence cannot be unlinked from the dictionary. This is a hack and its reflected in the code of dag, kbcore and srch. code 82 * 83 * Revision 1.1.4.10 2005/08/02 21:37:28 arthchan2003 84 * 1, Used s3_cd_gmm_compute_sen instead of approx_cd_gmm_compute_sen in mode 2, 4 and 5. This will suppose to make s3.0 to be able to read SCHMM and use them as well. 2, Change srch_gmm_compute_lv2 to accept a two-dimensional array (no_stream*no_coeff) instead of a one dimensional array (no_coeff). 85 * 86 * Revision 1.1.4.9 2005/07/24 19:35:59 arthchan2003 87 * Added GAUDEN_EVAL_WINDOW in srch.h. Assuming this is property of a search. 88 * 89 * Revision 1.1.4.8 2005/07/24 01:39:26 arthchan2003 90 * Added srch_on_srch_frame_lv[12] in the search abstraction routine. This will allow implementation just provide the search for one frame without supplying all function pointer in the standard abstraction. 91 * 92 * Revision 1.1.4.7 2005/07/22 03:41:05 arthchan2003 93 * 1, (Incomplete) Add function pointers for flat foward search. Notice implementation is not yet filled in. 2, adding log_hypstr and log_hyp_detailed. It is sphinx 3.0 version of matchwrite. Add it to possible code merge. 94 * 95 * Revision 1.1.4.6 2005/07/17 05:54:55 arthchan2003 96 * replace vithist_dag_write_header with dag_write_header 97 * 98 * Revision 1.1.4.5 2005/07/13 18:46:39 arthchan2003 99 * Re-included srch_fsg.h 100 * 101 * Revision 1.1.4.4 2005/07/07 02:37:39 arthchan2003 102 * 1, Changed names of srchmode* functions to srch_mode*, 2, complete srch_mode_index_to_str, 3, Remove srch_rescoring and ask implementation to call these "rescoring functions" themselves. The reason is rescoring is not as universal as I would think in the general search. I think search implementer should be the one who decide whether rescoring is one part of their search algorithms 103 * 104 * Revision 1.1.4.3 2005/07/04 07:18:49 arthchan2003 105 * Disabled support of FSG. Added comments for srch_utt_begin and srch_utt_end. 106 * 107 * Revision 1.1.4.2 2005/07/03 23:04:55 arthchan2003 108 * 1, Added srchmode_str_to_index, 2, called the deallocation routine of the search implementation layer in srch_uninit 109 * 110 * Revision 1.1.4.1 2005/06/28 07:03:01 arthchan2003 111 * Added read_fsg operation as one method. Currently, it is still not clear how it should iteract with lm 112 * 113 * Revision 1.1 2005/06/22 02:24:42 arthchan2003 114 * Log. A search interface implementation are checked in. I will call 115 * srch_t to be search abstraction or search mechanism from now on. The 116 * major reason of separating with the search implementation routine 117 * (srch_*.[ch]) is that search is something that people could come up 118 * with thousands of ways to implement. 119 * 120 * Such a design shows a certain sense of defiance of conventional ways 121 * of designing speech recognition. Namely, **always** using generic 122 * graph as the grandfather ancester of every search lattice. This could 123 * 1) break a lot of legacy optimization code. 2) could be slow depends 124 * on the implementation. 125 * 126 * The current design only specify the operations that are supposed to be 127 * generic in every search (or atomic search operations (ASOs)). 128 * Ideally, users only need to implement the interface to make the code 129 * work for another search. 130 * 131 * From this point of view, the current check-in still have some 132 * fundamental flaws. For example, the communication mechanism between 133 * different atomic search operations are not clearly defined. Scores are 134 * now computed and put into structures of ascr. (ascr has no clear 135 * interface to outside world). This is something we need to improve. 136 * 137 * Revision 1.18 2005/06/16 04:59:10 archan 138 * Sphinx3 to s3.generic, a gentle-refactored version of Dave's change in senone scale. 139 * 140 * Revision 1.17 2005/06/10 03:40:57 archan 141 * 1, Fixed doxygen documentation of srch.h, 2, eliminate srch.h C-style functions. 3, Start to fend off the users for using mode 5. We are ready to merge the code. 142 * 143 * Revision 1.16 2005/06/10 03:01:50 archan 144 * Fixed file_open. 145 * 146 * Revision 1.15 2005/06/09 21:03:33 archan 147 * Update srch.h and srch_debug.c such that include files doesn't depend on explicitly specified directory name. Rather it would be taken care by -I option in Makefile.am 148 * 149 * Revision 1.14 2005/05/11 06:10:38 archan 150 * Code for lattice and back track pointer table dumping is now wrapped in reg_result_dump. The function is shared across mode 4 and mode 5. Possibly later for mode 3 and mode 6 as well. 151 * 152 * Revision 1.13 2005/05/11 00:18:45 archan 153 * Add comments on srch.h and srch_time_switch_tree.h and srch_debug.h on how things work. A very detail comment is added in srch.h to describe how generally srch_t is interacting with other parts of the code. 154 * 155 * Revision 1.12 2005/05/04 05:15:25 archan 156 * reverted the last change, seems to be not working because of compilation issue. Try not to deal with it now. 157 * 158 * Revision 1.1 2005/05/04 04:46:04 archan 159 * Move srch.c and srch.h to search. More and more this type of refactoring will be done in future 160 * 161 * Revision 1.10 2005/05/03 04:09:09 archan 162 * Implemented the heart of word copy search. For every ci-phone, every word end, a tree will be allocated to preserve its pathscore. This is different from 3.5 or below, only the best score for a particular ci-phone, regardless of the word-ends will be preserved at every frame. The graph propagation will not collect unused word tree at this point. srch_WST_propagate_wd_lv2 is also as the most stupid in the century. But well, after all, everything needs a start. I will then really get the results from the search and see how it looks. 163 * 164 * Revision 1.9 2005/04/25 19:22:47 archan 165 * Refactor out the code of rescoring from lexical tree. Potentially we want to turn off the rescoring if we need. 166 * 167 * Revision 1.8 2005/04/22 04:22:36 archan 168 * Add gmm_wrap, this will share code across op_mode 4 and op_mode 5. Also it also separate active senone selection into a different process. I hope this is the final step before making the WST search works. At the current stage, the code of mode-5 looks very much alike mode-4. This is intended because in Prototype 4, tail sharing will be used to reduce memory. 169 * 170 * Revision 1.7 2005/04/21 23:50:26 archan 171 * Some more refactoring on the how reporting of structures inside kbcore_t is done, it is now 50% nice. Also added class-based LM test case into test-decode.sh.in. At this moment, everything in search mode 5 is already done. It is time to test the idea whether the search can really be used. 172 * 173 * Revision 1.6 2005/04/20 03:42:55 archan 174 * srch.c now is the only of the master search driver. When there is any change in the **interaction** of different blocks, srch.c should be changed first. Then the search implenetation, such as srch_time_switch_tree.c 175 * 176 * Revision 1.5 2005/03/30 01:22:47 archan 177 * Fixed mistakes in last updates. Add 178 * 179 * 180 * 17-Mar-2005 A. Chan (archan@cs.cmu.edu) at Carnegie Mellon University 181 * 1 Started. This replaced utt.c starting from Sphinx 3.6. 182 */ 183 184 #include <stdio.h> 185 186 #include <s3types.h> 187 #include <glist.h> 188 #include "dag.h" 189 #include "lm.h" 190 #include "ascr.h" 191 #include "adaptor.h" 192 #include "stat.h" 193 #include "fast_algo_struct.h" 194 #include "kbcore.h" 195 #include "kb.h" 196 197 198 /* Mode 1 */ 199 #include "srch_allphone.h" 200 201 /* Mode 2 */ 202 #include "srch_fsg.h" 203 204 /* Mode 3 */ 205 #include "srch_flat_fwd.h" 206 207 /* Mode 4 */ 208 #include "srch_time_switch_tree.h" 209 210 /* Mode 5 */ 211 #include "srch_word_switch_tree.h" 212 213 /* Mode 1368*/ 214 #include "srch_do_nothing.h" 215 216 /* Mode 1369*/ 217 #include "srch_debug.h" 218 219 220 #include "srch_output.h" 221 222 #ifndef _SRCH_H_ 223 #define _SRCH_H_ 224 225 226 #ifdef __cplusplus 227 extern "C" { 228 #endif 229 #if 0 230 /* Fool Emacs. */ 231 } 232 #endif 233 234 #define SRCH_SUCCESS 0 235 #define SRCH_FAILURE 1 236 237 /** 238 The many operation modes of srch.c. 239 240 Mode 1-4 are working modes. Not all of them are functioning as at 241 but we have plans to migrate to that. 242 243 Just for the sake of it. mode 6-10 are assigned to 5 working 244 programmers of CMU Sphinx. We don't expect too much from them. So 245 let's see how it goes. 246 247 Mode 88 is also assigned but this is just for the sake of it. 248 */ 249 250 #define OPERATION_ALIGN 0 /**< (Reserved but not used) force 251 alignment mode. It is 252 questionable whether this belongs 253 in the decoder at all since it's 254 mostly used in training. 255 */ 256 #define OPERATION_ALLPHONE 1 /**< Phoneme recognition. 257 */ 258 #define OPERATION_GRAPH 2 /**< FSG mode, adapated from sphinx 2 259 specify -fsg in the current decode 260 interface should work. 261 */ 262 #define OPERATION_FLATFWD 3 /**< Flat-lexicon decoding mode 263 In CMU, it is well-used as a 264 research tool for acoustic and 265 language modeling research. Its 266 shell, decode_anytopo is still 267 one of the standard executables. 268 */ 269 #define OPERATION_TST_DECODE 4 /**< Tree-lexicon decoding with 270 time-switching tree copies 271 the current default. Or what I 272 called the "magic wheel" search. 273 It is a practical and useful 274 search techniques. 275 */ 276 #define OPERATION_WST_DECODE 5 /**< (Reserved but not used) 277 Tree-lexicon decoding with 278 word-switching tree copies. Work 279 for bigram with trigram 280 rescoring. Not well developed 281 enough to be deployed. 282 */ 283 #define OPERATION_EVANDRO_MODE 6 /**< (Reserved but not used) Dr. 284 Evandro Gouvea is a very defensive 285 programmer. He is very clever and 286 has strong mathematical 287 background. I would guess his 288 search code would be very solid. 289 */ 290 #define OPERATION_DAVID_MODE 7 /**< (Reserved but not used) It is 291 not an illusion that David 292 Huggins-Daines is productive. 293 The reason he hasn't touched 294 the search part yet is because 295 he hasn't read Richard Bellman's 296 "Dynamic Programming" and Arthur 297 Chan's C++ implementation of 298 arec. He will definitely show 299 his super-fast speed in search 300 implementation as well. 301 */ 302 #define OPERATION_ARTHUR_MODE 8 /**< (Reserved but not used) Arthur 303 Chan is the guy who think of this 304 search mode thing. He wrote mode 305 5. Just like plan 9. It is quite 306 a failure. Though, this guy is 307 not that easy-going, he must have 308 think of something this time. 309 */ 310 #define OPERATION_YITAO_MODE 9 /**< (Reserved but not used) Yitao Sun 311 works mainly on grammar, CFG, FSG 312 and stuffs. Likely his search will 313 also reflect his research direction. 314 */ 315 #define OPERATION_RAVI_MODE 10 /**< (Reserved but not used) Mosur 316 Ravishankar works out both s2 and 317 s3 He is also the original author 318 of mode 2, 3 and 4. (In practice, 319 Arthur Chan's mode 5 really 320 couldn't match mode 4 at all.) 321 He always has a lot of 322 interesting ideas in his mind. Likely 323 it might think of interesting search 324 similar to mode 4. 325 */ 326 #define OPERATION_STEVE_MODE 88 /**< (Reserved but not used) Steven 327 Lee is the guy who work out the 328 a-star search for the MIT 329 segmental recognizer. He yielded 330 to Arthur Chan and become one of 331 the developers of CMU Sphinx. 332 But his heart is still with MIT, 333 MIT SLS. Likely he will still 334 be happy that 88 is a very lucky 335 number in Chinese. 336 */ 337 #define OPERATION_DO_NOTHING 1368 /**< Do nothing mode. as it means 338 it does nothing. It is used as 339 one of the function test of the 340 search operator. 341 */ 342 343 #define OPERATION_DEBUG 1369 /**< Debug mode, it dumps the 344 internal function call in srch.c 345 layer. ARCHAN 20050329: 1369 has 346 no meaning at all, I just love to 347 call it in that way. 348 */ 349 350 #define GRAPH_STRUCT_FLAT 0 351 #define GRAPH_STRUCT_TST 1 352 #define GRAPH_STRUCT_WST 2 353 #define GRAPH_STRUCT_GENGRAPH 3 354 #define GRAPH_STRUCT_PHMM 4 355 356 #define GMM_STRUCT_CDHMM 0 357 #define GMM_STRUCT_SCHMM 1 358 359 360 #define GAUDEN_EVAL_WINDOW 8 /*Moving window length when frames are 361 considered as blocks, currently used in 362 3.0 family of tools. */ 363 364 #define DFLT_UTT_SIZE 5000 /**< The default number of frames of an utterance */ 365 #define DFLT_NUM_SEGS 200 /**< The default number of segments of an utterance */ 366 367 368 369 /* \struct grp_str_t 370 */ 371 typedef struct { 372 void *graph_struct; /**< The graph structure */ 373 int32 graph_type; /**< The graph type */ 374 }grp_str_t; 375 376 377 378 /** 379 \file srch.h 380 \brief search abstraction. 381 382 Written at 20050510 by ARCHAN. 383 384 Mechanism/implementation separation 385 386 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 387 388 Starting from Sphinx 3.6, the implementation of the search has 389 conceptually separated into two layers. The top layer srch.c 390 defines the top level atomic function defintion (implemented as 391 function pointers) that every search-like operations will share. 392 393 To understand what it means, consider two seemingly different 394 search-like operations, alignment and decoding. Both of them 395 required 1) initialization, 2) propropagte 1 frame. 3) termniation. 396 Each of these operation can well be defined as one single 397 interface. 398 399 Some people call this type of implementation as "C-class" which I 400 think is very appropiate. Because in general, C++, Objective-C and 401 D actually used the same mechanism to implement the concept 402 "class". Obviously, the actual implementation was hidden in the 403 compiler in those cases so that's why so many people are confused. 404 405 Why polymorphism is important? 406 407 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 408 409 Before sphinx 3.6, maintainers (that includes me) tend to think 410 that different operations would require different treatment. The 411 consequence of that is different opertations tends to have 412 different implementations, styles. Code tends to be duplicated if 413 programmers think in this way. One big problem is the fact that we 414 had so called s3 slow and s3 fast and there were 3000 lines 415 duplication. 416 417 Code duplication also tends compound. i.e. if code was duplicated 418 twice, it will be duplicated four times. That reduces 419 productivitiy of the team. 420 421 Other reasons why srch.c is written 422 423 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 424 425 At the time when we implemented this routine, we actually didn't 426 know whether mode 5 (word tree copies) will work or not. 427 Therefore, letting the code of mode 4 (lucky wheel search) and mode 428 5 to coexist will be the best. 429 430 431 "Embedding" of multi-stage search into the dynamic programming 432 433 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 434 435 Another interesting part (Interesting in the sense that I would 436 write a paper on this.) of the search is that we (I will say mainly 437 me because this is not new at all.) realize that approximiate 438 search (i.e. something like phoneme-lookahead), detail search 439 (i.e. something like hard-core tree and fsm search) and high level 440 rescoring using knowledge source (e.g. using LM to rescore N-best) 441 could all be incorporated into the first pass search and 442 potentially make the code more verstille. 443 444 They key concept on how to do this is here: at one frame, one can 445 carry out the above three operations and make the search faster 446 because of 1) heuristics value obtained in the approximate search 447 2) early incorporation of high-level knowledge in the rescoring stage. 448 449 The naming of function interfaces shows this tendency. lv1 means the 450 approximate first-stage search. lv2 means the detail second-stage 451 search. rescoring means the use of high-level information in 452 rescoring during the search. 453 454 The scope of what srch.c defines 455 456 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 457 458 Written at 20050510 459 460 At the time I wrote this, we only implement mode 4 (Ravi's 461 implementation in Sphinx 3.2, I usually call it "lucky wheel" 462 implementation. (See my description in srch_time_switch_tree.h), 463 mode 5 (My implementation using tree copies.) and mode 1369 (A 464 debug mode of the search mechanism where only a prompt will be 465 displayed and show that a function is called. Why 1369? coz I am a 466 nerd. :-)) 467 468 Though only three modes of the search were written. I found it to 469 be utmost important to share my imagination of what we can actually 470 do using this interface of the search. In theory, the slow search 471 (pre-defined as mode 3) and the fast searches (pre-defined as mode 472 4 and mode 5) can be incorporated using this interface. s3align and 473 s3allphone (pre-defined as mode 0 and 1) in s3.0 family can also be 474 implemented in this way. 475 476 Applications such as keyword spotting and speaker verifcation. They 477 are not defined at this point but they are possible using the 478 architecture 479 480 What limits this architecture is that it might not be able to 481 implement a wide class of segmental models. I will speculate if a 482 recursive formulat can be derived for a particular type of model 483 (segmental or non-segmental, HMM or non-HMM). The current 484 interface could be overloaded and allow implementation of them. 485 There are definitely some types of segmental model are very hard to 486 implement using this structure; e.g. MIT segmental model (pre-1999) 487 where segmentation was generated by low-level information first and 488 search were performed on segments. 489 490 491 A general guide-line on how to write a search implementation 492 493 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 494 495 1, The batch mode search operation and the on-line mode operation 496 can always share code. If the implementation is duplicated, it 497 shows a bad omen. 498 499 2, Knowledge source (such as LM and finite state grammer) always 500 change the graph structure. That is why srch_{set,add,delete}_lm 501 was defined, if we have multiple types of knowledge 502 sources. Interfaces should be added in srch.c and the search 503 operation (e.g. search/srch_time_switch_tree.c) provides the actualy 504 implemenation 505 506 3, GMM computation and search can be easily separated into two 507 routines. Therefore, instead of wrap it up in individual searches, 508 a new set of function called gmmwrap.c is used which conform the 509 interface in srch.c. 510 511 4, Mechanism such as phoneme lookahead and generally fast match are 512 all similar in the sense that. They look forward a couple of frames 513 and compute a heuristic score. The windows controlling mechanism 514 should be coded in srch.c . The actual heuristic should be coded 515 somewhere else. 516 517 5, grh contains the actual data structure that stores the graph. We 518 didn't use WFST as a common structure because we found that it has 519 certain fact we don't know at the time we implemented. 520 521 6, Always get back to the debug mode (mode 1369) if you want to 522 debug the search mechanism 523 524 7, (V. Imp.) When in doubt, ask Arthur Chan. If he is not in CMU 525 any more, brainstorm how to get him back. He is a nice guy and has 526 no problem in talking. 527 528 */ 529 typedef struct srch_funcs_s { 530 /* 531 Function pointers that perform the operations. Every mode will 532 set these pointers at the beginning of the search. 533 */ 534 535 /** Initialization of the search, coz the graph type can be different */ 536 int (*init)(kb_t *kb, /**< Pointer of kb_t which srch_init wants to copy from */ 537 void* srch_struct /**< a pointer of srch_t */ 538 ); 539 540 /**< Un-Initialize of the search. */ 541 int (*uninit)( 542 void* srch_struct /**< a pointer of srch_t */ 543 ); 544 /**< Begin search for one utterance */ 545 int (*utt_begin)( 546 void* srch_struct /**< a pointer of srch_t */ 547 ); 548 549 /** End search for one utterance */ 550 int (*utt_end)( 551 void* srch_struct /**< a pointer of srch_t */ 552 ); 553 /** Actual decoding operation */ 554 int (*decode)( 555 void* srch_struct /**< a pointer of srch_t */ 556 ); 557 558 /** Set LM operation. */ 559 int (*set_lm)( 560 void* srch_struct, /**< a pointer of srch_t */ 561 const char *lmname /**< The LM name */ 562 ); 563 564 /** Add LM operation */ 565 int (*add_lm)(void* srch_struct, /**< a pointer of srch_t */ 566 lm_t* lm, /**< A new lm */ 567 const char *lmname /**< The LM name */ 568 ); 569 570 /** Delete LM operation */ 571 int (*delete_lm)(void* srch_struct, /**< a pointer of srch_t */ 572 const char *lmname /**< The LM name */ 573 ); 574 575 /** Read FSG operation*/ 576 #if 0 577 word_fsg_t* (*read_fsgfile)(void* srch_struct, /**< a pointer of srch_t */ 578 const char* fsgname /** The fsg file name*/ 579 580 ); 581 #endif 582 /* The 4 operations that require switching during the approximate search process */ 583 /**< lv1 stands for approximate search. Currently not used. */ 584 585 /** Compute Approximate GMM */ 586 int (*gmm_compute_lv1)(void* srch_struct, /**< a pointer of srch_t */ 587 float32 *feat, /**< The feature vector */ 588 int32 frmno_lp1, /**< The frame for the cache */ 589 int32 frmno_lp2 /**< The frame for the windows */ 590 ); 591 592 593 /* The level 1 search functions are not yet fully used. Not all of them are defined nowWhen fast 594 match is needed. We will need them more. 595 */ 596 int (*one_srch_frame_lv1)(void* srch_struct /**< a pointer of srch_t */ 597 ); 598 599 int (*hmm_compute_lv1)(void* srch_struct); 600 int (*eval_beams_lv1)(void* srch_struct); 601 int (*propagate_graph_ph_lv1)(void* srch_struct); 602 int (*propagate_graph_wd_lv1)(void* srch_struct); 603 604 /* The 4 operations that require switching during the detail search process */ 605 /** lv2 stands for detail search. */ 606 607 608 /** Compute detail (CD) GMM scores or lv2*/ 609 int (*gmm_compute_lv2)(void* srch_struct, /**< a pointer of srch_t */ 610 float32 **feat, /**< A feature vector */ 611 int32 time /**< The frame we want to compute detail score */ 612 ); 613 614 615 /** A short-cut function that allows implementer could just 616 implement searching for one frame without implement the 617 following 4 fuctions. 618 */ 619 int (*one_srch_frame_lv2)(void* srch_struct /**< a pointer of srch_t */ 620 ); 621 622 623 /** Compute detail (CD) HMM scores or lv2*/ 624 int (*hmm_compute_lv2)(void* srch_struct, /**< a pointer of srch_t */ 625 int32 frmno /**< The frame we want to compute detail score */ 626 ); 627 628 /** Compute the beams*/ 629 int (*eval_beams_lv2)(void* srch_struct /**< a pointer of srch_t */ 630 ); 631 632 /** Propagate the graph in phone level */ 633 int (*propagate_graph_ph_lv2)(void* srch_struct, /**< a pointer of srch_t */ 634 int32 frmno /**< The frame no. */ 635 ); 636 637 /** Propagate the graph in word level */ 638 int (*propagate_graph_wd_lv2)(void* srch_struct, /**< a pointer of srch_t */ 639 int32 frmno /**< The frame no. */ 640 ); 641 642 /** Rescoring srch */ 643 int (*rescoring) (void* srch_struct, /**< a pointer of srch_t */ 644 int32 frmno /**< The frame no. */ 645 ); 646 647 int (*frame_windup) (void * srch_struct, int32 frmno); 648 int (*compute_heuristic) (void * srch_struct, int32 win_efv); 649 int (*shift_one_cache_frame) (void *srch_struct,int32 win_efv); 650 int (*select_active_gmm) (void *srch_struct); 651 652 653 /** 654 Second stage functions. They provide a generalized interface for 655 different modes to generate output 656 */ 657 /** 658 Generation of hypothesis (*.hyp). Notice, displaying hypothesis is taken care by srch.c itself. 659 */ 660 glist_t (*gen_hyp) (void * srch_struct /**< a pointer of srch_t */ 661 ); 662 663 /** 664 Generation of directed acyclic graph (*.lat.gz). Notice , dumping 665 the dag will be taken care by srch.c. There is mode specific 666 optimization. 667 @return a dag which represent the word graphs. 668 */ 669 dag_t* (*gen_dag) (void* srch_struct, /**< a pointer of srch_t */ 670 glist_t hyp 671 ); 672 673 /** 674 Dump vithist 675 */ 676 int (*dump_vithist)(void * srch_struct /**< a pointer of srch_t */ 677 ); 678 679 /** 680 Interface of best path search. 681 */ 682 glist_t (*bestpath_impl)(void *srch_struct, /**< a pointer of srch_t */ 683 dag_t *dag 684 ); 685 686 /** 687 Interface for sphinx3 dag dumping function 688 */ 689 int (*dag_dump) (void * srch_struct, 690 dag_t *dag 691 ); 692 693 /** 694 Interface of N-best search. 695 */ 696 glist_t (*nbest_impl)(void *srch_struct, /**< a pointer of srch_t */ 697 dag_t *dag 698 ); 699 700 /** Empty "guard" element which does nothing. */ 701 void *nothing; 702 } srch_funcs_t; 703 704 typedef struct srch_s { 705 /** 706 * Methods specific to a particular search mode. 707 **/ 708 srch_funcs_t *funcs; 709 710 grp_str_t* grh; /**< Pointer to search specific structures */ 711 int op_mode; /**< The operation mode */ 712 stat_t *stat; /**< Pointer to the statistics structure */ 713 char *uttid; /**< Reference to UttID */ 714 char *uttfile; /**< Reference to Utterance filename (or NULL) */ 715 716 /* 717 These variables control the logistic of a search operation. The 718 are global to all different search modes. 719 */ 720 int32 cache_win; /**< The windows lengths of the cache for approximate search */ 721 int32 cache_win_strt; /**< The start index of the window near the end of a block */ 722 723 int32 senscale; /**< TEMPORARY VARIABLE: Senone scale */ 724 725 int32 *ascale; /**< Same as senscale but it records the senscale for the whole sentence. 726 The default size is 3000 frames. 727 */ 728 int32 ascale_sz; /**< Size of the ascr_scale*/ 729 int32 num_frm; /**< Number of frames processed */ 730 731 int32 *segsz; /**< Size of segments for each call */ 732 int32 segsz_sz; /**< Size of segments size*/ 733 734 int32 num_segs; /**< In one search, (i.e. between one srch_utt_begin and one srch_utt_end) 735 The number of segments the search has decoded. 736 That also means the number of times srch_utt_decode_blk is called. 737 */ 738 739 740 /* 741 Auxillary Structures for the search. 742 */ 743 int32 exit_id; /**< Lattice ID for exit word (-1 if 744 * utterance is not complete) */ 745 dag_t *dag; /**< Word graph representation of completed utterance. */ 746 747 /* ARCHAN: Various pruning beams, put them together such that it looks more logical. */ 748 ascr_t *ascr; /**< Pointer to Senone and composite senone scores for one frame */ 749 beam_t *beam; /**< Pointer to Structure that wraps up parameters related to beam pruning */ 750 fast_gmm_t *fastgmm; /**< Pointer to Structure that wraps up parameters for fast GMM computation */ 751 pl_t *pl; /**< Pointer to Structure that wraps up parameter for phoneme look-ahead */ 752 adapt_am_t * adapt_am; /** Pointer to AM adaptation structure */ 753 kbcore_t *kbc; /**< Pointer to the kbcore */ 754 755 756 FILE *matchfp; /**< Copy of File handle for the match file */ 757 FILE *matchsegfp; /**< Copy of File handle for the match segmentation file */ 758 759 FILE *hmmdumpfp; /**< Copy of File handle for dumping hmms for debugging */ 760 761 /* FIXME, duplicated with fwd_dbg_t */ 762 int32 hmm_dump_sf; /**< Start frame for HMMs to be dumped for debugging */ 763 int32 hmm_dump_ef; /**< End frame for HMMs to be dumped for debugging */ 764 }srch_t; 765 766 /** 767 Translate mode string (as passed in the -mode argument) to number. 768 Names chosen for familiarity with Sphinx2/PocketSphinx. 769 770 The currently valid operation modes are 771 772 Strings Operation Mode(Mode Idx) 773 --------------------------------------- 774 allphone OPERATION_ALLPHONE(1) 775 776 fsg OPERATION_GRAPH(2) 777 778 fwdflat OPERATION_FLATFWD(3) 779 780 fwdtree OPERATION_TST_DECODE(4) 781 782 @return the mode index if it is valid, -1 if it is not. 783 */ 784 785 int32 srch_mode_str_to_index(const char* mode_str); 786 787 /** Translate mode string to mode index. User need to supply an initialized 788 @return a mode string; 789 @see srchmode_str_to_index 790 */ 791 792 char* srch_mode_index_to_str(int32 index); 793 794 795 /* The following are C-style method for srch structure. In theory, 796 users could used both C-style and function pointer style to access 797 functionalities of the code. However, we recommend developers to use 798 the C-style functions because 1) it won't scare people that match, 2) 799 it is more consistent with other modules in sphinx 3. 800 */ 801 802 /** Initialize the search routine, this will specify the type of search 803 drivers and initialized all resouces. It will do the following things 804 805 1, Set all function pointers depends on the mode number. 806 807 2, Initialize parameters which the default search abstraction 808 mechanism required. This currently include, cache_win, 809 cache_win_start. They control how much delay of the first level 810 and second level search. Also senscale, a parameter which store 811 the largest gmm score for each frame. 812 813 3, Initialize search-specific data structure using srch_init 814 method of the search class. That is s->srch_init will be called. 815 816 @return an initialized srch_t with operation mode op_mode. 817 */ 818 819 srch_t* srch_init(kb_t *kb, /**< In: knowledge base */ 820 int32 op_mode /**< In: operation mode of the search */ 821 ); 822 823 /** Report the search routine */ 824 /** using file name of the model definition and directory name to initialize */ 825 826 /** Report the search structure 827 */ 828 void srch_report(srch_t* srch /**< In: a search structure */ 829 ); 830 831 /** 832 * Begin decoding of speech for one utterance. 833 * 834 * @return SRCH_SUCCESS if succeed, SRCH_FAILURE if failed. 835 * @see srch_utt_end 836 */ 837 838 int32 srch_utt_begin(srch_t* srch /**< In: a search structure */ 839 ); 840 841 /** 842 Decode one block of speech and provide the implementation of the default search abstraction 843 */ 844 S3DECODER_EXPORT 845 int32 srch_utt_decode_blk(srch_t* srch, /**< In: a search structure */ 846 float ***block_feat, /**< In: a pointer of a two dimensional array */ 847 int32 block_nfeatvec, /**< In: Number of feature vector */ 848 int32 *curfrm /**< In/Out: a pointer of the current frame index*/ 849 ); 850 851 /** 852 End decoding of speech for one utterance. 853 */ 854 int32 srch_utt_end(srch_t* srch /**< In: a search structure */ 855 ); 856 857 /** Wrap up the search routine*/ 858 int32 srch_uninit(srch_t* srch /**< In: a search structure */ 859 ); 860 861 /** 862 * Get a recognition backtrace, as a list of srch_hyp_t structures. 863 * You must free this with glist_free() after using it. */ 864 glist_t srch_get_hyp(srch_t *srch /**< In: a search structure */ 865 ); 866 867 /** 868 * Get the complete set of possible hypotheses, as a word graph (DAG). 869 * The dag_t returned by this function remains owned by srch_t. Do 870 * NOT attempt to free it. 871 */ 872 dag_t *srch_get_dag(srch_t *srch); 873 874 875 /** Dump recognition result */ 876 void reg_result_dump (srch_t* s, /**< A search structure */ 877 int32 id /**< Utterance ID */ 878 ); 879 /** 880 Dump the best senone score for each frame 881 */ 882 void write_bstsenscr(FILE *fp, /**< A file pointer */ 883 int32 numframe, /**< Number of frame in one recognition */ 884 int32* scale /**< Scales */ 885 ); 886 887 888 /** using file name of the LM or defined lmctlfn mechanism */ 889 S3DECODER_EXPORT 890 int32 srch_set_lm(srch_t* srch, /**< A search structure */ 891 const char *lmname /**< LM fie name */ 892 ); 893 894 /** delete lm */ 895 int32 srch_delete_lm(srch_t* srch, /**< A search structure */ 896 const char *lmname /**< LM fie name */ 897 ); 898 899 #if 0 /*Tentative: but not yet implemented */ 900 int32 srch_set_am(void); 901 902 /** add new am */ 903 int32 srch_add_am(void); 904 905 /** delete am */ 906 int32 srch_delete_am(void); 907 908 /** add new lm */ 909 int32 srch_add_lm(void); 910 911 912 /** using file name of the regression class mapping or a directory name to initialize */ 913 int32 srch_set_mllr(void); 914 915 /** add new mllr */ 916 int32 srch_add_mllr(void); 917 918 /** delete mllr */ 919 int32 srch_delete_mllr(void); 920 921 /** using file name of interpolation file to initialize it */ 922 int32 srch_set_lamdafn(void); 923 924 /** add new mllr */ 925 int32 srch_add_lamdafn(void); 926 927 /** delete mllr */ 928 int32 srch_delete_lamdafn(void); 929 930 /** add new words into the dictionary */ 931 int32 srch_add_words_to_dict(void); 932 933 #endif /* End not implemented */ 934 935 #ifdef __cplusplus 936 } 937 #endif 938 939 940 #endif /*_SRCH_H_ */ 941