1# Copyright (c) 2008 Carnegie Mellon University. All rights 2# reserved. 3# 4# You may copy, modify, and distribute this code under the same terms 5# as PocketSphinx or Python, at your convenience, as long as this 6# notice is not removed. 7# 8# Author: David Huggins-Daines <dhuggins@cs.cmu.edu> 9 10cdef class LatNode: 11 """ 12 Node in a word lattice. 13 14 @ivar word: Word this node corresponds to (with pronunciation variant). 15 @type word: str 16 @ivar baseword: Base word (no pronunciation variant) this node corresponds to. 17 @type baseword: str 18 @ivar sf: Start frame for this node. 19 @type sf: int 20 @ivar fef: First ending frame for this node. 21 @type fef: int 22 @ivar lef: Last ending frame for this node. 23 @type lef: int 24 @ivar best_exit: Best scoring exit link from this node 25 @type best_exit: LatLink 26 @ivar prob: Posterior probability for this node. 27 @type prob: float 28 """ 29 def __cinit__(self): 30 self.node = NULL 31 32 cdef set_node(LatNode self, ps_lattice_t *dag, ps_latnode_t *node): 33 """ 34 Internal function - binds this to a PocketSphinx lattice node. 35 """ 36 cdef short fef, lef 37 cdef ps_latlink_t *best_exit 38 self.dag = dag 39 self.node = node 40 self.word = ps_latnode_word(dag, node) 41 self.baseword = ps_latnode_baseword(dag, node) 42 self.sf = ps_latnode_times(node, &fef, &lef) 43 self.fef = fef 44 self.lef = lef 45 self.best_exit = None 46 best_exit = NULL 47 self.prob = sb.logmath_log_to_ln(ps_lattice_get_logmath(dag), 48 ps_latnode_prob(dag, node, &best_exit)) 49 if best_exit != NULL: 50 self.best_exit = LatLink() 51 self.best_exit.set_link(dag, best_exit) 52 53 def exits(self): 54 """ 55 Obtain an iterator over arcs exiting this node. 56 57 @return: Iterator over arcs exiting this node 58 @rtype: LatLinkIterator 59 """ 60 cdef LatLinkIterator itor 61 cdef ps_latlink_iter_t *citor 62 63 citor = ps_latnode_exits(self.node) 64 itor = LatLinkIterator() 65 itor.itor = citor 66 itor.dag = self.dag 67 return itor 68 69 def entries(self): 70 """ 71 Obtain an iterator over arcs entering this node. 72 73 @return: Iterator over arcs entering this node 74 @rtype: LatLinkIterator 75 """ 76 cdef LatLinkIterator itor 77 cdef ps_latlink_iter_t *citor 78 79 citor = ps_latnode_entries(self.node) 80 itor = LatLinkIterator() 81 itor.itor = citor 82 itor.dag = self.dag 83 return itor 84 85cdef class LatNodeIterator: 86 """ 87 Iterator over word lattice nodes. 88 """ 89 def __init__(self, start, end): 90 self.itor = NULL 91 self.first_node = True 92 self.start = start 93 self.end = end 94 95 def __iter__(self): 96 return self 97 98 def __next__(self): 99 """ 100 Advance iterator and return the next node. 101 102 @return: Next lattice node in this iterator. 103 @rtype: LatNode 104 """ 105 cdef LatNode node 106 cdef int start 107 cdef ps_latnode_t *cnode 108 109 # Make sure we keep raising exceptions at the end 110 if self.itor == NULL: 111 raise StopIteration 112 # Advance the iterator if this isn't the first item 113 if self.first_node: 114 self.first_node = False 115 else: 116 self.itor = ps_latnode_iter_next(self.itor) 117 if self.itor == NULL: 118 raise StopIteration 119 # Look for the next node within the given time range 120 cnode = ps_latnode_iter_node(self.itor) 121 start = ps_latnode_times(cnode, NULL, NULL) 122 while start < self.start or start >= self.end: 123 self.itor = ps_latnode_iter_next(self.itor) 124 if self.itor == NULL: 125 raise StopIteration 126 cnode = ps_latnode_iter_node(self.itor) 127 start = ps_latnode_times(cnode, NULL, NULL) 128 node = LatNode() 129 node.set_node(self.dag, cnode) 130 return node 131 132cdef class LatLink: 133 """ 134 Link (edge) in a word lattice, connecting two nodes. 135 136 @ivar word: Word (with pronunciation variant) for this link. 137 @type word: str 138 @ivar baseword: Base word (no pronunciation variant) for this link. 139 @type baseword: str 140 @ivar sf: Start frame for this link. 141 @type sf: int 142 @ivar fef: Ending frame for this link. 143 @type fef: int 144 @ivar prob: Posterior probability for this link. 145 @type prob: float 146 """ 147 def __cinit__(self): 148 self.link = NULL 149 150 cdef set_link(LatLink self, ps_lattice_t *dag, ps_latlink_t *link): 151 """ 152 Internal function - binds this to a PocketSphinx lattice link. 153 """ 154 cdef short sf 155 self.dag = dag 156 self.link = link 157 self.word = ps_latlink_word(dag, link) 158 self.baseword = ps_latlink_baseword(dag, link) 159 self.ef = ps_latlink_times(link, &sf) 160 self.sf = sf 161 self.prob = sb.logmath_log_to_ln(ps_lattice_get_logmath(dag), 162 ps_latlink_prob(dag, link, NULL)) 163 164 def nodes(self): 165 """ 166 Get source and destination nodes for this link. 167 168 @return: Source and destination nodes for this link 169 @rtype: (LatNode, LatNode) 170 """ 171 cdef LatNode src, dest 172 cdef ps_latnode_t *csrc, *cdest 173 174 cdest = ps_latlink_nodes(self.link, &csrc) 175 src = LatNode() 176 src.set_node(self.dag, csrc) 177 dest = LatNode() 178 dest.set_node(self.dag, cdest) 179 return src, dest 180 181 def pred(self): 182 """ 183 Get backpointer from this link. 184 185 @return: Backpointer from this link, set by bestpath search. 186 @rtype: LatLink 187 """ 188 cdef LatLink pred 189 cdef ps_latlink_t *cpred 190 191 cpred = ps_latlink_pred(self.link) 192 if cpred == NULL: 193 return None 194 pred = LatLink() 195 pred.set_link(self.dag, cpred) 196 return pred 197 198cdef class LatLinkIterator: 199 """ 200 Iterator over word lattice links. 201 """ 202 def __cinit__(self): 203 self.itor = NULL 204 self.first_link = True 205 206 def __iter__(self): 207 return self 208 209 def __next__(self): 210 """ 211 Advance iterator and return the next link. 212 213 @return: Next lattice link in this iterator. 214 @rtype: LatLink 215 """ 216 cdef LatLink link 217 if self.first_link: 218 self.first_link = False 219 else: 220 self.itor = ps_latlink_iter_next(self.itor) 221 if self.itor == NULL: 222 raise StopIteration 223 link = LatLink() 224 link.set_link(self.dag, ps_latlink_iter_link(self.itor)) 225 return link 226 227cdef class Lattice: 228 """ 229 Word lattice. 230 231 The word lattice is a compact representation of the set of 232 hypotheses considered by the decoder when recognizing an 233 utterance. 234 235 A lattice object can be constructed either from a lattice file 236 on disk or from a 'boxed' object passed in from GStreamer (or, 237 in theory, anything else that uses GLib). In the first case, 238 the C{ps} argument is required. 239 240 @param ps: PocketSphinx decoder. 241 @type ps: Decoder 242 @param latfile: Filename of lattice file to read. 243 @type latfile: str 244 @param boxed: Boxed pointer from GStreamer containing a lattice 245 @type boxed: PyGBoxed 246 247 @ivar n_frames: Number of frames of audio covered by this lattice 248 @type n_frames: int 249 @ivar start: Start node 250 @type start: LatNode 251 @ivar end: End node 252 @type end: LatNode 253 """ 254 def __init__(self, ps=None, latfile=None, boxed=None): 255 self.dag = NULL 256 if latfile: 257 self.read_dag(ps, latfile) 258 if boxed: 259 self.set_boxed(boxed) 260 261 cdef read_dag(Lattice self, Decoder ps, latfile): 262 if ps: 263 self.dag = ps_lattice_read(ps.ps, latfile) 264 else: 265 self.dag = ps_lattice_read(NULL, latfile) 266 self.n_frames = ps_lattice_n_frames(self.dag) 267 if self.dag == NULL: 268 raise RuntimeError, "Failed to read lattice from %s" % latfile 269 270 cdef set_dag(Lattice self, ps_lattice_t *dag): 271 ps_lattice_retain(dag) 272 ps_lattice_free(self.dag) 273 self.dag = dag 274 self.n_frames = ps_lattice_n_frames(dag) 275 276 cdef set_boxed(Lattice self, box): 277 cdef ps_lattice_t *dag 278 dag = <ps_lattice_t *>(<PyGBoxed *>box).boxed 279 ps_lattice_retain(dag) 280 ps_lattice_free(self.dag) 281 self.dag = dag 282 self.n_frames = ps_lattice_n_frames(self.dag) 283 284 def __dealloc__(self): 285 ps_lattice_free(self.dag) 286 287 def bestpath(self, NGramModel lmset, float lwf, float ascale): 288 """ 289 Find the best path through the lattice, optionally using a 290 language model. 291 292 This function performs best-path search on the lattice, and 293 returns the final link in the best path found. The existing 294 acoustic scores on the lattice links are used in conjunction 295 with an optional language model. A scaling factor can be 296 applied to the acoustic scores to produce more useful 297 posterior probabilities (in conjunction with C{posterior()}, 298 below). 299 300 @param lmset: Language model (set) to use for rescoring 301 @type lmset: sphinxbase.NGramModel 302 @param lwf: Weight to apply to language model scores (on top 303 of any existing language model weight set in C{lmset}). 304 @type lwf: float 305 @param ascale: Weight to apply to acoustic model scores. 306 @type ascale: float 307 @return: Final link in best path. 308 @rtype: LatLink 309 """ 310 cdef ps_latlink_t *end 311 cdef LatLink link 312 end = ps_lattice_bestpath(self.dag, lmset.lm, lwf, ascale) 313 link = LatLink() 314 link.set_link(self.dag, end) 315 return link 316 317 def posterior(self, NGramModel lmset, float ascale): 318 """ 319 Calculate posterior probabilities of all links in a lattice. 320 321 This function performs the backward part of forward-backward 322 calculation of posterior probabilities for all links in the 323 lattice. It assumes that C{bestpath()} has already been 324 called on the lattice. 325 326 @param lmset: Language model (set) to use for rescoring 327 @type lmset: sphinxbase.NGramModel 328 @param ascale: Weight to apply to acoustic model scores. 329 @type ascale: float 330 @return: Log-probability of the lattice as a whole. 331 @rtype: float 332 """ 333 cdef logmath_t *lmath 334 lmath = ps_lattice_get_logmath(self.dag) 335 return sb.logmath_log_to_ln(lmath, 336 ps_lattice_posterior(self.dag, lmset.lm, ascale)) 337 338 def nodes(self, start=0, end=-1): 339 """ 340 Get an iterator over all nodes in the lattice. 341 342 @param start: First frame to iterate over. 343 @type start: int 344 @param end: Last frame to iterate over, or -1 for all remaining 345 @type end: int 346 @return: Iterator over nodes. 347 @rtype: LatNodeIterator 348 """ 349 cdef LatNodeIterator itor 350 351 if end == -1: 352 end = ps_lattice_n_frames(self.dag) 353 itor = LatNodeIterator(start, end) 354 itor.dag = self.dag 355 itor.itor = ps_latnode_iter(self.dag) 356 return itor 357 358 def write(self, outfile): 359 """ 360 Write the lattice to an output file. 361 362 @param outfile: Name of file to write to. 363 @type outfile: str 364 """ 365 cdef int rv 366 367 rv = ps_lattice_write(self.dag, outfile) 368 if rv < 0: 369 raise RuntimeError, "Failed to write lattice to %s" % outfile 370 371 372cdef class Segment: 373 374 def __init__(self): 375 self.seg = NULL 376 377 cdef set_seg(self, ps_seg_t *seg): 378 self.seg = seg 379 380 def word(self): 381 return ps_seg_word(self.seg) 382 383 def frames(self): 384 cdef int sf, ef 385 ps_seg_frames(self.seg, &sf, &ef) 386 return(sf, ef) 387 388 def prob(self): 389 cdef int32 ascr, lscr, lback 390 ps_seg_prob(self.seg, &ascr, &lscr, &lback) 391 return (ascr, lscr, lback) 392 393cdef class SegmentIterator: 394 """ 395 Iterator for best hypothesis word segments of best hypothesis 396 """ 397 def __init__(self): 398 self.itor = NULL 399 self.first_seg = False 400 401 cdef set_iter(self, ps_seg_t *seg): 402 self.itor = seg 403 self.first_seg = True 404 405 def __iter__(self): 406 return self 407 408 def __next__(self): 409 cdef Segment seg 410 if self.first_seg: 411 self.first_seg = False 412 else: 413 self.itor = ps_seg_next(self.itor) 414 if NULL == self.itor: 415 raise StopIteration 416 else: 417 seg = Segment() 418 seg.set_seg(self.itor) 419 return seg 420 421 422cdef class Decoder: 423 """ 424 PocketSphinx speech decoder. 425 426 To initialize the PocketSphinx decoder, pass a list of keyword 427 arguments to the constructor:: 428 429 d = pocketsphinx.Decoder(hmm='/path/to/acoustic/model', 430 lm='/path/to/language/model', 431 dict='/path/to/dictionary', 432 beam='1e-80') 433 434 If no arguments are passed, the default acoustic and language 435 models will be loaded, which may be acceptable for general English 436 speech. Any arguments supported by the PocketSphinx decoder are 437 allowed here. Only the most frequent ones are described below. 438 439 @param boxed: Boxed pointer from GStreamer containing a decoder 440 @type boxed: PyGBoxed 441 @param hmm: Path to acoustic model directory 442 @type hmm: str 443 @param dict: Path to dictionary file 444 @type dict: str 445 @param lm: Path to language model file 446 @type lm: str 447 @param jsgf: Path to JSGF grammar file 448 @type jsgf str 449 """ 450 def __init__(self, **kwargs): 451 cdef cmd_ln_t *config 452 cdef int i 453 454 # Construct from an existing GObject pointer if given 455 if 'boxed' in kwargs: 456 self.argc = 0 457 self.set_boxed(kwargs['boxed']) 458 return 459 460 # A much more concise version of what pocketsphinx_parse_argdict used to do 461 self.argc = len(kwargs) * 2 462 self.argv = <char **>sb.ckd_calloc(self.argc, sizeof(char *)) 463 i = 0 464 for k, v in kwargs.iteritems(): 465 if k[0] != '-': 466 k = '-' + k 467 self.argv[i] = sb.ckd_salloc(k) 468 self.argv[i+1] = sb.ckd_salloc(v) 469 i = i + 2 470 config = sb.cmd_ln_parse_r(NULL, ps_args(), self.argc, self.argv, 0) 471 if config == NULL: 472 raise RuntimeError, "Failed to parse argument list" 473 self.ps = ps_init(config) 474 sb.cmd_ln_free_r(config) 475 if self.ps == NULL: 476 raise RuntimeError, "Failed to initialize PocketSphinx" 477 478 cdef set_boxed(Decoder self, box): 479 cdef ps_decoder_t *ps 480 ps = <ps_decoder_t *>(<PyGBoxed *>box).boxed 481 ps_retain(ps) 482 ps_free(self.ps) 483 self.ps = ps 484 485 def __dealloc__(self): 486 ps_free(self.ps) 487 for i from 0 <= i < self.argc: 488 sb.ckd_free(self.argv[i]) 489 sb.ckd_free(self.argv) 490 self.argv = NULL 491 self.argc = 0 492 493 def decode_raw(self, fh, uttid=None, maxsamps=-1): 494 """ 495 Decode raw audio from a file. 496 497 @param fh: Filehandle to read audio from. 498 @type fh: file 499 @param uttid: Identifier to give to this utterance. 500 @type uttid: str 501 @param maxsamps: Maximum number of samples to read. If not 502 specified or -1, the rest of the file will be read. 503 @type maxsamps: int 504 """ 505 cdef FILE *cfh 506 cdef int nsamp 507 cdef char *cuttid 508 509 cfh = PyFile_AsFile(fh) 510 if uttid == None: 511 cuttid = NULL 512 else: 513 cuttid = uttid 514 return ps_decode_raw(self.ps, cfh, cuttid, maxsamps) 515 516 def decode_senscr(self, fh, uttid=None): 517 """ 518 Decode senone scores from a file. 519 520 @param fh: Filehandle to read senone scores from. 521 @type fh: file 522 @param uttid: Identifier to give to this utterance. 523 @type uttid: str 524 """ 525 cdef FILE *cfh 526 cdef char *cuttid 527 528 cfh = PyFile_AsFile(fh) 529 if uttid == None: 530 cuttid = NULL 531 else: 532 cuttid = uttid 533 return ps_decode_senscr(self.ps, cfh, cuttid) 534 535 def start_utt(self, uttid=None): 536 """ 537 Prepare the decoder to recognize an utterance. 538 539 @param uttid: Identifier to give to this utterance. 540 @type uttid: str 541 """ 542 cdef char *cuttid 543 544 if uttid == None: 545 cuttid = NULL 546 else: 547 cuttid = uttid 548 if ps_start_utt(self.ps, cuttid) < 0: 549 raise RuntimeError, "Failed to start utterance processing" 550 551 def process_raw(self, data, no_search=False, full_utt=False): 552 """ 553 Process (decode) some audio data. 554 555 @param data: Audio data to process. This is packed binary 556 data, which consists of single-channel, 16-bit PCM audio, at 557 the sample rate specified when the decoder was initialized. 558 @type data: str 559 @param no_search: Buffer the data without actually processing it (default is to process the 560 data as it is received). 561 @type no_search: bool 562 @param full_utt: This block of data is an entire utterance. 563 Processing an entire utterance at once may improve 564 recognition, particularly for the first utterance passed to 565 the decoder. 566 @type full_utt: bool 567 """ 568 cdef Py_ssize_t len 569 cdef char* strdata 570 cdef raw_data_ptr cdata 571 572 PyString_AsStringAndSize(data, &strdata, &len) 573 cdata = strdata 574 if ps_process_raw(self.ps, cdata, len / 2, no_search, full_utt) < 0: 575 raise RuntimeError, "Failed to process %d samples of audio data" % len / 2 576 577 def end_utt(self): 578 """ 579 Finish processing an utterance. 580 """ 581 if ps_end_utt(self.ps) < 0: 582 raise RuntimeError, "Failed to stop utterance processing" 583 584 def get_hyp(self): 585 """ 586 Get a hypothesis string. 587 588 This function returns the text which has been recognized so 589 far, or, if C{end_utt()} has been called, the final 590 recognition result. 591 592 @return: Hypothesis string, utterance ID, recognition score 593 @rtype: (str, str, int) 594 """ 595 cdef const_char_ptr hyp 596 cdef const_char_ptr uttid 597 cdef int score 598 599 hyp = ps_get_hyp(self.ps, &score, &uttid) 600 601 # No result 602 if hyp == NULL: 603 return None, None, 0 604 605 return hyp, uttid, score 606 607 def get_prob(self): 608 """ 609 Get a posterior probability. 610 611 Returns the posterior in linear scale. 612 613 @return: posterior probability of the result 614 @rtype: float 615 """ 616 cdef logmath_t *lmath 617 cdef const_char_ptr uttid 618 lmath = ps_get_logmath(self.ps) 619 return sb.logmath_exp(lmath, ps_get_prob(self.ps, &uttid)) 620 621 def get_lattice(self): 622 """ 623 Get the word lattice. 624 625 This function returns all hypotheses which have been 626 considered so far, in the form of a word lattice. 627 628 @return: Word lattice 629 @rtype: Lattice 630 """ 631 cdef ps_lattice_t *dag 632 cdef Lattice lat 633 634 dag = ps_get_lattice(self.ps) 635 if dag == NULL: 636 raise RuntimeError, "Failed to create word lattice" 637 lat = Lattice() 638 lat.set_dag(dag) 639 return lat 640 641 def get_lmset(self): 642 """ 643 Get the language model set. 644 645 This function returns the language model set, which allows you 646 to obtain language model scores or switch language models. 647 648 @return: Language model set 649 @rtype: sphinxbase.NGramModel 650 """ 651 cdef ngram_model_t *clm 652 cdef logmath_t *lmath 653 cdef cmd_ln_t *config 654 cdef NGramModel lm 655 656 lm = NGramModel() 657 clm = sb.ngram_model_retain(ps_get_lmset(self.ps)) 658 lm.set_lm(clm) 659 lmath = sb.logmath_retain(ps_get_logmath(self.ps)) 660 lm.set_lmath(lmath) 661 config = ps_get_config(self.ps) 662 663 # This is not necessarily true but it will have to do 664 lm.lw = sb.cmd_ln_float32_r(config, "-lw") 665 lm.wip = sb.cmd_ln_float32_r(config, "-wip") 666 lm.uw = sb.cmd_ln_float32_r(config, "-uw") 667 return lm 668 669 def update_lmset(self, NGramModel lmset): 670 """ 671 Notifies the decoder that the LMset has been modified. Primarily used 672 after adding/removing LMs or after switching to particular 673 LM within the set 674 675 @param lmset: the modified lmset 676 @type lmset: sphinxbase.NGramModel 677 678 @return: the lmset 679 @rtype: sphinxbase.NGramModel 680 """ 681 ps_update_lmset(self.ps, sb.ngram_model_retain(lmset.lm)) 682 return self 683 684 def add_word(self, word, phones, update=True): 685 """ 686 Add a word to the dictionary and current language model. 687 688 @param word: Name of the word to add. 689 @type word: str 690 @param phones: Pronunciation of the word, a space-separated list of phones. 691 @type phones: str 692 @param update: Update the decoder to recognize this new word. 693 If adding a number of words at once you may wish to pass 694 C{False} here. 695 @type update: bool 696 """ 697 return ps_add_word(self.ps, word, phones, update) 698 699 def load_dict(self, dictfile, fdictfile=None, format=None): 700 """ 701 Load a new pronunciation dictionary. 702 703 @param dictfile: Dictionary filename. 704 @type dictfile: str 705 @param fdictfile: Filler dictionary filename. 706 @type fdictfile: str 707 @param format: Dictionary format, currently unused. 708 @type format: str 709 """ 710 return ps_load_dict(self.ps, dictfile, fdictfile, format) 711 712 def save_dict(self, dictfile, format=None): 713 """ 714 Save current pronunciation dictionary to a file. 715 716 @param dictfile: Dictionary filename. 717 @type dictfile: str 718 @param format: Dictionary format, currently unused. 719 @type format: str 720 """ 721 return ps_save_dict(self.ps, dictfile, format) 722 723 def segments(self): 724 cdef int32 score 725 cdef ps_seg_t *first_seg 726 cdef SegmentIterator itor 727 first_seg = ps_seg_iter(self.ps, &score) 728 if first_seg == NULL: 729 raise RuntimeError, "Failed to create best path word segment iterator" 730 itor = SegmentIterator() 731 itor.set_iter(first_seg) 732 return (itor, score) 733 734