1# Copyright (c) 2008 Carnegie Mellon University. All rights
2# reserved.
3#
4# You may copy, modify, and distribute this code under the same terms
5# as PocketSphinx or Python, at your convenience, as long as this
6# notice is not removed.
7#
8# Author: David Huggins-Daines <dhuggins@cs.cmu.edu>
9
10cdef class LatNode:
11    """
12    Node in a word lattice.
13
14    @ivar word: Word this node corresponds to (with pronunciation variant).
15    @type word: str
16    @ivar baseword: Base word (no pronunciation variant) this node corresponds to.
17    @type baseword: str
18    @ivar sf: Start frame for this node.
19    @type sf: int
20    @ivar fef: First ending frame for this node.
21    @type fef: int
22    @ivar lef: Last ending frame for this node.
23    @type lef: int
24    @ivar best_exit: Best scoring exit link from this node
25    @type best_exit: LatLink
26    @ivar prob: Posterior probability for this node.
27    @type prob: float
28    """
29    def __cinit__(self):
30        self.node = NULL
31
32    cdef set_node(LatNode self, ps_lattice_t *dag, ps_latnode_t *node):
33        """
34        Internal function - binds this to a PocketSphinx lattice node.
35        """
36        cdef short fef, lef
37        cdef ps_latlink_t *best_exit
38        self.dag = dag
39        self.node = node
40        self.word = ps_latnode_word(dag, node)
41        self.baseword = ps_latnode_baseword(dag, node)
42        self.sf = ps_latnode_times(node, &fef, &lef)
43        self.fef = fef
44        self.lef = lef
45        self.best_exit = None
46        best_exit = NULL
47        self.prob = sb.logmath_log_to_ln(ps_lattice_get_logmath(dag),
48                                         ps_latnode_prob(dag, node, &best_exit))
49        if best_exit != NULL:
50            self.best_exit = LatLink()
51            self.best_exit.set_link(dag, best_exit)
52
53    def exits(self):
54        """
55        Obtain an iterator over arcs exiting this node.
56
57        @return: Iterator over arcs exiting this node
58        @rtype: LatLinkIterator
59        """
60        cdef LatLinkIterator itor
61        cdef ps_latlink_iter_t *citor
62
63        citor = ps_latnode_exits(self.node)
64        itor = LatLinkIterator()
65        itor.itor = citor
66        itor.dag = self.dag
67        return itor
68
69    def entries(self):
70        """
71        Obtain an iterator over arcs entering this node.
72
73        @return: Iterator over arcs entering this node
74        @rtype: LatLinkIterator
75        """
76        cdef LatLinkIterator itor
77        cdef ps_latlink_iter_t *citor
78
79        citor = ps_latnode_entries(self.node)
80        itor = LatLinkIterator()
81        itor.itor = citor
82        itor.dag = self.dag
83        return itor
84
85cdef class LatNodeIterator:
86    """
87    Iterator over word lattice nodes.
88    """
89    def __init__(self, start, end):
90        self.itor = NULL
91        self.first_node = True
92        self.start = start
93        self.end = end
94
95    def __iter__(self):
96        return self
97
98    def __next__(self):
99        """
100        Advance iterator and return the next node.
101
102        @return: Next lattice node in this iterator.
103        @rtype: LatNode
104        """
105        cdef LatNode node
106        cdef int start
107        cdef ps_latnode_t *cnode
108
109        # Make sure we keep raising exceptions at the end
110        if self.itor == NULL:
111            raise StopIteration
112        # Advance the iterator if this isn't the first item
113        if self.first_node:
114            self.first_node = False
115        else:
116            self.itor = ps_latnode_iter_next(self.itor)
117            if self.itor == NULL:
118                raise StopIteration
119        # Look for the next node within the given time range
120        cnode = ps_latnode_iter_node(self.itor)
121        start = ps_latnode_times(cnode, NULL, NULL)
122        while start < self.start or start >= self.end:
123            self.itor = ps_latnode_iter_next(self.itor)
124            if self.itor == NULL:
125                raise StopIteration
126            cnode = ps_latnode_iter_node(self.itor)
127            start = ps_latnode_times(cnode, NULL, NULL)
128        node = LatNode()
129        node.set_node(self.dag, cnode)
130        return node
131
132cdef class LatLink:
133    """
134    Link (edge) in a word lattice, connecting two nodes.
135
136    @ivar word: Word (with pronunciation variant) for this link.
137    @type word: str
138    @ivar baseword: Base word (no pronunciation variant) for this link.
139    @type baseword: str
140    @ivar sf: Start frame for this link.
141    @type sf: int
142    @ivar fef: Ending frame for this link.
143    @type fef: int
144    @ivar prob: Posterior probability for this link.
145    @type prob: float
146    """
147    def __cinit__(self):
148        self.link = NULL
149
150    cdef set_link(LatLink self, ps_lattice_t *dag, ps_latlink_t *link):
151        """
152        Internal function - binds this to a PocketSphinx lattice link.
153        """
154        cdef short sf
155        self.dag = dag
156        self.link = link
157        self.word = ps_latlink_word(dag, link)
158        self.baseword = ps_latlink_baseword(dag, link)
159        self.ef = ps_latlink_times(link, &sf)
160        self.sf = sf
161        self.prob = sb.logmath_log_to_ln(ps_lattice_get_logmath(dag),
162                                         ps_latlink_prob(dag, link, NULL))
163
164    def nodes(self):
165        """
166        Get source and destination nodes for this link.
167
168        @return: Source and destination nodes for this link
169        @rtype: (LatNode, LatNode)
170        """
171        cdef LatNode src, dest
172        cdef ps_latnode_t *csrc, *cdest
173
174        cdest = ps_latlink_nodes(self.link, &csrc)
175        src = LatNode()
176        src.set_node(self.dag, csrc)
177        dest = LatNode()
178        dest.set_node(self.dag, cdest)
179        return src, dest
180
181    def pred(self):
182        """
183        Get backpointer from this link.
184
185        @return: Backpointer from this link, set by bestpath search.
186        @rtype: LatLink
187        """
188        cdef LatLink pred
189        cdef ps_latlink_t *cpred
190
191        cpred = ps_latlink_pred(self.link)
192        if cpred == NULL:
193            return None
194        pred = LatLink()
195        pred.set_link(self.dag, cpred)
196        return pred
197
198cdef class LatLinkIterator:
199    """
200    Iterator over word lattice links.
201    """
202    def __cinit__(self):
203        self.itor = NULL
204        self.first_link = True
205
206    def __iter__(self):
207        return self
208
209    def __next__(self):
210        """
211        Advance iterator and return the next link.
212
213        @return: Next lattice link in this iterator.
214        @rtype: LatLink
215        """
216        cdef LatLink link
217        if self.first_link:
218            self.first_link = False
219        else:
220            self.itor = ps_latlink_iter_next(self.itor)
221        if self.itor == NULL:
222            raise StopIteration
223        link = LatLink()
224        link.set_link(self.dag, ps_latlink_iter_link(self.itor))
225        return link
226
227cdef class Lattice:
228    """
229    Word lattice.
230
231    The word lattice is a compact representation of the set of
232    hypotheses considered by the decoder when recognizing an
233    utterance.
234
235    A lattice object can be constructed either from a lattice file
236    on disk or from a 'boxed' object passed in from GStreamer (or,
237    in theory, anything else that uses GLib).  In the first case,
238    the C{ps} argument is required.
239
240    @param ps: PocketSphinx decoder.
241    @type ps: Decoder
242    @param latfile: Filename of lattice file to read.
243    @type latfile: str
244    @param boxed: Boxed pointer from GStreamer containing a lattice
245    @type boxed: PyGBoxed
246
247    @ivar n_frames: Number of frames of audio covered by this lattice
248    @type n_frames: int
249    @ivar start: Start node
250    @type start: LatNode
251    @ivar end: End node
252    @type end: LatNode
253    """
254    def __init__(self, ps=None, latfile=None, boxed=None):
255        self.dag = NULL
256        if latfile:
257            self.read_dag(ps, latfile)
258        if boxed:
259            self.set_boxed(boxed)
260
261    cdef read_dag(Lattice self, Decoder ps, latfile):
262        if ps:
263            self.dag = ps_lattice_read(ps.ps, latfile)
264        else:
265            self.dag = ps_lattice_read(NULL, latfile)
266        self.n_frames = ps_lattice_n_frames(self.dag)
267        if self.dag == NULL:
268            raise RuntimeError, "Failed to read lattice from %s" % latfile
269
270    cdef set_dag(Lattice self, ps_lattice_t *dag):
271        ps_lattice_retain(dag)
272        ps_lattice_free(self.dag)
273        self.dag = dag
274        self.n_frames = ps_lattice_n_frames(dag)
275
276    cdef set_boxed(Lattice self, box):
277        cdef ps_lattice_t *dag
278        dag = <ps_lattice_t *>(<PyGBoxed *>box).boxed
279        ps_lattice_retain(dag)
280        ps_lattice_free(self.dag)
281        self.dag = dag
282        self.n_frames = ps_lattice_n_frames(self.dag)
283
284    def __dealloc__(self):
285        ps_lattice_free(self.dag)
286
287    def bestpath(self, NGramModel lmset, float lwf, float ascale):
288        """
289        Find the best path through the lattice, optionally using a
290        language model.
291
292        This function performs best-path search on the lattice, and
293        returns the final link in the best path found.  The existing
294        acoustic scores on the lattice links are used in conjunction
295        with an optional language model.  A scaling factor can be
296        applied to the acoustic scores to produce more useful
297        posterior probabilities (in conjunction with C{posterior()},
298        below).
299
300        @param lmset: Language model (set) to use for rescoring
301        @type lmset: sphinxbase.NGramModel
302        @param lwf: Weight to apply to language model scores (on top
303        of any existing language model weight set in C{lmset}).
304        @type lwf: float
305        @param ascale: Weight to apply to acoustic model scores.
306        @type ascale: float
307        @return: Final link in best path.
308        @rtype: LatLink
309        """
310        cdef ps_latlink_t *end
311        cdef LatLink link
312        end = ps_lattice_bestpath(self.dag, lmset.lm, lwf, ascale)
313        link = LatLink()
314        link.set_link(self.dag, end)
315        return link
316
317    def posterior(self, NGramModel lmset, float ascale):
318        """
319        Calculate posterior probabilities of all links in a lattice.
320
321        This function performs the backward part of forward-backward
322        calculation of posterior probabilities for all links in the
323        lattice.  It assumes that C{bestpath()} has already been
324        called on the lattice.
325
326        @param lmset: Language model (set) to use for rescoring
327        @type lmset: sphinxbase.NGramModel
328        @param ascale: Weight to apply to acoustic model scores.
329        @type ascale: float
330        @return: Log-probability of the lattice as a whole.
331        @rtype: float
332        """
333        cdef logmath_t *lmath
334        lmath = ps_lattice_get_logmath(self.dag)
335        return sb.logmath_log_to_ln(lmath,
336                                    ps_lattice_posterior(self.dag, lmset.lm, ascale))
337
338    def nodes(self, start=0, end=-1):
339        """
340        Get an iterator over all nodes in the lattice.
341
342        @param start: First frame to iterate over.
343        @type start: int
344        @param end: Last frame to iterate over, or -1 for all remaining
345        @type end: int
346        @return: Iterator over nodes.
347        @rtype: LatNodeIterator
348        """
349        cdef LatNodeIterator itor
350
351        if end == -1:
352            end = ps_lattice_n_frames(self.dag)
353        itor = LatNodeIterator(start, end)
354        itor.dag = self.dag
355        itor.itor = ps_latnode_iter(self.dag)
356        return itor
357
358    def write(self, outfile):
359        """
360        Write the lattice to an output file.
361
362        @param outfile: Name of file to write to.
363        @type outfile: str
364        """
365        cdef int rv
366
367        rv = ps_lattice_write(self.dag, outfile)
368        if rv < 0:
369            raise RuntimeError, "Failed to write lattice to %s" % outfile
370
371
372cdef class Segment:
373
374    def __init__(self):
375        self.seg = NULL
376
377    cdef set_seg(self, ps_seg_t *seg):
378        self.seg = seg
379
380    def word(self):
381        return ps_seg_word(self.seg)
382
383    def frames(self):
384        cdef int sf, ef
385        ps_seg_frames(self.seg, &sf, &ef)
386        return(sf, ef)
387
388    def prob(self):
389        cdef int32 ascr, lscr, lback
390        ps_seg_prob(self.seg, &ascr, &lscr, &lback)
391        return (ascr, lscr, lback)
392
393cdef class SegmentIterator:
394    """
395    Iterator for best hypothesis word segments of best hypothesis
396    """
397    def __init__(self):
398        self.itor = NULL
399        self.first_seg = False
400
401    cdef set_iter(self, ps_seg_t *seg):
402        self.itor = seg
403        self.first_seg = True
404
405    def __iter__(self):
406        return self
407
408    def __next__(self):
409        cdef Segment seg
410        if self.first_seg:
411            self.first_seg = False
412        else:
413            self.itor = ps_seg_next(self.itor)
414        if NULL == self.itor:
415            raise StopIteration
416        else:
417            seg = Segment()
418            seg.set_seg(self.itor)
419        return seg
420
421
422cdef class Decoder:
423    """
424    PocketSphinx speech decoder.
425
426    To initialize the PocketSphinx decoder, pass a list of keyword
427    arguments to the constructor::
428
429     d = pocketsphinx.Decoder(hmm='/path/to/acoustic/model',
430                              lm='/path/to/language/model',
431                              dict='/path/to/dictionary',
432                              beam='1e-80')
433
434    If no arguments are passed, the default acoustic and language
435    models will be loaded, which may be acceptable for general English
436    speech.  Any arguments supported by the PocketSphinx decoder are
437    allowed here.  Only the most frequent ones are described below.
438
439    @param boxed: Boxed pointer from GStreamer containing a decoder
440    @type boxed: PyGBoxed
441    @param hmm: Path to acoustic model directory
442    @type hmm: str
443    @param dict: Path to dictionary file
444    @type dict: str
445    @param lm: Path to language model file
446    @type lm: str
447    @param jsgf: Path to JSGF grammar file
448    @type jsgf str
449    """
450    def __init__(self, **kwargs):
451        cdef cmd_ln_t *config
452        cdef int i
453
454        # Construct from an existing GObject pointer if given
455        if 'boxed' in kwargs:
456            self.argc = 0
457            self.set_boxed(kwargs['boxed'])
458            return
459
460        # A much more concise version of what pocketsphinx_parse_argdict used to do
461        self.argc = len(kwargs) * 2
462        self.argv = <char **>sb.ckd_calloc(self.argc, sizeof(char *))
463        i = 0
464        for k, v in kwargs.iteritems():
465            if k[0] != '-':
466                k = '-' + k
467            self.argv[i] = sb.ckd_salloc(k)
468            self.argv[i+1] = sb.ckd_salloc(v)
469            i = i + 2
470        config = sb.cmd_ln_parse_r(NULL, ps_args(), self.argc, self.argv, 0)
471        if config == NULL:
472            raise RuntimeError, "Failed to parse argument list"
473        self.ps = ps_init(config)
474        sb.cmd_ln_free_r(config)
475        if self.ps == NULL:
476            raise RuntimeError, "Failed to initialize PocketSphinx"
477
478    cdef set_boxed(Decoder self, box):
479        cdef ps_decoder_t *ps
480        ps = <ps_decoder_t *>(<PyGBoxed *>box).boxed
481        ps_retain(ps)
482        ps_free(self.ps)
483        self.ps = ps
484
485    def __dealloc__(self):
486        ps_free(self.ps)
487        for i from 0 <= i < self.argc:
488            sb.ckd_free(self.argv[i])
489        sb.ckd_free(self.argv)
490        self.argv = NULL
491        self.argc = 0
492
493    def decode_raw(self, fh, uttid=None, maxsamps=-1):
494        """
495        Decode raw audio from a file.
496
497        @param fh: Filehandle to read audio from.
498        @type fh: file
499        @param uttid: Identifier to give to this utterance.
500        @type uttid: str
501        @param maxsamps: Maximum number of samples to read.  If not
502        specified or -1, the rest of the file will be read.
503        @type maxsamps: int
504        """
505        cdef FILE *cfh
506        cdef int nsamp
507        cdef char *cuttid
508
509        cfh = PyFile_AsFile(fh)
510        if uttid == None:
511            cuttid = NULL
512        else:
513            cuttid = uttid
514        return ps_decode_raw(self.ps, cfh, cuttid, maxsamps)
515
516    def decode_senscr(self, fh, uttid=None):
517        """
518        Decode senone scores from a file.
519
520        @param fh: Filehandle to read senone scores from.
521        @type fh: file
522        @param uttid: Identifier to give to this utterance.
523        @type uttid: str
524        """
525        cdef FILE *cfh
526        cdef char *cuttid
527
528        cfh = PyFile_AsFile(fh)
529        if uttid == None:
530            cuttid = NULL
531        else:
532            cuttid = uttid
533        return ps_decode_senscr(self.ps, cfh, cuttid)
534
535    def start_utt(self, uttid=None):
536        """
537        Prepare the decoder to recognize an utterance.
538
539        @param uttid: Identifier to give to this utterance.
540        @type uttid: str
541        """
542        cdef char *cuttid
543
544        if uttid == None:
545            cuttid = NULL
546        else:
547            cuttid = uttid
548        if ps_start_utt(self.ps, cuttid) < 0:
549            raise RuntimeError, "Failed to start utterance processing"
550
551    def process_raw(self, data, no_search=False, full_utt=False):
552        """
553        Process (decode) some audio data.
554
555        @param data: Audio data to process.  This is packed binary
556        data, which consists of single-channel, 16-bit PCM audio, at
557        the sample rate specified when the decoder was initialized.
558        @type data: str
559        @param no_search: Buffer the data without actually processing it (default is to process the
560        data as it is received).
561        @type no_search: bool
562        @param full_utt: This block of data is an entire utterance.
563        Processing an entire utterance at once may improve
564        recognition, particularly for the first utterance passed to
565        the decoder.
566        @type full_utt: bool
567        """
568        cdef Py_ssize_t len
569        cdef char* strdata
570        cdef raw_data_ptr cdata
571
572        PyString_AsStringAndSize(data, &strdata, &len)
573        cdata = strdata
574        if ps_process_raw(self.ps, cdata, len / 2, no_search, full_utt) < 0:
575            raise RuntimeError, "Failed to process %d samples of audio data" % len / 2
576
577    def end_utt(self):
578        """
579        Finish processing an utterance.
580        """
581        if ps_end_utt(self.ps) < 0:
582            raise RuntimeError, "Failed to stop utterance processing"
583
584    def get_hyp(self):
585        """
586        Get a hypothesis string.
587
588        This function returns the text which has been recognized so
589        far, or, if C{end_utt()} has been called, the final
590        recognition result.
591
592        @return: Hypothesis string, utterance ID, recognition score
593        @rtype: (str, str, int)
594        """
595        cdef const_char_ptr hyp
596        cdef const_char_ptr uttid
597        cdef int score
598
599        hyp = ps_get_hyp(self.ps, &score, &uttid)
600
601        # No result
602        if hyp == NULL:
603             return None, None, 0
604
605        return hyp, uttid, score
606
607    def get_prob(self):
608        """
609	Get a posterior probability.
610
611	Returns the posterior in linear scale.
612
613	@return: posterior probability of the result
614	@rtype: float
615	"""
616        cdef logmath_t *lmath
617        cdef const_char_ptr uttid
618        lmath = ps_get_logmath(self.ps)
619        return sb.logmath_exp(lmath, ps_get_prob(self.ps, &uttid))
620
621    def get_lattice(self):
622        """
623        Get the word lattice.
624
625        This function returns all hypotheses which have been
626        considered so far, in the form of a word lattice.
627
628        @return: Word lattice
629        @rtype: Lattice
630        """
631        cdef ps_lattice_t *dag
632        cdef Lattice lat
633
634        dag = ps_get_lattice(self.ps)
635        if dag == NULL:
636            raise RuntimeError, "Failed to create word lattice"
637        lat = Lattice()
638        lat.set_dag(dag)
639        return lat
640
641    def get_lmset(self):
642        """
643        Get the language model set.
644
645        This function returns the language model set, which allows you
646        to obtain language model scores or switch language models.
647
648        @return: Language model set
649        @rtype: sphinxbase.NGramModel
650        """
651        cdef ngram_model_t *clm
652        cdef logmath_t *lmath
653        cdef cmd_ln_t *config
654        cdef NGramModel lm
655
656        lm = NGramModel()
657        clm = sb.ngram_model_retain(ps_get_lmset(self.ps))
658        lm.set_lm(clm)
659        lmath = sb.logmath_retain(ps_get_logmath(self.ps))
660        lm.set_lmath(lmath)
661        config = ps_get_config(self.ps)
662
663        # This is not necessarily true but it will have to do
664        lm.lw = sb.cmd_ln_float32_r(config, "-lw")
665        lm.wip = sb.cmd_ln_float32_r(config, "-wip")
666        lm.uw = sb.cmd_ln_float32_r(config, "-uw")
667        return lm
668
669    def update_lmset(self, NGramModel lmset):
670        """
671        Notifies the decoder that the LMset has been modified.  Primarily used
672        after adding/removing LMs or after switching to particular
673        LM within the set
674
675        @param lmset: the modified lmset
676        @type lmset: sphinxbase.NGramModel
677
678        @return: the lmset
679        @rtype: sphinxbase.NGramModel
680        """
681        ps_update_lmset(self.ps, sb.ngram_model_retain(lmset.lm))
682        return self
683
684    def add_word(self, word, phones, update=True):
685        """
686        Add a word to the dictionary and current language model.
687
688        @param word: Name of the word to add.
689        @type word: str
690        @param phones: Pronunciation of the word, a space-separated list of phones.
691        @type phones: str
692        @param update: Update the decoder to recognize this new word.
693        If adding a number of words at once you may wish to pass
694        C{False} here.
695        @type update: bool
696        """
697        return ps_add_word(self.ps, word, phones, update)
698
699    def load_dict(self, dictfile, fdictfile=None, format=None):
700        """
701        Load a new pronunciation dictionary.
702
703        @param dictfile: Dictionary filename.
704        @type dictfile: str
705        @param fdictfile: Filler dictionary filename.
706        @type fdictfile: str
707        @param format: Dictionary format, currently unused.
708        @type format: str
709        """
710        return ps_load_dict(self.ps, dictfile, fdictfile, format)
711
712    def save_dict(self, dictfile, format=None):
713        """
714        Save current pronunciation dictionary to a file.
715
716        @param dictfile: Dictionary filename.
717        @type dictfile: str
718        @param format: Dictionary format, currently unused.
719        @type format: str
720        """
721        return ps_save_dict(self.ps, dictfile, format)
722
723    def segments(self):
724        cdef int32 score
725        cdef ps_seg_t *first_seg
726        cdef SegmentIterator itor
727        first_seg = ps_seg_iter(self.ps, &score)
728        if first_seg == NULL:
729            raise RuntimeError, "Failed to create best path word segment iterator"
730        itor = SegmentIterator()
731        itor.set_iter(first_seg)
732        return (itor, score)
733
734