1# Copyright (c) 2008 Carnegie Mellon University. All rights
2# reserved.
3#
4# You may copy, modify, and distribute this code under the same terms
5# as PocketSphinx or Python, at your convenience, as long as this
6# notice is not removed.
7#
8# Author: David Huggins-Daines <dhuggins@cs.cmu.edu>
9
10cdef class LogMath:
11    """
12    Log-space math class.
13
14    This class provides fast logarithmic math functions in base
15    1.000+epsilon, useful for fixed point speech recognition.
16
17    @param base: The base B in which computation is to be done.
18    @type base: float
19    @param shift: Log values are shifted right by this many bits.
20    @type shift: int
21    @param use_table Whether to use an add table or not
22    @type use_table: bool
23    """
24    def __init__(self, base=1.0001, shift=0, use_table=1):
25        self.lmath = logmath_init(base, shift, use_table)
26
27    def __dealloc__(self):
28        """
29        Destructor for LogMath class.
30        """
31        logmath_free(self.lmath)
32
33    def get_zero(self):
34        """
35        Get the log-zero value.
36
37        @return: Smallest number representable by this object.
38        @rtype: int
39        """
40        return logmath_get_zero(self.lmath)
41
42    def add(self, a, b):
43        """
44        Add two numbers in log-space.
45
46        @param a: Logarithm A.
47        @type a: int
48        @param b: Logarithm B.
49        @type b: int
50        @return: log(exp(a)+exp(b))
51        @rtype: int
52        """
53        return logmath_add(self.lmath, a, b)
54
55    def log(self, x):
56        """
57        Return log-value of a number.
58
59        @param x: Number (in linear space)
60        @type x: float
61        @return: Log-value of x.
62        @rtype: int
63        """
64        return logmath_log(self.lmath, x)
65
66    def exp(self, x):
67        """
68        Return linear of a log-value
69
70        @param x: Logarithm X (in this object's base)
71        @type x: int
72        @return: Exponent (linear value) of X.
73        @rtype: float
74        """
75        return logmath_exp(self.lmath, x)
76
77    def log_to_ln(self, x):
78        """
79        Return natural logarithm of a log-value.
80
81        @param x: Logarithm X (in this object's base)
82        @type x: int
83        @return: Natural log equivalent of x.
84        @rtype: float
85        """
86        return logmath_log_to_ln(self.lmath, x)
87
88    def log_to_log10(self, x):
89        """
90        Return logarithm in base 10 of a log-value.
91
92        @param x: Logarithm X (in this object's base)
93        @type x: int
94        @return: log10 equivalent of x.
95        @rtype: float
96        """
97        return logmath_log_to_log10(self.lmath, x)
98
99    def ln_to_log(self, x):
100        """
101        Return log-value of a natural logarithm.
102
103        @param x: Logarithm X (in base e)
104        @type x: float
105        @return: Log-value equivalent of x.
106        @rtype: int
107        """
108        return logmath_ln_to_log(self.lmath, x)
109
110    def log10_to_log(self, x):
111        """
112        Return log-value of a base 10 logarithm.
113
114        @param x: Logarithm X (in base 10)
115        @type x: float
116        @return: Log-value equivalent of x.
117        @rtype: int
118        """
119        return logmath_log10_to_log(self.lmath, x)
120
121# Unfortunately, Cython doesn't actually export enums to Python...
122AUTO = NGRAM_AUTO
123ARPA = NGRAM_ARPA
124DMP = NGRAM_DMP
125UPPER = NGRAM_UPPER
126LOWER = NGRAM_LOWER
127
128cdef class NGramModel:
129    """
130    N-Gram language model class.
131
132    This class provides access to N-Gram language models stored on
133    disk.  These can be in ARPABO text format or Sphinx DMP format.
134    Methods are provided for scoring N-Grams based on the model
135    and looking up words in the model.
136
137    @param file: Path to an N-Gram model file.
138    @type file: string
139    @param lw: Language weight to apply to model probabilities.
140    @type lw: float
141    @param wip: Word insertion penalty to add to model probabilities
142    @type wip: float
143    @param uw: Weight to give unigrams when interpolating with uniform distribution.
144    @type uw: float
145    """
146    def __init__(self, file=None, lw=1.0, wip=1.0, uw=1.0, lmctl=None):
147        self.lmath = logmath_init(1.0001, 0, 0)
148        if file:
149            self.lm = ngram_model_read(NULL, file, NGRAM_AUTO, self.lmath)
150            ngram_model_apply_weights(self.lm, lw, wip, uw)
151        elif lmctl:
152            self.lm = ngram_model_set_read(NULL, lmctl, self.lmath)
153        else:
154            self.lm = NULL
155        self.lw = lw
156        self.wip = wip
157        self.uw = uw
158
159    cdef set_lm(NGramModel self, ngram_model_t *lm):
160        ngram_model_retain(lm)
161        ngram_model_free(self.lm)
162        self.lm = lm
163
164    cdef set_lmath(NGramModel self, logmath_t *lmath):
165        logmath_retain(lmath)
166        logmath_free(self.lmath)
167        self.lmath = lmath
168
169    def set_add(NGramModel self, NGramModel lm, name, float weight=1.0, int reuse_widmap=0):
170        """
171        Adds an language model to the lmset
172
173        @param lm: language model to add
174        @type lm: sphinxbase.NGramModel
175        @param name: name of the language model
176        @type name: string
177        @param weight: language model weight (defaults to 1.0)
178        @type weight: float
179        @param reuse_widmap: whether to reuse the word ip mapping
180        @type reuse_widmap: int
181        """
182        ngram_model_set_add(self.lm, lm.lm, name, weight, reuse_widmap)
183
184    def set_select(NGramModel self, name):
185        """
186        Instructs the LMSet to switch to the LM specified by the name param
187
188        @param name: the name associated with the language model
189        @type name: string
190        """
191        ngram_model_set_select(self.lm, name)
192
193    def __dealloc__(self):
194        """
195        Destructor for N-Gram model class.
196        """
197        logmath_free(self.lmath)
198        ngram_model_free(self.lm)
199
200    def apply_weights(self, lw=1.0, wip=1.0, uw=1.0):
201        """
202        Change the language model weights applied in L{score}.
203
204        @param lw: Language weight to apply to model probabilities.
205        @type lw: float
206        @param wip: Word insertion penalty to add to model probabilities
207        @type wip: float
208        @param uw: Weight to give unigrams when interpolating with uniform distribution.
209        @type uw: float
210        """
211        self.lw = lw
212        self.wip = wip
213        self.uw = uw
214        ngram_model_apply_weights(self.lm, lw, wip, uw)
215
216    def get_size(self):
217        """
218        Get the order of this model (i.e. the 'N' in 'N-gram')
219
220        @return: Order of this model
221        @rtype: int
222        """
223        return ngram_model_get_size(self.lm)
224
225    def get_counts(self):
226        """
227        Get the counts of each size of N-gram.
228
229        @return: Counts of 1, 2, ..., N grams
230        @rtype: tuple(int)
231        """
232        cdef const_int_ptr counts
233        counts = ngram_model_get_counts(self.lm)
234        return tuple([counts[i] for i in range(ngram_model_get_size(self.lm))])
235
236    def unknown_wid(self):
237        """
238        Get the ID for an unknown word.
239
240        In the case of a closed-vocabulary language model this will be -1.
241
242        @return: Word ID for the unknown word.
243        @rtype: int
244        """
245        return ngram_unknown_wid(self.lm)
246
247    def zero(self):
248        """
249        Get the log-zero value for this language model.
250
251        @return: Log value used to represent zero.
252        @rtype: float
253        """
254        return logmath_log_to_ln(self.lmath, ngram_zero(self.lm))
255
256    def wid(self, word):
257        """
258        Get the internal ID for a word.
259
260        @param word: Word in question
261        @type word: string
262        @return: Internal ID for word, or -1 if not present
263        @rtype: int
264        """
265        return ngram_wid(self.lm, word)
266
267    def word(self, wid):
268        """
269        Get the string corresponding to an internal word ID.
270
271        @param word: Word ID in question
272        @type word: int
273        @return: String for word, or None if not present
274        @rtype: string
275        """
276        return ngram_word(self.lm, wid)
277
278    # Note that this and prob() are almost exactly the same...
279    def score(self, word, *args):
280        """
281        Get the score for an N-Gram.
282
283        The argument list consists of the history words (as
284        null-terminated strings) of the N-Gram, in reverse order.
285        Therefore, if you wanted to get the N-Gram score for 'a whole
286        joy', you would call::
287
288         score, n_used = model.score('joy', 'whole', 'a')
289
290        This function returns a tuple, consisting of the score and the
291        number of words used in computing it (i.e. the effective size
292        of the N-Gram).  The score is returned in logarithmic form,
293        using base e.
294
295        If one of the words is not in the LM's vocabulary, the result
296        will depend on whether this is an open or closed vocabulary
297        language model.  For an open-vocabulary model, unknown words
298        are all mapped to the unigram <UNK> which has a non-zero
299        probability and also participates in higher-order N-Grams.
300        Therefore, you will get a score of some sort in this case.
301
302        For a closed-vocabulary model, unknown words are impossible
303        and thus have zero probability.  Therefore, if C{word} is
304        unknown, this function will return a 'zero' log-probability,
305        i.e. a large negative number.
306        """
307        cdef int32 wid
308        cdef int32 *hist
309        cdef int32 n_hist
310        cdef int32 n_used
311        cdef int32 score
312        wid = ngram_wid(self.lm, word)
313        n_hist = len(args)
314        hist = <int32 *>ckd_calloc(n_hist, sizeof(int32))
315        for i from 0 <= i < n_hist:
316            spam = args[i]
317            hist[i] = ngram_wid(self.lm, spam)
318        score = ngram_ng_score(self.lm, wid, hist, n_hist, &n_used)
319        ckd_free(hist)
320        return logmath_log_to_ln(self.lmath, score), n_used
321
322    def prob(self, word, *args):
323        """
324        Get the log-probability for an N-Gram.
325
326        This works effectively the same way as L{score}, except that
327        any weights (language weight, insertion penalty) applied to
328        the language model are ignored and the 'raw' probability value
329        is returned.
330        """
331        cdef int32 wid
332        cdef int32 *hist
333        cdef int32 n_hist
334        cdef int32 n_used
335        cdef int32 score
336        wid = ngram_wid(self.lm, word)
337        n_hist = len(args)
338        hist = <int32 *>ckd_calloc(n_hist, sizeof(int32))
339        for i from 0 <= i < n_hist:
340            spam = args[i]
341            hist[i] = ngram_wid(self.lm, spam)
342        score = ngram_ng_prob(self.lm, wid, hist, n_hist, &n_used)
343        ckd_free(hist)
344        return logmath_log_to_ln(self.lmath, score), n_used
345
346    def mgrams(self, m):
347        """
348        Return an iterator over each N-gram of order m+1.
349
350        This allows Pythonic iteration over the parameters of an
351        N-Gram model.
352
353        @param m: Order of requested N-grams minus one
354        @type m: int
355        @return: Iterator over M+1-grams
356        @rtype: NGramIter
357        """
358        cdef NGramIter itor
359        itor = NGramIter(self, m)
360        itor.itor = ngram_model_mgrams(self.lm, m)
361        return itor
362
363    def ngram(self, word, *args):
364        """
365        Return an N-Gram iterator pointing to a given N-gram.
366
367        This allows you to iterate over its successors among other
368        things.
369
370        @param word: Head word of requested N-gram.
371        @type word: str
372        @param args: History words of requested N-gram
373        @type args: str
374        @return: Iterator pointing to N-gram.
375        """
376        cdef NGramIter itor
377        cdef int32 wid
378        cdef int32 *hist
379        cdef int32 n_hist
380        wid = ngram_wid(self.lm, word)
381        n_hist = len(args)
382        hist = <int32 *>ckd_calloc(n_hist, sizeof(int32))
383        for i from 0 <= i < n_hist:
384            spam = args[i]
385            hist[i] = ngram_wid(self.lm, spam)
386        itor = NGramIter(self, n_hist)
387        # We do set_iter here, because we're returning something the
388        # user is immediately going to do stuff with.
389        itor.set_iter(ngram_ng_iter(self.lm, wid, hist, n_hist))
390        ckd_free(hist)
391        return itor
392
393    def add_word(self, word, weight=1.0):
394        return ngram_model_add_word(self.lm, word, weight)
395
396    def recode(self, frum, too):
397        cdef int rv
398        rv = ngram_model_recode(self.lm, frum, too)
399        if rv == -1:
400            raise ValueError, "Recode from %s to %s failed" % (frum, too)
401
402    def casefold(self, kase):
403        cdef int rv
404        rv = ngram_model_casefold(self.lm, kase)
405        if rv == -1:
406            raise ValueError, "Casefolding failed"
407
408    def write(self, file_name, format=NGRAM_AUTO):
409        cdef int rv
410        rv = ngram_model_write(self.lm, file_name, format)
411        if rv == -1:
412            raise ValueError, "Write %s to file failed" % file_name
413
414cdef class NGramIter:
415    """
416    N-Gram language model iterator class.
417
418    This class provides access to the individual N-grams stored in a
419    language model.
420    """
421    def __cinit__(self, NGramModel lm, int m):
422        self.itor = NULL
423        self.lm = lm
424        self.m = m
425        self.first_item = True
426
427    def __iter__(self):
428        self.first_item = True
429        return self
430
431    cdef set_iter(NGramIter self, ngram_iter_t *itor):
432        cdef int32 prob, bowt
433        cdef const_int32_ptr wids
434
435        if itor == NULL:
436            raise StopIteration
437        self.itor = itor
438        if self.first_item:
439            self.first_item = False
440        wids = ngram_iter_get(itor, &prob, &bowt)
441        self.log_prob = logmath_log_to_ln(self.lm.lmath, prob)
442        self.log_bowt = logmath_log_to_ln(self.lm.lmath, bowt)
443        self.words = []
444        for i in range(0, self.m+1):
445            self.words.append(ngram_word(self.lm.lm, wids[i]))
446
447    def __next__(self):
448        if self.first_item:
449            self.set_iter(self.itor)
450        else:
451            self.set_iter(ngram_iter_next(self.itor))
452        return self
453
454    def successors(self):
455        """
456        Get an iterator over N+1-gram successors of an N-gram.
457        """
458        cdef NGramIter itor
459        itor = NGramIter(self.lm, self.m + 1)
460        itor.itor = ngram_iter_successors(self.itor)
461        return itor
462
463def binstr(str val, int nbits):
464    """
465    Silly function to format a string as a binary string
466    """
467    cdef int i
468    outstr = ""
469    for c in val:
470        cval = ord(c)
471        cnb = min(8, nbits)
472        for i in range(0,cnb):
473            outstr += "%d" % ((cval & (1 << 7-i)) != 0)
474        nbits -= 8
475    return outstr
476
477def bincw(int cw, int nbits):
478    """
479    Silly function to format an int as a binary string
480    """
481    cdef int i
482    outstr = ""
483    for i in range(0,nbits):
484        outstr = "%s" % (cw & 1) + outstr
485        cw >>= 1
486    return outstr
487
488# FIXME: Due to the style of IO in huff_code API this part of the code
489# is not compatible with Python 3. This needs to be converted to
490# the new Python io module.
491
492cdef class HuffCode:
493    """
494    Huffman coding class.
495
496    You can either construct a Huffman code from an alphabet of
497    symbols with frequencies, or read one from a file.  Either the
498    alphabet or infile argument (but not both) must be passed to the
499    constructor.
500
501    @param alphabet: Alphabet of (symbol, frequency) pairs
502    @type alphabet: [(str, int)]
503    @param infile: File handle or filename to read from
504    @type infile: file | str
505    """
506    def __init__(self, alphabet=None, infile=None):
507        cdef char **symbols
508        cdef int *frequencies
509        cdef int nsym
510
511        if alphabet == None and infile == None:
512            raise ValueError, "One of alphabet or infile must be passed to constructor"
513        if alphabet != None and infile != None:
514            raise ValueError, "Only one of alphabet or infile must be passed to constructor"
515
516        self.fh = None
517        if infile:
518            self.read(infile)
519            return
520
521        nsym = len(alphabet)
522        frequencies = <int *>ckd_calloc(nsym, sizeof(int))
523        symbols = <char **>ckd_calloc(nsym, sizeof(char *))
524        # Need to create separate Python objects for each string,
525        # otherwise we get random duplicates of the codewords...
526        bogus = []
527        for i, spam in enumerate(alphabet):
528            sym, freq = spam
529            bogus.append(repr(sym))
530            frequencies[i] = freq
531            symbols[i] = bogus[-1]
532        self.hc = huff_code_build_str(symbols, frequencies, nsym)
533        ckd_free(frequencies)
534        ckd_free(symbols)
535
536    def read(self, infile):
537        if not isinstance(infile, file):
538            infile = file(infile, "rb")
539        huff_code_free(self.hc)
540        self.hc = huff_code_read(PyFile_AsFile(infile))
541
542    def write(self, outfile):
543        if not isinstance(outfile, file):
544            outfile = file(outfile, "wb")
545        huff_code_write(self.hc, PyFile_AsFile(outfile))
546
547    def dump(self, outfile):
548        if not isinstance(outfile, file):
549            outfile = file(outfile, "w")
550        huff_code_dump(self.hc, PyFile_AsFile(outfile))
551
552    def encode(self, seq):
553        """
554        Encode a sequence of symbols to a byte array, returning that
555        array and the bit offset of the next codeword in the last
556        byte (i.e. 8 minutes the number of extra zero bits)
557        """
558        cdef unsigned int cw
559        cdef int cwlen, nbits = 0, nbytes, offset, i
560        cdef unsigned char buf = 0
561        cdef char *output
562
563        for sym in seq:
564            sss = repr(sym)
565            cwlen = huff_code_encode_str(self.hc, sss, &cw)
566            nbits += cwlen
567        nbytes = int((nbits + 7) / 8)
568        offset = 0
569        output = <char *>PyMem_Malloc(nbytes + 1)
570        output[nbytes] = 0
571        i = 0
572        nbits = 0
573        for sym in seq:
574            sss = repr(sym)
575            cwlen = huff_code_encode_str(self.hc, sss, &cw)
576            #print "sym: %s cw: %s buf: %s output: %s" \
577            #      % (sym, bincw(cw, cwlen), bincw(buf >> (8-offset), offset),
578            #         binstr(output, nbits))
579            #print "cwlen",cwlen
580            # Do one byte at a time while full bytes are available
581            while cwlen >= 8:
582                # Fill low bits of buf with high bits of cw
583                buf |= (cw >> (cwlen - (8 - offset))) & ((1 << (8 - offset)) - 1)
584                # Append buf to output
585                output[i] = buf
586                i += 1
587                nbits += 8
588                # Fill high bits of buf with low bits of this byte
589                cwlen -= 8
590                buf = (cw >> cwlen) & ((1 << offset) - 1)
591                buf <<= (8-offset)
592                #print "cwlen",cwlen
593            # Now cwlen will be less than 8, but it might still be
594            # more than the available space in buf.
595            if cwlen >= (8 - offset):
596                # Fill low bits of buf with (8-offset) highest bits of cw
597                buf |= (cw >> (cwlen - (8 - offset))) & ((1 << (8 - offset)) - 1)
598                # Append buf to output
599                output[i] = buf
600                i += 1
601                nbits += 8
602                # cwlen is down to the remaining bits
603                cwlen -= (8 - offset)
604                # Offset is now zero since we just completed and emptied buf
605                offset = 0
606                # buf is zero, because we just emptied it without putting stuff in
607                buf = 0
608                #print "cwlen",cwlen
609                # Any remaining  bits will be taken care of below (we hope)
610            # Add remaining high bits of cw to low bits of buf
611            #print "cwlen",cwlen
612            buf |= ((cw & ((1 << cwlen) - 1)) << (8 - offset - cwlen))
613            offset += cwlen
614            #print "after buf: %s output: %s" \
615            #      % (bincw(buf >> (8-offset), offset), binstr(output, nbits))
616        if offset > 0:
617            # Append buf to output
618            output[i] = buf
619            nbits += offset
620            i += 1
621        #print "output:", binstr(output, nbits)
622        outstr = PyString_FromStringAndSize(output, nbytes)
623        PyMem_Free(output)
624        return (outstr, offset)
625
626    def decode(self, data):
627        """
628        Decode a sequence of symbols from a string, returning the
629        sequence and the bit offset of the next codeword in the last
630        byte (i.e. 8 minutes the number of remaining bits)
631        """
632        cdef int offset
633        cdef const_char_ptr dptr
634        cdef const_char_ptr strval
635        cdef size_t dlen
636
637        dlen = len(data)
638        offset = 0
639        dptr = data
640        output = []
641        while True:
642            strval = huff_code_decode_str(self.hc, &dptr, &dlen, &offset)
643            if strval == NULL:
644                break
645            output.append(strval)
646        if dlen > 1:
647            raise ValueError, "Invalid data at position %d" % (len(data) - dlen)
648        return (output, offset)
649
650    def attach(self, fh, char *mode):
651        if not isinstance(fh, file):
652            fh = file(fh, mode)
653        self.fh = fh
654        huff_code_attach(self.hc, PyFile_AsFile(fh), mode)
655
656    def detach(self):
657        huff_code_detach(self.hc)
658        self.fh = None
659
660    def encode_to_file(self, seq):
661        if self.fh == None:
662            raise RuntimeError, "No file is attached"
663        for sym in seq:
664            strsym = repr(sym)
665            huff_code_encode_str(self.hc, strsym, NULL)
666
667    def decode_from_file(self):
668        cdef const_char_ptr sym
669        if self.fh == None:
670            raise RuntimeError, "No file is attached"
671        sym = huff_code_decode_str(self.hc, NULL, NULL, NULL)
672        if sym == NULL:
673            return None
674        else:
675            return sym
676
677    def __dealloc__(self):
678        if self.fh:
679            self.detach()
680        huff_code_free(self.hc)
681