1# Copyright (c) 2008 Carnegie Mellon University. All rights 2# reserved. 3# 4# You may copy, modify, and distribute this code under the same terms 5# as PocketSphinx or Python, at your convenience, as long as this 6# notice is not removed. 7# 8# Author: David Huggins-Daines <dhuggins@cs.cmu.edu> 9 10cdef class LogMath: 11 """ 12 Log-space math class. 13 14 This class provides fast logarithmic math functions in base 15 1.000+epsilon, useful for fixed point speech recognition. 16 17 @param base: The base B in which computation is to be done. 18 @type base: float 19 @param shift: Log values are shifted right by this many bits. 20 @type shift: int 21 @param use_table Whether to use an add table or not 22 @type use_table: bool 23 """ 24 def __init__(self, base=1.0001, shift=0, use_table=1): 25 self.lmath = logmath_init(base, shift, use_table) 26 27 def __dealloc__(self): 28 """ 29 Destructor for LogMath class. 30 """ 31 logmath_free(self.lmath) 32 33 def get_zero(self): 34 """ 35 Get the log-zero value. 36 37 @return: Smallest number representable by this object. 38 @rtype: int 39 """ 40 return logmath_get_zero(self.lmath) 41 42 def add(self, a, b): 43 """ 44 Add two numbers in log-space. 45 46 @param a: Logarithm A. 47 @type a: int 48 @param b: Logarithm B. 49 @type b: int 50 @return: log(exp(a)+exp(b)) 51 @rtype: int 52 """ 53 return logmath_add(self.lmath, a, b) 54 55 def log(self, x): 56 """ 57 Return log-value of a number. 58 59 @param x: Number (in linear space) 60 @type x: float 61 @return: Log-value of x. 62 @rtype: int 63 """ 64 return logmath_log(self.lmath, x) 65 66 def exp(self, x): 67 """ 68 Return linear of a log-value 69 70 @param x: Logarithm X (in this object's base) 71 @type x: int 72 @return: Exponent (linear value) of X. 73 @rtype: float 74 """ 75 return logmath_exp(self.lmath, x) 76 77 def log_to_ln(self, x): 78 """ 79 Return natural logarithm of a log-value. 80 81 @param x: Logarithm X (in this object's base) 82 @type x: int 83 @return: Natural log equivalent of x. 84 @rtype: float 85 """ 86 return logmath_log_to_ln(self.lmath, x) 87 88 def log_to_log10(self, x): 89 """ 90 Return logarithm in base 10 of a log-value. 91 92 @param x: Logarithm X (in this object's base) 93 @type x: int 94 @return: log10 equivalent of x. 95 @rtype: float 96 """ 97 return logmath_log_to_log10(self.lmath, x) 98 99 def ln_to_log(self, x): 100 """ 101 Return log-value of a natural logarithm. 102 103 @param x: Logarithm X (in base e) 104 @type x: float 105 @return: Log-value equivalent of x. 106 @rtype: int 107 """ 108 return logmath_ln_to_log(self.lmath, x) 109 110 def log10_to_log(self, x): 111 """ 112 Return log-value of a base 10 logarithm. 113 114 @param x: Logarithm X (in base 10) 115 @type x: float 116 @return: Log-value equivalent of x. 117 @rtype: int 118 """ 119 return logmath_log10_to_log(self.lmath, x) 120 121# Unfortunately, Cython doesn't actually export enums to Python... 122AUTO = NGRAM_AUTO 123ARPA = NGRAM_ARPA 124DMP = NGRAM_DMP 125UPPER = NGRAM_UPPER 126LOWER = NGRAM_LOWER 127 128cdef class NGramModel: 129 """ 130 N-Gram language model class. 131 132 This class provides access to N-Gram language models stored on 133 disk. These can be in ARPABO text format or Sphinx DMP format. 134 Methods are provided for scoring N-Grams based on the model 135 and looking up words in the model. 136 137 @param file: Path to an N-Gram model file. 138 @type file: string 139 @param lw: Language weight to apply to model probabilities. 140 @type lw: float 141 @param wip: Word insertion penalty to add to model probabilities 142 @type wip: float 143 @param uw: Weight to give unigrams when interpolating with uniform distribution. 144 @type uw: float 145 """ 146 def __init__(self, file=None, lw=1.0, wip=1.0, uw=1.0, lmctl=None): 147 self.lmath = logmath_init(1.0001, 0, 0) 148 if file: 149 self.lm = ngram_model_read(NULL, file, NGRAM_AUTO, self.lmath) 150 ngram_model_apply_weights(self.lm, lw, wip, uw) 151 elif lmctl: 152 self.lm = ngram_model_set_read(NULL, lmctl, self.lmath) 153 else: 154 self.lm = NULL 155 self.lw = lw 156 self.wip = wip 157 self.uw = uw 158 159 cdef set_lm(NGramModel self, ngram_model_t *lm): 160 ngram_model_retain(lm) 161 ngram_model_free(self.lm) 162 self.lm = lm 163 164 cdef set_lmath(NGramModel self, logmath_t *lmath): 165 logmath_retain(lmath) 166 logmath_free(self.lmath) 167 self.lmath = lmath 168 169 def set_add(NGramModel self, NGramModel lm, name, float weight=1.0, int reuse_widmap=0): 170 """ 171 Adds an language model to the lmset 172 173 @param lm: language model to add 174 @type lm: sphinxbase.NGramModel 175 @param name: name of the language model 176 @type name: string 177 @param weight: language model weight (defaults to 1.0) 178 @type weight: float 179 @param reuse_widmap: whether to reuse the word ip mapping 180 @type reuse_widmap: int 181 """ 182 ngram_model_set_add(self.lm, lm.lm, name, weight, reuse_widmap) 183 184 def set_select(NGramModel self, name): 185 """ 186 Instructs the LMSet to switch to the LM specified by the name param 187 188 @param name: the name associated with the language model 189 @type name: string 190 """ 191 ngram_model_set_select(self.lm, name) 192 193 def __dealloc__(self): 194 """ 195 Destructor for N-Gram model class. 196 """ 197 logmath_free(self.lmath) 198 ngram_model_free(self.lm) 199 200 def apply_weights(self, lw=1.0, wip=1.0, uw=1.0): 201 """ 202 Change the language model weights applied in L{score}. 203 204 @param lw: Language weight to apply to model probabilities. 205 @type lw: float 206 @param wip: Word insertion penalty to add to model probabilities 207 @type wip: float 208 @param uw: Weight to give unigrams when interpolating with uniform distribution. 209 @type uw: float 210 """ 211 self.lw = lw 212 self.wip = wip 213 self.uw = uw 214 ngram_model_apply_weights(self.lm, lw, wip, uw) 215 216 def get_size(self): 217 """ 218 Get the order of this model (i.e. the 'N' in 'N-gram') 219 220 @return: Order of this model 221 @rtype: int 222 """ 223 return ngram_model_get_size(self.lm) 224 225 def get_counts(self): 226 """ 227 Get the counts of each size of N-gram. 228 229 @return: Counts of 1, 2, ..., N grams 230 @rtype: tuple(int) 231 """ 232 cdef const_int_ptr counts 233 counts = ngram_model_get_counts(self.lm) 234 return tuple([counts[i] for i in range(ngram_model_get_size(self.lm))]) 235 236 def unknown_wid(self): 237 """ 238 Get the ID for an unknown word. 239 240 In the case of a closed-vocabulary language model this will be -1. 241 242 @return: Word ID for the unknown word. 243 @rtype: int 244 """ 245 return ngram_unknown_wid(self.lm) 246 247 def zero(self): 248 """ 249 Get the log-zero value for this language model. 250 251 @return: Log value used to represent zero. 252 @rtype: float 253 """ 254 return logmath_log_to_ln(self.lmath, ngram_zero(self.lm)) 255 256 def wid(self, word): 257 """ 258 Get the internal ID for a word. 259 260 @param word: Word in question 261 @type word: string 262 @return: Internal ID for word, or -1 if not present 263 @rtype: int 264 """ 265 return ngram_wid(self.lm, word) 266 267 def word(self, wid): 268 """ 269 Get the string corresponding to an internal word ID. 270 271 @param word: Word ID in question 272 @type word: int 273 @return: String for word, or None if not present 274 @rtype: string 275 """ 276 return ngram_word(self.lm, wid) 277 278 # Note that this and prob() are almost exactly the same... 279 def score(self, word, *args): 280 """ 281 Get the score for an N-Gram. 282 283 The argument list consists of the history words (as 284 null-terminated strings) of the N-Gram, in reverse order. 285 Therefore, if you wanted to get the N-Gram score for 'a whole 286 joy', you would call:: 287 288 score, n_used = model.score('joy', 'whole', 'a') 289 290 This function returns a tuple, consisting of the score and the 291 number of words used in computing it (i.e. the effective size 292 of the N-Gram). The score is returned in logarithmic form, 293 using base e. 294 295 If one of the words is not in the LM's vocabulary, the result 296 will depend on whether this is an open or closed vocabulary 297 language model. For an open-vocabulary model, unknown words 298 are all mapped to the unigram <UNK> which has a non-zero 299 probability and also participates in higher-order N-Grams. 300 Therefore, you will get a score of some sort in this case. 301 302 For a closed-vocabulary model, unknown words are impossible 303 and thus have zero probability. Therefore, if C{word} is 304 unknown, this function will return a 'zero' log-probability, 305 i.e. a large negative number. 306 """ 307 cdef int32 wid 308 cdef int32 *hist 309 cdef int32 n_hist 310 cdef int32 n_used 311 cdef int32 score 312 wid = ngram_wid(self.lm, word) 313 n_hist = len(args) 314 hist = <int32 *>ckd_calloc(n_hist, sizeof(int32)) 315 for i from 0 <= i < n_hist: 316 spam = args[i] 317 hist[i] = ngram_wid(self.lm, spam) 318 score = ngram_ng_score(self.lm, wid, hist, n_hist, &n_used) 319 ckd_free(hist) 320 return logmath_log_to_ln(self.lmath, score), n_used 321 322 def prob(self, word, *args): 323 """ 324 Get the log-probability for an N-Gram. 325 326 This works effectively the same way as L{score}, except that 327 any weights (language weight, insertion penalty) applied to 328 the language model are ignored and the 'raw' probability value 329 is returned. 330 """ 331 cdef int32 wid 332 cdef int32 *hist 333 cdef int32 n_hist 334 cdef int32 n_used 335 cdef int32 score 336 wid = ngram_wid(self.lm, word) 337 n_hist = len(args) 338 hist = <int32 *>ckd_calloc(n_hist, sizeof(int32)) 339 for i from 0 <= i < n_hist: 340 spam = args[i] 341 hist[i] = ngram_wid(self.lm, spam) 342 score = ngram_ng_prob(self.lm, wid, hist, n_hist, &n_used) 343 ckd_free(hist) 344 return logmath_log_to_ln(self.lmath, score), n_used 345 346 def mgrams(self, m): 347 """ 348 Return an iterator over each N-gram of order m+1. 349 350 This allows Pythonic iteration over the parameters of an 351 N-Gram model. 352 353 @param m: Order of requested N-grams minus one 354 @type m: int 355 @return: Iterator over M+1-grams 356 @rtype: NGramIter 357 """ 358 cdef NGramIter itor 359 itor = NGramIter(self, m) 360 itor.itor = ngram_model_mgrams(self.lm, m) 361 return itor 362 363 def ngram(self, word, *args): 364 """ 365 Return an N-Gram iterator pointing to a given N-gram. 366 367 This allows you to iterate over its successors among other 368 things. 369 370 @param word: Head word of requested N-gram. 371 @type word: str 372 @param args: History words of requested N-gram 373 @type args: str 374 @return: Iterator pointing to N-gram. 375 """ 376 cdef NGramIter itor 377 cdef int32 wid 378 cdef int32 *hist 379 cdef int32 n_hist 380 wid = ngram_wid(self.lm, word) 381 n_hist = len(args) 382 hist = <int32 *>ckd_calloc(n_hist, sizeof(int32)) 383 for i from 0 <= i < n_hist: 384 spam = args[i] 385 hist[i] = ngram_wid(self.lm, spam) 386 itor = NGramIter(self, n_hist) 387 # We do set_iter here, because we're returning something the 388 # user is immediately going to do stuff with. 389 itor.set_iter(ngram_ng_iter(self.lm, wid, hist, n_hist)) 390 ckd_free(hist) 391 return itor 392 393 def add_word(self, word, weight=1.0): 394 return ngram_model_add_word(self.lm, word, weight) 395 396 def recode(self, frum, too): 397 cdef int rv 398 rv = ngram_model_recode(self.lm, frum, too) 399 if rv == -1: 400 raise ValueError, "Recode from %s to %s failed" % (frum, too) 401 402 def casefold(self, kase): 403 cdef int rv 404 rv = ngram_model_casefold(self.lm, kase) 405 if rv == -1: 406 raise ValueError, "Casefolding failed" 407 408 def write(self, file_name, format=NGRAM_AUTO): 409 cdef int rv 410 rv = ngram_model_write(self.lm, file_name, format) 411 if rv == -1: 412 raise ValueError, "Write %s to file failed" % file_name 413 414cdef class NGramIter: 415 """ 416 N-Gram language model iterator class. 417 418 This class provides access to the individual N-grams stored in a 419 language model. 420 """ 421 def __cinit__(self, NGramModel lm, int m): 422 self.itor = NULL 423 self.lm = lm 424 self.m = m 425 self.first_item = True 426 427 def __iter__(self): 428 self.first_item = True 429 return self 430 431 cdef set_iter(NGramIter self, ngram_iter_t *itor): 432 cdef int32 prob, bowt 433 cdef const_int32_ptr wids 434 435 if itor == NULL: 436 raise StopIteration 437 self.itor = itor 438 if self.first_item: 439 self.first_item = False 440 wids = ngram_iter_get(itor, &prob, &bowt) 441 self.log_prob = logmath_log_to_ln(self.lm.lmath, prob) 442 self.log_bowt = logmath_log_to_ln(self.lm.lmath, bowt) 443 self.words = [] 444 for i in range(0, self.m+1): 445 self.words.append(ngram_word(self.lm.lm, wids[i])) 446 447 def __next__(self): 448 if self.first_item: 449 self.set_iter(self.itor) 450 else: 451 self.set_iter(ngram_iter_next(self.itor)) 452 return self 453 454 def successors(self): 455 """ 456 Get an iterator over N+1-gram successors of an N-gram. 457 """ 458 cdef NGramIter itor 459 itor = NGramIter(self.lm, self.m + 1) 460 itor.itor = ngram_iter_successors(self.itor) 461 return itor 462 463def binstr(str val, int nbits): 464 """ 465 Silly function to format a string as a binary string 466 """ 467 cdef int i 468 outstr = "" 469 for c in val: 470 cval = ord(c) 471 cnb = min(8, nbits) 472 for i in range(0,cnb): 473 outstr += "%d" % ((cval & (1 << 7-i)) != 0) 474 nbits -= 8 475 return outstr 476 477def bincw(int cw, int nbits): 478 """ 479 Silly function to format an int as a binary string 480 """ 481 cdef int i 482 outstr = "" 483 for i in range(0,nbits): 484 outstr = "%s" % (cw & 1) + outstr 485 cw >>= 1 486 return outstr 487 488# FIXME: Due to the style of IO in huff_code API this part of the code 489# is not compatible with Python 3. This needs to be converted to 490# the new Python io module. 491 492cdef class HuffCode: 493 """ 494 Huffman coding class. 495 496 You can either construct a Huffman code from an alphabet of 497 symbols with frequencies, or read one from a file. Either the 498 alphabet or infile argument (but not both) must be passed to the 499 constructor. 500 501 @param alphabet: Alphabet of (symbol, frequency) pairs 502 @type alphabet: [(str, int)] 503 @param infile: File handle or filename to read from 504 @type infile: file | str 505 """ 506 def __init__(self, alphabet=None, infile=None): 507 cdef char **symbols 508 cdef int *frequencies 509 cdef int nsym 510 511 if alphabet == None and infile == None: 512 raise ValueError, "One of alphabet or infile must be passed to constructor" 513 if alphabet != None and infile != None: 514 raise ValueError, "Only one of alphabet or infile must be passed to constructor" 515 516 self.fh = None 517 if infile: 518 self.read(infile) 519 return 520 521 nsym = len(alphabet) 522 frequencies = <int *>ckd_calloc(nsym, sizeof(int)) 523 symbols = <char **>ckd_calloc(nsym, sizeof(char *)) 524 # Need to create separate Python objects for each string, 525 # otherwise we get random duplicates of the codewords... 526 bogus = [] 527 for i, spam in enumerate(alphabet): 528 sym, freq = spam 529 bogus.append(repr(sym)) 530 frequencies[i] = freq 531 symbols[i] = bogus[-1] 532 self.hc = huff_code_build_str(symbols, frequencies, nsym) 533 ckd_free(frequencies) 534 ckd_free(symbols) 535 536 def read(self, infile): 537 if not isinstance(infile, file): 538 infile = file(infile, "rb") 539 huff_code_free(self.hc) 540 self.hc = huff_code_read(PyFile_AsFile(infile)) 541 542 def write(self, outfile): 543 if not isinstance(outfile, file): 544 outfile = file(outfile, "wb") 545 huff_code_write(self.hc, PyFile_AsFile(outfile)) 546 547 def dump(self, outfile): 548 if not isinstance(outfile, file): 549 outfile = file(outfile, "w") 550 huff_code_dump(self.hc, PyFile_AsFile(outfile)) 551 552 def encode(self, seq): 553 """ 554 Encode a sequence of symbols to a byte array, returning that 555 array and the bit offset of the next codeword in the last 556 byte (i.e. 8 minutes the number of extra zero bits) 557 """ 558 cdef unsigned int cw 559 cdef int cwlen, nbits = 0, nbytes, offset, i 560 cdef unsigned char buf = 0 561 cdef char *output 562 563 for sym in seq: 564 sss = repr(sym) 565 cwlen = huff_code_encode_str(self.hc, sss, &cw) 566 nbits += cwlen 567 nbytes = int((nbits + 7) / 8) 568 offset = 0 569 output = <char *>PyMem_Malloc(nbytes + 1) 570 output[nbytes] = 0 571 i = 0 572 nbits = 0 573 for sym in seq: 574 sss = repr(sym) 575 cwlen = huff_code_encode_str(self.hc, sss, &cw) 576 #print "sym: %s cw: %s buf: %s output: %s" \ 577 # % (sym, bincw(cw, cwlen), bincw(buf >> (8-offset), offset), 578 # binstr(output, nbits)) 579 #print "cwlen",cwlen 580 # Do one byte at a time while full bytes are available 581 while cwlen >= 8: 582 # Fill low bits of buf with high bits of cw 583 buf |= (cw >> (cwlen - (8 - offset))) & ((1 << (8 - offset)) - 1) 584 # Append buf to output 585 output[i] = buf 586 i += 1 587 nbits += 8 588 # Fill high bits of buf with low bits of this byte 589 cwlen -= 8 590 buf = (cw >> cwlen) & ((1 << offset) - 1) 591 buf <<= (8-offset) 592 #print "cwlen",cwlen 593 # Now cwlen will be less than 8, but it might still be 594 # more than the available space in buf. 595 if cwlen >= (8 - offset): 596 # Fill low bits of buf with (8-offset) highest bits of cw 597 buf |= (cw >> (cwlen - (8 - offset))) & ((1 << (8 - offset)) - 1) 598 # Append buf to output 599 output[i] = buf 600 i += 1 601 nbits += 8 602 # cwlen is down to the remaining bits 603 cwlen -= (8 - offset) 604 # Offset is now zero since we just completed and emptied buf 605 offset = 0 606 # buf is zero, because we just emptied it without putting stuff in 607 buf = 0 608 #print "cwlen",cwlen 609 # Any remaining bits will be taken care of below (we hope) 610 # Add remaining high bits of cw to low bits of buf 611 #print "cwlen",cwlen 612 buf |= ((cw & ((1 << cwlen) - 1)) << (8 - offset - cwlen)) 613 offset += cwlen 614 #print "after buf: %s output: %s" \ 615 # % (bincw(buf >> (8-offset), offset), binstr(output, nbits)) 616 if offset > 0: 617 # Append buf to output 618 output[i] = buf 619 nbits += offset 620 i += 1 621 #print "output:", binstr(output, nbits) 622 outstr = PyString_FromStringAndSize(output, nbytes) 623 PyMem_Free(output) 624 return (outstr, offset) 625 626 def decode(self, data): 627 """ 628 Decode a sequence of symbols from a string, returning the 629 sequence and the bit offset of the next codeword in the last 630 byte (i.e. 8 minutes the number of remaining bits) 631 """ 632 cdef int offset 633 cdef const_char_ptr dptr 634 cdef const_char_ptr strval 635 cdef size_t dlen 636 637 dlen = len(data) 638 offset = 0 639 dptr = data 640 output = [] 641 while True: 642 strval = huff_code_decode_str(self.hc, &dptr, &dlen, &offset) 643 if strval == NULL: 644 break 645 output.append(strval) 646 if dlen > 1: 647 raise ValueError, "Invalid data at position %d" % (len(data) - dlen) 648 return (output, offset) 649 650 def attach(self, fh, char *mode): 651 if not isinstance(fh, file): 652 fh = file(fh, mode) 653 self.fh = fh 654 huff_code_attach(self.hc, PyFile_AsFile(fh), mode) 655 656 def detach(self): 657 huff_code_detach(self.hc) 658 self.fh = None 659 660 def encode_to_file(self, seq): 661 if self.fh == None: 662 raise RuntimeError, "No file is attached" 663 for sym in seq: 664 strsym = repr(sym) 665 huff_code_encode_str(self.hc, strsym, NULL) 666 667 def decode_from_file(self): 668 cdef const_char_ptr sym 669 if self.fh == None: 670 raise RuntimeError, "No file is attached" 671 sym = huff_code_decode_str(self.hc, NULL, NULL, NULL) 672 if sym == NULL: 673 return None 674 else: 675 return sym 676 677 def __dealloc__(self): 678 if self.fh: 679 self.detach() 680 huff_code_free(self.hc) 681