1 // $Id: ngramtable.h 34 2010-06-03 09:19:34Z nicolabertoldi $ 2 3 /****************************************************************************** 4 IrstLM: IRST Language Model Toolkit, compile LM 5 Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy 6 7 This library is free software; you can redistribute it and/or 8 modify it under the terms of the GNU Lesser General Public 9 License as published by the Free Software Foundation; either 10 version 2.1 of the License, or (at your option) any later version. 11 12 This library is distributed in the hope that it will be useful, 13 but WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 Lesser General Public License for more details. 16 17 You should have received a copy of the GNU Lesser General Public 18 License along with this library; if not, write to the Free Software 19 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 21 ******************************************************************************/ 22 23 #ifndef MF_NGRAMTABLE_H 24 #define MF_NGRAMTABLE_H 25 26 //Backoff symbol 27 #ifndef BACKOFF_ 28 #define BACKOFF_ "_backoff_" 29 #endif 30 31 //Dummy symbol 32 #ifndef DUMMY_ 33 #define DUMMY_ "_dummy_" 34 #endif 35 36 // internal data structure 37 38 #ifdef MYCODESIZE 39 #define DEFCODESIZE MYCODESIZE 40 #else 41 #define DEFCODESIZE (int)2 42 #endif 43 44 #define SHORTSIZE (int)2 45 #define PTRSIZE (int)sizeof(char *) 46 #define INTSIZE (int)4 47 #define CHARSIZE (int)1 48 49 50 //Node flags 51 #define FREQ1 (unsigned char) 1 52 #define FREQ2 (unsigned char) 2 53 #define FREQ4 (unsigned char) 4 54 #define INODE (unsigned char) 8 55 #define LNODE (unsigned char) 16 56 #define SNODE (unsigned char) 32 57 #define FREQ6 (unsigned char) 64 58 #define FREQ3 (unsigned char) 128 59 60 typedef char* node; //inodes, lnodes, snodes 61 typedef char* table; //inode table, lnode table, singleton table 62 63 typedef unsigned char NODETYPE; 64 65 66 typedef enum {FIND, //!< search: find an entry 67 ENTER, //!< search: enter an entry 68 DELETE, //!< search: find and remove entry 69 INIT, //!< scan: start scan 70 CONT //!< scan: continue scan 71 } ACTION; 72 73 74 typedef enum {COUNT, //!< table: only counters 75 LEAFPROB, //!< table: only probs on leafs 76 FLEAFPROB, //!< table: only probs on leafs and FROZEN 77 LEAFPROB2, //!< table: only probs on leafs 78 LEAFPROB3, //!< table: only probs on leafs 79 LEAFPROB4, //!< table: only probs on leafs 80 LEAFCODE, //!< table: only codes on leafs 81 SIMPLE_I, //!< table: simple interpolated LM 82 SIMPLE_B, //!< table: simple backoff LM 83 SHIFTBETA_I, //!< table: interpolated shiftbeta 84 SHIFTBETA_B, //!< table: backoff shiftbeta 85 MSHIFTBETA_I,//!< table: interp modified shiftbeta 86 MSHIFTBETA_B,//!< table: backoff modified shiftbeta 87 FULL, //!< table: full fledged table 88 89 } TABLETYPE; 90 91 92 93 class tabletype 94 { 95 96 TABLETYPE ttype; 97 98 public: 99 100 int CODESIZE; //sizeof word codes 101 long long code_range[7]; //max code for each size 102 103 //Offsets of internal node fields 104 int WORD_OFFS; //word code position 105 int MSUCC_OFFS; //number of successors 106 int MTAB_OFFS; //pointer to successors 107 int FLAGS_OFFS; //flag table 108 int SUCC1_OFFS; //number of successors with freq=1 109 int SUCC2_OFFS; //number of successors with freq=2 110 int BOFF_OFFS; //back-off probability 111 int I_FREQ_OFFS; //frequency offset 112 int I_FREQ_NUM; //number of internal frequencies 113 int L_FREQ_NUM; //number of leaf frequencies 114 int L_FREQ_SIZE; //minimum size for leaf frequencies 115 116 //Offsets of leaf node fields 117 int L_FREQ_OFFS; //frequency offset 118 tbtype()119 TABLETYPE tbtype() { 120 return ttype; 121 } 122 123 tabletype(TABLETYPE tt,int codesize=DEFCODESIZE) { 124 125 if (codesize<=4 && codesize>0) 126 CODESIZE=codesize; 127 else { 128 cerr << "ngramtable wrong codesize\n"; 129 exit(1); 130 } 131 132 code_range[1]=255; 133 code_range[2]=65535; 134 code_range[3]=16777214; 135 code_range[4]=2147483640; 136 code_range[6]=140737488360000LL; //stay below true limit 137 // code_range[6]=281474977000000LL; //stay below true limit 138 139 //information which is useful to initialize 140 //LEAFPROB tables 141 L_FREQ_SIZE=FREQ1; 142 143 WORD_OFFS =0; 144 MSUCC_OFFS =CODESIZE; 145 MTAB_OFFS =MSUCC_OFFS+CODESIZE; 146 FLAGS_OFFS =MTAB_OFFS+PTRSIZE; 147 148 switch (tt) { 149 150 case COUNT: 151 SUCC1_OFFS =0; 152 SUCC2_OFFS =0; 153 BOFF_OFFS =0; 154 I_FREQ_OFFS=FLAGS_OFFS+CHARSIZE; 155 I_FREQ_NUM=1; 156 L_FREQ_NUM=1; 157 158 ttype=tt; 159 break; 160 161 case FULL: 162 case MSHIFTBETA_B: 163 SUCC1_OFFS =FLAGS_OFFS+CHARSIZE; 164 SUCC2_OFFS =SUCC1_OFFS+CODESIZE; 165 BOFF_OFFS =SUCC2_OFFS+CODESIZE; 166 I_FREQ_OFFS=BOFF_OFFS+INTSIZE; 167 L_FREQ_OFFS=CODESIZE; 168 I_FREQ_NUM=2; 169 L_FREQ_NUM=1; 170 171 ttype=tt; 172 break; 173 174 case MSHIFTBETA_I: 175 SUCC1_OFFS =FLAGS_OFFS+CHARSIZE; 176 SUCC2_OFFS =SUCC1_OFFS+CODESIZE; 177 BOFF_OFFS =0; 178 I_FREQ_OFFS=SUCC2_OFFS+CODESIZE; 179 L_FREQ_OFFS=CODESIZE; 180 I_FREQ_NUM=2; 181 L_FREQ_NUM=1; 182 183 ttype=tt; 184 break; 185 186 case SIMPLE_I: 187 SUCC1_OFFS = 0; 188 SUCC2_OFFS = 0; 189 BOFF_OFFS = 0; 190 I_FREQ_OFFS= FLAGS_OFFS+CHARSIZE; 191 L_FREQ_OFFS=CODESIZE; 192 I_FREQ_NUM=1; 193 L_FREQ_NUM=1; 194 195 ttype=tt; 196 break; 197 198 case SIMPLE_B: 199 200 SUCC1_OFFS = 0; 201 SUCC2_OFFS = 0; 202 BOFF_OFFS = FLAGS_OFFS+CHARSIZE; 203 I_FREQ_OFFS = BOFF_OFFS+INTSIZE; 204 L_FREQ_OFFS = CODESIZE; 205 I_FREQ_NUM = 1; 206 L_FREQ_NUM = 1; 207 208 ttype=tt; 209 break; 210 211 case SHIFTBETA_I: 212 SUCC1_OFFS = FLAGS_OFFS+CHARSIZE; 213 SUCC2_OFFS = 0; 214 BOFF_OFFS = 0; 215 I_FREQ_OFFS= SUCC1_OFFS+CODESIZE; 216 L_FREQ_OFFS=CODESIZE; 217 I_FREQ_NUM=1; 218 L_FREQ_NUM=1; 219 220 ttype=tt; 221 break; 222 223 case SHIFTBETA_B: 224 225 SUCC1_OFFS = FLAGS_OFFS+CHARSIZE; 226 SUCC2_OFFS = 0; 227 BOFF_OFFS = SUCC1_OFFS+CODESIZE; 228 I_FREQ_OFFS = BOFF_OFFS+INTSIZE; 229 L_FREQ_OFFS = CODESIZE; 230 I_FREQ_NUM = 1; 231 L_FREQ_NUM = 1; 232 233 ttype=tt; 234 break; 235 236 case LEAFPROB: 237 case FLEAFPROB: 238 SUCC1_OFFS = 0; 239 SUCC2_OFFS = 0; 240 BOFF_OFFS = 0; 241 I_FREQ_OFFS = FLAGS_OFFS+CHARSIZE; 242 I_FREQ_NUM = 0; 243 L_FREQ_NUM = 1; 244 245 ttype=tt; 246 break; 247 248 case LEAFPROB2: 249 SUCC1_OFFS =0; 250 SUCC2_OFFS =0; 251 BOFF_OFFS =0; 252 I_FREQ_OFFS=FLAGS_OFFS+CHARSIZE; 253 I_FREQ_NUM=0; 254 L_FREQ_NUM=2; 255 256 ttype=LEAFPROB; 257 break; 258 259 case LEAFPROB3: 260 SUCC1_OFFS =0; 261 SUCC2_OFFS =0; 262 BOFF_OFFS =0; 263 I_FREQ_OFFS=FLAGS_OFFS+CHARSIZE; 264 I_FREQ_NUM=0; 265 L_FREQ_NUM=3; 266 267 ttype=LEAFPROB; 268 break; 269 270 case LEAFPROB4: 271 SUCC1_OFFS =0; 272 SUCC2_OFFS =0; 273 BOFF_OFFS =0; 274 I_FREQ_OFFS=FLAGS_OFFS+CHARSIZE; 275 I_FREQ_NUM=0; 276 L_FREQ_NUM=4; 277 278 ttype=LEAFPROB; 279 break; 280 281 default: 282 assert(tt==COUNT); 283 } 284 285 L_FREQ_OFFS=CODESIZE; 286 } 287 inodesize(int s)288 int inodesize(int s) { 289 290 return I_FREQ_OFFS + I_FREQ_NUM * s; 291 292 } 293 lnodesize(int s)294 int lnodesize(int s) { 295 return L_FREQ_OFFS + L_FREQ_NUM * s; 296 } 297 298 }; 299 300 301 class ngramtable:tabletype 302 { 303 304 node tree; // ngram table root 305 int maxlev; // max storable n-gram 306 NODETYPE treeflags; 307 char info[100]; //information put in the header 308 int resolution; //max resolution for probabilities 309 double decay; //decay constant 310 311 storage* mem; //memory storage class 312 313 int* memory; // memory load per level 314 int* occupancy; // memory occupied per level 315 long long* mentr; // multiple entries per level 316 long long card; //entries at maxlev 317 318 int idx[MAX_NGRAM+1]; 319 320 int oov_code,oov_size,du_code, bo_code; //used by prob; 321 322 int backoff_state; //used by prob; 323 324 public: 325 326 int corrcounts; //corrected counters flag 327 328 dictionary *dict; // dictionary 329 330 // filtering dictionary: 331 // if the first word of the ngram does not belong to filterdict 332 // do not insert the ngram 333 dictionary *filterdict; 334 335 ngramtable(char* filename,int maxl,char* is, 336 dictionary* extdict, 337 char* filterdictfile, 338 int googletable=0, 339 int dstco=0,char* hmask=NULL,int inplen=0, 340 TABLETYPE tt=FULL,int codesize=DEFCODESIZE); 341 342 inline char* ngtype(char *str=NULL) { 343 if (str!=NULL) strcpy(info,str); 344 return info; 345 } 346 347 virtual ~ngramtable(); 348 freetree()349 inline void freetree() { 350 freetree(tree); 351 }; 352 void freetree(node nd); 353 resetngramtable()354 void resetngramtable() { 355 //clean up all memory and restart from an empty table 356 357 freetree(); //clean memory pool 358 memset(tree,0,inodesize(6)); //reset tree 359 //1-gram table initial flags 360 361 if (maxlev>1) mtflags(tree,INODE | FREQ4); 362 else if (maxlev==1) mtflags(tree,LNODE | FREQ4); 363 364 word(tree,0); //dummy word 365 msucc(tree,0); // number of different n-grams 366 mtable(tree,NULL); // table of n-gram 367 368 for (int i=1; i<=maxlev; i++) 369 mentr[i]=memory[i]=occupancy[i]=0; 370 371 } 372 373 void stat(int level=4); 374 375 inline long long totfreq(long long v=-1) { 376 return (v==-1?freq(tree,INODE):freq(tree,INODE,v)); 377 } 378 379 inline long long btotfreq(long long v=-1) { 380 return (v==-1?getfreq(tree,treeflags,1):setfreq(tree,treeflags,v,1)); 381 } 382 entries(int lev)383 inline long long entries(int lev) { 384 return mentr[lev]; 385 } 386 maxlevel()387 int maxlevel() { 388 return maxlev; 389 } 390 391 // void savetxt(char *filename,int sz=0); 392 void savetxt(char *filename,int sz=0,int googleformat=0); 393 void loadtxt(char *filename,int googletable=0); 394 395 396 void savebin(char *filename,int sz=0); 397 void savebin(mfstream& out); 398 void savebin(mfstream& out,node nd,NODETYPE ndt,int lev,int mlev); 399 400 void loadbin(const char *filename); 401 void loadbin(mfstream& inp); 402 void loadbin(mfstream& inp,node nd,NODETYPE ndt,int lev); 403 404 void loadbinold(char *filename); 405 void loadbinold(mfstream& inp,node nd,NODETYPE ndt,int lev); 406 407 void generate(char *filename,dictionary *extdict=NULL); 408 void generate_dstco(char *filename,int dstco); 409 void generate_hmask(char *filename,char* hmask,int inplen=0); 410 411 void augment(ngramtable* ngt); 412 413 int scan(ngram& ng,ACTION action=CONT,int maxlev=-1) { 414 return scan(tree,INODE,0,ng,action,maxlev); 415 } 416 succscan(ngram & h,ngram & ng,ACTION action,int lev)417 int succscan(ngram& h,ngram& ng,ACTION action,int lev) { 418 //return scan(h.link,h.info,h.lev,ng,action,lev); 419 return scan(h.link,h.info,lev-1,ng,action,lev); 420 } 421 422 double prob(ngram ng); 423 424 int scan(node nd,NODETYPE ndt,int lev,ngram& ng,ACTION action=CONT,int maxl=-1); 425 426 void show(); 427 428 void *search(table *tb,NODETYPE ndt,int lev,int n,int sz,int *w, 429 ACTION action,char **found=(char **)NULL); 430 431 int mybsearch(char *ar, int n, int size, unsigned char *key, int *idx); 432 433 int put(ngram& ng); 434 int put(ngram& ng,node nd,NODETYPE ndt,int lev); 435 get(ngram & ng)436 inline int get(ngram& ng) { 437 return get(ng,maxlev,maxlev); 438 } 439 virtual int get(ngram& ng,int n,int lev); 440 441 int comptbsize(int n); 442 table *grow(table *tb,NODETYPE ndt,int lev,int n,int sz,NODETYPE oldndt=0); 443 444 445 bool check_dictsize_bound(); 446 447 putmem(char * ptr,int value,int offs,int size)448 inline int putmem(char* ptr,int value,int offs,int size) { 449 assert(ptr!=NULL); 450 for (int i=0; i<size; i++) 451 ptr[offs+i]=(value >> (8 * i)) & 0xff; 452 return value; 453 } 454 getmem(char * ptr,int * value,int offs,int size)455 inline int getmem(char* ptr,int* value,int offs,int size) { 456 assert(ptr!=NULL); 457 *value=ptr[offs] & 0xff; 458 for (int i=1; i<size; i++) 459 *value= *value | ( ( ptr[offs+i] & 0xff ) << (8 *i)); 460 return *value; 461 } 462 putmem(char * ptr,long long value,int offs,int size)463 inline long putmem(char* ptr,long long value,int offs,int size) { 464 assert(ptr!=NULL); 465 for (int i=0; i<size; i++) 466 ptr[offs+i]=(value >> (8 * i)) & 0xffLL; 467 return value; 468 } 469 getmem(char * ptr,long long * value,int offs,int size)470 inline long getmem(char* ptr,long long* value,int offs,int size) { 471 assert(ptr!=NULL); 472 *value=ptr[offs] & 0xff; 473 for (int i=1; i<size; i++) 474 *value= *value | ( ( ptr[offs+i] & 0xffLL ) << (8 *i)); 475 return *value; 476 } 477 478 inline void tb2ngcpy(int* wordp,char* tablep,int n=1) { 479 for (int i=0; i<n; i++) 480 getmem(tablep,&wordp[i],i*CODESIZE,CODESIZE); 481 } 482 483 inline void ng2tbcpy(char* tablep,int* wordp,int n=1) { 484 for (int i=0; i<n; i++) 485 putmem(tablep,wordp[i],i*CODESIZE,CODESIZE); 486 } 487 488 inline int ngtbcmp(int* wordp,char* tablep,int n=1) { 489 int word; 490 for (int i=0; i<n; i++) { 491 getmem(tablep,&word,i*CODESIZE,CODESIZE); 492 if (wordp[i]!=word) return 1; 493 } 494 return 0; 495 } 496 word(node nd,int value)497 inline int word(node nd,int value) { 498 putmem(nd,value,WORD_OFFS,CODESIZE); 499 return value; 500 } 501 word(node nd)502 inline int word(node nd) { 503 int v; 504 getmem(nd,&v,WORD_OFFS,CODESIZE); 505 return v; 506 } 507 mtflags(node nd,unsigned char value)508 unsigned char mtflags(node nd,unsigned char value) { 509 return *(unsigned char *)(nd+FLAGS_OFFS)=value; 510 } 511 mtflags(node nd)512 unsigned char mtflags(node nd) { 513 return *(unsigned char *)(nd+FLAGS_OFFS); 514 } 515 codecmp(char * a,char * b)516 int codecmp(char * a,char *b) { 517 register int i,result; 518 for (i=(CODESIZE-1); i>=0; i--) { 519 result=(unsigned char)a[i]-(unsigned char)b[i]; 520 if(result) return result; 521 } 522 return 0; 523 }; 524 codediff(node a,node b)525 int codediff(node a,node b) { 526 return word(a)-word(b); 527 }; 528 529 update(ngram ng)530 int update(ngram ng) { 531 532 if (!get(ng,ng.size,ng.size)) { 533 cerr << "cannot find " << ng << "\n"; 534 exit (1); 535 } 536 537 freq(ng.link,ng.pinfo,ng.freq); 538 539 return 1; 540 } 541 freq(node nd,NODETYPE ndt,long long value)542 long long freq(node nd,NODETYPE ndt,long long value) { 543 int offs=(ndt & LNODE)?L_FREQ_OFFS:I_FREQ_OFFS; 544 545 if (ndt & FREQ1) 546 putmem(nd,value,offs,1); 547 else if (ndt & FREQ2) 548 putmem(nd,value,offs,2); 549 else if (ndt & FREQ3) 550 putmem(nd,value,offs,3); 551 else if (ndt & FREQ4) 552 putmem(nd,value,offs,4); 553 else 554 putmem(nd,value,offs,6); 555 return value; 556 } 557 freq(node nd,NODETYPE ndt)558 long long freq(node nd,NODETYPE ndt) { 559 int offs=(ndt & LNODE)?L_FREQ_OFFS:I_FREQ_OFFS; 560 long long value; 561 562 if (ndt & FREQ1) 563 getmem(nd,&value,offs,1); 564 else if (ndt & FREQ2) 565 getmem(nd,&value,offs,2); 566 else if (ndt & FREQ3) 567 getmem(nd,&value,offs,3); 568 else if (ndt & FREQ4) 569 getmem(nd,&value,offs,4); 570 else 571 getmem(nd,&value,offs,6); 572 573 return value; 574 } 575 576 577 long long setfreq(node nd,NODETYPE ndt,long long value,int index=0) { 578 int offs=(ndt & LNODE)?L_FREQ_OFFS:I_FREQ_OFFS; 579 580 if (ndt & FREQ1) 581 putmem(nd,value,offs+index * 1,1); 582 else if (ndt & FREQ2) 583 putmem(nd,value,offs+index * 2,2); 584 else if (ndt & FREQ3) 585 putmem(nd,value,offs+index * 3,3); 586 else if (ndt & FREQ4) 587 putmem(nd,value,offs+index * 4,4); 588 else 589 putmem(nd,value,offs+index * 6,6); 590 591 return value; 592 } 593 594 long long getfreq(node nd,NODETYPE ndt,int index=0) { 595 int offs=(ndt & LNODE)?L_FREQ_OFFS:I_FREQ_OFFS; 596 597 long long value; 598 599 if (ndt & FREQ1) 600 getmem(nd,&value,offs+ index * 1,1); 601 else if (ndt & FREQ2) 602 getmem(nd,&value,offs+ index * 2,2); 603 else if (ndt & FREQ3) 604 getmem(nd,&value,offs+ index * 3,3); 605 else if (ndt & FREQ4) 606 getmem(nd,&value,offs+ index * 4,4); 607 else 608 getmem(nd,&value,offs+ index * 6,6); 609 610 return value; 611 } 612 613 boff(node nd)614 double boff(node nd) { 615 int value=0; 616 getmem(nd,&value,BOFF_OFFS,INTSIZE); 617 618 return double (value/(double)1000000000.0); 619 } 620 621 myround(double x)622 double myround(double x) { 623 long int i=(long int)(x); 624 return (x-i)>0.500?i+1.0:(double)i; 625 } 626 boff(node nd,double value)627 int boff(node nd,double value) { 628 int v=(int)myround(value * 1000000000.0); 629 putmem(nd,v,BOFF_OFFS,INTSIZE); 630 631 return 1; 632 } 633 succ2(node nd,int value)634 int succ2(node nd,int value) { 635 putmem(nd,value,SUCC2_OFFS,CODESIZE); 636 return value; 637 } 638 succ2(node nd)639 int succ2(node nd) { 640 int value=0; 641 getmem(nd,&value,SUCC2_OFFS,CODESIZE); 642 return value; 643 } 644 succ1(node nd,int value)645 int succ1(node nd,int value) { 646 putmem(nd,value,SUCC1_OFFS,CODESIZE); 647 return value; 648 } 649 succ1(node nd)650 int succ1(node nd) { 651 int value=0; 652 getmem(nd,&value,SUCC1_OFFS,CODESIZE); 653 return value; 654 } 655 msucc(node nd,int value)656 int msucc(node nd,int value) { 657 putmem(nd,value,MSUCC_OFFS,CODESIZE); 658 return value; 659 } 660 msucc(node nd)661 int msucc(node nd) { 662 int value; 663 getmem(nd,&value,MSUCC_OFFS,CODESIZE); 664 return value; 665 } 666 mtable(node nd)667 table mtable(node nd) { 668 char v[PTRSIZE];; 669 for (int i=0; i<PTRSIZE; i++) 670 v[i]=nd[MTAB_OFFS+i]; 671 672 return *(table *)v; 673 } 674 mtable(node nd,table value)675 table mtable(node nd,table value) { 676 char *v=(char *)&value; 677 for (int i=0; i<PTRSIZE; i++) 678 nd[MTAB_OFFS+i]=v[i]; 679 return value; 680 } 681 mtablesz(node nd)682 int mtablesz(node nd) { 683 if (mtflags(nd) & LNODE) { 684 if (mtflags(nd) & FREQ1) 685 return lnodesize(1); 686 else if (mtflags(nd) & FREQ2) 687 return lnodesize(2); 688 else if (mtflags(nd) & FREQ3) 689 return lnodesize(3); 690 else if (mtflags(nd) & FREQ4) 691 return lnodesize(4); 692 else 693 return lnodesize(6); 694 } else if (mtflags(nd) & INODE) { 695 if (mtflags(nd) & FREQ1) 696 return inodesize(1); 697 else if (mtflags(nd) & FREQ2) 698 return inodesize(2); 699 else if (mtflags(nd) & FREQ3) 700 return inodesize(3); 701 else if (mtflags(nd) & FREQ4) 702 return inodesize(4); 703 else 704 return inodesize(6); 705 } else { 706 cerr << "node has wrong flags\n"; 707 exit(1); 708 } 709 } 710 711 712 int bo_state(int value=-1) { 713 return (value==-1?backoff_state:backoff_state=value); 714 } 715 716 717 }; 718 719 #endif 720 721 722 723 724