1 /******************************************************************************
2 IrstLM: IRST Language Model Toolkit
3 Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
4
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with this library; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
19 ******************************************************************************/
20
21 #include <cmath>
22 #include <string>
23 #include <assert.h>
24 #include <vector>
25 #include "util.h"
26 #include "mfstream.h"
27 #include "mempool.h"
28 #include "htable.h"
29 #include "dictionary.h"
30 #include "n_gram.h"
31 #include "mempool.h"
32 #include "ngramcache.h"
33 #include "ngramtable.h"
34 #include "normcache.h"
35 #include "interplm.h"
36 #include "mdiadapt.h"
37 #include "shiftlm.h"
38 #include "lmtable.h"
39
40 using namespace std;
41
42 //
43 //Minimum discrimination adaptation for interplm
44 //
mdiadaptlm(char * ngtfile,int depth,TABLETYPE tbtype)45 mdiadaptlm::mdiadaptlm(char* ngtfile,int depth,TABLETYPE tbtype):
46 interplm(ngtfile,depth,tbtype)
47 {
48 adaptlev=0;
49 forelm=NULL;
50 cache=NULL;
51 m_save_per_level=true;
52 };
53
~mdiadaptlm()54 mdiadaptlm::~mdiadaptlm()
55 {
56 if (cache) delete cache;
57 delete_caches();
58 };
59
delete_caches(int level)60 void mdiadaptlm::delete_caches(int level)
61 {
62 if (probcache[level]) delete probcache[level];
63 if (backoffcache[level]) delete backoffcache[level];
64 };
65
delete_caches()66 void mdiadaptlm::delete_caches()
67 {
68 #ifdef MDIADAPTLM_CACHE_ENABLE
69 for (int i=0; i<=max_caching_level; i++) delete_caches(i);
70
71 delete [] probcache;
72 delete [] backoffcache;
73 #endif
74 };
75
caches_stat()76 void mdiadaptlm::caches_stat()
77 {
78 #ifdef MDIADAPTLM_CACHE_ENABLE
79 for (int i=1; i<=max_caching_level; i++) {
80 if (probcache[i]) {
81 cerr << "Statistics of probcache at level " << i << " (of " << lmsize() << ") ";
82 probcache[i]->stat();
83 }
84 if (backoffcache[i]) {
85 cerr << "Statistics of backoffcache at level " << i << " (of " << lmsize() << ") ";
86 backoffcache[i]->stat();
87 }
88 }
89 #endif
90 };
91
92
create_caches(int mcl)93 void mdiadaptlm::create_caches(int mcl)
94 {
95 max_caching_level=(mcl>=0 && mcl<lmsize())?mcl:lmsize()-1;
96
97 probcache = new NGRAMCACHE_t*[max_caching_level+1]; //index 0 will never be used, index=max_caching_level is not used
98 backoffcache = new NGRAMCACHE_t*[max_caching_level+1]; //index 0 will never be used, index=max_caching_level is not used
99 for (int i=0; i<=max_caching_level; i++) {
100 probcache[i]=NULL;
101 backoffcache[i]=NULL;
102 }
103
104 init_caches();
105 }
106
107
init_caches(int level)108 void mdiadaptlm::init_caches(int level)
109 {
110 assert(probcache[level]==NULL);
111 assert(backoffcache[level]==NULL);
112 probcache[level]=new NGRAMCACHE_t(level,sizeof(double),400000);
113 backoffcache[level]=new NGRAMCACHE_t(level,sizeof(double),400000);
114 };
115
init_caches()116 void mdiadaptlm::init_caches()
117 {
118 #ifdef MDIADAPTLM_CACHE_ENABLE
119 for (int i=1; i<=max_caching_level; i++) init_caches(i);
120 #endif
121 };
122
check_cache_levels(int level)123 void mdiadaptlm::check_cache_levels(int level)
124 {
125 if (probcache[level] && probcache[level]->isfull()) probcache[level]->reset(probcache[level]->cursize());
126 if (backoffcache[level] && backoffcache[level]->isfull()) backoffcache[level]->reset(backoffcache[level]->cursize());
127 };
128
check_cache_levels()129 void mdiadaptlm::check_cache_levels()
130 {
131 #ifdef MDIADAPTLM_CACHE_ENABLE
132 for (int i=1; i<=max_caching_level; i++) check_cache_levels(i);
133 #endif
134 };
135
reset_caches(int level)136 void mdiadaptlm::reset_caches(int level)
137 {
138 if (probcache[level]) probcache[level]->reset(MAX(probcache[level]->cursize(),probcache[level]->maxsize()));
139 if (backoffcache[level]) backoffcache[level]->reset(MAX(backoffcache[level]->cursize(),backoffcache[level]->maxsize()));
140 };
141
reset_caches()142 void mdiadaptlm::reset_caches()
143 {
144 #ifdef MDIADAPTLM_CACHE_ENABLE
145 for (int i=1; i<=max_caching_level; i++) reset_caches(i);
146 #endif
147 };
148
149
get_probcache(int level)150 inline NGRAMCACHE_t* mdiadaptlm::get_probcache(int level)
151 {
152 return probcache[level];
153 }
154
get_backoffcache(int level)155 inline NGRAMCACHE_t* mdiadaptlm::get_backoffcache(int level)
156 {
157 return backoffcache[level];
158 }
159
scalefact(char * ngtfile)160 int mdiadaptlm::scalefact(char *ngtfile)
161 {
162 if (forelm!=NULL) delete forelm;
163 if (cache!=NULL) delete cache;
164 cache=new normcache(dict);
165
166 forelm=new shiftbeta(ngtfile,1);
167 forelm->train();
168
169 //compute oov scalefact term
170 ngram fng(forelm->dict,1);
171 ngram ng(dict,1);
172 int* w=fng.wordp(1);
173
174 oovscaling=1.0;
175 for ((*w)=0; (*w)<forelm->dict->size(); (*w)++)
176 if ((*w) != forelm->dict->oovcode()) {
177 ng.trans(fng);
178 if (*ng.wordp(1)==dict->oovcode()) {
179 cerr << "adaptation file contains new words: use -ao=yes option\n";
180 exit(1);
181 }
182 //forbidden situation
183 oovscaling-=backunig(ng);
184 }
185 *w=forelm->dict->oovcode();
186 oovscaling=foreunig(fng)/oovscaling;
187
188 return 1;
189 };
190
savescalefactor(char * filename)191 int mdiadaptlm::savescalefactor(char* filename)
192 {
193
194 ngram ng(dict,1);
195 int* w=ng.wordp(1);
196
197 mfstream out(filename,ios::out);
198
199 out << "\n\\data\\" << "\nngram 1=" << dict->size() << "\n\n1grams:\n";
200
201 for ((*w)=0; (*w)<dict->size(); (*w)++) {
202 double ratio=scalefact(ng);
203 out << (float) (ratio?log10(ratio):-99);
204 if (*w==dict->oovcode())
205 out << "\t" << "<unk>\n";
206 else
207 out << "\t" << (char *)dict->decode(*w) << "\n";
208
209 }
210 out << "\\end\\\n";
211
212 return 1;
213 }
214
scalefact(ngram ng)215 double mdiadaptlm::scalefact(ngram ng)
216 {
217 ngram fng(forelm->dict,1);
218 fng.trans(ng);
219 if (*fng.wordp(1)==forelm->dict->oovcode())
220 return pow(oovscaling,gis_step);
221 else {
222 double prback=backunig(ng);
223 double prfore=foreunig(ng);
224 return pow(prfore/prback,gis_step);
225 }
226 }
227
228
foreunig(ngram ng)229 double mdiadaptlm::foreunig(ngram ng)
230 {
231
232 double fstar,lambda;
233
234 forelm->discount(ng,1,fstar,lambda);
235
236 return fstar;
237 }
238
backunig(ngram ng)239 double mdiadaptlm::backunig(ngram ng)
240 {
241
242 double fstar,lambda;
243
244 discount(ng,1,fstar,lambda,0);
245
246 return fstar;
247 };
248
249
250
adapt(char * ngtfile,int alev,double step)251 int mdiadaptlm::adapt(char* ngtfile,int alev,double step)
252 {
253
254 if (alev > lmsize() || alev<=0) {
255 cerr << "setting adaptation level to " << lmsize() << "\n";
256 alev=lmsize();
257 }
258 adaptlev=alev;
259
260
261 cerr << "adapt ....";
262 gis_step=step;
263
264 if (ngtfile==NULL) {
265 cerr << "adaptation file is missing\n";
266 exit(1);
267 }
268
269 //compute the scaling factor;
270
271 scalefact(ngtfile);
272
273 //compute 1-gram zeta
274 ngram ng(dict,2);
275 int* w=ng.wordp(1);
276
277 cerr << "precomputing 1-gram normalization ...\n";
278 zeta0=0;
279 for ((*w)=0; (*w)<dict->size(); (*w)++)
280 zeta0+=scalefact(ng) * backunig(ng);
281
282 if (alev==1) return 1 ;
283
284 cerr << "precomputing 2-gram normalization:\n";
285
286 //precompute the bigram normalization
287 w=ng.wordp(2);
288 *ng.wordp(1)=0;
289
290 for ((*w)=0; (*w)<dict->size(); (*w)++) {
291 zeta(ng,2);
292 if ((*w % 1000)==0) cerr << ".";
293 }
294
295 cerr << "done\n";
296
297 return 1;
298 };
299
300
zeta(ngram ng,int size)301 double mdiadaptlm::zeta(ngram ng,int size)
302 {
303
304 assert(size>=1);
305
306 double z=0; // compute normalization term
307
308 ng.size=size;
309
310 if (size==1) return zeta0;
311 else { //size>1
312
313 //check in the 2gr and 3gr cache
314 if (size <=3 && cache->get(ng,size,z)) return z;
315
316 double fstar,lambda;
317 ngram histo=ng;
318 int succ=0;
319
320 discount(ng,size,fstar,lambda,(int)0);
321
322 if ((lambda<1) && get(histo,size,size-1)) {
323 ;
324
325 //scan all its successors
326 succ=0;
327
328 succscan(histo,ng,INIT,size);
329 while(succscan(histo,ng,CONT,size)) {
330
331 discount(ng,size,fstar,lambda,0);
332 if (fstar>0) {
333 z+=(scalefact(ng) * fstar);
334 succ++;
335 //cerr << ng << "zeta= " << z << "\n";
336 }
337 }
338 }
339
340 z+=lambda*zeta(ng,size-1);
341
342 if (size<=3 && succ>1) cache->put(ng,size,z);
343
344 return z;
345 }
346
347 }
348
349
discount(ngram ng_,int size,double & fstar,double & lambda,int)350 int mdiadaptlm::discount(ngram ng_,int size,double& fstar,double& lambda,int /* unused parameter: cv */)
351 {
352
353 ngram ng(dict);
354 ng.trans(ng_);
355
356 double __fstar, __lambda;
357 bool lambda_cached=0;
358 int size_lambda=size-1;
359
360 ngram histo=ng;
361 histo.shift();
362
363 if (size_lambda>0 && histo.size>=size_lambda) {
364 #ifdef MDIADAPTLM_CACHE_ENABLE
365 if (size_lambda<=max_caching_level) {
366 //backoffcache hit
367 if (backoffcache[size_lambda] && backoffcache[size_lambda]->get(histo.wordp(size_lambda),__lambda))
368 lambda_cached=1;
369 }
370 #endif
371 }
372
373 discount(ng,size,__fstar,__lambda,0);
374
375 if ((size>0) && (size<=adaptlev) && (__lambda<1)) {
376
377 if (size>1) {
378 double numlambda, numfstar, den;
379 numfstar=scalefact(ng);
380 den=zeta(ng,size);
381 __fstar=__fstar * numfstar/den;
382 if (!lambda_cached) {
383 numlambda=zeta(ng,size-1);
384 __lambda=__lambda * numlambda/den;
385 }
386 } else if (size==1) {
387 double ratio;
388 ratio=scalefact(ng)/zeta0;
389 __fstar=__fstar * ratio;
390 if (!lambda_cached) {
391 __lambda=__lambda * ratio;
392 }
393 } else {
394 //size==0 do nothing
395 }
396 }
397
398 #ifdef MDIADAPTLM_CACHE_ENABLE
399 //backoffcache insert
400 if (!lambda_cached && size_lambda>0 && size_lambda<=max_caching_level && histo.size>=size_lambda && backoffcache[size_lambda])
401 backoffcache[size_lambda]->add(histo.wordp(size_lambda),__lambda);
402 #endif
403
404 lambda=__lambda;
405 fstar=__fstar;
406 return 1;
407 }
408
409
compute_backoff_per_level()410 int mdiadaptlm::compute_backoff_per_level()
411 {
412
413 double fstar,lambda;
414
415 this->backoff=1;
416
417 for (int size=1; size<lmsize(); size++) {
418
419 ngram hg(dict,size);
420
421 scan(hg,INIT,size);
422
423 while(scan(hg,CONT,size)) {
424
425 ngram ng=hg;
426 ng.pushc(0); //ng.size is now hg.size+1
427
428 double pr=1.0;
429
430 succscan(hg,ng,INIT,size+1);
431 while(succscan(hg,ng,CONT,size+1)) {
432
433 mdiadaptlm::discount(ng,ng.size,fstar,lambda);
434
435 if (fstar>0){
436 ng.size=ng.size-1;
437 pr -= mdiadaptlm::prob(ng,size);
438 }
439 }
440
441 assert(pr>0 && pr<=1);
442
443 boff(hg.link,pr);
444 }
445
446 }
447
448 cerr << "done\n";
449
450 return 1;
451 }
452
453
compute_backoff_per_word()454 int mdiadaptlm::compute_backoff_per_word()
455 {
456 cerr << "Current implementation does not support the usage of backoff (-bo=yes) mixture models (-lm=mix) combined with the per-word saving (-saveperllevel=no)." << endl;
457 cerr << "Please, either choose a per-level saving (-saveperllevel=yes) or do not use backoff (-bo=no) " << endl;
458
459 exit(1);
460 }
461
462
prob2(ngram ng,int size,double & fstar)463 double mdiadaptlm::prob2(ngram ng,int size,double& fstar)
464 {
465
466 double lambda;
467
468 mdiadaptlm::discount(ng,size,fstar,lambda);
469
470 if (size>1)
471 return fstar + lambda * prob(ng,size-1);
472 else
473 return fstar;
474 }
475
476
477 //inline double mdiadaptlm::prob(ngram ng,int size){
prob(ngram ng,int size)478 double mdiadaptlm::prob(ngram ng,int size)
479 {
480 double fstar,lambda,bo;
481 return prob(ng,size,fstar,lambda,bo);
482 }
483
prob(ngram ng,int size,double & fstar,double & lambda,double & bo)484 double mdiadaptlm::prob(ngram ng,int size,double& fstar,double& lambda, double& bo)
485 {
486 double pr;
487
488 #ifdef MDIADAPTLM_CACHE_ENABLE
489 //probcache hit
490 if (size<=max_caching_level && probcache[size] && ng.size>=size && probcache[size]->get(ng.wordp(size),pr))
491 return pr;
492 #endif
493
494 //probcache miss
495 mdiadaptlm::bodiscount(ng,size,fstar,lambda,bo);
496
497 if (fstar>UPPER_SINGLE_PRECISION_OF_1 || lambda>UPPER_SINGLE_PRECISION_OF_1) {
498 cerr << "wrong probability: " << ng
499 << " , size " << size
500 << " , fstar " << fstar
501 << " , lambda " << lambda << "\n";
502 exit(1);
503 }
504 if (backoff) {
505
506 if (size>1) {
507 if (fstar>0){
508 pr=fstar;
509 }else {
510 if (lambda<1){
511 pr = lambda/bo * prob(ng,size-1);
512 }else {
513 assert(lambda<UPPER_SINGLE_PRECISION_OF_1);
514 pr = prob(ng,size-1);
515 }
516 }
517 } else
518 pr = fstar;
519 }
520
521 else { //interpolation
522
523 if (size>1)
524 pr = fstar + lambda * prob(ng,size-1);
525 else
526 pr = fstar;
527 }
528
529 #ifdef MDIADAPTLM_CACHE_ENABLE
530 //probcache insert
531 if (size<=max_caching_level && probcache[size] && ng.size>=size)
532 probcache[size]->add(ng.wordp(size),pr);
533 #endif
534
535 return pr;
536 }
537
538
bodiscount(ngram ng_,int size,double & fstar,double & lambda,double & bo)539 int mdiadaptlm::bodiscount(ngram ng_,int size,double& fstar,double& lambda,double& bo)
540 {
541 ngram ng(dict);
542 ng.trans(ng_);
543
544 mdiadaptlm::discount(ng,size,fstar,lambda);
545
546 bo=1.0;
547
548 if (backoff) { //get back-off probability
549
550 if (size>1 && lambda<1) {
551
552 ngram hg=ng;
553
554 // cerr<< "hg:|" << hg << "| size:|" << size << "|" << endl;
555 if (! get(hg,size,size-1)){
556 cerr << "ERROR: int mdiadaptlm::bodiscount(ngram ng_,int size,double& fstar,double& lambda,double& bo) -> get(hg,size,size-1) returns NULL\n";
557 }
558 assert(get(hg,size,size-1));
559
560 bo=boff(hg.link);
561
562 // if (lambda > bo){
563 // cerr << " mdiadaptlm::bodiscount ERROR: " << " lambda:" << lambda << " bo:" << bo << "\n";
564 // exit(1);
565 // }
566 }
567 }
568
569 return 1;
570 }
571
572
txclprob(ngram ng,int size)573 double mdiadaptlm::txclprob(ngram ng,int size)
574 {
575
576 double fstar,lambda;
577
578 if (size>1) {
579 mdiadaptlm::discount(ng,size,fstar,lambda);
580 return fstar + lambda * txclprob(ng,size-1);
581 } else {
582 double freq=1;
583 if ((*ng.wordp(1)!=dict->oovcode()) && get(ng,1,1))
584 freq+=ng.freq;
585
586 double N=totfreq()+dict->dub()-dict->size();
587 return freq/N;
588 }
589 }
590
591
netsize()592 int mdiadaptlm::netsize()
593 {
594 double fstar,lambda;
595 int size,totsize;
596 ngram ng(dict);
597
598 cerr << "Computing LM size:\n";
599
600 totsize=dict->size() * 2;
601
602 cout << "1-gram " << totsize << "\n";
603
604 for (int i=2; i<=maxlevel(); i++) {
605
606 size=0;
607
608 scan(ng,INIT,i);
609
610 while (scan(ng,CONT,i)) {
611
612 mdiadaptlm::discount(ng,i,fstar,lambda);
613
614 if (fstar>0) size++;
615
616 }
617
618 size+=size * (i<maxlevel());
619
620 totsize+=size;
621
622 cout << i << "-gram " << totsize << "\n";
623
624 }
625
626 return totsize;
627 }
628
629
630
631 /*
632 * trigram file format:
633
634 --------------------------------
635
636 <idx> dictionary length
637
638 repeat [ dictionary length ] {
639 <newline terminated string> word;
640 }
641
642 while [ first word != STOP ] {
643 <idx> first word
644 <idx> number of successors
645 repeat [ number of successors ] {
646 <idx> second word
647 <float> prob
648 }
649 }
650
651 <idx> STOP
652
653 while [ first word != STOP ] {
654 <idx> first word
655 <idx> number of successor sets
656 repeat [ number of successor sets ] {
657 <idx> second word
658 <idx> number of successors
659 repeat [ number of successors ] {
660 <idx> third word
661 <float> prob
662 }
663 }
664 }
665
666 <idx> STOP
667
668 */
669
670
671 //void writeNull(mfbstream& out,unsigned short nullCode,float nullProb){
672 // out.writex(&nullCode,sizeof(short));
673 // out.writex(&nullProb,sizeof(float));
674 //}
675
676
swapbytes(char * p,int sz,int n)677 int swapbytes(char *p, int sz, int n)
678 {
679 char c,*l,*h;
680 if((n<1) ||(sz<2)) return 0;
681 for(; n--; p+=sz) for(h=(l=p)+sz; --h>l; l++) {
682 c=*h;
683 *h=*l;
684 *l=c;
685 }
686 return 0;
687 };
688
fwritex(char * p,int sz,int n,FILE * f)689 void fwritex(char *p,int sz,int n,FILE* f)
690 {
691
692 if(*(short *)"AB"==0x4241) {
693 swapbytes((char*)p, sz,n);
694 }
695
696 fwrite((char *)p,sz,n,f);
697
698 if(*(short *)"AB"==0x4241) swapbytes((char*)p, sz,n);
699
700 }
701
ifwrite(long loc,void * ptr,int size,int,FILE * f)702 void ifwrite(long loc,void *ptr,int size,int /* unused parameter: n */,FILE* f)
703 {
704 fflush(f);
705
706 long pos=ftell(f);
707
708 fseek(f,loc,SEEK_SET);
709
710 fwritex((char *)ptr,size,1,f);
711
712 fseek(f,pos,SEEK_SET);
713
714 fflush(f);
715 }
716
writeNull(unsigned short nullCode,float nullProb,FILE * f)717 void writeNull(unsigned short nullCode,float nullProb,FILE* f)
718 {
719 fwritex((char *)&nullCode,sizeof(short),1,f);
720 fwritex((char *)&nullProb,sizeof(float),1,f);
721 }
722
723
saveASR(char * filename,int,char * subdictfile)724 int mdiadaptlm::saveASR(char *filename,int /* unused parameter: backoff */,char* subdictfile)
725 {
726 int totbg,tottr;
727
728 dictionary* subdict;
729
730 if (subdictfile)
731 subdict=new dictionary(subdictfile);
732 else
733 subdict=dict; // default is subdict=dict
734
735 typedef unsigned short code;
736
737 system("date");
738
739 if (lmsize()>3 || lmsize()<1) {
740 cerr << "wrong lmsize\n";
741 exit(1);
742 }
743
744 if (dict->size()>=0xffff && subdict->size()>=0xffff) {
745 cerr << "save bin requires unsigned short codes\n";
746 exit(1);
747 }
748
749 FILE* f=fopen(filename,"w");
750
751 double fstar,lambda,boff;
752 float pr;
753 long succ1pos,succ2pos;
754 code succ1,succ2,w,h1,h2;
755 code stop=0xffff;
756
757 //dictionary
758 //#dictsize w1\n ..wN\n NULL\n
759
760 code oovcode=subdict->oovcode();
761
762 //includes at least NULL
763 code subdictsz=subdict->size()+1;
764
765 fwritex((char *)&subdictsz,sizeof(code),1,f);
766
767 subdictsz--;
768 for (w=0; w<subdictsz; w++)
769 fprintf(f,"%s\n",(char *)subdict->decode(w));
770
771 fprintf(f,"____\n");
772
773 //unigram part
774 //NULL #succ w1 pr1 ..wN prN
775
776 h1=subdictsz;
777 fwritex((char *)&h1,sizeof(code),1,f); //NULL
778
779 succ1=0;
780 succ1pos=ftell(f);
781 fwritex((char *)&succ1,sizeof(code),1,f);
782
783 ngram ng(dict);
784 ngram sng(subdict);
785
786 ng.size=sng.size=1;
787
788 scan(ng,INIT,1);
789 while(scan(ng,CONT,1)) {
790 sng.trans(ng);
791 if (sng.containsWord(subdict->OOV(),1))
792 continue;
793
794 pr=(float)mdiadaptlm::prob(ng,1);
795 if (pr>1e-50) { //do not consider too low probabilities
796 succ1++;
797 w=*sng.wordp(1);
798 fwritex((char *)&w,sizeof(code),1,f);
799 fwritex((char *)&pr,sizeof(float),1,f);
800 } else {
801 cerr << "small prob word " << ng << "\n";
802 }
803 }
804
805 // update number of unigrams
806 ifwrite(succ1pos,&succ1,sizeof(code),1,f);
807
808 cerr << "finito unigrammi " << succ1 << "\n";
809 fflush(f);
810
811 if (lmsize()==1) {
812 fclose(f);
813 return 1;
814 }
815
816 // rest of bigrams
817 // w1 #succ w1 pr1 .. wN prN
818
819 succ1=0;
820 h1=subdictsz;
821 totbg=subdictsz;
822
823 ngram hg1(dict,1);
824
825 ng.size=sng.size=2;
826
827 scan(hg1,INIT,1);
828 while(scan(hg1,CONT,1)) {
829
830 if (hg1.containsWord(dict->OOV(),1)) continue;
831
832 assert((*hg1.wordp(1))<dict->size());
833
834 *ng.wordp(2)=*hg1.wordp(1);
835 *ng.wordp(1)=0;
836
837 sng.trans(ng);
838 if (sng.containsWord(dict->OOV(),1)) continue;
839
840 mdiadaptlm::bodiscount(ng,2,fstar,lambda,boff);
841
842 if (lambda < 1.0) {
843
844 h1=*sng.wordp(2);
845
846 fwritex((char *)&h1,sizeof(code),1,f);
847
848 succ1=0;
849 succ1pos=ftell(f);
850 fwritex((char *)&succ1,sizeof(code),1,f);
851
852 ngram shg=hg1;
853 get(shg,1,1);
854
855 succscan(shg,ng,INIT,2);
856 while(succscan(shg,ng,CONT,2)) {
857
858 if (*ng.wordp(1)==oovcode) continue;
859
860 sng.trans(ng);
861 if (sng.containsWord(dict->OOV(),2)) continue;
862
863 mdiadaptlm::discount(ng,2,fstar,lambda);
864
865 if (fstar>1e-50) {
866 w=*sng.wordp(1);
867 fwritex((char *)&w,sizeof(code),1,f);
868 pr=(float)mdiadaptlm::prob(ng,2);
869 //cerr << ng << " prob=" << log(pr) << "\n";
870
871 fwritex((char *)&pr,sizeof(float),1,f);
872 succ1++;
873 }
874 }
875
876 if (succ1) {
877 lambda/=boff; //consider backoff
878 writeNull(subdictsz,(float)lambda,f);
879 succ1++;
880 totbg+=succ1;
881 ifwrite(succ1pos,&succ1,sizeof(code),1,f);
882 } else {
883 //go back one word
884 fseek(f,succ1pos-(streampos)sizeof(code),SEEK_SET);
885 }
886 }
887 }
888
889 fwritex((char *)&stop,sizeof(code),1,f);
890
891 cerr << " finito bigrammi! " << subdictsz << "\n";
892 fflush(f);
893
894 system("date");
895
896 if (lmsize()<3) {
897 fclose(f);
898 return 1;
899 }
900
901 //TRIGRAM PART
902
903 h1=subdictsz;
904 h2=subdictsz;
905 tottr=0;
906 succ1=0;
907 succ2=0;
908
909 ngram hg2(dict,2);
910
911 ng.size=sng.size=3;
912
913 scan(hg1,INIT,1);
914 while(scan(hg1,CONT,1)) {
915
916 if ((*hg1.wordp(1)==oovcode)) continue;
917
918 *ng.wordp(3)=*hg1.wordp(1);
919
920 sng.trans(ng);
921 if (sng.containsWord(dict->OOV(),1)) continue;
922
923 assert((*sng.wordp(3))<subdictsz);
924
925 h1=*sng.wordp(3);
926 fwritex((char *)&h1,sizeof(code),1,f);
927
928 succ1=0;
929 succ1pos=ftell(f);
930 fwritex((char *)&succ1,sizeof(code),1,f);
931
932 ngram shg1=ng;
933 get(shg1,3,1);
934
935 succscan(shg1,hg2,INIT,2);
936 while(succscan(shg1,hg2,CONT,2)) {
937
938 if (*hg2.wordp(1)==oovcode) continue;
939
940 *ng.wordp(2)=*hg2.wordp(1);
941 *ng.wordp(1)=0;
942
943 sng.trans(ng);
944 if (sng.containsWord(dict->OOV(),2)) continue;
945
946 mdiadaptlm::bodiscount(ng,3,fstar,lambda,boff);
947
948 if (lambda < 1.0) {
949
950 h2=*sng.wordp(2);
951 fwritex((char *)&h2,sizeof(code),1,f);
952
953 succ2=0;
954 succ2pos=ftell(f);
955 fwritex((char *)&succ2,sizeof(code),1,f);
956
957 ngram shg2=ng;
958 get(shg2,3,2);
959
960 succscan(shg2,ng,INIT,3);
961 while(succscan(shg2,ng,CONT,3)) {
962
963 if (*ng.wordp(1)==oovcode) continue;
964
965 sng.trans(ng);
966 if (sng.containsWord(dict->OOV(),3)) continue;
967
968 mdiadaptlm::discount(ng,3,fstar,lambda);
969 //pr=(float)mdiadaptlm::prob2(ng,3,fstar);
970
971 if (fstar>1e-50) {
972
973 w=*sng.wordp(1);
974 fwritex((char *)&w,sizeof(code),1,f);
975
976 pr=(float)mdiadaptlm::prob(ng,3);
977
978 // cerr << ng << " prob=" << log(pr) << "\n";
979 fwritex((char *)&pr,sizeof(float),1,f);
980 succ2++;
981 }
982 }
983
984 if (succ2) {
985 lambda/=boff;
986 writeNull(subdictsz,(float)lambda,f);
987 succ2++;
988 tottr+=succ2;
989 ifwrite(succ2pos,&succ2,sizeof(code),1,f);
990 succ1++;
991 } else {
992 //go back one word
993 fseek(f,succ2pos-(long)sizeof(code),SEEK_SET);
994 }
995 }
996 }
997
998 if (succ1)
999 ifwrite(succ1pos,&succ1,sizeof(code),1,f);
1000 else
1001 fseek(f,succ1pos-(long)sizeof(code),SEEK_SET);
1002 }
1003
1004 fwritex((char *)&stop,sizeof(code),1,f);
1005
1006 fclose(f);
1007
1008 cerr << "Tot bg: " << totbg << " tg: " << tottr<< "\n";
1009
1010 system("date");
1011
1012 return 1;
1013 };
1014
1015
1016 ///// Save in IRST MT format
1017
saveMT(char * filename,int backoff,char * subdictfile,int resolution,double decay)1018 int mdiadaptlm::saveMT(char *filename,int backoff,
1019 char* subdictfile,int resolution,double decay)
1020 {
1021
1022 double logalpha=log(decay);
1023 dictionary* subdict;
1024
1025 if (subdictfile)
1026 subdict=new dictionary(subdictfile);
1027 else
1028 subdict=dict; // default is subdict=dict
1029
1030 ngram ng(dict,lmsize());
1031 ngram sng(subdict,lmsize());
1032
1033 cerr << "Adding unigram of OOV word if missing\n";
1034
1035 for (int i=1; i<=maxlevel(); i++)
1036 *ng.wordp(i)=dict->oovcode();
1037
1038 if (!get(ng,maxlevel(),1)) {
1039 cerr << "oov is missing in the ngram-table\n";
1040 // f(oov) = dictionary size (Witten Bell)
1041 ng.freq=dict->freq(dict->oovcode());
1042 cerr << "adding oov unigram " << ng << "\n";
1043 put(ng);
1044 }
1045
1046 cerr << "Eventually adding OOV symbol to subdictionary\n";
1047 subdict->encode(OOV_);
1048
1049 system("date");
1050
1051 mfstream out(filename,ios::out);
1052
1053 //add special symbols
1054
1055 subdict->incflag(1);
1056 int bo_code=subdict->encode(BACKOFF_);
1057 int du_code=subdict->encode(DUMMY_);
1058 subdict->incflag(0);
1059
1060 out << "nGrAm " << lmsize() << " " << 0
1061 << " " << "LM_ "
1062 << resolution << " "
1063 << decay << "\n";
1064
1065 subdict->save(out);
1066
1067 //start writing ngrams
1068
1069 cerr << "write unigram of oov probability\n";
1070 ng.size=1;
1071 *ng.wordp(1)=dict->oovcode();
1072 double pr=(float)mdiadaptlm::prob(ng,1);
1073 sng.trans(ng);
1074 sng.size=lmsize();
1075 for (int s=2; s<=lmsize(); s++) *sng.wordp(s)=du_code;
1076 sng.freq=(int)ceil(pr * (double)10000000)-1;
1077 out << sng << "\n";
1078
1079 for (int i=1; i<=lmsize(); i++) {
1080 cerr << "LEVEL " << i << "\n";
1081
1082 double fstar,lambda,bo,dummy;
1083
1084 scan(ng,INIT,i);
1085 while(scan(ng,CONT,i)) {
1086
1087 sng.trans(ng);
1088
1089 sng.size=lmsize();
1090 for (int s=i+1; s<=lmsize(); s++)
1091 *sng.wordp(s)=du_code;
1092
1093 if (i>=1 && sng.containsWord(subdict->OOV(),sng.size)) {
1094 cerr << "skipping : " << sng << "\n";
1095 continue;
1096 }
1097
1098 // skip also eos symbols not at the final
1099 //if (i>=1 && sng.containsWord(dict->EoS(),sng.size))
1100 //continue;
1101
1102 mdiadaptlm::discount(ng,i,fstar,dummy);
1103
1104 //out << sng << " fstar " << fstar << " lambda " << lambda << "\n";
1105 //if (i==1 && sng.containsWord(subdict->OOV(),i)){
1106 // cerr << sng << " fstar " << fstar << "\n";
1107 //}
1108
1109 if (fstar>0) {
1110
1111 double pr=(float)mdiadaptlm::prob(ng,i);
1112
1113 if (i>1 && resolution<10000000) {
1114 sng.freq=resolution-(int)(log(pr)/logalpha)-1;
1115 sng.freq=(sng.freq>=0?sng.freq:0);
1116 } else
1117 sng.freq=(int)ceil(pr * (double)10000000)-1;
1118
1119 out << sng << "\n";
1120
1121 }
1122
1123 if (i<lmsize()) { /// write backoff of higher order!!
1124
1125 ngram ng2=ng;
1126 ng2.pushc(0); //extend by one
1127 mdiadaptlm::bodiscount(ng2,i+1,dummy,lambda,bo);
1128 assert(!backoff || (lambda ==1 || bo<1 ));
1129
1130 sng.pushc(bo_code);
1131 sng.size=lmsize();
1132
1133 if (lambda<1) {
1134 if (resolution<10000000) {
1135 sng.freq=resolution-(int)((log(lambda) - log(bo))/logalpha)-1;
1136 sng.freq=(sng.freq>=0?sng.freq:0);
1137 } else
1138 sng.freq=(int)ceil(lambda/bo * (double)10000000)-1;
1139
1140 out << sng << "\n";
1141 }
1142 }
1143 }
1144 cerr << "LEVEL " << i << "DONE \n";
1145 }
1146 return 1;
1147 };
1148
1149 ///// Save in binary format forbackoff N-gram models
1150
saveBIN_per_word(char * filename,int backoff,char * subdictfile,int mmap)1151 int mdiadaptlm::saveBIN_per_word(char *filename,int backoff,char* subdictfile,int mmap)
1152 {
1153 VERBOSE(2,"mdiadaptlm::saveBIN_per_word START\n");
1154 system("date");
1155
1156 //subdict
1157 dictionary* subdict;
1158
1159 //accumulated unigram oov prob
1160 //CHECK why this is not used (differently from what happens in the other save functions
1161 // double oovprob=0;
1162
1163
1164 if (subdictfile) subdict=new dictionary(subdictfile);
1165 else subdict=dict; // default is subdict=dict
1166
1167 if (mmap) {
1168 VERBOSE(2,"savebin with memory map: " << filename << "\n");
1169 } else {
1170 VERBOSE(2,"savebin: " << filename << "\n");
1171 }
1172
1173
1174 vector<streampos> pos(lmsize()+1);
1175 int maxlev=lmsize();
1176 char buff[100];
1177 int isQuant=0; //savebin for quantized LM is not yet implemented
1178
1179 //temporary filename to save the LM related to a single term
1180 char tmpfilename[BUFSIZ];
1181
1182 //create temporary output file stream to store single levels for all terms
1183 assert(strlen(filename)<1000);
1184 char tfilename[MAX_NGRAM][1000];
1185 mfstream *tout[MAX_NGRAM];
1186
1187 for (int i=1; i<=lmsize(); i++) {
1188 sprintf(tfilename[i],"%s-%dgrams",filename,i);
1189 tout[i]=new mfstream(tfilename[i],ios::out);
1190 }
1191
1192 // print header in the main output file
1193 mfstream out(filename,ios::out);
1194 out << "blmt " << maxlev;
1195
1196 for (int i=1; i<=maxlev; i++) { //reserve space for ngram statistics (which are not yet avalable)
1197 pos[i]=out.tellp();
1198 sprintf(buff," %10d",0);
1199 out << buff;
1200 }
1201 out << "\n";
1202 subdict->save(out);
1203 out.flush();
1204
1205 ngram ng(dict,lmsize());
1206 ngram oldng(dict,lmsize());
1207 ngram locng(dict,lmsize());
1208
1209 ngram sng(subdict,lmsize());
1210
1211 double fstar,lambda,bo,dummy,dummy2,pr,ibow;
1212
1213 //n-gram counters
1214 table_entry_pos_t num[lmsize()+1];
1215 for (int i=1; i<=lmsize(); i++) num[i]=0;
1216
1217 lmtable* lmt = new lmtable();
1218
1219 lmt->configure(maxlev,isQuant);
1220 lmt->setDict(subdict);
1221 lmt->expand_level(1,dict->size(),filename,mmap);
1222
1223 //main loop
1224 for (int w=0; w<dict->size(); w++) {
1225 sprintf(tmpfilename,"%s_tmp_%d",filename,w);
1226
1227 if (!w % 10000) cerr << ".";
1228
1229 //1-gram
1230 ngram ung(dict,1);
1231 *ung.wordp(1)=w;
1232 sng.trans(ung);
1233
1234 //exclude words not occurring in the subdictionary
1235 if (sng.containsWord(subdict->OOV(),1) && !ung.containsWord(dict->OOV(),1)) continue;
1236
1237
1238 pr=mdiadaptlm::prob(ung,1);
1239 pr=(pr?log10(pr):-99);
1240
1241 if (lmsize()>1) { //compute back-off
1242 ung.pushc(0); //extend by one
1243 mdiadaptlm::bodiscount(ung,2,fstar,lambda,bo);
1244 ung.shift();//shrink by one
1245
1246 assert(!backoff || ((lambda<UPPER_SINGLE_PRECISION_OF_1 && lambda>LOWER_SINGLE_PRECISION_OF_1) || bo<UPPER_SINGLE_PRECISION_OF_1 ));
1247
1248 if (backoff){
1249 ibow=log10(lambda) - log10(bo);
1250 }else{
1251 if (lambda<LOWER_SINGLE_PRECISION_OF_1){
1252 ibow = log10(lambda);
1253 }else { //force to be 0.0
1254 ibow = 0.0;
1255 }
1256 }
1257 }
1258 else {
1259 ibow=0.0; //default value for backoff weight at the lowest level
1260 }
1261
1262 lmt->addwithoffset(ung,(float)pr,(float)ibow);
1263 num[1]++;
1264
1265 //manage n-grams
1266 if (get(ung,1,1)) {
1267
1268 //create n-gram with history w
1269 *ng.wordp(lmsize())=w;
1270
1271 //create sentinel n-gram
1272 for (int i=1; i<=lmsize(); i++) *oldng.wordp(i)=-1;
1273
1274 //create the table for all levels but the level 1, with the maximum number of possible entries
1275 for (int i=2; i<=lmsize(); i++)
1276 lmt->expand_level(i,entries(i),tmpfilename,mmap);
1277
1278 scan(ung.link,ung.info,1,ng,INIT,lmsize());
1279 while(scan(ung.link,ung.info,1,ng,CONT,lmsize())) {
1280 sng.trans(ng); // convert to subdictionary
1281 locng=ng; // make a local copy
1282
1283 //find first internal level that changed
1284 int f=lmsize()-1; //unigrams have been already covered
1285 while (f>1 && (*oldng.wordp(f)==*ng.wordp(f))){ f--; }
1286
1287 for (int l=lmsize()-(f-1); l<=lmsize(); l++){
1288
1289 locng=ng; // make a local copy
1290 if (l<lmsize()) locng.shift(lmsize()-l); //reduce the ngram, which has size level
1291
1292 if (sng.containsWord(subdict->OOV(),l)) continue;
1293
1294 // skip also eos symbols not at the final
1295 if (sng.containsWord(dict->EoS(),l-1)) continue;
1296
1297 pr=mdiadaptlm::prob(locng,l,fstar,dummy,dummy2);
1298
1299 //PATCH by Nicola (16-04-2008)
1300
1301 if (!(pr<=1.0 && pr > 1e-10)) {
1302 cerr << ng << " " << pr << "\n";
1303 assert(pr<=1.0);
1304 cerr << "prob modified to 1e-10\n";
1305 pr=1e-10;
1306 }
1307
1308 if (l<lmsize()) {
1309
1310 locng.pushc(0); //extend by one
1311
1312 mdiadaptlm::bodiscount(locng,l+1,dummy,lambda,bo);
1313
1314 locng.shift();
1315
1316 if (fstar>=UPPER_SINGLE_PRECISION_OF_0 || lambda <= LOWER_SINGLE_PRECISION_OF_1) {
1317 ibow=log10(lambda) - log10(bo);
1318 if (lmt->addwithoffset(locng,(float)log10(pr),(float)ibow)){
1319 num[l]++;
1320 }else{
1321 continue;
1322 }
1323 }
1324 else{
1325 continue; //skip n-grams with too small fstar
1326 }
1327 } else {
1328 if (fstar>=UPPER_SINGLE_PRECISION_OF_0) {
1329 ibow=0.0; //value for backoff weight at the highest level
1330 if (lmt->addwithoffset(locng,(float)log10(pr),(float)ibow)){
1331 num[l]++;
1332 }else{
1333 continue;
1334 }
1335 }
1336 else{
1337 continue; //skip n-grams with too small fstar
1338 }
1339 }
1340 }
1341 oldng=ng;
1342 }
1343 }
1344 else{
1345 //create empty tables for all levels but the level 1, to keep consistency with the rest of the code
1346 for (int i=2; i<=lmsize(); i++)
1347 lmt->expand_level(i,0,tmpfilename,mmap);
1348 }
1349
1350
1351 //level 1 is not modified until everything is done
1352 //because it has to contain the full dictionary
1353 //which provides the direct access to the second level
1354 for (int i=2; i<=lmsize(); i++){
1355
1356 if (i>2) {
1357 lmt->checkbounds(i-1);
1358 lmt->appendbin_level(i-1, *tout[i-1], mmap);
1359 }
1360
1361 // now we can resize table at level i
1362 lmt->resize_level(i, tmpfilename, mmap);
1363 }
1364
1365 // now we can save table at level maxlev, if not equal to 1
1366 if (lmsize()>1){
1367 lmt->appendbin_level(maxlev, *tout[maxlev], mmap);
1368 }
1369
1370 //delete levels from 2 to lmsize();
1371 for (int i=2; i<=lmsize(); i++) lmt->delete_level(i, tmpfilename, mmap);
1372
1373 //update table offsets
1374 for (int i=2; i<=lmsize(); i++) lmt->update_offset(i,num[i]);
1375 }
1376 //close levels from 2 to lmsize()
1377 for (int i=2; i<=lmsize(); i++) tout[i]->close();
1378
1379 //now we can save level 1, which contains all unigrams
1380 //cerr << "saving level 1" << "...\n";
1381 lmt->savebin_level(1, filename, mmap);
1382
1383 //update headers
1384 for (int i=1; i<=lmsize(); i++) {
1385 sprintf(buff," %10d",num[i]);
1386 out.seekp(pos[i]);
1387 out << buff;
1388 }
1389
1390 out.close();
1391
1392 //concatenate files for each single level into one file
1393 //single level files should have a name derived from "filename"
1394 lmt->compact_all_levels(filename);
1395
1396 cerr << "\n";
1397 system("date");
1398
1399 VERBOSE(2,"mdiadaptlm::saveBIN_per_word END\n");
1400 return 1;
1401 };
1402
1403 ///// Save in binary format forbackoff N-gram models
saveBIN_per_level(char * filename,int backoff,char * subdictfile,int mmap)1404 int mdiadaptlm::saveBIN_per_level(char *filename,int backoff,char* subdictfile,int mmap)
1405 {
1406 VERBOSE(2,"mdiadaptlm::saveBIN_per_level START\n");
1407 system("date");
1408
1409 //subdict
1410 dictionary* subdict;
1411
1412 //accumulated unigram oov prob
1413 double oovprob=0;
1414
1415 if (subdictfile) subdict=new dictionary(subdictfile);
1416 else subdict=dict; // default is subdict=dict
1417
1418 if (mmap) {
1419 VERBOSE(2,"savebin with memory map: " << filename << "\n");
1420 } else {
1421 VERBOSE(2,"savebin: " << filename << "\n");
1422 }
1423
1424 vector<streampos> pos(lmsize()+1);
1425 int maxlev=lmsize();
1426 char buff[100];
1427 int isQuant=0; //savebin for quantized LM is not yet implemented
1428
1429 // print header
1430 fstream out(filename,ios::out);
1431 out << "blmt " << maxlev;
1432
1433 for (int i=1; i<=maxlev; i++) { //reserve space for ngram statistics (which are not yet avalable)
1434 pos[i]=out.tellp();
1435 sprintf(buff," %10d",0);
1436 out << buff;
1437 }
1438 out << "\n";
1439 lmtable* lmt = new lmtable();
1440
1441 lmt->configure(maxlev,isQuant);
1442
1443 lmt->setDict(subdict);
1444 subdict->save(out);
1445 out.flush();
1446
1447
1448 //start adding n-grams to lmtable
1449
1450 for (int i=1; i<=lmsize(); i++) {
1451 cerr << "saving level " << i << "...\n";
1452 table_entry_pos_t numberofentries;
1453 if (i==1) { //unigram
1454 numberofentries = (table_entry_pos_t) subdict->size();
1455 } else {
1456 numberofentries = (table_entry_pos_t) entries(i);
1457 }
1458 system("date");
1459 lmt->expand_level(i,numberofentries,filename,mmap);
1460
1461 double totp=0;
1462 double fstar,lambda,bo,dummy,dummy2,pr,ibow;
1463
1464 ngram ng(dict,1);
1465 ngram ng2(dict);
1466 ngram sng(subdict,1);
1467
1468 if (i==1) { //unigram case
1469
1470 //scan the dictionary
1471 for (int w=0; w<dict->size(); w++) {
1472 *ng.wordp(1)=w;
1473
1474 sng.trans(ng);
1475 pr=mdiadaptlm::prob(ng,1);
1476 totp+=pr;
1477
1478 if (sng.containsWord(subdict->OOV(),i) && !ng.containsWord(dict->OOV(),i)) {
1479 oovprob+=pr; //accumulate oov probability
1480 continue;
1481 }
1482
1483
1484 if (ng.containsWord(dict->OOV(),i)) pr+=oovprob;
1485
1486 //cerr << ng << " freq " << dict->freq(w) << " - Pr " << pr << "\n";
1487 pr=(pr?log10(pr):-99);
1488
1489 if (w==dict->oovcode()){
1490 //CHECK whether we can avoid this reassignment because dict should be lmt->getDict()
1491 *ng.wordp(1)=lmt->getDict()->oovcode();
1492 ibow=0.0;
1493 }
1494 else {
1495 // } //do nothing
1496
1497 if (lmsize()>1) {
1498 ngram ng2=ng;
1499 ng2.pushc(0); //extend by one
1500
1501 //cerr << ng2 << "\n";
1502
1503 mdiadaptlm::bodiscount(ng2,i+1,fstar,lambda,bo);
1504 assert(!backoff || ((lambda<UPPER_SINGLE_PRECISION_OF_1 && lambda>LOWER_SINGLE_PRECISION_OF_1) || bo<UPPER_SINGLE_PRECISION_OF_1));
1505
1506 if (backoff){
1507 ibow = log10(lambda) - log10(bo);
1508 }else{
1509 if (lambda<LOWER_SINGLE_PRECISION_OF_1){
1510 ibow = log10(lambda);
1511 }else { //force to be 0.0
1512 ibow = 0.0;
1513 }
1514 }
1515 }else {
1516 ibow=0.0; //default value for backoff weight at the lowest level
1517 }
1518 }
1519 lmt->add(ng,(float)pr,(float)ibow);
1520 }
1521 //cerr << "totprob = " << totp << "\n";
1522 }
1523 else { //i>1 , bigrams, trigrams, fourgrams...
1524 *ng.wordp(1)=0;
1525 get(ng,1,1); //this
1526 scan(ng,INIT,i);
1527 while(scan(ng,CONT,i)) {
1528 sng.trans(ng);
1529
1530 if (sng.containsWord(subdict->OOV(),i)) continue;
1531
1532 // skip also eos symbols not at the final
1533 if (sng.containsWord(dict->EoS(),i-1)) continue;
1534
1535 // mdiadaptlm::discount(ng,i,fstar,dummy);
1536 // pr=mdiadaptlm::prob(ng,i);
1537 pr=mdiadaptlm::prob(ng,i,fstar,dummy,dummy2);
1538
1539 if (!(pr<=1.0 && pr > 1e-10)) {
1540 cerr << ng << " " << pr << "\n";
1541 assert(pr<=1.0);
1542 cerr << "prob modified to 1e-10\n";
1543 pr=1e-10;
1544 }
1545
1546 if (i<lmsize()) {
1547 ng2=ng;
1548 ng2.pushc(0); //extend by one
1549
1550 mdiadaptlm::bodiscount(ng2,i+1,dummy,lambda,bo);
1551
1552 if (fstar>=UPPER_SINGLE_PRECISION_OF_0 || lambda <= LOWER_SINGLE_PRECISION_OF_1) {
1553 ibow=log10(lambda) - log10(bo);
1554 lmt->add(ng,(float)log10(pr),(float)ibow);
1555 }
1556 } else {
1557 if (fstar >= UPPER_SINGLE_PRECISION_OF_0) {
1558 ibow=0.0; //value for backoff weight at the highest level
1559 lmt->add(ng,(float)log10(pr),(float)ibow);
1560 }
1561 }
1562 }
1563 }
1564
1565 // now we can fix table at level i-1
1566 // now we can save table at level i-1
1567 // now we can remove table at level i-1
1568 if (maxlev>1 && i>1) {
1569 lmt->checkbounds(i-1);
1570 lmt->savebin_level(i-1, filename, mmap);
1571 }
1572
1573 // now we can resize table at level i
1574 lmt->resize_level(i, filename, mmap);
1575
1576 }
1577 // now we can save table at level maxlev
1578 lmt->savebin_level(maxlev, filename, mmap);
1579
1580 //update headers
1581 for (int i=1; i<=lmsize(); i++) {
1582 sprintf(buff," %10d",lmt->getCurrentSize(i));
1583 out.seekp(pos[i]);
1584 out << buff;
1585 }
1586 out.close();
1587
1588 //concatenate files for each single level into one file
1589 //single level files should have a name derived from "filename"
1590 lmt->compact_all_levels(filename);
1591
1592 VERBOSE(2,"mdiadaptlm::saveBIN_per_level END\n");
1593 return 1;
1594 }
1595
1596
1597 ///// Save in format for ARPA backoff N-gram models
saveARPA_per_word(char * filename,int backoff,char * subdictfile)1598 int mdiadaptlm::saveARPA_per_word(char *filename,int backoff,char* subdictfile )
1599 {
1600 VERBOSE(2,"mdiadaptlm::saveARPA_per_word START\n");
1601 system("date");
1602
1603 //subdict
1604 dictionary* subdict;
1605
1606 //accumulated unigram oov prob
1607 //CHECK why this is not used (differently from what happens in the other save functions
1608 // double oovprob=0;
1609
1610
1611 if (subdictfile) subdict=new dictionary(subdictfile);
1612 else subdict=dict; // default is subdict=dict
1613
1614 //main output file
1615 mfstream out(filename,ios::out);
1616
1617 //create temporary output file stream
1618 assert(strlen(filename)<1000);
1619 char tfilename[MAX_NGRAM][1000];
1620 mfstream *tout[MAX_NGRAM];
1621
1622 for (int i=1; i<=lmsize(); i++) {
1623 sprintf(tfilename[i],"%s.%d",filename,i);
1624 tout[i]=new mfstream(tfilename[i],ios::out);
1625 *tout[i] << "\n\\" << i << "-grams:\n";
1626 }
1627
1628
1629 ngram ng(dict,lmsize());
1630 ngram oldng(dict,lmsize());
1631 ngram locng(dict,lmsize());
1632
1633 ngram sng(subdict,lmsize());
1634
1635 double fstar,lambda,bo,dummy,dummy2, pr;
1636
1637 //n-gram counters
1638 table_entry_pos_t num[lmsize()+1];
1639 for (int i=1; i<=lmsize(); i++) num[i]=0;
1640
1641
1642 //main loop
1643 for (int w=0; w<dict->size(); w++) {
1644
1645 if (!w % 10000) cerr << ".";
1646
1647 //1-gram
1648 ngram ung(dict,1);
1649 *ung.wordp(1)=w;
1650 sng.trans(ung);
1651
1652 //exclude words not occurring in the subdictionary
1653 if (sng.containsWord(subdict->OOV(),1) && !ung.containsWord(dict->OOV(),1)) continue;
1654
1655 pr=mdiadaptlm::prob(ung,1);
1656 pr=(pr?log10(pr):-99);
1657
1658 if (w==dict->oovcode())
1659 *tout[1] << (float) pr << "\t" << "<unk>";
1660 else
1661 *tout[1] << (float) pr << "\t" << (char *)dict->decode(w);
1662
1663 num[1]++;
1664
1665 if (lmsize()>1) { //print back-off
1666 ung.pushc(0); //extend by one
1667 mdiadaptlm::bodiscount(ung,2,fstar,lambda,bo);
1668 ung.shift();//shrink by one
1669
1670 assert(!backoff || ((lambda<UPPER_SINGLE_PRECISION_OF_1 && lambda>LOWER_SINGLE_PRECISION_OF_1) || bo<UPPER_SINGLE_PRECISION_OF_1 ));
1671
1672 if (backoff){
1673 *tout[1] << "\t" << (float) (log10(lambda) - log10(bo));
1674 }else{
1675 if (lambda<LOWER_SINGLE_PRECISION_OF_1){
1676 *tout[1] << "\t" << (float) log10(lambda);
1677 } //no output if log10(lambda)==0
1678 }
1679 }
1680 *tout[1] << "\n";
1681
1682 //manage n-grams
1683 if (get(ung,1,1)) {
1684
1685 //create n-gram with history w
1686 *ng.wordp(lmsize())=w;
1687
1688 //create sentinel n-gram
1689 for (int i=1; i<=lmsize(); i++) *oldng.wordp(i)=-1;
1690
1691 scan(ung.link,ung.info,1,ng,INIT,lmsize());
1692 while(scan(ung.link,ung.info,1,ng,CONT,lmsize())) {
1693 //cerr << ng << "\n";
1694 sng.trans(ng); // convert to subdictionary
1695 locng=ng; // make a local copy
1696
1697 //find first internal level that changed
1698 int f=lmsize()-1; //unigrams have been already covered
1699 while (f>1 && (*oldng.wordp(f)==*ng.wordp(f))){ f--; }
1700
1701 for (int l=lmsize(); l>lmsize()-f;l--){
1702
1703 if (l<lmsize()) locng.shift(); //ngram has size level
1704
1705 if (sng.containsWord(subdict->OOV(),l)) continue;
1706
1707 // skip also eos symbols not at the final
1708 if (sng.containsWord(dict->EoS(),l-1)) continue;
1709
1710 pr=mdiadaptlm::prob(locng,l,fstar,dummy,dummy2);
1711
1712 //PATCH by Nicola (16-04-2008)
1713
1714 if (!(pr<=1.0 && pr > 1e-10)) {
1715 cerr << ng << " " << pr << "\n";
1716 assert(pr<=1.0);
1717 cerr << "prob modified to 1e-10\n";
1718 pr=1e-10;
1719 }
1720
1721 if (l<lmsize()) {
1722
1723 locng.pushc(0); //extend by one
1724
1725 mdiadaptlm::bodiscount(locng,l+1,dummy,lambda,bo);
1726
1727 locng.shift();
1728
1729 if (fstar>=UPPER_SINGLE_PRECISION_OF_0 || lambda <= LOWER_SINGLE_PRECISION_OF_1) {
1730 *tout[l] << (float) log10(pr);
1731 *tout[l] << "\t" << (char *)dict->decode(*locng.wordp(l));
1732 for (int j=l-1; j>0; j--)
1733 *tout[l] << " " << (char *)dict->decode(*locng.wordp(j));
1734
1735 if (lambda < LOWER_SINGLE_PRECISION_OF_1) //output back-off prob
1736 *tout[l] << "\t" << (float) (log10(lambda) -log10(bo));
1737 *tout[l] << "\n";
1738
1739 num[l]++;
1740 } else continue; //skip n-grams with too small fstar
1741 } else {
1742 if (fstar>=UPPER_SINGLE_PRECISION_OF_0 ) {
1743 *tout[l] << (float) log10(pr);
1744 *tout[l] << "\t" << (char *)dict->decode(*locng.wordp(l));
1745 for (int j=l-1; j>0; j--)
1746 *tout[l] << " " << (char *)dict->decode(*locng.wordp(j));
1747 *tout[l] << "\n";
1748 num[l]++;
1749 } else continue; //skip n-grams with too small fstar
1750 }
1751
1752 }
1753 oldng=ng;
1754 }
1755 }
1756
1757 }
1758
1759
1760 //print header
1761 out << "\n\\data\\" << "\n";
1762 char buff[100];
1763 for (int i=1; i<=lmsize(); i++) {
1764 sprintf(buff,"ngram %2d=%10d\n",i,num[i]);
1765 out << buff;
1766 }
1767 out << "\n";
1768
1769 //append and remove temporary files
1770 for (int i=1; i<=lmsize(); i++) {
1771 delete tout[i];
1772 tout[i]=new mfstream(tfilename[i],ios::in);
1773 out << tout[i]->rdbuf();
1774 delete tout[i];
1775 removefile(tfilename[i]);
1776 }
1777
1778 out << "\\end\\" << "\n";
1779
1780 cerr << "\n";
1781 system("date");
1782
1783 VERBOSE(2,"mdiadaptlm::saveARPA_per_word END\n");
1784 return 1;
1785 };
1786
1787 ///// Save in format for ARPA backoff N-gram models
saveARPA_per_level(char * filename,int backoff,char * subdictfile)1788 int mdiadaptlm::saveARPA_per_level(char *filename,int backoff,char* subdictfile )
1789 {
1790 VERBOSE(2,"mdiadaptlm::saveARPA_per_level START\n");
1791 system("date");
1792
1793 //subdict
1794 dictionary* subdict;
1795
1796 //accumulated unigram oov prob
1797 double oovprob=0;
1798
1799 if (subdictfile) {
1800 subdict=new dictionary(subdictfile);
1801 } else
1802 subdict=dict; // default is subdict=dict
1803
1804 fstream out(filename,ios::out);
1805 // out.precision(15);
1806
1807 vector<streampos> pos(lmsize()+1);
1808 table_entry_pos_t num[lmsize()+1];
1809 char buff[100];
1810
1811 //print header
1812 out << "\n\\data\\" << "\n";
1813
1814 for (int i=1; i<=lmsize(); i++) {
1815 num[i]=0;
1816 pos[i]=out.tellp();
1817 sprintf(buff,"ngram %2d=%10d\n",i,num[i]);
1818 out << buff;
1819 }
1820
1821 out << "\n";
1822
1823 //start writing n-grams
1824
1825 for (int i=1; i<=lmsize(); i++) {
1826 cerr << "saving level " << i << "...\n";
1827
1828
1829 out << "\n\\" << i << "-grams:\n";
1830
1831 double totp=0;
1832 double fstar,lambda,bo,dummy,dummy2,pr;
1833
1834
1835 ngram ng(dict,1);
1836 ngram ng2(dict);
1837 ngram sng(subdict,1);
1838
1839 if (i==1) { //unigram case
1840
1841 //scan the dictionary
1842
1843 for (int w=0; w<dict->size(); w++) {
1844 *ng.wordp(1)=w;
1845
1846 sng.trans(ng);
1847 pr=mdiadaptlm::prob(ng,1);
1848 totp+=pr;
1849
1850 if (sng.containsWord(subdict->OOV(),i) && !ng.containsWord(dict->OOV(),i)) {
1851 oovprob+=pr; //accumulate oov probability
1852 continue;
1853 }
1854
1855
1856 if (ng.containsWord(dict->OOV(),i)) pr+=oovprob;
1857
1858 //cerr << ng << " freq " << dict->freq(w) << " - Pr " << pr << "\n";
1859 out << (float) (pr?log10(pr):-99);
1860
1861 num[i]++;
1862
1863 if (w==dict->oovcode())
1864 out << "\t" << "<unk>\n";
1865 else {
1866 out << "\t" << (char *)dict->decode(w);
1867
1868 if (lmsize()>1) {
1869 ngram ng2=ng;
1870 ng2.pushc(0); //extend by one
1871
1872 mdiadaptlm::bodiscount(ng2,i+1,fstar,lambda,bo);
1873
1874 assert(!backoff || ((lambda<UPPER_SINGLE_PRECISION_OF_1 && lambda>LOWER_SINGLE_PRECISION_OF_1) || bo<UPPER_SINGLE_PRECISION_OF_1 ));
1875
1876 if (backoff){
1877 out << "\t" << (float) (log10(lambda) - log10(bo));
1878 }else{
1879 if (lambda<LOWER_SINGLE_PRECISION_OF_1){
1880 out << "\t" << (float) log10(lambda);
1881 } //no output if log10(lambda)==0
1882 }
1883 }
1884 out << "\n";
1885 }
1886 }
1887 //cerr << "totprob = " << totp << "\n";
1888 }
1889 else { //i>1 , bigrams, trigrams, fourgrams...
1890 *ng.wordp(1)=0;
1891 get(ng,1,1); //this
1892 scan(ng,INIT,i);
1893 while(scan(ng,CONT,i)) {
1894
1895 sng.trans(ng);
1896 if (sng.containsWord(subdict->OOV(),i)) continue;
1897
1898 // skip also eos symbols not at the final
1899 if (sng.containsWord(dict->EoS(),i-1)) continue;
1900
1901 pr=mdiadaptlm::prob(ng,i,fstar,dummy,dummy2);
1902
1903 //PATCH by Nicola (16-04-2008)
1904
1905 if (!(pr<=1.0 && pr > 1e-10)) {
1906 cerr << ng << " " << pr << "\n";
1907 assert(pr<=1.0);
1908 cerr << "prob modified to 1e-10\n";
1909 pr=1e-10;
1910 }
1911
1912 if (i<lmsize()) {
1913 ng2=ng;
1914 ng2.pushc(0); //extend by one
1915
1916 mdiadaptlm::bodiscount(ng2,i+1,dummy,lambda,bo);
1917
1918 if (fstar>=UPPER_SINGLE_PRECISION_OF_0 || lambda <= LOWER_SINGLE_PRECISION_OF_1) {
1919 out << (float) log10(pr);
1920 out << "\t" << (char *)dict->decode(*ng.wordp(i));
1921 for (int j=i-1; j>0; j--)
1922 out << " " << (char *)dict->decode(*ng.wordp(j));
1923 if (backoff){
1924 out << "\t" << (float) (log10(lambda) - log10(bo));
1925 }else{
1926 if (lambda<LOWER_SINGLE_PRECISION_OF_1){
1927 out << "\t" << (float) log10(lambda);
1928 } //no output if log10(lambda)==0
1929 }
1930 out << "\n";
1931 num[i]++;
1932 }
1933 } else {
1934 if (fstar>=UPPER_SINGLE_PRECISION_OF_0) {
1935 out << (float) log10(pr);
1936 out << "\t" << (char *)dict->decode(*ng.wordp(i));
1937 for (int j=i-1; j>0; j--)
1938 out << " " << (char *)dict->decode(*ng.wordp(j));
1939 out << "\n";
1940
1941 num[i]++;
1942 }
1943 }
1944 }
1945 }
1946
1947 cerr << i << "grams tot:" << num[i] << "\n";
1948 }
1949
1950 streampos last=out.tellp();
1951
1952 //update headers
1953 for (int i=1; i<=lmsize(); i++) {
1954 sprintf(buff,"ngram %2d=%10d\n",i,num[i]);
1955 out.seekp(pos[i]);
1956 out << buff;
1957 }
1958
1959 out.seekp(last);
1960 out << "\\end\\" << "\n";
1961 system("date");
1962
1963 VERBOSE(2,"mdiadaptlm::saveARPA_per_level END\n");
1964 return 1;
1965 };
1966
1967
1968 /*
1969 main(int argc,char** argv){
1970 char* dictname=argv[1];
1971 char* backngram=argv[2];
1972 int depth=atoi(argv[3]);
1973 char* forengram=argv[4];
1974 char* testngram=argv[5];
1975
1976 dictionary dict(dictname);
1977 ngramtable test(&dict,testngram,depth);
1978
1979 shiftbeta lm2(&dict,backngram,depth);
1980 lm2.train();
1981 //lm2.test(test,depth);
1982
1983 mdi lm(&dict,backngram,depth);
1984 lm.train();
1985 for (double w=0.0;w<=1.0;w+=0.1){
1986 lm.getforelm(forengram);
1987 lm.adapt(w);
1988 lm.test(test,depth);
1989 }
1990 }
1991 */
1992
1993