1 /******************************************************************************
2 IrstLM: IRST Language Model Toolkit
3 Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
4 
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9 
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13 Lesser General Public License for more details.
14 
15 You should have received a copy of the GNU Lesser General Public
16 License along with this library; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
18 
19 ******************************************************************************/
20 
21 #include <cmath>
22 #include <string>
23 #include <assert.h>
24 #include <vector>
25 #include "util.h"
26 #include "mfstream.h"
27 #include "mempool.h"
28 #include "htable.h"
29 #include "dictionary.h"
30 #include "n_gram.h"
31 #include "mempool.h"
32 #include "ngramcache.h"
33 #include "ngramtable.h"
34 #include "normcache.h"
35 #include "interplm.h"
36 #include "mdiadapt.h"
37 #include "shiftlm.h"
38 #include "lmtable.h"
39 
40 using namespace std;
41 
42 //
43 //Minimum discrimination adaptation for interplm
44 //
mdiadaptlm(char * ngtfile,int depth,TABLETYPE tbtype)45 mdiadaptlm::mdiadaptlm(char* ngtfile,int depth,TABLETYPE tbtype):
46   interplm(ngtfile,depth,tbtype)
47 {
48   adaptlev=0;
49   forelm=NULL;
50   cache=NULL;
51 	m_save_per_level=true;
52 };
53 
~mdiadaptlm()54 mdiadaptlm::~mdiadaptlm()
55 {
56   if (cache) delete cache;
57   delete_caches();
58 };
59 
delete_caches(int level)60 void mdiadaptlm::delete_caches(int level)
61 {
62   if (probcache[level]) delete probcache[level];
63   if (backoffcache[level]) delete backoffcache[level];
64 };
65 
delete_caches()66 void mdiadaptlm::delete_caches()
67 {
68 #ifdef MDIADAPTLM_CACHE_ENABLE
69   for (int i=0; i<=max_caching_level; i++) delete_caches(i);
70 
71   delete [] probcache;
72   delete [] backoffcache;
73 #endif
74 };
75 
caches_stat()76 void mdiadaptlm::caches_stat()
77 {
78 #ifdef MDIADAPTLM_CACHE_ENABLE
79   for (int i=1; i<=max_caching_level; i++) {
80     if (probcache[i]) {
81       cerr << "Statistics of probcache at level " << i << " (of " << lmsize() << ") ";
82       probcache[i]->stat();
83     }
84     if (backoffcache[i]) {
85       cerr << "Statistics of backoffcache at level " << i << " (of " << lmsize() << ") ";
86       backoffcache[i]->stat();
87     }
88   }
89 #endif
90 };
91 
92 
create_caches(int mcl)93 void mdiadaptlm::create_caches(int mcl)
94 {
95   max_caching_level=(mcl>=0 && mcl<lmsize())?mcl:lmsize()-1;
96 
97   probcache = new NGRAMCACHE_t*[max_caching_level+1]; //index 0 will never be used, index=max_caching_level is not used
98   backoffcache = new NGRAMCACHE_t*[max_caching_level+1]; //index 0 will never be used, index=max_caching_level is not used
99   for (int i=0; i<=max_caching_level; i++) {
100     probcache[i]=NULL;
101     backoffcache[i]=NULL;
102   }
103 
104   init_caches();
105 }
106 
107 
init_caches(int level)108 void mdiadaptlm::init_caches(int level)
109 {
110   assert(probcache[level]==NULL);
111   assert(backoffcache[level]==NULL);
112   probcache[level]=new NGRAMCACHE_t(level,sizeof(double),400000);
113   backoffcache[level]=new NGRAMCACHE_t(level,sizeof(double),400000);
114 };
115 
init_caches()116 void mdiadaptlm::init_caches()
117 {
118 #ifdef MDIADAPTLM_CACHE_ENABLE
119   for (int i=1; i<=max_caching_level; i++)		init_caches(i);
120 #endif
121 };
122 
check_cache_levels(int level)123 void mdiadaptlm::check_cache_levels(int level)
124 {
125   if (probcache[level] && probcache[level]->isfull()) probcache[level]->reset(probcache[level]->cursize());
126   if (backoffcache[level] && backoffcache[level]->isfull()) backoffcache[level]->reset(backoffcache[level]->cursize());
127 };
128 
check_cache_levels()129 void mdiadaptlm::check_cache_levels()
130 {
131 #ifdef MDIADAPTLM_CACHE_ENABLE
132   for (int i=1; i<=max_caching_level; i++)		check_cache_levels(i);
133 #endif
134 };
135 
reset_caches(int level)136 void mdiadaptlm::reset_caches(int level)
137 {
138   if (probcache[level]) probcache[level]->reset(MAX(probcache[level]->cursize(),probcache[level]->maxsize()));
139   if (backoffcache[level]) backoffcache[level]->reset(MAX(backoffcache[level]->cursize(),backoffcache[level]->maxsize()));
140 };
141 
reset_caches()142 void mdiadaptlm::reset_caches()
143 {
144 #ifdef MDIADAPTLM_CACHE_ENABLE
145   for (int i=1; i<=max_caching_level; i++)		reset_caches(i);
146 #endif
147 };
148 
149 
get_probcache(int level)150 inline NGRAMCACHE_t* mdiadaptlm::get_probcache(int level)
151 {
152   return probcache[level];
153 }
154 
get_backoffcache(int level)155 inline NGRAMCACHE_t* mdiadaptlm::get_backoffcache(int level)
156 {
157   return backoffcache[level];
158 }
159 
scalefact(char * ngtfile)160 int mdiadaptlm::scalefact(char *ngtfile)
161 {
162   if (forelm!=NULL) delete forelm;
163   if (cache!=NULL) delete cache;
164   cache=new normcache(dict);
165 
166   forelm=new shiftbeta(ngtfile,1);
167   forelm->train();
168 
169   //compute oov scalefact term
170   ngram fng(forelm->dict,1);
171   ngram ng(dict,1);
172   int* w=fng.wordp(1);
173 
174   oovscaling=1.0;
175   for ((*w)=0; (*w)<forelm->dict->size(); (*w)++)
176     if ((*w) != forelm->dict->oovcode()) {
177       ng.trans(fng);
178       if (*ng.wordp(1)==dict->oovcode()) {
179         cerr << "adaptation file contains new words: use -ao=yes option\n";
180         exit(1);
181       }
182       //forbidden situation
183       oovscaling-=backunig(ng);
184     }
185   *w=forelm->dict->oovcode();
186   oovscaling=foreunig(fng)/oovscaling;
187 
188   return 1;
189 };
190 
savescalefactor(char * filename)191 int mdiadaptlm::savescalefactor(char* filename)
192 {
193 
194   ngram ng(dict,1);
195   int* w=ng.wordp(1);
196 
197   mfstream out(filename,ios::out);
198 
199   out << "\n\\data\\" << "\nngram 1=" << dict->size() << "\n\n1grams:\n";
200 
201   for ((*w)=0; (*w)<dict->size(); (*w)++) {
202     double ratio=scalefact(ng);
203     out << (float)  (ratio?log10(ratio):-99);
204     if (*w==dict->oovcode())
205       out << "\t" << "<unk>\n";
206     else
207       out << "\t" << (char *)dict->decode(*w) << "\n";
208 
209   }
210   out << "\\end\\\n";
211 
212   return 1;
213 }
214 
scalefact(ngram ng)215 double mdiadaptlm::scalefact(ngram ng)
216 {
217   ngram fng(forelm->dict,1);
218   fng.trans(ng);
219   if (*fng.wordp(1)==forelm->dict->oovcode())
220     return pow(oovscaling,gis_step);
221   else {
222     double prback=backunig(ng);
223     double prfore=foreunig(ng);
224     return pow(prfore/prback,gis_step);
225   }
226 }
227 
228 
foreunig(ngram ng)229 double mdiadaptlm::foreunig(ngram ng)
230 {
231 
232   double fstar,lambda;
233 
234   forelm->discount(ng,1,fstar,lambda);
235 
236   return fstar;
237 }
238 
backunig(ngram ng)239 double mdiadaptlm::backunig(ngram ng)
240 {
241 
242   double fstar,lambda;
243 
244   discount(ng,1,fstar,lambda,0);
245 
246   return fstar;
247 };
248 
249 
250 
adapt(char * ngtfile,int alev,double step)251 int mdiadaptlm::adapt(char* ngtfile,int alev,double step)
252 {
253 
254   if (alev > lmsize() || alev<=0) {
255     cerr << "setting adaptation level to " << lmsize() << "\n";
256     alev=lmsize();
257   }
258   adaptlev=alev;
259 
260 
261   cerr << "adapt ....";
262   gis_step=step;
263 
264   if (ngtfile==NULL) {
265     cerr << "adaptation file is missing\n";
266     exit(1);
267   }
268 
269   //compute the scaling factor;
270 
271   scalefact(ngtfile);
272 
273   //compute 1-gram zeta
274   ngram ng(dict,2);
275   int* w=ng.wordp(1);
276 
277   cerr << "precomputing 1-gram normalization ...\n";
278   zeta0=0;
279   for ((*w)=0; (*w)<dict->size(); (*w)++)
280     zeta0+=scalefact(ng) * backunig(ng);
281 
282   if (alev==1) return 1 ;
283 
284   cerr << "precomputing 2-gram normalization:\n";
285 
286   //precompute the bigram normalization
287   w=ng.wordp(2);
288   *ng.wordp(1)=0;
289 
290   for ((*w)=0; (*w)<dict->size(); (*w)++) {
291     zeta(ng,2);
292     if ((*w % 1000)==0) cerr << ".";
293   }
294 
295   cerr << "done\n";
296 
297   return 1;
298 };
299 
300 
zeta(ngram ng,int size)301 double mdiadaptlm::zeta(ngram ng,int size)
302 {
303 
304   assert(size>=1);
305 
306   double z=0; // compute normalization term
307 
308   ng.size=size;
309 
310   if (size==1) return zeta0;
311   else { //size>1
312 
313     //check in the 2gr and 3gr cache
314     if (size <=3 && cache->get(ng,size,z)) return z;
315 
316     double fstar,lambda;
317     ngram histo=ng;
318     int succ=0;
319 
320     discount(ng,size,fstar,lambda,(int)0);
321 
322     if ((lambda<1) && get(histo,size,size-1)) {
323       ;
324 
325       //scan all its successors
326       succ=0;
327 
328       succscan(histo,ng,INIT,size);
329       while(succscan(histo,ng,CONT,size)) {
330 
331         discount(ng,size,fstar,lambda,0);
332         if (fstar>0) {
333           z+=(scalefact(ng) * fstar);
334           succ++;
335           //cerr << ng << "zeta= " << z << "\n";
336         }
337       }
338     }
339 
340     z+=lambda*zeta(ng,size-1);
341 
342     if (size<=3 && succ>1) cache->put(ng,size,z);
343 
344     return z;
345   }
346 
347 }
348 
349 
discount(ngram ng_,int size,double & fstar,double & lambda,int)350 int mdiadaptlm::discount(ngram ng_,int size,double& fstar,double& lambda,int /* unused parameter: cv */)
351 {
352 
353   ngram ng(dict);
354   ng.trans(ng_);
355 
356   double __fstar, __lambda;
357   bool lambda_cached=0;
358   int size_lambda=size-1;
359 
360   ngram histo=ng;
361   histo.shift();
362 
363   if (size_lambda>0 && histo.size>=size_lambda) {
364 #ifdef MDIADAPTLM_CACHE_ENABLE
365     if (size_lambda<=max_caching_level) {
366       //backoffcache hit
367       if (backoffcache[size_lambda]  && backoffcache[size_lambda]->get(histo.wordp(size_lambda),__lambda))
368         lambda_cached=1;
369     }
370 #endif
371   }
372 
373   discount(ng,size,__fstar,__lambda,0);
374 
375   if ((size>0) && (size<=adaptlev) && (__lambda<1)) {
376 
377     if (size>1) {
378       double numlambda, numfstar, den;
379       numfstar=scalefact(ng);
380       den=zeta(ng,size);
381       __fstar=__fstar * numfstar/den;
382       if (!lambda_cached) {
383         numlambda=zeta(ng,size-1);
384         __lambda=__lambda * numlambda/den;
385       }
386     } else if (size==1) {
387       double ratio;
388       ratio=scalefact(ng)/zeta0;
389       __fstar=__fstar * ratio;
390       if (!lambda_cached) {
391         __lambda=__lambda * ratio;
392       }
393     } else {
394       //size==0 do nothing
395     }
396   }
397 
398 #ifdef MDIADAPTLM_CACHE_ENABLE
399   //backoffcache insert
400   if (!lambda_cached && size_lambda>0 && size_lambda<=max_caching_level && histo.size>=size_lambda && backoffcache[size_lambda])
401     backoffcache[size_lambda]->add(histo.wordp(size_lambda),__lambda);
402 #endif
403 
404   lambda=__lambda;
405   fstar=__fstar;
406   return 1;
407 }
408 
409 
compute_backoff_per_level()410 int mdiadaptlm::compute_backoff_per_level()
411 {
412 
413   double fstar,lambda;
414 
415   this->backoff=1;
416 
417   for (int size=1; size<lmsize(); size++) {
418 
419     ngram hg(dict,size);
420 
421     scan(hg,INIT,size);
422 
423     while(scan(hg,CONT,size)) {
424 
425       ngram ng=hg;
426       ng.pushc(0); //ng.size is now hg.size+1
427 
428       double pr=1.0;
429 
430       succscan(hg,ng,INIT,size+1);
431       while(succscan(hg,ng,CONT,size+1)) {
432 
433         mdiadaptlm::discount(ng,ng.size,fstar,lambda);
434 
435         if (fstar>0){
436 					ng.size=ng.size-1;
437           pr -= mdiadaptlm::prob(ng,size);
438 				}
439       }
440 
441       assert(pr>0 && pr<=1);
442 
443       boff(hg.link,pr);
444     }
445 
446   }
447 
448   cerr << "done\n";
449 
450   return 1;
451 }
452 
453 
compute_backoff_per_word()454 int mdiadaptlm::compute_backoff_per_word()
455 {
456 	cerr << "Current implementation does not support the usage of backoff (-bo=yes) mixture models (-lm=mix) combined with the per-word saving (-saveperllevel=no)." << endl;
457 	cerr << "Please, either choose a per-level saving (-saveperllevel=yes) or do not use backoff (-bo=no) " << endl;
458 
459 	exit(1);
460 }
461 
462 
prob2(ngram ng,int size,double & fstar)463 double mdiadaptlm::prob2(ngram ng,int size,double& fstar)
464 {
465 
466   double lambda;
467 
468   mdiadaptlm::discount(ng,size,fstar,lambda);
469 
470   if (size>1)
471     return fstar  + lambda * prob(ng,size-1);
472   else
473     return fstar;
474 }
475 
476 
477 //inline double mdiadaptlm::prob(ngram ng,int size){
prob(ngram ng,int size)478 double mdiadaptlm::prob(ngram ng,int size)
479 {
480   double fstar,lambda,bo;
481   return prob(ng,size,fstar,lambda,bo);
482 }
483 
prob(ngram ng,int size,double & fstar,double & lambda,double & bo)484 double mdiadaptlm::prob(ngram ng,int size,double& fstar,double& lambda, double& bo)
485 {
486   double pr;
487 
488 #ifdef MDIADAPTLM_CACHE_ENABLE
489   //probcache hit
490   if (size<=max_caching_level && probcache[size] && ng.size>=size && probcache[size]->get(ng.wordp(size),pr))
491     return pr;
492 #endif
493 
494   //probcache miss
495   mdiadaptlm::bodiscount(ng,size,fstar,lambda,bo);
496 
497   if (fstar>UPPER_SINGLE_PRECISION_OF_1 || lambda>UPPER_SINGLE_PRECISION_OF_1) {
498     cerr << "wrong probability: " << ng
499          << " , size " << size
500          << " , fstar " << fstar
501          << " , lambda " << lambda << "\n";
502     exit(1);
503   }
504   if (backoff) {
505 
506     if (size>1) {
507       if (fstar>0){
508 				pr=fstar;
509       }else {
510         if (lambda<1){
511           pr = lambda/bo * prob(ng,size-1);
512         }else {
513           assert(lambda<UPPER_SINGLE_PRECISION_OF_1);
514           pr = prob(ng,size-1);
515         }
516       }
517     } else
518       pr = fstar;
519   }
520 
521   else { //interpolation
522 
523     if (size>1)
524       pr = fstar  + lambda * prob(ng,size-1);
525     else
526       pr = fstar;
527   }
528 
529 #ifdef MDIADAPTLM_CACHE_ENABLE
530   //probcache insert
531   if (size<=max_caching_level && probcache[size] && ng.size>=size)
532     probcache[size]->add(ng.wordp(size),pr);
533 #endif
534 
535   return pr;
536 }
537 
538 
bodiscount(ngram ng_,int size,double & fstar,double & lambda,double & bo)539 int mdiadaptlm::bodiscount(ngram ng_,int size,double& fstar,double& lambda,double& bo)
540 {
541   ngram ng(dict);
542   ng.trans(ng_);
543 
544   mdiadaptlm::discount(ng,size,fstar,lambda);
545 
546   bo=1.0;
547 
548   if (backoff) { //get back-off probability
549 
550     if (size>1 && lambda<1) {
551 
552       ngram hg=ng;
553 
554 //			cerr<< "hg:|" << hg << "| size:|" << size << "|" <<  endl;
555 			if (! get(hg,size,size-1)){
556 				cerr << "ERROR: int mdiadaptlm::bodiscount(ngram ng_,int size,double& fstar,double& lambda,double& bo)   -> get(hg,size,size-1) returns NULL\n";
557 			}
558       assert(get(hg,size,size-1));
559 
560       bo=boff(hg.link);
561 
562 //			if (lambda > bo){
563 //				cerr << " mdiadaptlm::bodiscount ERROR: " << " lambda:" << lambda << " bo:" << bo << "\n";
564 //				exit(1);
565 //			}
566     }
567   }
568 
569   return 1;
570 }
571 
572 
txclprob(ngram ng,int size)573 double mdiadaptlm::txclprob(ngram ng,int size)
574 {
575 
576   double fstar,lambda;
577 
578   if (size>1) {
579     mdiadaptlm::discount(ng,size,fstar,lambda);
580     return fstar  + lambda * txclprob(ng,size-1);
581   } else {
582     double freq=1;
583     if ((*ng.wordp(1)!=dict->oovcode()) && get(ng,1,1))
584       freq+=ng.freq;
585 
586     double N=totfreq()+dict->dub()-dict->size();
587     return freq/N;
588   }
589 }
590 
591 
netsize()592 int mdiadaptlm::netsize()
593 {
594   double fstar,lambda;
595   int size,totsize;
596   ngram ng(dict);
597 
598   cerr << "Computing LM size:\n";
599 
600   totsize=dict->size() * 2;
601 
602   cout << "1-gram " << totsize << "\n";
603 
604   for (int i=2; i<=maxlevel(); i++) {
605 
606     size=0;
607 
608     scan(ng,INIT,i);
609 
610     while (scan(ng,CONT,i)) {
611 
612       mdiadaptlm::discount(ng,i,fstar,lambda);
613 
614       if (fstar>0) size++;
615 
616     }
617 
618     size+=size * (i<maxlevel());
619 
620     totsize+=size;
621 
622     cout << i << "-gram " << totsize << "\n";
623 
624   }
625 
626   return totsize;
627 }
628 
629 
630 
631 /*
632  * trigram file format:
633 
634  --------------------------------
635 
636    <idx> dictionary length
637 
638    repeat [ dictionary length ] {
639         <newline terminated string> word;
640    }
641 
642    while [ first word != STOP ] {
643         <idx> first word
644         <idx> number of successors
645         repeat [ number of successors ] {
646                 <idx>   second word
647                 <float> prob
648         }
649    }
650 
651    <idx> STOP
652 
653    while [ first word != STOP ] {
654             <idx> first word
655 	    <idx> number of successor sets
656 	         repeat [ number of successor sets ] {
657 		        <idx>   second word
658 			<idx>   number of successors
659 			repeat [ number of successors ] {
660 			      <idx>   third word
661 			      <float> prob
662 			}
663 		 }
664    }
665 
666    <idx> STOP
667 
668 */
669 
670 
671 //void writeNull(mfbstream& out,unsigned short nullCode,float nullProb){
672 //  out.writex(&nullCode,sizeof(short));
673 //  out.writex(&nullProb,sizeof(float));
674 //}
675 
676 
swapbytes(char * p,int sz,int n)677 int swapbytes(char *p, int sz, int n)
678 {
679   char c,*l,*h;
680   if((n<1) ||(sz<2)) return 0;
681   for(; n--; p+=sz) for(h=(l=p)+sz; --h>l; l++) {
682       c=*h;
683       *h=*l;
684       *l=c;
685     }
686   return 0;
687 };
688 
fwritex(char * p,int sz,int n,FILE * f)689 void fwritex(char *p,int sz,int n,FILE* f)
690 {
691 
692   if(*(short *)"AB"==0x4241) {
693     swapbytes((char*)p, sz,n);
694   }
695 
696   fwrite((char *)p,sz,n,f);
697 
698   if(*(short *)"AB"==0x4241) swapbytes((char*)p, sz,n);
699 
700 }
701 
ifwrite(long loc,void * ptr,int size,int,FILE * f)702 void ifwrite(long loc,void *ptr,int size,int /* unused parameter: n */,FILE* f)
703 {
704   fflush(f);
705 
706   long pos=ftell(f);
707 
708   fseek(f,loc,SEEK_SET);
709 
710   fwritex((char *)ptr,size,1,f);
711 
712   fseek(f,pos,SEEK_SET);
713 
714   fflush(f);
715 }
716 
writeNull(unsigned short nullCode,float nullProb,FILE * f)717 void writeNull(unsigned short nullCode,float nullProb,FILE* f)
718 {
719   fwritex((char *)&nullCode,sizeof(short),1,f);
720   fwritex((char *)&nullProb,sizeof(float),1,f);
721 }
722 
723 
saveASR(char * filename,int,char * subdictfile)724 int mdiadaptlm::saveASR(char *filename,int /* unused parameter: backoff */,char* subdictfile)
725 {
726   int totbg,tottr;
727 
728   dictionary* subdict;
729 
730   if (subdictfile)
731     subdict=new dictionary(subdictfile);
732   else
733     subdict=dict; // default is subdict=dict
734 
735   typedef unsigned short code;
736 
737   system("date");
738 
739   if (lmsize()>3 || lmsize()<1) {
740     cerr << "wrong lmsize\n";
741     exit(1);
742   }
743 
744   if (dict->size()>=0xffff && subdict->size()>=0xffff) {
745     cerr << "save bin requires unsigned short codes\n";
746     exit(1);
747   }
748 
749   FILE* f=fopen(filename,"w");
750 
751   double fstar,lambda,boff;
752   float pr;
753   long succ1pos,succ2pos;
754   code succ1,succ2,w,h1,h2;
755   code stop=0xffff;
756 
757   //dictionary
758   //#dictsize w1\n ..wN\n NULL\n
759 
760   code oovcode=subdict->oovcode();
761 
762   //includes at least NULL
763   code subdictsz=subdict->size()+1;
764 
765   fwritex((char *)&subdictsz,sizeof(code),1,f);
766 
767   subdictsz--;
768   for (w=0; w<subdictsz; w++)
769     fprintf(f,"%s\n",(char *)subdict->decode(w));
770 
771   fprintf(f,"____\n");
772 
773   //unigram part
774   //NULL #succ w1 pr1 ..wN prN
775 
776   h1=subdictsz;
777   fwritex((char *)&h1,sizeof(code),1,f); //NULL
778 
779   succ1=0;
780   succ1pos=ftell(f);
781   fwritex((char *)&succ1,sizeof(code),1,f);
782 
783   ngram ng(dict);
784   ngram sng(subdict);
785 
786   ng.size=sng.size=1;
787 
788   scan(ng,INIT,1);
789   while(scan(ng,CONT,1)) {
790     sng.trans(ng);
791     if (sng.containsWord(subdict->OOV(),1))
792       continue;
793 
794     pr=(float)mdiadaptlm::prob(ng,1);
795     if (pr>1e-50) { //do not consider too low probabilities
796       succ1++;
797       w=*sng.wordp(1);
798       fwritex((char *)&w,sizeof(code),1,f);
799       fwritex((char *)&pr,sizeof(float),1,f);
800     } else {
801       cerr << "small prob word " << ng << "\n";
802     }
803   }
804 
805   // update number of unigrams
806   ifwrite(succ1pos,&succ1,sizeof(code),1,f);
807 
808   cerr << "finito unigrammi " << succ1 << "\n";
809   fflush(f);
810 
811   if (lmsize()==1) {
812     fclose(f);
813     return 1;
814   }
815 
816   // rest of bigrams
817   // w1 #succ w1 pr1 .. wN prN
818 
819   succ1=0;
820   h1=subdictsz;
821   totbg=subdictsz;
822 
823   ngram hg1(dict,1);
824 
825   ng.size=sng.size=2;
826 
827   scan(hg1,INIT,1);
828   while(scan(hg1,CONT,1)) {
829 
830     if (hg1.containsWord(dict->OOV(),1)) continue;
831 
832     assert((*hg1.wordp(1))<dict->size());
833 
834     *ng.wordp(2)=*hg1.wordp(1);
835     *ng.wordp(1)=0;
836 
837     sng.trans(ng);
838     if (sng.containsWord(dict->OOV(),1)) continue;
839 
840     mdiadaptlm::bodiscount(ng,2,fstar,lambda,boff);
841 
842     if (lambda < 1.0) {
843 
844       h1=*sng.wordp(2);
845 
846       fwritex((char *)&h1,sizeof(code),1,f);
847 
848       succ1=0;
849       succ1pos=ftell(f);
850       fwritex((char *)&succ1,sizeof(code),1,f);
851 
852       ngram shg=hg1;
853       get(shg,1,1);
854 
855       succscan(shg,ng,INIT,2);
856       while(succscan(shg,ng,CONT,2)) {
857 
858         if (*ng.wordp(1)==oovcode) continue;
859 
860         sng.trans(ng);
861         if (sng.containsWord(dict->OOV(),2)) continue;
862 
863         mdiadaptlm::discount(ng,2,fstar,lambda);
864 
865         if (fstar>1e-50) {
866           w=*sng.wordp(1);
867           fwritex((char *)&w,sizeof(code),1,f);
868           pr=(float)mdiadaptlm::prob(ng,2);
869           //cerr << ng << " prob=" << log(pr) << "\n";
870 
871           fwritex((char *)&pr,sizeof(float),1,f);
872           succ1++;
873         }
874       }
875 
876       if (succ1) {
877         lambda/=boff; //consider backoff
878         writeNull(subdictsz,(float)lambda,f);
879         succ1++;
880         totbg+=succ1;
881         ifwrite(succ1pos,&succ1,sizeof(code),1,f);
882       } else {
883         //go back one word
884         fseek(f,succ1pos-(streampos)sizeof(code),SEEK_SET);
885       }
886     }
887   }
888 
889   fwritex((char *)&stop,sizeof(code),1,f);
890 
891   cerr << " finito bigrammi! " << subdictsz << "\n";
892   fflush(f);
893 
894   system("date");
895 
896   if (lmsize()<3) {
897     fclose(f);
898     return 1;
899   }
900 
901   //TRIGRAM PART
902 
903   h1=subdictsz;
904   h2=subdictsz;
905   tottr=0;
906   succ1=0;
907   succ2=0;
908 
909   ngram hg2(dict,2);
910 
911   ng.size=sng.size=3;
912 
913   scan(hg1,INIT,1);
914   while(scan(hg1,CONT,1)) {
915 
916     if ((*hg1.wordp(1)==oovcode)) continue;
917 
918     *ng.wordp(3)=*hg1.wordp(1);
919 
920     sng.trans(ng);
921     if (sng.containsWord(dict->OOV(),1)) continue;
922 
923     assert((*sng.wordp(3))<subdictsz);
924 
925     h1=*sng.wordp(3);
926     fwritex((char *)&h1,sizeof(code),1,f);
927 
928     succ1=0;
929     succ1pos=ftell(f);
930     fwritex((char *)&succ1,sizeof(code),1,f);
931 
932     ngram shg1=ng;
933     get(shg1,3,1);
934 
935     succscan(shg1,hg2,INIT,2);
936     while(succscan(shg1,hg2,CONT,2)) {
937 
938       if (*hg2.wordp(1)==oovcode) continue;
939 
940       *ng.wordp(2)=*hg2.wordp(1);
941       *ng.wordp(1)=0;
942 
943       sng.trans(ng);
944       if (sng.containsWord(dict->OOV(),2)) continue;
945 
946       mdiadaptlm::bodiscount(ng,3,fstar,lambda,boff);
947 
948       if (lambda < 1.0) {
949 
950         h2=*sng.wordp(2);
951         fwritex((char *)&h2,sizeof(code),1,f);
952 
953         succ2=0;
954         succ2pos=ftell(f);
955         fwritex((char *)&succ2,sizeof(code),1,f);
956 
957         ngram shg2=ng;
958         get(shg2,3,2);
959 
960         succscan(shg2,ng,INIT,3);
961         while(succscan(shg2,ng,CONT,3)) {
962 
963           if (*ng.wordp(1)==oovcode) continue;
964 
965           sng.trans(ng);
966           if (sng.containsWord(dict->OOV(),3)) continue;
967 
968           mdiadaptlm::discount(ng,3,fstar,lambda);
969           //pr=(float)mdiadaptlm::prob2(ng,3,fstar);
970 
971           if (fstar>1e-50) {
972 
973             w=*sng.wordp(1);
974             fwritex((char *)&w,sizeof(code),1,f);
975 
976             pr=(float)mdiadaptlm::prob(ng,3);
977 
978             //	    cerr << ng << " prob=" << log(pr) << "\n";
979             fwritex((char *)&pr,sizeof(float),1,f);
980             succ2++;
981           }
982         }
983 
984         if (succ2) {
985           lambda/=boff;
986           writeNull(subdictsz,(float)lambda,f);
987           succ2++;
988           tottr+=succ2;
989           ifwrite(succ2pos,&succ2,sizeof(code),1,f);
990           succ1++;
991         } else {
992           //go back one word
993           fseek(f,succ2pos-(long)sizeof(code),SEEK_SET);
994         }
995       }
996     }
997 
998     if (succ1)
999       ifwrite(succ1pos,&succ1,sizeof(code),1,f);
1000     else
1001       fseek(f,succ1pos-(long)sizeof(code),SEEK_SET);
1002   }
1003 
1004   fwritex((char *)&stop,sizeof(code),1,f);
1005 
1006   fclose(f);
1007 
1008   cerr << "Tot bg: " << totbg << " tg: " << tottr<< "\n";
1009 
1010   system("date");
1011 
1012   return 1;
1013 };
1014 
1015 
1016 ///// Save in IRST MT format
1017 
saveMT(char * filename,int backoff,char * subdictfile,int resolution,double decay)1018 int mdiadaptlm::saveMT(char *filename,int backoff,
1019                        char* subdictfile,int resolution,double decay)
1020 {
1021 
1022   double logalpha=log(decay);
1023   dictionary* subdict;
1024 
1025   if (subdictfile)
1026     subdict=new dictionary(subdictfile);
1027   else
1028     subdict=dict; // default is subdict=dict
1029 
1030   ngram ng(dict,lmsize());
1031   ngram sng(subdict,lmsize());
1032 
1033   cerr << "Adding unigram of OOV word if missing\n";
1034 
1035   for (int i=1; i<=maxlevel(); i++)
1036     *ng.wordp(i)=dict->oovcode();
1037 
1038   if (!get(ng,maxlevel(),1)) {
1039     cerr << "oov is missing in the ngram-table\n";
1040     // f(oov) = dictionary size (Witten Bell)
1041     ng.freq=dict->freq(dict->oovcode());
1042     cerr << "adding oov unigram " << ng << "\n";
1043     put(ng);
1044   }
1045 
1046   cerr << "Eventually adding OOV symbol to subdictionary\n";
1047   subdict->encode(OOV_);
1048 
1049   system("date");
1050 
1051   mfstream out(filename,ios::out);
1052 
1053   //add special symbols
1054 
1055   subdict->incflag(1);
1056   int bo_code=subdict->encode(BACKOFF_);
1057   int du_code=subdict->encode(DUMMY_);
1058   subdict->incflag(0);
1059 
1060   out << "nGrAm " << lmsize() << " " << 0
1061       << " " << "LM_ "
1062       << resolution << " "
1063       << decay << "\n";
1064 
1065   subdict->save(out);
1066 
1067   //start writing ngrams
1068 
1069   cerr << "write unigram of oov probability\n";
1070   ng.size=1;
1071   *ng.wordp(1)=dict->oovcode();
1072   double pr=(float)mdiadaptlm::prob(ng,1);
1073   sng.trans(ng);
1074   sng.size=lmsize();
1075   for (int s=2; s<=lmsize(); s++) *sng.wordp(s)=du_code;
1076   sng.freq=(int)ceil(pr * (double)10000000)-1;
1077   out << sng << "\n";
1078 
1079   for (int i=1; i<=lmsize(); i++) {
1080     cerr << "LEVEL " << i << "\n";
1081 
1082     double fstar,lambda,bo,dummy;
1083 
1084     scan(ng,INIT,i);
1085     while(scan(ng,CONT,i)) {
1086 
1087       sng.trans(ng);
1088 
1089       sng.size=lmsize();
1090       for (int s=i+1; s<=lmsize(); s++)
1091         *sng.wordp(s)=du_code;
1092 
1093       if (i>=1 && sng.containsWord(subdict->OOV(),sng.size)) {
1094         cerr << "skipping : " << sng << "\n";
1095         continue;
1096       }
1097 
1098       // skip also eos symbols not at the final
1099       //if (i>=1 && sng.containsWord(dict->EoS(),sng.size))
1100       //continue;
1101 
1102       mdiadaptlm::discount(ng,i,fstar,dummy);
1103 
1104       //out << sng << " fstar " << fstar << " lambda " << lambda << "\n";
1105       //if (i==1 && sng.containsWord(subdict->OOV(),i)){
1106       //	cerr << sng << " fstar " << fstar << "\n";
1107       //}
1108 
1109       if (fstar>0) {
1110 
1111         double pr=(float)mdiadaptlm::prob(ng,i);
1112 
1113         if (i>1 && resolution<10000000) {
1114           sng.freq=resolution-(int)(log(pr)/logalpha)-1;
1115           sng.freq=(sng.freq>=0?sng.freq:0);
1116         } else
1117           sng.freq=(int)ceil(pr * (double)10000000)-1;
1118 
1119         out << sng << "\n";
1120 
1121       }
1122 
1123       if (i<lmsize()) { /// write backoff of higher order!!
1124 
1125         ngram ng2=ng;
1126         ng2.pushc(0); //extend by one
1127         mdiadaptlm::bodiscount(ng2,i+1,dummy,lambda,bo);
1128         assert(!backoff || (lambda ==1 || bo<1 ));
1129 
1130         sng.pushc(bo_code);
1131         sng.size=lmsize();
1132 
1133         if (lambda<1) {
1134           if (resolution<10000000) {
1135             sng.freq=resolution-(int)((log(lambda) - log(bo))/logalpha)-1;
1136             sng.freq=(sng.freq>=0?sng.freq:0);
1137           } else
1138             sng.freq=(int)ceil(lambda/bo * (double)10000000)-1;
1139 
1140           out << sng << "\n";
1141         }
1142       }
1143     }
1144     cerr << "LEVEL " << i << "DONE \n";
1145   }
1146   return 1;
1147 };
1148 
1149 ///// Save in binary format forbackoff N-gram models
1150 
saveBIN_per_word(char * filename,int backoff,char * subdictfile,int mmap)1151 int mdiadaptlm::saveBIN_per_word(char *filename,int backoff,char* subdictfile,int mmap)
1152 {
1153 	VERBOSE(2,"mdiadaptlm::saveBIN_per_word START\n");
1154   system("date");
1155 
1156   //subdict
1157   dictionary* subdict;
1158 
1159   //accumulated unigram oov prob
1160 	//CHECK why this is not used (differently from what happens in the other save functions
1161 	//	double oovprob=0;
1162 
1163 
1164   if (subdictfile) subdict=new dictionary(subdictfile);
1165   else   subdict=dict; // default is subdict=dict
1166 
1167 	if (mmap) {
1168     VERBOSE(2,"savebin with memory map: " << filename << "\n");
1169   } else {
1170     VERBOSE(2,"savebin: " << filename << "\n");
1171   }
1172 
1173 
1174   vector<streampos> pos(lmsize()+1);
1175   int maxlev=lmsize();
1176   char buff[100];
1177   int isQuant=0; //savebin for quantized LM is not yet implemented
1178 
1179 	//temporary filename to save the LM related to a single term
1180   char tmpfilename[BUFSIZ];
1181 
1182 	//create temporary output file stream to store single levels for all terms
1183   assert(strlen(filename)<1000);
1184   char tfilename[MAX_NGRAM][1000];
1185   mfstream *tout[MAX_NGRAM];
1186 
1187 	for (int i=1; i<=lmsize(); i++) {
1188     sprintf(tfilename[i],"%s-%dgrams",filename,i);
1189     tout[i]=new mfstream(tfilename[i],ios::out);
1190   }
1191 
1192 	// print header in the main output file
1193   mfstream out(filename,ios::out);
1194   out << "blmt " << maxlev;
1195 
1196   for (int i=1; i<=maxlev; i++) { //reserve space for ngram statistics (which are not yet avalable)
1197     pos[i]=out.tellp();
1198     sprintf(buff," %10d",0);
1199     out << buff;
1200   }
1201   out << "\n";
1202 	subdict->save(out);
1203 	out.flush();
1204 
1205   ngram ng(dict,lmsize());
1206   ngram oldng(dict,lmsize());
1207   ngram locng(dict,lmsize());
1208 
1209   ngram sng(subdict,lmsize());
1210 
1211   double fstar,lambda,bo,dummy,dummy2,pr,ibow;
1212 
1213   //n-gram counters
1214   table_entry_pos_t num[lmsize()+1];
1215   for (int i=1; i<=lmsize(); i++) num[i]=0;
1216 
1217 	lmtable* lmt = new lmtable();
1218 
1219 	lmt->configure(maxlev,isQuant);
1220 	lmt->setDict(subdict);
1221 	lmt->expand_level(1,dict->size(),filename,mmap);
1222 
1223   //main loop
1224   for (int w=0; w<dict->size(); w++) {
1225 		sprintf(tmpfilename,"%s_tmp_%d",filename,w);
1226 
1227     if (!w % 10000) cerr << ".";
1228 
1229     //1-gram
1230     ngram ung(dict,1);
1231     *ung.wordp(1)=w;
1232     sng.trans(ung);
1233 
1234     //exclude words not occurring in the subdictionary
1235     if (sng.containsWord(subdict->OOV(),1) && !ung.containsWord(dict->OOV(),1))	continue;
1236 
1237 
1238 		pr=mdiadaptlm::prob(ung,1);
1239 		pr=(pr?log10(pr):-99);
1240 
1241 		if (lmsize()>1) { //compute back-off
1242 			ung.pushc(0); //extend by one
1243 			mdiadaptlm::bodiscount(ung,2,fstar,lambda,bo);
1244 			ung.shift();//shrink by one
1245 
1246 			assert(!backoff || ((lambda<UPPER_SINGLE_PRECISION_OF_1 && lambda>LOWER_SINGLE_PRECISION_OF_1) || bo<UPPER_SINGLE_PRECISION_OF_1 ));
1247 
1248 			if (backoff){
1249 				ibow=log10(lambda) - log10(bo);
1250 			}else{
1251 				if (lambda<LOWER_SINGLE_PRECISION_OF_1){
1252 					ibow = log10(lambda);
1253 				}else { //force to be 0.0
1254 					ibow = 0.0;
1255 				}
1256 			}
1257 		}
1258 		else {
1259 			ibow=0.0; //default value for backoff weight at the lowest level
1260 		}
1261 
1262 		lmt->addwithoffset(ung,(float)pr,(float)ibow);
1263 		num[1]++;
1264 
1265     //manage n-grams
1266     if (get(ung,1,1)) {
1267 
1268       //create n-gram with history w
1269       *ng.wordp(lmsize())=w;
1270 
1271       //create sentinel n-gram
1272       for (int i=1; i<=lmsize(); i++) *oldng.wordp(i)=-1;
1273 
1274       //create the table for all levels but the level 1, with the maximum number of possible entries
1275       for (int i=2; i<=lmsize(); i++)
1276 				lmt->expand_level(i,entries(i),tmpfilename,mmap);
1277 
1278       scan(ung.link,ung.info,1,ng,INIT,lmsize());
1279       while(scan(ung.link,ung.info,1,ng,CONT,lmsize())) {
1280         sng.trans(ng); // convert to subdictionary
1281 				locng=ng;      // make a local copy
1282 
1283 				//find first internal level that changed
1284 				int f=lmsize()-1; //unigrams have been already covered
1285 				while (f>1 && (*oldng.wordp(f)==*ng.wordp(f))){ f--; }
1286 
1287 				for (int l=lmsize()-(f-1); l<=lmsize(); l++){
1288 
1289 					locng=ng;      // make a local copy
1290 					if (l<lmsize()) locng.shift(lmsize()-l); //reduce the ngram, which has size level
1291 
1292 					if (sng.containsWord(subdict->OOV(),l)) continue;
1293 
1294           // skip also eos symbols not at the final
1295           if (sng.containsWord(dict->EoS(),l-1)) continue;
1296 
1297           pr=mdiadaptlm::prob(locng,l,fstar,dummy,dummy2);
1298 
1299           //PATCH by Nicola (16-04-2008)
1300 
1301           if (!(pr<=1.0 && pr > 1e-10)) {
1302             cerr << ng << " " << pr << "\n";
1303             assert(pr<=1.0);
1304             cerr << "prob modified to 1e-10\n";
1305             pr=1e-10;
1306           }
1307 
1308           if (l<lmsize()) {
1309 
1310             locng.pushc(0); //extend by one
1311 
1312             mdiadaptlm::bodiscount(locng,l+1,dummy,lambda,bo);
1313 
1314             locng.shift();
1315 
1316             if (fstar>=UPPER_SINGLE_PRECISION_OF_0 || lambda <= LOWER_SINGLE_PRECISION_OF_1) {
1317 							ibow=log10(lambda) - log10(bo);
1318 							if (lmt->addwithoffset(locng,(float)log10(pr),(float)ibow)){
1319 								num[l]++;
1320 							}else{
1321 								continue;
1322 							}
1323             }
1324 						else{
1325 							continue; //skip n-grams with too small fstar
1326 						}
1327           } else {
1328             if (fstar>=UPPER_SINGLE_PRECISION_OF_0) {
1329 							ibow=0.0; //value for backoff weight at the highest level
1330 							if (lmt->addwithoffset(locng,(float)log10(pr),(float)ibow)){
1331 								num[l]++;
1332 							}else{
1333 								continue;
1334 							}
1335             }
1336 						else{
1337 							continue; //skip n-grams with too small fstar
1338 						}
1339           }
1340         }
1341         oldng=ng;
1342       }
1343 		}
1344 		else{
1345       //create empty tables for all levels but the level 1, to keep consistency with the rest of the code
1346       for (int i=2; i<=lmsize(); i++)
1347 				lmt->expand_level(i,0,tmpfilename,mmap);
1348 		}
1349 
1350 
1351 		//level 1 is not modified until everything is done
1352 		//because it has to contain the full dictionary
1353 		//which provides the direct access to the second level
1354 		for (int i=2; i<=lmsize(); i++){
1355 
1356 			if (i>2) {
1357 				lmt->checkbounds(i-1);
1358 				lmt->appendbin_level(i-1, *tout[i-1], mmap);
1359 			}
1360 
1361 			// now we can resize table at level i
1362 			lmt->resize_level(i, tmpfilename, mmap);
1363 		}
1364 
1365 		// now we can save table at level maxlev, if not equal to 1
1366 		if (lmsize()>1){
1367 			lmt->appendbin_level(maxlev, *tout[maxlev], mmap);
1368 		}
1369 
1370 		//delete levels from 2 to lmsize();
1371 		for (int i=2; i<=lmsize(); i++)			lmt->delete_level(i, tmpfilename, mmap);
1372 
1373 		//update table offsets
1374 		for (int i=2; i<=lmsize(); i++) lmt->update_offset(i,num[i]);
1375   }
1376 	//close levels from 2 to lmsize()
1377 	for (int i=2; i<=lmsize(); i++) tout[i]->close();
1378 
1379 	//now we can save level 1, which contains all unigrams
1380 	//cerr << "saving level 1" << "...\n";
1381 	lmt->savebin_level(1, filename, mmap);
1382 
1383 	//update headers
1384   for (int i=1; i<=lmsize(); i++) {
1385     sprintf(buff," %10d",num[i]);
1386     out.seekp(pos[i]);
1387     out << buff;
1388   }
1389 
1390 	out.close();
1391 
1392   //concatenate files for each single level into one file
1393 	//single level files should have a name derived from "filename"
1394 	lmt->compact_all_levels(filename);
1395 
1396   cerr << "\n";
1397   system("date");
1398 
1399 	VERBOSE(2,"mdiadaptlm::saveBIN_per_word END\n");
1400   return 1;
1401 };
1402 
1403 ///// Save in binary format forbackoff N-gram models
saveBIN_per_level(char * filename,int backoff,char * subdictfile,int mmap)1404 int mdiadaptlm::saveBIN_per_level(char *filename,int backoff,char* subdictfile,int mmap)
1405 {
1406 	VERBOSE(2,"mdiadaptlm::saveBIN_per_level START\n");
1407   system("date");
1408 
1409   //subdict
1410   dictionary* subdict;
1411 
1412   //accumulated unigram oov prob
1413   double oovprob=0;
1414 
1415   if (subdictfile)     subdict=new dictionary(subdictfile);
1416   else    subdict=dict; // default is subdict=dict
1417 
1418   if (mmap) {
1419     VERBOSE(2,"savebin with memory map: " << filename << "\n");
1420   } else {
1421     VERBOSE(2,"savebin: " << filename << "\n");
1422   }
1423 
1424   vector<streampos> pos(lmsize()+1);
1425   int maxlev=lmsize();
1426   char buff[100];
1427   int isQuant=0; //savebin for quantized LM is not yet implemented
1428 
1429   // print header
1430   fstream out(filename,ios::out);
1431   out << "blmt " << maxlev;
1432 
1433   for (int i=1; i<=maxlev; i++) { //reserve space for ngram statistics (which are not yet avalable)
1434     pos[i]=out.tellp();
1435     sprintf(buff," %10d",0);
1436     out << buff;
1437   }
1438   out << "\n";
1439   lmtable* lmt = new lmtable();
1440 
1441   lmt->configure(maxlev,isQuant);
1442 
1443   lmt->setDict(subdict);
1444 	subdict->save(out);
1445 	out.flush();
1446 
1447 
1448   //start adding n-grams to lmtable
1449 
1450   for (int i=1; i<=lmsize(); i++) {
1451     cerr << "saving level " << i << "...\n";
1452     table_entry_pos_t numberofentries;
1453     if (i==1) { //unigram
1454       numberofentries = (table_entry_pos_t) subdict->size();
1455     } else {
1456       numberofentries = (table_entry_pos_t) entries(i);
1457     }
1458     system("date");
1459     lmt->expand_level(i,numberofentries,filename,mmap);
1460 
1461     double totp=0;
1462     double fstar,lambda,bo,dummy,dummy2,pr,ibow;
1463 
1464     ngram ng(dict,1);
1465     ngram ng2(dict);
1466     ngram sng(subdict,1);
1467 
1468     if (i==1) { //unigram case
1469 
1470       //scan the dictionary
1471       for (int w=0; w<dict->size(); w++) {
1472         *ng.wordp(1)=w;
1473 
1474         sng.trans(ng);
1475         pr=mdiadaptlm::prob(ng,1);
1476         totp+=pr;
1477 
1478         if (sng.containsWord(subdict->OOV(),i) && !ng.containsWord(dict->OOV(),i)) {
1479           oovprob+=pr; //accumulate oov probability
1480           continue;
1481         }
1482 
1483 
1484         if (ng.containsWord(dict->OOV(),i)) pr+=oovprob;
1485 
1486         //cerr << ng << " freq " << dict->freq(w) << " -  Pr " << pr << "\n";
1487         pr=(pr?log10(pr):-99);
1488 
1489         if (w==dict->oovcode()){
1490 					//CHECK whether we can avoid this reassignment because dict should be lmt->getDict()
1491           *ng.wordp(1)=lmt->getDict()->oovcode();
1492 					ibow=0.0;
1493 				}
1494         else {
1495 					//				} //do nothing
1496 
1497 					if (lmsize()>1) {
1498 						ngram ng2=ng;
1499 						ng2.pushc(0); //extend by one
1500 
1501 						//cerr << ng2 << "\n";
1502 
1503 						mdiadaptlm::bodiscount(ng2,i+1,fstar,lambda,bo);
1504 						assert(!backoff || ((lambda<UPPER_SINGLE_PRECISION_OF_1 && lambda>LOWER_SINGLE_PRECISION_OF_1) || bo<UPPER_SINGLE_PRECISION_OF_1));
1505 
1506 						if (backoff){
1507 							ibow = log10(lambda) - log10(bo);
1508 						}else{
1509 							if (lambda<LOWER_SINGLE_PRECISION_OF_1){
1510 								ibow = log10(lambda);
1511 							}else { //force to be 0.0
1512 								ibow = 0.0;
1513 							}
1514 						}
1515 					}else {
1516 						ibow=0.0; //default value for backoff weight at the lowest level
1517 					}
1518 				}
1519         lmt->add(ng,(float)pr,(float)ibow);
1520       }
1521       //cerr << "totprob = " << totp << "\n";
1522     }
1523 		else { //i>1 , bigrams, trigrams, fourgrams...
1524 			*ng.wordp(1)=0;
1525 			get(ng,1,1); //this
1526       scan(ng,INIT,i);
1527       while(scan(ng,CONT,i)) {
1528         sng.trans(ng);
1529 
1530         if (sng.containsWord(subdict->OOV(),i)) continue;
1531 
1532         // skip also eos symbols not at the final
1533         if (sng.containsWord(dict->EoS(),i-1)) continue;
1534 
1535         //	mdiadaptlm::discount(ng,i,fstar,dummy);
1536         //	pr=mdiadaptlm::prob(ng,i);
1537         pr=mdiadaptlm::prob(ng,i,fstar,dummy,dummy2);
1538 
1539         if (!(pr<=1.0 && pr > 1e-10)) {
1540           cerr << ng << " " << pr << "\n";
1541           assert(pr<=1.0);
1542           cerr << "prob modified to 1e-10\n";
1543           pr=1e-10;
1544         }
1545 
1546         if (i<lmsize()) {
1547           ng2=ng;
1548           ng2.pushc(0); //extend by one
1549 
1550           mdiadaptlm::bodiscount(ng2,i+1,dummy,lambda,bo);
1551 
1552           if (fstar>=UPPER_SINGLE_PRECISION_OF_0 || lambda <= LOWER_SINGLE_PRECISION_OF_1) {
1553             ibow=log10(lambda) - log10(bo);
1554             lmt->add(ng,(float)log10(pr),(float)ibow);
1555           }
1556         } else {
1557           if (fstar >= UPPER_SINGLE_PRECISION_OF_0) {
1558             ibow=0.0; //value for backoff weight at the highest level
1559             lmt->add(ng,(float)log10(pr),(float)ibow);
1560           }
1561         }
1562       }
1563     }
1564 
1565     // now we can fix table at level i-1
1566     // now we can save table at level i-1
1567     // now we can remove table at level i-1
1568     if (maxlev>1 && i>1) {
1569       lmt->checkbounds(i-1);
1570       lmt->savebin_level(i-1, filename, mmap);
1571     }
1572 
1573     // now we can resize table at level i
1574     lmt->resize_level(i, filename, mmap);
1575 
1576   }
1577   // now we can save table at level maxlev
1578   lmt->savebin_level(maxlev, filename, mmap);
1579 
1580   //update headers
1581   for (int i=1; i<=lmsize(); i++) {
1582     sprintf(buff," %10d",lmt->getCurrentSize(i));
1583     out.seekp(pos[i]);
1584     out << buff;
1585   }
1586   out.close();
1587 
1588   //concatenate files for each single level into one file
1589 	//single level files should have a name derived from "filename"
1590 	lmt->compact_all_levels(filename);
1591 
1592 	VERBOSE(2,"mdiadaptlm::saveBIN_per_level END\n");
1593   return 1;
1594 }
1595 
1596 
1597 ///// Save in format for ARPA backoff N-gram models
saveARPA_per_word(char * filename,int backoff,char * subdictfile)1598 int mdiadaptlm::saveARPA_per_word(char *filename,int backoff,char* subdictfile )
1599 {
1600 	VERBOSE(2,"mdiadaptlm::saveARPA_per_word START\n");
1601   system("date");
1602 
1603   //subdict
1604   dictionary* subdict;
1605 
1606   //accumulated unigram oov prob
1607 //CHECK why this is not used (differently from what happens in the other save functions
1608 //	double oovprob=0;
1609 
1610 
1611   if (subdictfile) subdict=new dictionary(subdictfile);
1612   else   subdict=dict; // default is subdict=dict
1613 
1614   //main output file
1615   mfstream out(filename,ios::out);
1616 
1617   //create temporary output file stream
1618   assert(strlen(filename)<1000);
1619   char tfilename[MAX_NGRAM][1000];
1620   mfstream *tout[MAX_NGRAM];
1621 
1622   for (int i=1; i<=lmsize(); i++) {
1623     sprintf(tfilename[i],"%s.%d",filename,i);
1624     tout[i]=new mfstream(tfilename[i],ios::out);
1625     *tout[i] << "\n\\" << i << "-grams:\n";
1626   }
1627 
1628 
1629   ngram ng(dict,lmsize());
1630   ngram oldng(dict,lmsize());
1631   ngram locng(dict,lmsize());
1632 
1633   ngram sng(subdict,lmsize());
1634 
1635   double fstar,lambda,bo,dummy,dummy2, pr;
1636 
1637   //n-gram counters
1638   table_entry_pos_t num[lmsize()+1];
1639   for (int i=1; i<=lmsize(); i++) num[i]=0;
1640 
1641 
1642   //main loop
1643   for (int w=0; w<dict->size(); w++) {
1644 
1645     if (!w % 10000) cerr << ".";
1646 
1647     //1-gram
1648     ngram ung(dict,1);
1649     *ung.wordp(1)=w;
1650     sng.trans(ung);
1651 
1652     //exclude words not occurring in the subdictionary
1653     if (sng.containsWord(subdict->OOV(),1) && !ung.containsWord(dict->OOV(),1))	continue;
1654 
1655     pr=mdiadaptlm::prob(ung,1);
1656 		pr=(pr?log10(pr):-99);
1657 
1658     if (w==dict->oovcode())
1659       *tout[1] << (float) pr << "\t" << "<unk>";
1660     else
1661       *tout[1] << (float) pr << "\t" << (char *)dict->decode(w);
1662 
1663     num[1]++;
1664 
1665     if (lmsize()>1) { //print back-off
1666       ung.pushc(0); //extend by one
1667       mdiadaptlm::bodiscount(ung,2,fstar,lambda,bo);
1668 			ung.shift();//shrink by one
1669 
1670       assert(!backoff || ((lambda<UPPER_SINGLE_PRECISION_OF_1 && lambda>LOWER_SINGLE_PRECISION_OF_1) || bo<UPPER_SINGLE_PRECISION_OF_1 ));
1671 
1672 			if (backoff){
1673 				*tout[1] << "\t" << (float) (log10(lambda) - log10(bo));
1674 			}else{
1675 				if (lambda<LOWER_SINGLE_PRECISION_OF_1){
1676 					*tout[1] << "\t" << (float) log10(lambda);
1677 				} //no output if log10(lambda)==0
1678 			}
1679     }
1680     *tout[1] << "\n";
1681 
1682     //manage n-grams
1683     if (get(ung,1,1)) {
1684 
1685       //create n-gram with history w
1686       *ng.wordp(lmsize())=w;
1687 
1688       //create sentinel n-gram
1689       for (int i=1; i<=lmsize(); i++) *oldng.wordp(i)=-1;
1690 
1691       scan(ung.link,ung.info,1,ng,INIT,lmsize());
1692       while(scan(ung.link,ung.info,1,ng,CONT,lmsize())) {
1693         //cerr << ng << "\n";
1694         sng.trans(ng); // convert to subdictionary
1695         locng=ng;      // make a local copy
1696 
1697 				//find first internal level that changed
1698 				int f=lmsize()-1; //unigrams have been already covered
1699 				while (f>1 && (*oldng.wordp(f)==*ng.wordp(f))){ f--; }
1700 
1701 				for (int l=lmsize(); l>lmsize()-f;l--){
1702 
1703 					if (l<lmsize()) locng.shift(); //ngram has size level
1704 
1705           if (sng.containsWord(subdict->OOV(),l)) continue;
1706 
1707           // skip also eos symbols not at the final
1708           if (sng.containsWord(dict->EoS(),l-1)) continue;
1709 
1710           pr=mdiadaptlm::prob(locng,l,fstar,dummy,dummy2);
1711 
1712           //PATCH by Nicola (16-04-2008)
1713 
1714           if (!(pr<=1.0 && pr > 1e-10)) {
1715             cerr << ng << " " << pr << "\n";
1716             assert(pr<=1.0);
1717             cerr << "prob modified to 1e-10\n";
1718             pr=1e-10;
1719           }
1720 
1721           if (l<lmsize()) {
1722 
1723             locng.pushc(0); //extend by one
1724 
1725             mdiadaptlm::bodiscount(locng,l+1,dummy,lambda,bo);
1726 
1727             locng.shift();
1728 
1729             if (fstar>=UPPER_SINGLE_PRECISION_OF_0 || lambda <= LOWER_SINGLE_PRECISION_OF_1) {
1730               *tout[l] << (float) log10(pr);
1731               *tout[l] << "\t" << (char *)dict->decode(*locng.wordp(l));
1732               for (int j=l-1; j>0; j--)
1733                 *tout[l] << " " << (char *)dict->decode(*locng.wordp(j));
1734 
1735               if (lambda < LOWER_SINGLE_PRECISION_OF_1) //output back-off prob
1736                 *tout[l] << "\t" << (float) (log10(lambda) -log10(bo));
1737               *tout[l] << "\n";
1738 
1739               num[l]++;
1740             } else continue; //skip n-grams with too small fstar
1741           } else {
1742             if (fstar>=UPPER_SINGLE_PRECISION_OF_0 ) {
1743               *tout[l] << (float) log10(pr);
1744               *tout[l] << "\t" << (char *)dict->decode(*locng.wordp(l));
1745               for (int j=l-1; j>0; j--)
1746                 *tout[l] << " " << (char *)dict->decode(*locng.wordp(j));
1747               *tout[l] << "\n";
1748               num[l]++;
1749             } else continue; //skip n-grams with too small fstar
1750           }
1751 
1752         }
1753         oldng=ng;
1754       }
1755     }
1756 
1757   }
1758 
1759 
1760   //print header
1761   out << "\n\\data\\" << "\n";
1762   char buff[100];
1763   for (int i=1; i<=lmsize(); i++) {
1764     sprintf(buff,"ngram %2d=%10d\n",i,num[i]);
1765     out << buff;
1766   }
1767   out << "\n";
1768 
1769   //append and remove temporary files
1770   for (int i=1; i<=lmsize(); i++) {
1771     delete tout[i];
1772     tout[i]=new mfstream(tfilename[i],ios::in);
1773     out << tout[i]->rdbuf();
1774     delete tout[i];
1775     removefile(tfilename[i]);
1776   }
1777 
1778   out << "\\end\\" << "\n";
1779 
1780   cerr << "\n";
1781   system("date");
1782 
1783 	VERBOSE(2,"mdiadaptlm::saveARPA_per_word END\n");
1784   return 1;
1785 };
1786 
1787 ///// Save in format for ARPA backoff N-gram models
saveARPA_per_level(char * filename,int backoff,char * subdictfile)1788 int mdiadaptlm::saveARPA_per_level(char *filename,int backoff,char* subdictfile )
1789 {
1790 	VERBOSE(2,"mdiadaptlm::saveARPA_per_level START\n");
1791   system("date");
1792 
1793   //subdict
1794   dictionary* subdict;
1795 
1796   //accumulated unigram oov prob
1797   double oovprob=0;
1798 
1799   if (subdictfile) {
1800     subdict=new dictionary(subdictfile);
1801   } else
1802     subdict=dict; // default is subdict=dict
1803 
1804   fstream out(filename,ios::out);
1805   //  out.precision(15);
1806 
1807   vector<streampos> pos(lmsize()+1);
1808   table_entry_pos_t num[lmsize()+1];
1809   char buff[100];
1810 
1811   //print header
1812   out << "\n\\data\\" << "\n";
1813 
1814   for (int i=1; i<=lmsize(); i++) {
1815     num[i]=0;
1816     pos[i]=out.tellp();
1817     sprintf(buff,"ngram %2d=%10d\n",i,num[i]);
1818     out << buff;
1819   }
1820 
1821   out << "\n";
1822 
1823   //start writing n-grams
1824 
1825   for (int i=1; i<=lmsize(); i++) {
1826     cerr << "saving level " << i << "...\n";
1827 
1828 
1829     out << "\n\\" << i << "-grams:\n";
1830 
1831     double totp=0;
1832     double fstar,lambda,bo,dummy,dummy2,pr;
1833 
1834 
1835     ngram ng(dict,1);
1836     ngram ng2(dict);
1837     ngram sng(subdict,1);
1838 
1839     if (i==1) { //unigram case
1840 
1841       //scan the dictionary
1842 
1843       for (int w=0; w<dict->size(); w++) {
1844         *ng.wordp(1)=w;
1845 
1846         sng.trans(ng);
1847         pr=mdiadaptlm::prob(ng,1);
1848         totp+=pr;
1849 
1850         if (sng.containsWord(subdict->OOV(),i) && !ng.containsWord(dict->OOV(),i)) {
1851           oovprob+=pr; //accumulate oov probability
1852           continue;
1853         }
1854 
1855 
1856         if (ng.containsWord(dict->OOV(),i)) pr+=oovprob;
1857 
1858         //cerr << ng << " freq " << dict->freq(w) << " -  Pr " << pr << "\n";
1859         out << (float)  (pr?log10(pr):-99);
1860 
1861         num[i]++;
1862 
1863         if (w==dict->oovcode())
1864           out << "\t" << "<unk>\n";
1865         else {
1866           out << "\t" << (char *)dict->decode(w);
1867 
1868           if (lmsize()>1) {
1869             ngram ng2=ng;
1870             ng2.pushc(0); //extend by one
1871 
1872             mdiadaptlm::bodiscount(ng2,i+1,fstar,lambda,bo);
1873 
1874 	    assert(!backoff || ((lambda<UPPER_SINGLE_PRECISION_OF_1 && lambda>LOWER_SINGLE_PRECISION_OF_1) || bo<UPPER_SINGLE_PRECISION_OF_1 ));
1875 
1876 	    if (backoff){
1877               out << "\t" << (float) (log10(lambda) - log10(bo));
1878 	    }else{
1879 	      if (lambda<LOWER_SINGLE_PRECISION_OF_1){
1880 		out << "\t" << (float) log10(lambda);
1881 	      } //no output if log10(lambda)==0
1882 	    }
1883           }
1884           out << "\n";
1885         }
1886       }
1887       //cerr << "totprob = " << totp << "\n";
1888     }
1889 		else { //i>1 , bigrams, trigrams, fourgrams...
1890 			*ng.wordp(1)=0;
1891 			get(ng,1,1); //this
1892       scan(ng,INIT,i);
1893       while(scan(ng,CONT,i)) {
1894 
1895         sng.trans(ng);
1896         if (sng.containsWord(subdict->OOV(),i)) continue;
1897 
1898         // skip also eos symbols not at the final
1899         if (sng.containsWord(dict->EoS(),i-1)) continue;
1900 
1901         pr=mdiadaptlm::prob(ng,i,fstar,dummy,dummy2);
1902 
1903         //PATCH by Nicola (16-04-2008)
1904 
1905         if (!(pr<=1.0 && pr > 1e-10)) {
1906           cerr << ng << " " << pr << "\n";
1907           assert(pr<=1.0);
1908           cerr << "prob modified to 1e-10\n";
1909           pr=1e-10;
1910         }
1911 
1912         if (i<lmsize()) {
1913           ng2=ng;
1914           ng2.pushc(0); //extend by one
1915 
1916           mdiadaptlm::bodiscount(ng2,i+1,dummy,lambda,bo);
1917 
1918           if (fstar>=UPPER_SINGLE_PRECISION_OF_0 || lambda <= LOWER_SINGLE_PRECISION_OF_1) {
1919             out << (float) log10(pr);
1920             out << "\t" << (char *)dict->decode(*ng.wordp(i));
1921             for (int j=i-1; j>0; j--)
1922               out << " " << (char *)dict->decode(*ng.wordp(j));
1923             if (backoff){
1924                out << "\t" << (float) (log10(lambda) - log10(bo));
1925             }else{
1926                if (lambda<LOWER_SINGLE_PRECISION_OF_1){
1927                  out << "\t" << (float) log10(lambda);
1928                } //no output if log10(lambda)==0
1929             }
1930             out << "\n";
1931             num[i]++;
1932           }
1933         } else {
1934           if (fstar>=UPPER_SINGLE_PRECISION_OF_0) {
1935             out << (float) log10(pr);
1936             out << "\t" << (char *)dict->decode(*ng.wordp(i));
1937             for (int j=i-1; j>0; j--)
1938               out << " " << (char *)dict->decode(*ng.wordp(j));
1939             out << "\n";
1940 
1941             num[i]++;
1942           }
1943         }
1944       }
1945     }
1946 
1947     cerr << i << "grams tot:" << num[i] << "\n";
1948   }
1949 
1950   streampos last=out.tellp();
1951 
1952   //update headers
1953   for (int i=1; i<=lmsize(); i++) {
1954     sprintf(buff,"ngram %2d=%10d\n",i,num[i]);
1955     out.seekp(pos[i]);
1956     out << buff;
1957   }
1958 
1959   out.seekp(last);
1960   out << "\\end\\" << "\n";
1961   system("date");
1962 
1963 	VERBOSE(2,"mdiadaptlm::saveARPA_per_level END\n");
1964   return 1;
1965 };
1966 
1967 
1968 /*
1969 main(int argc,char** argv){
1970   char* dictname=argv[1];
1971   char* backngram=argv[2];
1972   int depth=atoi(argv[3]);
1973   char* forengram=argv[4];
1974   char* testngram=argv[5];
1975 
1976   dictionary dict(dictname);
1977   ngramtable test(&dict,testngram,depth);
1978 
1979   shiftbeta lm2(&dict,backngram,depth);
1980   lm2.train();
1981   //lm2.test(test,depth);
1982 
1983   mdi lm(&dict,backngram,depth);
1984   lm.train();
1985   for (double w=0.0;w<=1.0;w+=0.1){
1986   lm.getforelm(forengram);
1987   lm.adapt(w);
1988   lm.test(test,depth);
1989   }
1990 }
1991 */
1992 
1993