1 //===========================================================================
2 // Copyright (C) 2000 Matt Timmermans
3 // Free for non-commercial purposes as long as this notice remains intact.
4 // To discuss commercial use, mail me at matt@timmermans.org
5 //
6 // sufftree.h
7 //
8 // This file started life as Jesper Larsson's excellent sliding window
9 // suffix tree.  Now it implements a modified PPM* model for the bicom
10 // bijective compressor.  Much of the original elegance of Jesper's code
11 // has been destroyed with the addition of new functionality.  In particular,
12 // the clever hashing scheme for children was ditched, because I had to keep
13 // linked lists anyway.
14 //
15 // This model does a few things differently:
16 //
17 // Contexts are unbounded.  We start at the longest context and escape
18 // through shorter ones.  The first non-deterministic context we look at is
19 // the one with the highest P(MPS), i.e., we use Charle's blooks Local Order
20 // Estimation.
21 // To avoid being too slow, we look at onlythe longest determinstic context,
22 // and at most SuffixTreeModel::SuffixTreeModel::MAXWALK non-deterministic
23 // contexts.  We will only escape through SuffixTreeModel::MAXNONDET
24 // non-deterministic contexts.
25 // We also stop escaping when we hit a context that could result in more than
26 // MAXEXCLS exclusions.  These don't tend to be too useful anyway.
27 //
28 // In non-deterministic contexts, escape probabilities are estimated
29 // originally using PPM method C.  In deterministic contexts, the initial
30 // estimate is based on the context length.
31 //
32 // In both cases, this estimate is used only as an index into a table of
33 // adaptive binary contexts that produce the probability we actually use.
34 //
35 // The result is like Charles Bloom's SEE, but not as good -- might change
36 // that later.
37 //
38 // This is Jesper's original copyright notice:
39 //
40 //============================================================================
41 // Copyright 1999, N. Jesper Larsson, all rights reserved.
42 //
43 // This file contains an implementation of a sliding window index using a
44 // suffix tree. It corresponds to the algorithms and representation presented
45 // in the Ph.D. thesis "Structures of String Matching and Data Compression",
46 // Lund University, 1999.
47 //
48 // This software may be used freely for any purpose. However, when distributed,
49 // the original source must be clearly stated, and, when the source code is
50 // distributed, the copyright notice must be retained and any alterations in
51 // the code must be clearly marked. No warranty is given regarding the quality
52 // of this software.
53 //===========================================================================
54 // (note: ALL of Jesper's code has been altered, so consider this a clear
55 // marking of that fact ;-)
56 
57 #include "sufftree.h"
58 
59 #include <stdlib.h>
60 #include <limits.h>
61 #include <time.h>
62 
63 #ifdef _DEBUG
64 #include <assert.h>
65 #define _ASSERT(x) assert(x)
66 #else
67 #define _ASSERT(x)
68 #endif
69 
70 #define RANDOMIZE_NODE_ORDER 1
71 #define K (UCHAR_MAX+1)
72 
73 /* INode numbering:
74 
75    INode 0 is nil.
76    INode 1 is root.
77    Nodes 2...m_maxSize-1 are non-root internal nodes.
78    Nodes m_maxSize...2*m_maxSize-1 are leaves.*/
79 
80 
81 
82 /* Macros used to keep indices inside the circular buffer (avoiding
83    modulo operations for speed). M0 is for subtractions to stay
84    nonnegative, MM for additions to stay below m_maxSize.*/
85 #define M0(i)           ((i)<0 ? (i)+m_maxSize : (i))
86 #define MM(i)           ((i)<m_maxSize ? (i) : (i)-m_maxSize)
87 
88 //these constants were gathered by examining the contexts after processing
89 //book2 -- they don't really make too much difference, as long as they're
90 //not silly.  The rows are log(# of leaves).  It's interesting how little
91 //difference the # of leaves in a deterministic context makes.
92 static int dini[20*8]=
93   {
94   25,29,30,33,37,42,44,42,45,47,45,48,49,50,53,59,60,61,61,61,
95   23,31,26,31,38,45,46,42,46,50,47,49,52,49,53,59,60,61,61,61,
96   23,31,31,39,44,48,52,50,54,52,54,55,57,55,57,62,61,62,61,61,
97   23,28,33,46,51,54,54,52,54,56,57,56,56,58,58,62,61,63,63,59,
98   16,27,32,47,52,53,53,56,56,58,58,53,57,60,61,62,63,63,63,57,
99   25,33,38,56,52,53,56,55,58,54,57,56,55,61,62,63,63,62,63,61,
100   25,33,38,56,52,53,56,55,58,54,57,56,55,61,62,63,63,62,63,61,
101   25,33,38,56,52,53,56,55,58,54,57,56,55,61,62,63,63,62,63,61
102   };
103 
SuffixTreeModel(int windowSize,FILE * statfileout)104 SuffixTreeModel::SuffixTreeModel(int windowSize,FILE *statfileout)
105   {
106   m_iblocks=0;
107   m_ifreelist=0;
108   m_rootnode=0;
109 
110   m_leafbuf=new LNode[windowSize];
111   m_maxSize=windowSize;
112 
113   if (m_statfile=statfileout)
114     {
115     //write number of short vars
116     putc(3,m_statfile);
117     }
118 
119   Reset();
120   }
121 
122 
Reset()123 void SuffixTreeModel::Reset()
124   {
125   int i,j;
126 
127     {
128     //free internal nodes
129     LNode *n=m_rootnode;
130     INode *in;
131     while(n)
132       {
133       if (n->isleaf)
134         {n=n->next;continue;}
135       in=(INode *)n;
136       while(n=in->firstchild)
137         {
138         in->firstchild=n->next;
139         n->next=in->next;
140         in->next=n;
141         }
142       n=in->next;
143       DelI(in);
144       }
145     }
146   m_rootnode=NewI(0,0);
147   m_rootnode->childcount=1;  //stays 1 forever
148 
149   m_currentSize=0;
150 
151   //initialize active point.
152   ap.ins=m_rootnode;
153 
154   ap.proj=0;
155   ap.r=0;
156 
157   //initialize window limits.
158   tail=front=0;
159 
160   m_extracountnode=0;
161 
162   //init escape predictors
163   for(i=0;i<20;++i)
164     {
165     for(j=0;j<DETLEAVES;j++)
166       {
167       detpred[i][j].hitcount=dini[i+j*20];
168       detpred[i][j].misscount=64-dini[i+j*20];
169       }
170     }
171 
172   for (i=0;i<63;++i)
173     {
174     escapepredictors[i][0].hitcount=(63-i);
175     escapepredictors[i][0].misscount=(i+1);
176     esc2pred[i][0]=escapepredictors[i][0];
177     for (j=1;j<ESCORDERS;j++)
178       esc2pred[i][j]=escapepredictors[i][j]=esc2pred[i][0];
179     }
180   }
181 
182 
~SuffixTreeModel()183 SuffixTreeModel::~SuffixTreeModel()
184   {
185 #ifdef _DEBUG
186     {
187     long a,b;
188     unsigned i,j;
189     for (j=0;j<DETLEAVES;++j)
190       {
191       for (i=0;i<20;i++)
192         {
193         a=detpred[i][j].hitcount;
194         b=detpred[i][j].misscount+a;
195         a=(a*64+(b>>1))/b;
196         if (a>63)
197           a=63;
198         if (a<1)
199           a=1;
200         printf("%2ld,",a);
201         }
202       printf("\n");
203       }
204     }
205 #endif
206   INodeBlock *iblock;
207   while(iblock=m_iblocks)
208     {
209     m_iblocks=iblock->next;
210     delete iblock;
211     }
212   delete [] m_leafbuf;
213   }
214 
NewIBlock()215 SuffixTreeModel::INode *SuffixTreeModel::NewIBlock()
216   {
217   INodeBlock *block=new INodeBlock;
218   INode *s,*e;
219   block->next=m_iblocks;
220   m_iblocks=block;
221   s=block->data;
222   e=s+INodeBlock::SIZE-1;
223   while(e!=s)
224     {
225     e->next=m_ifreelist;
226     m_ifreelist=e;
227     --e;
228     }
229   return s;
230   }
231 
232 
GetChild(SuffixTreeModel::INode * parent,BYTE c)233 SuffixTreeModel::LNode *SuffixTreeModel::GetChild(SuffixTreeModel::INode *parent,BYTE c)
234   {
235   LNode **pos=&(parent->firstchild),**ipos,*ret;
236   if (!*pos)
237     return 0;
238   if ((*pos)->sym==c)
239     return(*pos);
240   pos=&((*pos)->next);
241   if (!*pos)
242     return 0;
243   if ((*pos)->sym==c)
244     return(*pos);
245 
246   ipos=pos;ret=0;
247   for (pos=&((*pos)->next);*pos;pos=&((*pos)->next))
248     {
249     if ((*pos)->sym==c)
250       {
251       ret=*pos;
252       *pos=ret->next;
253       ret->next=*ipos;
254       *ipos=ret;
255       break;
256       }
257     }
258   return ret;
259   }
260 
261 
262 /* canonize subroutine:
263 
264    r is return value. To avoid unnecessary access to the child list, r is
265    preserved between calls. If r is not 0 it is assumed that
266    r==child(ins, a), and (ins, r) is the edge of the insertion point.*/
Canonize(SuffixTreeModel::Point * p)267 void SuffixTreeModel::Canonize(SuffixTreeModel::Point *p)
268   {
269   int depthdiff;
270   if (p->proj && p->ins==0)
271     {
272     //the root is the child of nil for all symbols
273     p->ins=m_rootnode; --(p->proj); p->r=0;
274     }
275   while (p->proj)
276     {
277     if (!p->r)
278       {
279       p->sym=m_leafbuf[M0(front-p->proj)].bufsym;
280       p->r=GetChild(p->ins, p->sym);
281       }
282     if (p->r->isleaf) //r is leaf
283       break;
284     depthdiff=((INode *)(p->r))->depth-p->ins->depth;
285     if (p->proj<depthdiff)
286       {
287       // insertion point doesn't extend all the way to r
288       break;
289       }
290     p->proj-=depthdiff; p->ins=(INode *)p->r; p->r=0;
291     }
292   }
293 
CreateEdge(SuffixTreeModel::INode * parent,SuffixTreeModel::LNode * child,BYTE c,U32 inicounts)294 void SuffixTreeModel::CreateEdge
295   (
296   SuffixTreeModel::INode *parent,
297   SuffixTreeModel::LNode *child,
298   BYTE c,U32 inicounts
299   )
300   {
301   LNode **pos;
302   child->parent=parent;
303   child->count=inicounts;
304   child->sym=c;
305   parent->contextP1+=inicounts;
306   parent->childcount++;
307   pos=&(parent->firstchild);
308   if (*pos&&((*pos)->count>inicounts))
309     pos=&((*pos)->next);
310   child->next=*pos;
311   *pos=child;
312   }
313 
314 
DeleteEdge(SuffixTreeModel::INode * parent,SuffixTreeModel::LNode * child)315 U32 SuffixTreeModel::DeleteEdge
316   (
317   SuffixTreeModel::INode *parent,
318   SuffixTreeModel::LNode *child
319   )
320   {
321   U32 occurs=child->count;
322   LNode **pos=&(parent->firstchild);
323   if (*pos==child)  //removing MPS
324     {
325     parent->childcount--;
326     parent->contextP1-=occurs;
327     if (*pos=child->next)
328       {
329       LNode **bestpos=pos;
330       U32 bestcount=(*pos)->count;
331       for(pos=&((*pos)->next);*pos;pos=&((*pos)->next))
332         {
333         if ((*pos)->count>bestcount)
334           {
335           bestcount=(*pos)->count;
336           bestpos=pos;
337           }
338         }
339       child=*bestpos;
340       *bestpos=child->next;
341       child->next=parent->firstchild;
342       parent->firstchild=child;
343       }
344     }
345   else  //removing other child
346     {
347     for(pos=&((*pos)->next);*pos&&(*pos!=child);pos=&((*pos)->next));
348     _ASSERT(*pos);
349     if (*pos == child)
350       {
351       parent->childcount--;
352       parent->contextP1-=occurs;
353       *pos=child->next;
354       }
355     }
356   return occurs;
357   }
358 
359 
IncMPC(SuffixTreeModel::INode * parent,SuffixTreeModel::LNode * child)360 U32 SuffixTreeModel::IncMPC
361   (
362   SuffixTreeModel::INode *parent,
363   SuffixTreeModel::LNode *child
364   )
365   {
366   child->count++;
367   parent->contextP1++;
368   if ((parent->firstchild!=child)&&(child->count>parent->firstchild->count))
369     {
370     //new MPS
371     LNode *n;
372     while((n=parent->firstchild)!=child)
373       {
374       parent->firstchild=n->next;
375       n->next=child->next;
376       child->next=n;
377       }
378     }
379   return(child->count);
380   }
381 
382 /* Macro for Update subroutine:
383 
384    Send credits up the tree, updating pos values, until a nonzero credit
385    is found. Sign bit of suf links is used as credit bit.*/
UpdateCredits(SuffixTreeModel::INode * v,int i)386 void SuffixTreeModel::UpdateCredits(SuffixTreeModel::INode *v,int i)
387   {
388   INode *u;
389   int d;
390   int j, ii=M0(i-tail), jj;
391   while (v!=m_rootnode)
392     {
393     u=v->parent;
394     d=u->depth;
395     j=M0(v->pos-d);
396     jj=M0(j-tail);
397     if (ii>jj)
398       {
399       v->pos=MM(i+d);
400       }
401     else
402       {
403       i=j; ii=jj;
404       }
405     if (!v->cred)
406       {
407       v->cred=1;
408       break;
409       }
410     v->cred=0;
411     v=u;
412     }
413   }
414 
415 
Encode(BYTE symbol,ArithmeticEncoder * dest)416 void SuffixTreeModel::Encode(BYTE symbol,ArithmeticEncoder *dest)
417   {
418   Walk(symbol,dest,0);
419   Update(symbol);
420   }
421 
Decode(ArithmeticDecoder * src)422 BYTE SuffixTreeModel::Decode(ArithmeticDecoder *src)
423   {
424   BYTE ret=Walk(0,0,src);
425   Update(ret);
426   return ret;
427   }
428 
iroot(U32 n)429 static U32 iroot(U32 n)
430   {
431   //the easy-but-not-quite-optimal way
432   U32 ret=0,bit;
433 
434   for (bit=1;bit&&(bit*bit<=n);bit<<=1)
435     {ret=bit;}
436   for (bit=ret>>1;bit;bit>>=1)
437     {
438     ret+=bit;
439     if (ret*ret>n)
440       ret-=bit;
441     }
442   return ret;
443   }
444 
445 
put32(U32 i,FILE * f)446 inline void put32(U32 i,FILE *f)
447   {
448   putc((int)(i&255),f);
449   putc((int)((i>>8)&255),f);
450   putc((int)((i>>16)&255),f);
451   putc((int)((i>>24)&255),f);
452   }
453 
454 
Walk(BYTE c,ArithmeticEncoder * dest,ArithmeticDecoder * src)455 BYTE SuffixTreeModel::Walk(BYTE c,ArithmeticEncoder *dest,ArithmeticDecoder *src)
456   {
457   INode *insnode;
458   LNode *edge;
459   BitPred *epred;
460   U32 cesc,chit,p1,clow,nl;
461   bool escaped=false;
462   Point context;
463   double bestmpsp;
464   INode *cnode[MAXNONDET];
465   unsigned i,numcontexts,cnum;
466   unsigned exclsleft;
467   unsigned numnew;
468   LNode *newedges[MAXEXCLS];
469   BYTE prediction;
470 
471   m_excl.Clear();
472   Canonize(&ap);
473   context=ap;
474 
475   if (m_statfile)
476     sv.Clear();
477 
478   if (context.InEdge())  //deterministic context
479     {
480     int j;
481     U32 clen;
482     _ASSERT(!!context.proj);
483 
484     insnode=context.ins;
485     if (context.r->isleaf)
486       {
487       j=MM((context.r-m_leafbuf)+insnode->depth);
488       }
489     else
490       {
491       j=((INode *)context.r)->pos;
492       }
493     prediction=m_leafbuf[MM(j+context.proj)].bufsym;
494     clen=insnode->depth+context.proj;
495     if (clen<1)
496       clen=1;
497 
498     if (!insnode->firstchild)
499       goto DETMISS;
500 
501     nl=insnode->firstchild->count;
502 
503     if (m_statfile)
504       {
505       sv.clen=clen;
506       sv.mcpp=nl;
507       }
508 
509     cesc=clen;
510     if (cesc)
511       --cesc;
512     if (cesc>=8)
513       {
514       cesc=((cesc-8)>>1)+8;
515       chit=12;
516       while(cesc>=chit)
517         {
518         cesc=((cesc-chit)>>1)+chit;
519         chit+=2;
520         }
521       }
522     if (cesc>=20)
523       cesc=19;
524 
525     if (insnode==m_extracountnode)
526       --nl;
527 
528     if (nl)
529       --nl;
530     if (nl>=2)
531       {
532       for(chit=(nl-2),nl=2;chit;chit>>=2,++nl);
533       if (nl>=DETLEAVES)
534         nl=DETLEAVES-1;
535       }
536     epred=&(detpred[cesc][nl]);
537     cesc=epred->misscount;
538     chit=epred->hitcount;
539 
540     p1=cesc+chit;
541     if (src)
542       {
543       escaped=src->GetP(p1)>=chit;
544       if (escaped)
545         src->Narrow(p1,chit,p1);
546       else
547         {
548         src->Narrow(p1,0,chit);
549         c=prediction;
550         }
551       }
552     else
553       escaped=(c!=prediction);
554 
555     epred->Adjust(!escaped,ESCADJUST);
556 
557     if (dest)
558       {
559       if (escaped)
560         dest->Encode(p1,chit,p1);
561       else
562         dest->Encode(p1,0,chit);
563       }
564 
565     if (m_statfile)
566       sv.Write(m_statfile,!escaped);
567 
568     if (!escaped)
569       return c;
570 
571     m_excl.Exclude(prediction);
572 
573     DETMISS:
574     //shorten context until non-deterministic
575     for (;;)
576       {
577       if (insnode->depth<1)
578         goto ORDER0;
579       context.ins=insnode->suffix;
580       context.r=0;
581       Canonize(&context);
582       if (!context.InEdge())
583         break;
584       insnode=context.ins;
585       }
586     }
587 
588   //we are not in a deterministic context
589   //make the list of contexts to escape through
590 
591   numcontexts=0;
592   double mpsp;
593   for (i=0;i<MAXWALK;++i)
594     {
595     if ((!context.ins)||(context.ins==m_rootnode))
596       break;
597     insnode=context.ins;
598     if (insnode->firstchild)
599       {
600       mpsp=(double)insnode->firstchild->count;
601       mpsp/=(double)insnode->contextP1;
602       if (!numcontexts)
603         bestmpsp=mpsp;
604       else if(mpsp>bestmpsp)
605         {
606         numcontexts=0;
607         bestmpsp=mpsp;
608         }
609       if (numcontexts>=MAXNONDET)
610         break;
611       cnode[numcontexts]=context.ins;
612       ++numcontexts;
613       }
614     context.ins=insnode->suffix;
615     Canonize(&context);
616     }
617 
618   //walk the contexts to encode the symbol
619 
620   exclsleft=MAXEXCLS-m_excl.NumExcls();
621 
622   for(cnum=0;(cnum<numcontexts)&&exclsleft;++cnum)
623     {
624     escaped=true;
625     insnode=cnode[cnum];
626     cesc=((unsigned)insnode->childcount)+1;
627     chit=0;
628     numnew=0;
629     if (m_statfile)
630       {
631       sv.Clear();
632       sv.clen=insnode->depth;
633       sv.prevexcls=(BYTE)m_excl.NumExcls();
634       sv.children=insnode->childcount;
635       }
636     for (edge=insnode->firstchild;edge&&exclsleft;edge=edge->next)
637       {
638       prediction=edge->sym;
639       if (!m_excl.Exclude(prediction))
640         {
641         //was already excluded
642         continue;
643         }
644       --exclsleft;
645       newedges[numnew++]=edge;
646       chit+=edge->count;
647       if (prediction==c)
648         escaped=false;
649       }
650     if (!numnew)  //no new symbols
651       continue;
652     if (edge)
653       {m_excl.Backup(numnew);break;}
654 
655     p1=chit;
656 
657     //get escape estimator
658       {
659       int orderindex=insnode->depth;
660       if (orderindex>=ESCORDERS)
661         orderindex=ESCORDERS-1;
662       else if(orderindex)
663         --orderindex;
664       cesc=(cesc<<6)/(cesc+chit);
665       if (cesc>0)
666         --cesc;
667       if (cesc>=63)
668         cesc=62;
669       if (!cnum)
670         epred=&(escapepredictors[cesc][orderindex]);
671       else
672         epred=&(esc2pred[cesc][orderindex]);
673       cesc=epred->misscount;
674       chit=epred->hitcount;
675       }
676 
677     if (src)  //decode escape
678       escaped=(src->GetP(cesc+chit)>=chit);
679 
680     epred->Adjust(!escaped,ESCADJUST);
681 
682     if (m_statfile)
683       {
684       sv.clen=insnode->depth;
685       sv.children=insnode->childcount;
686       _ASSERT(((unsigned)sv.children)+1>=m_excl.NumExcls());
687       sv.mcpmisses=(BYTE)((((unsigned)sv.children)+1)-m_excl.NumExcls());
688       for (i=0;i<numnew;i++)
689         sv.mcpp+=newedges[i]->count;
690       _ASSERT(insnode->contextP1>=sv.mcpp);
691       sv.mcpmissp=insnode->contextP1-sv.mcpp;
692       sv.Write(m_statfile,!escaped);
693       }
694 
695     if (escaped)
696       {
697       if (src)
698         src->Narrow(cesc+chit,chit,cesc+chit);
699       if (dest)
700         dest->Encode(cesc+chit,chit,cesc+chit);
701       continue;
702       }
703     if (src)
704       src->Narrow(cesc+chit,0,chit);
705     if (dest)
706       dest->Encode(cesc+chit,0,chit);
707 
708     //yay! no escape!
709     nl=newedges[0]->count>>2; //I have no idea why this is good!!!
710     newedges[0]->count+=nl;
711     p1+=nl;
712     if (src)
713       {
714       //decode char
715       chit=src->GetP(p1);
716       clow=0;
717       for (i=0;i<numnew;i++)
718         {
719         edge=newedges[i];
720         if (chit<edge->count)
721           {
722           c=edge->sym;
723           chit=edge->count;
724           break;
725           }
726         clow+=edge->count;
727         chit-=edge->count;
728         }
729       _ASSERT(i<numnew);
730       }
731     else
732       {
733       clow=0;
734       for (i=0;i<numnew;i++)
735         {
736         edge=newedges[i];
737         prediction=edge->sym;
738         if (c==prediction)
739           {
740           chit=edge->count;
741           break;
742           }
743         clow+=edge->count;
744         }
745       _ASSERT(i<numnew);
746       }
747 
748 
749     _ASSERT(p1<MAXP1);
750 
751 
752     if (src)
753       src->Narrow(p1,clow,clow+chit);
754     if (dest)
755       dest->Encode(p1,clow,clow+chit);
756     _ASSERT(clow+chit<=p1);
757 
758     newedges[0]->count-=nl;
759     p1-=nl;
760     return(c);
761     }
762 
763   //sigh -- didn't find a match anywhere
764   ORDER0:
765   if (src)
766     c=order0.Decode(src,m_excl);
767   else if (dest)
768     order0.Encode(c,dest,m_excl);
769   order0.Update(c);
770   return(c);
771   }
772 
773 
774 
775 /* Function advancefront:
776 
777    Moves front, the right endpoint of the window, increasing its size.
778 */
779 
AdvanceFront()780 void SuffixTreeModel::AdvanceFront()
781   {
782   INode *u, *v;
783   LNode *s;
784   int j;
785   BYTE b, c;
786 
787   v=0;
788   c=m_leafbuf[front].bufsym;
789   while (1)
790     {
791     Canonize(&ap);
792     if (!ap.r)
793       {
794       //active point at node.
795       m_extracountnode=0;
796       if (!ap.ins)
797         {
798         ap.r=m_rootnode; //root is child of ins for any c
799         break;           //endpoint found.
800         }
801       ap.r=GetChild(ap.ins, c);
802       if (ap.r)
803         {
804         //ins has a child for c.
805         ap.sym=c; //a is first symbol in (ins, r) label
806         IncMPC(ap.ins,ap.r);
807         //if we split this node, we want to remove the count we just added
808         //from the new child
809         m_extracountnode=ap.r;
810         break;           //endpoint found.
811         }
812       else
813         {
814         u=ap.ins; //will add child below u.
815         }
816       }
817     else
818       {
819       // active point on edge.
820       j=(ap.r->isleaf ? MM((ap.r-m_leafbuf)+ap.ins->depth) : ((INode *)ap.r)->pos);
821       b=m_leafbuf[MM(j+ap.proj)].bufsym;    // next symbol in (ins, r) label.
822       if (c==b)           /* if same as front symbol.*/
823         break;           /* endpoint found.*/
824       else
825         {              /* edge must be split.*/
826         u=NewI(ap.ins->depth+ap.proj,M0(front-ap.proj));
827         U32 rcounts=DeleteEdge(ap.ins, ap.r);
828         CreateEdge(ap.ins, u, ap.sym,rcounts);
829         //we added a count when we came into this edge.  That coun't
830         //isn't valid for the part that's getting split off, so
831         //remove it
832         if (m_extracountnode==ap.r)
833           --rcounts;
834         _ASSERT(rcounts>0);
835         //m_extracountnode will get cleared below
836         CreateEdge(u, ap.r, b,rcounts);
837         if (!ap.r->isleaf)
838           ((INode *)ap.r)->pos=MM(j+ap.proj);
839         }
840       }
841     m_extracountnode=0;
842     s=m_leafbuf+M0(front-u->depth);
843     s->childcount=0;
844     s->count=0;
845     s->isleaf=1;
846     s->next=0;
847     s->parent=0;
848     s->sym=0;
849     CreateEdge(u, s, c,1);   /* add new leaf s.*/
850     m_rootnode->childcount=1;  // don't count children of root
851     if (u==ap.ins)            /* skip update if new node.*/
852       UpdateCredits(u, M0(front-u->depth));
853     if (v)
854       v->suffix=u;
855     v=u;
856     ap.ins=ap.ins->suffix;
857     ap.r=0;                   /* force getting new r in canonize.*/
858     }
859   if (v)
860     v->suffix=ap.ins;
861   ++ap.proj;                   /* move active point down.*/
862   front=MM(front+1);
863   }
864 
865 
866 /* Function advancetail:
867 
868    Moves tail, the left endpoint of the window, forward by positions
869    positions, decreasing its size.*/
AdvanceTail()870 void SuffixTreeModel::AdvanceTail()
871   {
872   INode *u, *w;              /* nodes.*/
873   LNode *v,*s;
874   BYTE c;
875   int i, d;
876   U32 oldcounts;
877 
878   Canonize(&ap);
879   v=m_leafbuf+tail; // the leaf to delete.
880   u=v->parent;
881   oldcounts=DeleteEdge(u, v);
882   if (v==ap.r)
883     {
884     //replace instead of delete
885 
886     i=M0(front-(ap.ins->depth+ap.proj));
887     if (m_extracountnode==v)
888       {
889       m_extracountnode=m_leafbuf+i;
890       ++oldcounts;
891       }
892     CreateEdge(ap.ins, m_leafbuf+i, v->sym,oldcounts);
893     UpdateCredits(ap.ins, i);
894     ap.ins=ap.ins->suffix;
895     ap.r=0;                   /* force getting new r in canonize.*/
896     }
897   else if (u!=m_rootnode && !u->childcount)
898     {
899     // u has only one child left, delete it.
900     _ASSERT(u->firstchild&&!u->firstchild->next);
901     c=u->sym;
902     _ASSERT(c==m_leafbuf[u->pos].bufsym);
903     w=u->parent;
904     d=u->depth-w->depth;
905     s=u->firstchild; // the remaining child of u.
906     _ASSERT(s->sym==m_leafbuf[MM(u->pos+d)].bufsym);
907     if (u==ap.ins)
908       {
909       ap.ins=w;
910       ap.proj+=d;
911       ap.sym=c;             // new first symbol of (ins, r) label
912       m_extracountnode=0;
913       }
914     else if (u==ap.r)
915       {
916       if (m_extracountnode==u)
917         m_extracountnode=s;
918       ap.r=s;             // new child(ins, a).
919       }
920     if (u->cred)
921       UpdateCredits(w, M0(u->pos-w->depth));
922     DeleteEdge(u, s);
923     oldcounts=DeleteEdge(w, u);
924     CreateEdge(w, s, c,oldcounts);
925     if (!s->isleaf)
926       ((INode *)s)->pos=M0(((INode *)s)->pos-d);
927     DelI(u);
928     }
929   tail=MM(tail+1);
930   }
931 
932 
933 
934 
Write(FILE * f,bool hit)935 void SuffixTreeModel::StatVec::Write(FILE *f,bool hit)
936   {
937   unsigned n=0,i;
938   U32 vec[8];
939   BYTE code=hit?(BYTE)128:0;
940   BYTE bit;
941 
942   vec[n++]=children;
943   vec[n++]=prevexcls;
944   vec[n++]=mcpmisses;
945   vec[n++]=clen;
946   vec[n++]=mcpp;
947   vec[n++]=mcpmissp;
948 
949   for (bit=1,i=0;i<n;i++,bit<<=1)
950     {
951     if (vec[i])
952       code|=bit;
953     }
954   putc(code,f);
955   for (bit=1,i=0;i<n;i++,bit<<=1)
956     {
957     if (vec[i])
958       {
959       if (i<3)
960         putc((BYTE)vec[i],f);
961       else
962         put32(vec[i],f);
963       }
964     }
965 
966   Clear();
967   }
968 
969 
970