1 //===========================================================================
2 // Copyright (C) 2000 Matt Timmermans
3 // Free for non-commercial purposes as long as this notice remains intact.
4 // To discuss commercial use, mail me at matt@timmermans.org
5 //
6 // sufftree.h
7 //
8 // This file started life as Jesper Larsson's excellent sliding window
9 // suffix tree. Now it implements a modified PPM* model for the bicom
10 // bijective compressor. Much of the original elegance of Jesper's code
11 // has been destroyed with the addition of new functionality. In particular,
12 // the clever hashing scheme for children was ditched, because I had to keep
13 // linked lists anyway.
14 //
15 // This model does a few things differently:
16 //
17 // Contexts are unbounded. We start at the longest context and escape
18 // through shorter ones. The first non-deterministic context we look at is
19 // the one with the highest P(MPS), i.e., we use Charle's blooks Local Order
20 // Estimation.
21 // To avoid being too slow, we look at onlythe longest determinstic context,
22 // and at most SuffixTreeModel::SuffixTreeModel::MAXWALK non-deterministic
23 // contexts. We will only escape through SuffixTreeModel::MAXNONDET
24 // non-deterministic contexts.
25 // We also stop escaping when we hit a context that could result in more than
26 // MAXEXCLS exclusions. These don't tend to be too useful anyway.
27 //
28 // In non-deterministic contexts, escape probabilities are estimated
29 // originally using PPM method C. In deterministic contexts, the initial
30 // estimate is based on the context length.
31 //
32 // In both cases, this estimate is used only as an index into a table of
33 // adaptive binary contexts that produce the probability we actually use.
34 //
35 // The result is like Charles Bloom's SEE, but not as good -- might change
36 // that later.
37 //
38 // This is Jesper's original copyright notice:
39 //
40 //============================================================================
41 // Copyright 1999, N. Jesper Larsson, all rights reserved.
42 //
43 // This file contains an implementation of a sliding window index using a
44 // suffix tree. It corresponds to the algorithms and representation presented
45 // in the Ph.D. thesis "Structures of String Matching and Data Compression",
46 // Lund University, 1999.
47 //
48 // This software may be used freely for any purpose. However, when distributed,
49 // the original source must be clearly stated, and, when the source code is
50 // distributed, the copyright notice must be retained and any alterations in
51 // the code must be clearly marked. No warranty is given regarding the quality
52 // of this software.
53 //===========================================================================
54 // (note: ALL of Jesper's code has been altered, so consider this a clear
55 // marking of that fact ;-)
56
57 #include "sufftree.h"
58
59 #include <stdlib.h>
60 #include <limits.h>
61 #include <time.h>
62
63 #ifdef _DEBUG
64 #include <assert.h>
65 #define _ASSERT(x) assert(x)
66 #else
67 #define _ASSERT(x)
68 #endif
69
70 #define RANDOMIZE_NODE_ORDER 1
71 #define K (UCHAR_MAX+1)
72
73 /* INode numbering:
74
75 INode 0 is nil.
76 INode 1 is root.
77 Nodes 2...m_maxSize-1 are non-root internal nodes.
78 Nodes m_maxSize...2*m_maxSize-1 are leaves.*/
79
80
81
82 /* Macros used to keep indices inside the circular buffer (avoiding
83 modulo operations for speed). M0 is for subtractions to stay
84 nonnegative, MM for additions to stay below m_maxSize.*/
85 #define M0(i) ((i)<0 ? (i)+m_maxSize : (i))
86 #define MM(i) ((i)<m_maxSize ? (i) : (i)-m_maxSize)
87
88 //these constants were gathered by examining the contexts after processing
89 //book2 -- they don't really make too much difference, as long as they're
90 //not silly. The rows are log(# of leaves). It's interesting how little
91 //difference the # of leaves in a deterministic context makes.
92 static int dini[20*8]=
93 {
94 25,29,30,33,37,42,44,42,45,47,45,48,49,50,53,59,60,61,61,61,
95 23,31,26,31,38,45,46,42,46,50,47,49,52,49,53,59,60,61,61,61,
96 23,31,31,39,44,48,52,50,54,52,54,55,57,55,57,62,61,62,61,61,
97 23,28,33,46,51,54,54,52,54,56,57,56,56,58,58,62,61,63,63,59,
98 16,27,32,47,52,53,53,56,56,58,58,53,57,60,61,62,63,63,63,57,
99 25,33,38,56,52,53,56,55,58,54,57,56,55,61,62,63,63,62,63,61,
100 25,33,38,56,52,53,56,55,58,54,57,56,55,61,62,63,63,62,63,61,
101 25,33,38,56,52,53,56,55,58,54,57,56,55,61,62,63,63,62,63,61
102 };
103
SuffixTreeModel(int windowSize,FILE * statfileout)104 SuffixTreeModel::SuffixTreeModel(int windowSize,FILE *statfileout)
105 {
106 m_iblocks=0;
107 m_ifreelist=0;
108 m_rootnode=0;
109
110 m_leafbuf=new LNode[windowSize];
111 m_maxSize=windowSize;
112
113 if (m_statfile=statfileout)
114 {
115 //write number of short vars
116 putc(3,m_statfile);
117 }
118
119 Reset();
120 }
121
122
Reset()123 void SuffixTreeModel::Reset()
124 {
125 int i,j;
126
127 {
128 //free internal nodes
129 LNode *n=m_rootnode;
130 INode *in;
131 while(n)
132 {
133 if (n->isleaf)
134 {n=n->next;continue;}
135 in=(INode *)n;
136 while(n=in->firstchild)
137 {
138 in->firstchild=n->next;
139 n->next=in->next;
140 in->next=n;
141 }
142 n=in->next;
143 DelI(in);
144 }
145 }
146 m_rootnode=NewI(0,0);
147 m_rootnode->childcount=1; //stays 1 forever
148
149 m_currentSize=0;
150
151 //initialize active point.
152 ap.ins=m_rootnode;
153
154 ap.proj=0;
155 ap.r=0;
156
157 //initialize window limits.
158 tail=front=0;
159
160 m_extracountnode=0;
161
162 //init escape predictors
163 for(i=0;i<20;++i)
164 {
165 for(j=0;j<DETLEAVES;j++)
166 {
167 detpred[i][j].hitcount=dini[i+j*20];
168 detpred[i][j].misscount=64-dini[i+j*20];
169 }
170 }
171
172 for (i=0;i<63;++i)
173 {
174 escapepredictors[i][0].hitcount=(63-i);
175 escapepredictors[i][0].misscount=(i+1);
176 esc2pred[i][0]=escapepredictors[i][0];
177 for (j=1;j<ESCORDERS;j++)
178 esc2pred[i][j]=escapepredictors[i][j]=esc2pred[i][0];
179 }
180 }
181
182
~SuffixTreeModel()183 SuffixTreeModel::~SuffixTreeModel()
184 {
185 #ifdef _DEBUG
186 {
187 long a,b;
188 unsigned i,j;
189 for (j=0;j<DETLEAVES;++j)
190 {
191 for (i=0;i<20;i++)
192 {
193 a=detpred[i][j].hitcount;
194 b=detpred[i][j].misscount+a;
195 a=(a*64+(b>>1))/b;
196 if (a>63)
197 a=63;
198 if (a<1)
199 a=1;
200 printf("%2ld,",a);
201 }
202 printf("\n");
203 }
204 }
205 #endif
206 INodeBlock *iblock;
207 while(iblock=m_iblocks)
208 {
209 m_iblocks=iblock->next;
210 delete iblock;
211 }
212 delete [] m_leafbuf;
213 }
214
NewIBlock()215 SuffixTreeModel::INode *SuffixTreeModel::NewIBlock()
216 {
217 INodeBlock *block=new INodeBlock;
218 INode *s,*e;
219 block->next=m_iblocks;
220 m_iblocks=block;
221 s=block->data;
222 e=s+INodeBlock::SIZE-1;
223 while(e!=s)
224 {
225 e->next=m_ifreelist;
226 m_ifreelist=e;
227 --e;
228 }
229 return s;
230 }
231
232
GetChild(SuffixTreeModel::INode * parent,BYTE c)233 SuffixTreeModel::LNode *SuffixTreeModel::GetChild(SuffixTreeModel::INode *parent,BYTE c)
234 {
235 LNode **pos=&(parent->firstchild),**ipos,*ret;
236 if (!*pos)
237 return 0;
238 if ((*pos)->sym==c)
239 return(*pos);
240 pos=&((*pos)->next);
241 if (!*pos)
242 return 0;
243 if ((*pos)->sym==c)
244 return(*pos);
245
246 ipos=pos;ret=0;
247 for (pos=&((*pos)->next);*pos;pos=&((*pos)->next))
248 {
249 if ((*pos)->sym==c)
250 {
251 ret=*pos;
252 *pos=ret->next;
253 ret->next=*ipos;
254 *ipos=ret;
255 break;
256 }
257 }
258 return ret;
259 }
260
261
262 /* canonize subroutine:
263
264 r is return value. To avoid unnecessary access to the child list, r is
265 preserved between calls. If r is not 0 it is assumed that
266 r==child(ins, a), and (ins, r) is the edge of the insertion point.*/
Canonize(SuffixTreeModel::Point * p)267 void SuffixTreeModel::Canonize(SuffixTreeModel::Point *p)
268 {
269 int depthdiff;
270 if (p->proj && p->ins==0)
271 {
272 //the root is the child of nil for all symbols
273 p->ins=m_rootnode; --(p->proj); p->r=0;
274 }
275 while (p->proj)
276 {
277 if (!p->r)
278 {
279 p->sym=m_leafbuf[M0(front-p->proj)].bufsym;
280 p->r=GetChild(p->ins, p->sym);
281 }
282 if (p->r->isleaf) //r is leaf
283 break;
284 depthdiff=((INode *)(p->r))->depth-p->ins->depth;
285 if (p->proj<depthdiff)
286 {
287 // insertion point doesn't extend all the way to r
288 break;
289 }
290 p->proj-=depthdiff; p->ins=(INode *)p->r; p->r=0;
291 }
292 }
293
CreateEdge(SuffixTreeModel::INode * parent,SuffixTreeModel::LNode * child,BYTE c,U32 inicounts)294 void SuffixTreeModel::CreateEdge
295 (
296 SuffixTreeModel::INode *parent,
297 SuffixTreeModel::LNode *child,
298 BYTE c,U32 inicounts
299 )
300 {
301 LNode **pos;
302 child->parent=parent;
303 child->count=inicounts;
304 child->sym=c;
305 parent->contextP1+=inicounts;
306 parent->childcount++;
307 pos=&(parent->firstchild);
308 if (*pos&&((*pos)->count>inicounts))
309 pos=&((*pos)->next);
310 child->next=*pos;
311 *pos=child;
312 }
313
314
DeleteEdge(SuffixTreeModel::INode * parent,SuffixTreeModel::LNode * child)315 U32 SuffixTreeModel::DeleteEdge
316 (
317 SuffixTreeModel::INode *parent,
318 SuffixTreeModel::LNode *child
319 )
320 {
321 U32 occurs=child->count;
322 LNode **pos=&(parent->firstchild);
323 if (*pos==child) //removing MPS
324 {
325 parent->childcount--;
326 parent->contextP1-=occurs;
327 if (*pos=child->next)
328 {
329 LNode **bestpos=pos;
330 U32 bestcount=(*pos)->count;
331 for(pos=&((*pos)->next);*pos;pos=&((*pos)->next))
332 {
333 if ((*pos)->count>bestcount)
334 {
335 bestcount=(*pos)->count;
336 bestpos=pos;
337 }
338 }
339 child=*bestpos;
340 *bestpos=child->next;
341 child->next=parent->firstchild;
342 parent->firstchild=child;
343 }
344 }
345 else //removing other child
346 {
347 for(pos=&((*pos)->next);*pos&&(*pos!=child);pos=&((*pos)->next));
348 _ASSERT(*pos);
349 if (*pos == child)
350 {
351 parent->childcount--;
352 parent->contextP1-=occurs;
353 *pos=child->next;
354 }
355 }
356 return occurs;
357 }
358
359
IncMPC(SuffixTreeModel::INode * parent,SuffixTreeModel::LNode * child)360 U32 SuffixTreeModel::IncMPC
361 (
362 SuffixTreeModel::INode *parent,
363 SuffixTreeModel::LNode *child
364 )
365 {
366 child->count++;
367 parent->contextP1++;
368 if ((parent->firstchild!=child)&&(child->count>parent->firstchild->count))
369 {
370 //new MPS
371 LNode *n;
372 while((n=parent->firstchild)!=child)
373 {
374 parent->firstchild=n->next;
375 n->next=child->next;
376 child->next=n;
377 }
378 }
379 return(child->count);
380 }
381
382 /* Macro for Update subroutine:
383
384 Send credits up the tree, updating pos values, until a nonzero credit
385 is found. Sign bit of suf links is used as credit bit.*/
UpdateCredits(SuffixTreeModel::INode * v,int i)386 void SuffixTreeModel::UpdateCredits(SuffixTreeModel::INode *v,int i)
387 {
388 INode *u;
389 int d;
390 int j, ii=M0(i-tail), jj;
391 while (v!=m_rootnode)
392 {
393 u=v->parent;
394 d=u->depth;
395 j=M0(v->pos-d);
396 jj=M0(j-tail);
397 if (ii>jj)
398 {
399 v->pos=MM(i+d);
400 }
401 else
402 {
403 i=j; ii=jj;
404 }
405 if (!v->cred)
406 {
407 v->cred=1;
408 break;
409 }
410 v->cred=0;
411 v=u;
412 }
413 }
414
415
Encode(BYTE symbol,ArithmeticEncoder * dest)416 void SuffixTreeModel::Encode(BYTE symbol,ArithmeticEncoder *dest)
417 {
418 Walk(symbol,dest,0);
419 Update(symbol);
420 }
421
Decode(ArithmeticDecoder * src)422 BYTE SuffixTreeModel::Decode(ArithmeticDecoder *src)
423 {
424 BYTE ret=Walk(0,0,src);
425 Update(ret);
426 return ret;
427 }
428
iroot(U32 n)429 static U32 iroot(U32 n)
430 {
431 //the easy-but-not-quite-optimal way
432 U32 ret=0,bit;
433
434 for (bit=1;bit&&(bit*bit<=n);bit<<=1)
435 {ret=bit;}
436 for (bit=ret>>1;bit;bit>>=1)
437 {
438 ret+=bit;
439 if (ret*ret>n)
440 ret-=bit;
441 }
442 return ret;
443 }
444
445
put32(U32 i,FILE * f)446 inline void put32(U32 i,FILE *f)
447 {
448 putc((int)(i&255),f);
449 putc((int)((i>>8)&255),f);
450 putc((int)((i>>16)&255),f);
451 putc((int)((i>>24)&255),f);
452 }
453
454
Walk(BYTE c,ArithmeticEncoder * dest,ArithmeticDecoder * src)455 BYTE SuffixTreeModel::Walk(BYTE c,ArithmeticEncoder *dest,ArithmeticDecoder *src)
456 {
457 INode *insnode;
458 LNode *edge;
459 BitPred *epred;
460 U32 cesc,chit,p1,clow,nl;
461 bool escaped=false;
462 Point context;
463 double bestmpsp;
464 INode *cnode[MAXNONDET];
465 unsigned i,numcontexts,cnum;
466 unsigned exclsleft;
467 unsigned numnew;
468 LNode *newedges[MAXEXCLS];
469 BYTE prediction;
470
471 m_excl.Clear();
472 Canonize(&ap);
473 context=ap;
474
475 if (m_statfile)
476 sv.Clear();
477
478 if (context.InEdge()) //deterministic context
479 {
480 int j;
481 U32 clen;
482 _ASSERT(!!context.proj);
483
484 insnode=context.ins;
485 if (context.r->isleaf)
486 {
487 j=MM((context.r-m_leafbuf)+insnode->depth);
488 }
489 else
490 {
491 j=((INode *)context.r)->pos;
492 }
493 prediction=m_leafbuf[MM(j+context.proj)].bufsym;
494 clen=insnode->depth+context.proj;
495 if (clen<1)
496 clen=1;
497
498 if (!insnode->firstchild)
499 goto DETMISS;
500
501 nl=insnode->firstchild->count;
502
503 if (m_statfile)
504 {
505 sv.clen=clen;
506 sv.mcpp=nl;
507 }
508
509 cesc=clen;
510 if (cesc)
511 --cesc;
512 if (cesc>=8)
513 {
514 cesc=((cesc-8)>>1)+8;
515 chit=12;
516 while(cesc>=chit)
517 {
518 cesc=((cesc-chit)>>1)+chit;
519 chit+=2;
520 }
521 }
522 if (cesc>=20)
523 cesc=19;
524
525 if (insnode==m_extracountnode)
526 --nl;
527
528 if (nl)
529 --nl;
530 if (nl>=2)
531 {
532 for(chit=(nl-2),nl=2;chit;chit>>=2,++nl);
533 if (nl>=DETLEAVES)
534 nl=DETLEAVES-1;
535 }
536 epred=&(detpred[cesc][nl]);
537 cesc=epred->misscount;
538 chit=epred->hitcount;
539
540 p1=cesc+chit;
541 if (src)
542 {
543 escaped=src->GetP(p1)>=chit;
544 if (escaped)
545 src->Narrow(p1,chit,p1);
546 else
547 {
548 src->Narrow(p1,0,chit);
549 c=prediction;
550 }
551 }
552 else
553 escaped=(c!=prediction);
554
555 epred->Adjust(!escaped,ESCADJUST);
556
557 if (dest)
558 {
559 if (escaped)
560 dest->Encode(p1,chit,p1);
561 else
562 dest->Encode(p1,0,chit);
563 }
564
565 if (m_statfile)
566 sv.Write(m_statfile,!escaped);
567
568 if (!escaped)
569 return c;
570
571 m_excl.Exclude(prediction);
572
573 DETMISS:
574 //shorten context until non-deterministic
575 for (;;)
576 {
577 if (insnode->depth<1)
578 goto ORDER0;
579 context.ins=insnode->suffix;
580 context.r=0;
581 Canonize(&context);
582 if (!context.InEdge())
583 break;
584 insnode=context.ins;
585 }
586 }
587
588 //we are not in a deterministic context
589 //make the list of contexts to escape through
590
591 numcontexts=0;
592 double mpsp;
593 for (i=0;i<MAXWALK;++i)
594 {
595 if ((!context.ins)||(context.ins==m_rootnode))
596 break;
597 insnode=context.ins;
598 if (insnode->firstchild)
599 {
600 mpsp=(double)insnode->firstchild->count;
601 mpsp/=(double)insnode->contextP1;
602 if (!numcontexts)
603 bestmpsp=mpsp;
604 else if(mpsp>bestmpsp)
605 {
606 numcontexts=0;
607 bestmpsp=mpsp;
608 }
609 if (numcontexts>=MAXNONDET)
610 break;
611 cnode[numcontexts]=context.ins;
612 ++numcontexts;
613 }
614 context.ins=insnode->suffix;
615 Canonize(&context);
616 }
617
618 //walk the contexts to encode the symbol
619
620 exclsleft=MAXEXCLS-m_excl.NumExcls();
621
622 for(cnum=0;(cnum<numcontexts)&&exclsleft;++cnum)
623 {
624 escaped=true;
625 insnode=cnode[cnum];
626 cesc=((unsigned)insnode->childcount)+1;
627 chit=0;
628 numnew=0;
629 if (m_statfile)
630 {
631 sv.Clear();
632 sv.clen=insnode->depth;
633 sv.prevexcls=(BYTE)m_excl.NumExcls();
634 sv.children=insnode->childcount;
635 }
636 for (edge=insnode->firstchild;edge&&exclsleft;edge=edge->next)
637 {
638 prediction=edge->sym;
639 if (!m_excl.Exclude(prediction))
640 {
641 //was already excluded
642 continue;
643 }
644 --exclsleft;
645 newedges[numnew++]=edge;
646 chit+=edge->count;
647 if (prediction==c)
648 escaped=false;
649 }
650 if (!numnew) //no new symbols
651 continue;
652 if (edge)
653 {m_excl.Backup(numnew);break;}
654
655 p1=chit;
656
657 //get escape estimator
658 {
659 int orderindex=insnode->depth;
660 if (orderindex>=ESCORDERS)
661 orderindex=ESCORDERS-1;
662 else if(orderindex)
663 --orderindex;
664 cesc=(cesc<<6)/(cesc+chit);
665 if (cesc>0)
666 --cesc;
667 if (cesc>=63)
668 cesc=62;
669 if (!cnum)
670 epred=&(escapepredictors[cesc][orderindex]);
671 else
672 epred=&(esc2pred[cesc][orderindex]);
673 cesc=epred->misscount;
674 chit=epred->hitcount;
675 }
676
677 if (src) //decode escape
678 escaped=(src->GetP(cesc+chit)>=chit);
679
680 epred->Adjust(!escaped,ESCADJUST);
681
682 if (m_statfile)
683 {
684 sv.clen=insnode->depth;
685 sv.children=insnode->childcount;
686 _ASSERT(((unsigned)sv.children)+1>=m_excl.NumExcls());
687 sv.mcpmisses=(BYTE)((((unsigned)sv.children)+1)-m_excl.NumExcls());
688 for (i=0;i<numnew;i++)
689 sv.mcpp+=newedges[i]->count;
690 _ASSERT(insnode->contextP1>=sv.mcpp);
691 sv.mcpmissp=insnode->contextP1-sv.mcpp;
692 sv.Write(m_statfile,!escaped);
693 }
694
695 if (escaped)
696 {
697 if (src)
698 src->Narrow(cesc+chit,chit,cesc+chit);
699 if (dest)
700 dest->Encode(cesc+chit,chit,cesc+chit);
701 continue;
702 }
703 if (src)
704 src->Narrow(cesc+chit,0,chit);
705 if (dest)
706 dest->Encode(cesc+chit,0,chit);
707
708 //yay! no escape!
709 nl=newedges[0]->count>>2; //I have no idea why this is good!!!
710 newedges[0]->count+=nl;
711 p1+=nl;
712 if (src)
713 {
714 //decode char
715 chit=src->GetP(p1);
716 clow=0;
717 for (i=0;i<numnew;i++)
718 {
719 edge=newedges[i];
720 if (chit<edge->count)
721 {
722 c=edge->sym;
723 chit=edge->count;
724 break;
725 }
726 clow+=edge->count;
727 chit-=edge->count;
728 }
729 _ASSERT(i<numnew);
730 }
731 else
732 {
733 clow=0;
734 for (i=0;i<numnew;i++)
735 {
736 edge=newedges[i];
737 prediction=edge->sym;
738 if (c==prediction)
739 {
740 chit=edge->count;
741 break;
742 }
743 clow+=edge->count;
744 }
745 _ASSERT(i<numnew);
746 }
747
748
749 _ASSERT(p1<MAXP1);
750
751
752 if (src)
753 src->Narrow(p1,clow,clow+chit);
754 if (dest)
755 dest->Encode(p1,clow,clow+chit);
756 _ASSERT(clow+chit<=p1);
757
758 newedges[0]->count-=nl;
759 p1-=nl;
760 return(c);
761 }
762
763 //sigh -- didn't find a match anywhere
764 ORDER0:
765 if (src)
766 c=order0.Decode(src,m_excl);
767 else if (dest)
768 order0.Encode(c,dest,m_excl);
769 order0.Update(c);
770 return(c);
771 }
772
773
774
775 /* Function advancefront:
776
777 Moves front, the right endpoint of the window, increasing its size.
778 */
779
AdvanceFront()780 void SuffixTreeModel::AdvanceFront()
781 {
782 INode *u, *v;
783 LNode *s;
784 int j;
785 BYTE b, c;
786
787 v=0;
788 c=m_leafbuf[front].bufsym;
789 while (1)
790 {
791 Canonize(&ap);
792 if (!ap.r)
793 {
794 //active point at node.
795 m_extracountnode=0;
796 if (!ap.ins)
797 {
798 ap.r=m_rootnode; //root is child of ins for any c
799 break; //endpoint found.
800 }
801 ap.r=GetChild(ap.ins, c);
802 if (ap.r)
803 {
804 //ins has a child for c.
805 ap.sym=c; //a is first symbol in (ins, r) label
806 IncMPC(ap.ins,ap.r);
807 //if we split this node, we want to remove the count we just added
808 //from the new child
809 m_extracountnode=ap.r;
810 break; //endpoint found.
811 }
812 else
813 {
814 u=ap.ins; //will add child below u.
815 }
816 }
817 else
818 {
819 // active point on edge.
820 j=(ap.r->isleaf ? MM((ap.r-m_leafbuf)+ap.ins->depth) : ((INode *)ap.r)->pos);
821 b=m_leafbuf[MM(j+ap.proj)].bufsym; // next symbol in (ins, r) label.
822 if (c==b) /* if same as front symbol.*/
823 break; /* endpoint found.*/
824 else
825 { /* edge must be split.*/
826 u=NewI(ap.ins->depth+ap.proj,M0(front-ap.proj));
827 U32 rcounts=DeleteEdge(ap.ins, ap.r);
828 CreateEdge(ap.ins, u, ap.sym,rcounts);
829 //we added a count when we came into this edge. That coun't
830 //isn't valid for the part that's getting split off, so
831 //remove it
832 if (m_extracountnode==ap.r)
833 --rcounts;
834 _ASSERT(rcounts>0);
835 //m_extracountnode will get cleared below
836 CreateEdge(u, ap.r, b,rcounts);
837 if (!ap.r->isleaf)
838 ((INode *)ap.r)->pos=MM(j+ap.proj);
839 }
840 }
841 m_extracountnode=0;
842 s=m_leafbuf+M0(front-u->depth);
843 s->childcount=0;
844 s->count=0;
845 s->isleaf=1;
846 s->next=0;
847 s->parent=0;
848 s->sym=0;
849 CreateEdge(u, s, c,1); /* add new leaf s.*/
850 m_rootnode->childcount=1; // don't count children of root
851 if (u==ap.ins) /* skip update if new node.*/
852 UpdateCredits(u, M0(front-u->depth));
853 if (v)
854 v->suffix=u;
855 v=u;
856 ap.ins=ap.ins->suffix;
857 ap.r=0; /* force getting new r in canonize.*/
858 }
859 if (v)
860 v->suffix=ap.ins;
861 ++ap.proj; /* move active point down.*/
862 front=MM(front+1);
863 }
864
865
866 /* Function advancetail:
867
868 Moves tail, the left endpoint of the window, forward by positions
869 positions, decreasing its size.*/
AdvanceTail()870 void SuffixTreeModel::AdvanceTail()
871 {
872 INode *u, *w; /* nodes.*/
873 LNode *v,*s;
874 BYTE c;
875 int i, d;
876 U32 oldcounts;
877
878 Canonize(&ap);
879 v=m_leafbuf+tail; // the leaf to delete.
880 u=v->parent;
881 oldcounts=DeleteEdge(u, v);
882 if (v==ap.r)
883 {
884 //replace instead of delete
885
886 i=M0(front-(ap.ins->depth+ap.proj));
887 if (m_extracountnode==v)
888 {
889 m_extracountnode=m_leafbuf+i;
890 ++oldcounts;
891 }
892 CreateEdge(ap.ins, m_leafbuf+i, v->sym,oldcounts);
893 UpdateCredits(ap.ins, i);
894 ap.ins=ap.ins->suffix;
895 ap.r=0; /* force getting new r in canonize.*/
896 }
897 else if (u!=m_rootnode && !u->childcount)
898 {
899 // u has only one child left, delete it.
900 _ASSERT(u->firstchild&&!u->firstchild->next);
901 c=u->sym;
902 _ASSERT(c==m_leafbuf[u->pos].bufsym);
903 w=u->parent;
904 d=u->depth-w->depth;
905 s=u->firstchild; // the remaining child of u.
906 _ASSERT(s->sym==m_leafbuf[MM(u->pos+d)].bufsym);
907 if (u==ap.ins)
908 {
909 ap.ins=w;
910 ap.proj+=d;
911 ap.sym=c; // new first symbol of (ins, r) label
912 m_extracountnode=0;
913 }
914 else if (u==ap.r)
915 {
916 if (m_extracountnode==u)
917 m_extracountnode=s;
918 ap.r=s; // new child(ins, a).
919 }
920 if (u->cred)
921 UpdateCredits(w, M0(u->pos-w->depth));
922 DeleteEdge(u, s);
923 oldcounts=DeleteEdge(w, u);
924 CreateEdge(w, s, c,oldcounts);
925 if (!s->isleaf)
926 ((INode *)s)->pos=M0(((INode *)s)->pos-d);
927 DelI(u);
928 }
929 tail=MM(tail+1);
930 }
931
932
933
934
Write(FILE * f,bool hit)935 void SuffixTreeModel::StatVec::Write(FILE *f,bool hit)
936 {
937 unsigned n=0,i;
938 U32 vec[8];
939 BYTE code=hit?(BYTE)128:0;
940 BYTE bit;
941
942 vec[n++]=children;
943 vec[n++]=prevexcls;
944 vec[n++]=mcpmisses;
945 vec[n++]=clen;
946 vec[n++]=mcpp;
947 vec[n++]=mcpmissp;
948
949 for (bit=1,i=0;i<n;i++,bit<<=1)
950 {
951 if (vec[i])
952 code|=bit;
953 }
954 putc(code,f);
955 for (bit=1,i=0;i<n;i++,bit<<=1)
956 {
957 if (vec[i])
958 {
959 if (i<3)
960 putc((BYTE)vec[i],f);
961 else
962 put32(vec[i],f);
963 }
964 }
965
966 Clear();
967 }
968
969
970