1 //===========================================================================
2 // Copyright (C) 2000 Matt Timmermans
3 // Free for non-commercial purposes as long as this notice remains intact.
4 // To discuss commercial use, mail me at matt@timmermans.org
5 //
6 // sufftree.h
7 //
8 // This file started life as Jesper Larsson's excellent sliding window
9 // suffix tree.  Now it implements a modified PPM* model for the bicom
10 // bijective compressor.  Much of the original elegance of Jesper's code
11 // has been destroyed with the addition of new functionality.  In particular,
12 // the clever hashing scheme for children was ditched, because I had to keep
13 // linked lists anyway.
14 //
15 // This model does a few things differently:
16 //
17 // Contexts are unbounded.  We start at the longest context and escape
18 // through shorter ones.  The first non-deterministic context we look at is
19 // the one with the highest P(MPS), i.e., we use Charle's blooks Local Order
20 // Estimation.
21 // To avoid being too slow, we look at onlythe longest determinstic context,
22 // and at most SuffixTreeModel::SuffixTreeModel::MAXWALK non-deterministic
23 // contexts.  We will only escape through SuffixTreeModel::MAXNONDET
24 // non-deterministic contexts.
25 // We also stop escaping when we hit a context that could result in more than
26 // MAXEXCLS exclusions.  These don't tend to be too useful anyway.
27 //
28 // In non-deterministic contexts, escape probabilities are estimated
29 // originally using PPM method C.  In deterministic contexts, the initial
30 // estimate is based on the context length.
31 //
32 // In both cases, this estimate is used only as an index into a table of
33 // adaptive binary contexts that produce the probability we actually use.
34 //
35 // The result is like Charles Bloom's SEE, but not as good -- might change
36 // that later.
37 //
38 // This is Jesper's original copyright notice:
39 //
40 //============================================================================
41 // Copyright 1999, N. Jesper Larsson, all rights reserved.
42 //
43 // This file contains an implementation of a sliding window index using a
44 // suffix tree. It corresponds to the algorithms and representation presented
45 // in the Ph.D. thesis "Structures of String Matching and Data Compression",
46 // Lund University, 1999.
47 //
48 // This software may be used freely for any purpose. However, when distributed,
49 // the original source must be clearly stated, and, when the source code is
50 // distributed, the copyright notice must be retained and any alterations in
51 // the code must be clearly marked. No warranty is given regarding the quality
52 // of this software.
53 //===========================================================================
54 // (note: ALL of Jesper's code has been altered, so consider this a clear
55 // marking of that fact ;-)
56 
57 #ifndef n43f4esskjqa2rfs4ejr1oxj4w5slu4h
58 #define n43f4esskjqa2rfs4ejr1oxj4w5slu4h
59 
60 #include "arithmetic.h"
61 #include "simplemodel.h"
62 #include "exclude.h"
63 #include "stdio.h"
64 
65 static const int SIGNBIT=(int)(((int)-1)^(((unsigned)(int)(-1))>>1));
66 typedef unsigned long U32;
67 
68 class SuffixTreeModel : public ArithmeticModel
69   {
70   public:
71   SuffixTreeModel(int maxsize,FILE *statfileout=0);
72   ~SuffixTreeModel();
73 
74   void Reset();
75 
76   void Encode(BYTE symbol,ArithmeticEncoder *dest);
77   BYTE Decode(ArithmeticDecoder *src);
78 
Update(BYTE c)79   void Update(BYTE c)
80     {
81     if (m_currentSize==m_maxSize)
82       AdvanceTail();
83     else
84       ++m_currentSize;
85     m_leafbuf[front].bufsym=c;
86     AdvanceFront();
87     }
88   enum {MAXEXCLS=192,MAXWALK=70,MAXNONDET=40,ESCADJUST=128,ESCORDERS=7,DETLEAVES=6};
89   private:
90 
91   struct INode;
92   struct LNode;
93 
94   class Point
95     {
96     public:
InEdge()97     bool InEdge()
98       {return(r!=NULL);}
99     //After Canonize()ing a point, r >0 <=> proj!=0, i.e., point
100     //is inside an edge
101     INode *ins;  //parent of point
102     LNode *r;    //if !=0, then child(ins,a)
103     int proj; //length of point - string(ins)
104     BYTE sym;
105     };
106 
107   struct LNode
108     {
109     U32 count;       //PPM count in context of parent
110     LNode *next;     //next child of the same parent
111     INode *parent;
112     BYTE childcount; //number of children minus one, except
113                      //for the root which always has
114                      //childcount==1.
115                      //leaves always have childcount==0
116     BYTE sym;        //first symbol on the edge to this node
117     BYTE isleaf;     //this node is a leaf?1:0
118     union
119       {
120       BYTE cred;     //for INodes, Fiala's credits
121       BYTE bufsym;   //for LNodes, buffer symbol.  We only put this
122                      //in the LNode structure because we have a byte
123                      //to spare here.
124       };
125     };
126 
127   struct INode : public LNode
128     {
129     int pos;          // start of edge label in m_leafbuf[..].bufsym
130     int depth;        // string depth at this node
131     INode *suffix;    // suffix link
132     U32 contextP1;    // sum of ->count on all children
133     LNode *firstchild;// linked list of children, MPS first
134     };
135 
136   //we use these to manage allocation of internal nodes
137   struct INodeBlock
138     {
139     enum{SIZE=4000};
140     INodeBlock *next;
141     INode data[4000];
142     };
143 
144 
145   class BitPred
146     {
147     public:
Adjust(bool hit,U32 div)148     void Adjust(bool hit,U32 div)
149       {
150       div=((hitcount+misscount)/div)+1;
151       if (hit)
152         hitcount+=div;
153       else
154         misscount+=div;
155       if (hitcount+misscount>=4096)
156         {
157         hitcount=(hitcount+1)>>1;
158         misscount=(misscount+1)>>1;
159         }
160       }
161     U32 hitcount,misscount;
162     };
163 
164 
165   enum  //statfile vars
166     {
167     SV_CHILDREN =0x01,
168     SV_PREVEXCLS=0x02,
169     SV_MCPMISSES=0x04,
170     SV_CLEN     =0x08,
171     SV_MCPP     =0x10,
172     SV_MCPMISSP =0x20,
173     SV_HIT      =0x80
174     };
175   class StatVec
176     {
177     public:
178     BYTE children;  //children-1
179     BYTE prevexcls;
180     BYTE mcpmisses;
181     U32 clen;
182     U32 mcpp;
183     U32 mcpmissp;
184 
Clear()185     void Clear()
186       {children=prevexcls=mcpmisses=0;clen=mcpp=mcpmissp=0;}
187     void Write(FILE *f,bool hit);
188     };
189 
190   INode *NewIBlock();
191 
NewI(int depth,int pos)192   INode *NewI(int depth,int pos)
193     {
194     INode *ret;
195     if (ret=m_ifreelist)
196       m_ifreelist=(INode *)ret->next;
197     else
198       ret=NewIBlock();
199     ret->count=0;
200     ret->next=0;
201     ret->parent=0;
202     ret->pos=pos;
203     ret->depth=depth;
204     ret->suffix=0;
205     ret->childcount=(BYTE)-1;
206     ret->contextP1=0;
207     ret->firstchild=0;
208     ret->isleaf=0;
209     ret->cred=1;
210     return(ret);
211     }
212 
DelI(LNode * n)213   void DelI(LNode *n)
214     {
215     n->next=m_ifreelist;
216     m_ifreelist=(INode *)n;
217     }
218 
219   LNode *GetChild(INode *parent,BYTE c);
220 
221   void CreateEdge(INode *parent,LNode *child,BYTE c,U32 inicounts);
222   U32 DeleteEdge(INode *parent,LNode *child);
223   U32 IncMPC(INode *parent,LNode *child);
224 
225 
226   void Canonize(SuffixTreeModel::Point *p);
227   void UpdateCredits(INode *v,int i);
228   void AdvanceFront();
229   BYTE Walk(BYTE c,ArithmeticEncoder *dest,ArithmeticDecoder *src);
230   void AdvanceTail();
231 
232   int m_maxSize;
233   int m_currentSize;
234 
235   Point ap; //active point
236   int front, tail;         /* limits of window.*/
237   ExclusionSet m_excl;
238   SimpleAdaptiveModel order0;
239   FILE *m_statfile;
240   StatVec sv;
241   BitPred escapepredictors[63][ESCORDERS],esc2pred[63][ESCORDERS],detpred[20][DETLEAVES];
242   INodeBlock *m_iblocks;
243   INode *m_ifreelist;
244   INode *m_rootnode;
245   LNode *m_leafbuf;  //leaves and buffer
246   LNode *m_extracountnode;
247   };
248 
249 
250 #endif
251 
252