1 //=========================================================================== 2 // Copyright (C) 2000 Matt Timmermans 3 // Free for non-commercial purposes as long as this notice remains intact. 4 // To discuss commercial use, mail me at matt@timmermans.org 5 // 6 // sufftree.h 7 // 8 // This file started life as Jesper Larsson's excellent sliding window 9 // suffix tree. Now it implements a modified PPM* model for the bicom 10 // bijective compressor. Much of the original elegance of Jesper's code 11 // has been destroyed with the addition of new functionality. In particular, 12 // the clever hashing scheme for children was ditched, because I had to keep 13 // linked lists anyway. 14 // 15 // This model does a few things differently: 16 // 17 // Contexts are unbounded. We start at the longest context and escape 18 // through shorter ones. The first non-deterministic context we look at is 19 // the one with the highest P(MPS), i.e., we use Charle's blooks Local Order 20 // Estimation. 21 // To avoid being too slow, we look at onlythe longest determinstic context, 22 // and at most SuffixTreeModel::SuffixTreeModel::MAXWALK non-deterministic 23 // contexts. We will only escape through SuffixTreeModel::MAXNONDET 24 // non-deterministic contexts. 25 // We also stop escaping when we hit a context that could result in more than 26 // MAXEXCLS exclusions. These don't tend to be too useful anyway. 27 // 28 // In non-deterministic contexts, escape probabilities are estimated 29 // originally using PPM method C. In deterministic contexts, the initial 30 // estimate is based on the context length. 31 // 32 // In both cases, this estimate is used only as an index into a table of 33 // adaptive binary contexts that produce the probability we actually use. 34 // 35 // The result is like Charles Bloom's SEE, but not as good -- might change 36 // that later. 37 // 38 // This is Jesper's original copyright notice: 39 // 40 //============================================================================ 41 // Copyright 1999, N. Jesper Larsson, all rights reserved. 42 // 43 // This file contains an implementation of a sliding window index using a 44 // suffix tree. It corresponds to the algorithms and representation presented 45 // in the Ph.D. thesis "Structures of String Matching and Data Compression", 46 // Lund University, 1999. 47 // 48 // This software may be used freely for any purpose. However, when distributed, 49 // the original source must be clearly stated, and, when the source code is 50 // distributed, the copyright notice must be retained and any alterations in 51 // the code must be clearly marked. No warranty is given regarding the quality 52 // of this software. 53 //=========================================================================== 54 // (note: ALL of Jesper's code has been altered, so consider this a clear 55 // marking of that fact ;-) 56 57 #ifndef n43f4esskjqa2rfs4ejr1oxj4w5slu4h 58 #define n43f4esskjqa2rfs4ejr1oxj4w5slu4h 59 60 #include "arithmetic.h" 61 #include "simplemodel.h" 62 #include "exclude.h" 63 #include "stdio.h" 64 65 static const int SIGNBIT=(int)(((int)-1)^(((unsigned)(int)(-1))>>1)); 66 typedef unsigned long U32; 67 68 class SuffixTreeModel : public ArithmeticModel 69 { 70 public: 71 SuffixTreeModel(int maxsize,FILE *statfileout=0); 72 ~SuffixTreeModel(); 73 74 void Reset(); 75 76 void Encode(BYTE symbol,ArithmeticEncoder *dest); 77 BYTE Decode(ArithmeticDecoder *src); 78 Update(BYTE c)79 void Update(BYTE c) 80 { 81 if (m_currentSize==m_maxSize) 82 AdvanceTail(); 83 else 84 ++m_currentSize; 85 m_leafbuf[front].bufsym=c; 86 AdvanceFront(); 87 } 88 enum {MAXEXCLS=192,MAXWALK=70,MAXNONDET=40,ESCADJUST=128,ESCORDERS=7,DETLEAVES=6}; 89 private: 90 91 struct INode; 92 struct LNode; 93 94 class Point 95 { 96 public: InEdge()97 bool InEdge() 98 {return(r!=NULL);} 99 //After Canonize()ing a point, r >0 <=> proj!=0, i.e., point 100 //is inside an edge 101 INode *ins; //parent of point 102 LNode *r; //if !=0, then child(ins,a) 103 int proj; //length of point - string(ins) 104 BYTE sym; 105 }; 106 107 struct LNode 108 { 109 U32 count; //PPM count in context of parent 110 LNode *next; //next child of the same parent 111 INode *parent; 112 BYTE childcount; //number of children minus one, except 113 //for the root which always has 114 //childcount==1. 115 //leaves always have childcount==0 116 BYTE sym; //first symbol on the edge to this node 117 BYTE isleaf; //this node is a leaf?1:0 118 union 119 { 120 BYTE cred; //for INodes, Fiala's credits 121 BYTE bufsym; //for LNodes, buffer symbol. We only put this 122 //in the LNode structure because we have a byte 123 //to spare here. 124 }; 125 }; 126 127 struct INode : public LNode 128 { 129 int pos; // start of edge label in m_leafbuf[..].bufsym 130 int depth; // string depth at this node 131 INode *suffix; // suffix link 132 U32 contextP1; // sum of ->count on all children 133 LNode *firstchild;// linked list of children, MPS first 134 }; 135 136 //we use these to manage allocation of internal nodes 137 struct INodeBlock 138 { 139 enum{SIZE=4000}; 140 INodeBlock *next; 141 INode data[4000]; 142 }; 143 144 145 class BitPred 146 { 147 public: Adjust(bool hit,U32 div)148 void Adjust(bool hit,U32 div) 149 { 150 div=((hitcount+misscount)/div)+1; 151 if (hit) 152 hitcount+=div; 153 else 154 misscount+=div; 155 if (hitcount+misscount>=4096) 156 { 157 hitcount=(hitcount+1)>>1; 158 misscount=(misscount+1)>>1; 159 } 160 } 161 U32 hitcount,misscount; 162 }; 163 164 165 enum //statfile vars 166 { 167 SV_CHILDREN =0x01, 168 SV_PREVEXCLS=0x02, 169 SV_MCPMISSES=0x04, 170 SV_CLEN =0x08, 171 SV_MCPP =0x10, 172 SV_MCPMISSP =0x20, 173 SV_HIT =0x80 174 }; 175 class StatVec 176 { 177 public: 178 BYTE children; //children-1 179 BYTE prevexcls; 180 BYTE mcpmisses; 181 U32 clen; 182 U32 mcpp; 183 U32 mcpmissp; 184 Clear()185 void Clear() 186 {children=prevexcls=mcpmisses=0;clen=mcpp=mcpmissp=0;} 187 void Write(FILE *f,bool hit); 188 }; 189 190 INode *NewIBlock(); 191 NewI(int depth,int pos)192 INode *NewI(int depth,int pos) 193 { 194 INode *ret; 195 if (ret=m_ifreelist) 196 m_ifreelist=(INode *)ret->next; 197 else 198 ret=NewIBlock(); 199 ret->count=0; 200 ret->next=0; 201 ret->parent=0; 202 ret->pos=pos; 203 ret->depth=depth; 204 ret->suffix=0; 205 ret->childcount=(BYTE)-1; 206 ret->contextP1=0; 207 ret->firstchild=0; 208 ret->isleaf=0; 209 ret->cred=1; 210 return(ret); 211 } 212 DelI(LNode * n)213 void DelI(LNode *n) 214 { 215 n->next=m_ifreelist; 216 m_ifreelist=(INode *)n; 217 } 218 219 LNode *GetChild(INode *parent,BYTE c); 220 221 void CreateEdge(INode *parent,LNode *child,BYTE c,U32 inicounts); 222 U32 DeleteEdge(INode *parent,LNode *child); 223 U32 IncMPC(INode *parent,LNode *child); 224 225 226 void Canonize(SuffixTreeModel::Point *p); 227 void UpdateCredits(INode *v,int i); 228 void AdvanceFront(); 229 BYTE Walk(BYTE c,ArithmeticEncoder *dest,ArithmeticDecoder *src); 230 void AdvanceTail(); 231 232 int m_maxSize; 233 int m_currentSize; 234 235 Point ap; //active point 236 int front, tail; /* limits of window.*/ 237 ExclusionSet m_excl; 238 SimpleAdaptiveModel order0; 239 FILE *m_statfile; 240 StatVec sv; 241 BitPred escapepredictors[63][ESCORDERS],esc2pred[63][ESCORDERS],detpred[20][DETLEAVES]; 242 INodeBlock *m_iblocks; 243 INode *m_ifreelist; 244 INode *m_rootnode; 245 LNode *m_leafbuf; //leaves and buffer 246 LNode *m_extracountnode; 247 }; 248 249 250 #endif 251 252