1 // 2 // WordBitCompress.h 3 // 4 // NAME 5 // 6 // Read and write objects in a bit stream, possibly compressing them. 7 // 8 // SYNOPSIS 9 // 10 // #include <WordBitStream.h> 11 // 12 // unsigned int value = 200; 13 // WordBitCompress stream; 14 // stream.PutUint(value, 8); 15 // stream.Rewind(); 16 // stream.GetUint(value); 17 // 18 // DESCRIPTION 19 // 20 // A <b>WordBitCompress</b> object is a variable size string with methods 21 // to write and read numerical objects using a controlled amount of bits. 22 // Some methods implement compression such as custom Shannon-Fano or 23 // static compression similar to Golomb or Gamma. 24 // 25 // It is not possible to read a stream while writing into it and vice 26 // versa. Another limitation is that the largest number is of type unsigned 27 // int. 28 // 29 // For examples of use, check the <b>WordDBCompress</b> implementation. 30 // 31 // 32 // END 33 // 34 // Part of the ht://Dig package <http://www.htdig.org/> 35 // Copyright (c) 1999, 2000, 2001 The ht://Dig Group 36 // For copyright details, see the file COPYING in your distribution 37 // or the GNU General Public License version 2 or later 38 // <http://www.gnu.org/copyleft/gpl.html> 39 // 40 // $Id: WordBitCompress.h,v 1.9 2001/06/29 14:14:08 loic Exp $ 41 // 42 43 #ifndef _WordBitCompress_h 44 #define _WordBitCompress_h 45 46 #include <stdio.h> 47 #include <stdlib.h> 48 #include <string.h> 49 50 class WordBitStream { 51 public: WordBitStream(unsigned int nbuff_size)52 inline WordBitStream(unsigned int nbuff_size) { 53 Init(); 54 Allocate(nbuff_size); 55 } WordBitStream()56 inline WordBitStream() { 57 Init(); 58 } ~WordBitStream()59 inline ~WordBitStream() { 60 free(buff); 61 } 62 63 // 64 // Build and initialize stream 65 // Init()66 inline void Init() { 67 buff_size = 1024; 68 buff = (unsigned char*)malloc(buff_size); 69 Clear(); 70 } 71 72 // 73 // Reset stream to empty state 74 // Clear()75 inline void Clear() { 76 bitpos = 0; 77 buff_length = 1; 78 buff_idx = 0; 79 buff[buff_idx] = '\0'; 80 freezeon = 0; 81 freeze_bitcount = 0; 82 } 83 Allocate(unsigned int nbuff_size)84 inline void Allocate(unsigned int nbuff_size) { 85 buff_size = nbuff_size; 86 buff = (unsigned char*)realloc(buff, buff_size); 87 } 88 89 // 90 // Make sure the buffer is large enough to safely access the 91 // byte located at buff[index] 92 // Check(int index)93 inline void Check(int index) { 94 while(index >= buff_size) 95 Allocate(buff_size * 2); 96 } 97 98 // 99 // Advance bitpos by adding incr to it, take care of space allocation. 100 // BitposIncr(int incr)101 inline void BitposIncr(int incr) { 102 bitpos += incr; 103 if(!(bitpos & 0x07)) { 104 Check(++buff_idx); 105 buff[buff_idx] = 0; 106 buff_length++; 107 } 108 } 109 110 // 111 // Append a bit : 0 if v null, 1 otherwise 112 // Put(unsigned int v)113 inline void Put(unsigned int v) { 114 if(freezeon) { 115 freeze_bitcount++; 116 return; 117 } 118 119 if(v) buff[buff_idx] |= 1 << (bitpos & 0x07); 120 BitposIncr(1); 121 } 122 // 123 // Return 0 if current bit in stream is not set, and not 0 if it is set. The 124 // bit position is incremented. 125 // Get()126 inline unsigned char Get() { 127 if(bitpos >= (buff_length << 3)) { 128 fprintf(stderr, "BitStream::get reading past end of BitStream.\n"); 129 } 130 131 unsigned char res = buff[bitpos >> 3] & (1 << (bitpos & 0x07)); 132 bitpos++; 133 return res; 134 } 135 136 // 137 // Put in stream the lowest <b>n</b> bits from the value <b>v</b> 138 // 139 void PutUint(unsigned int v, int n); 140 // 141 // Return <b>n</b> bits from the stream. 142 // 143 unsigned int GetUint(int n); 144 145 // 146 // Put in stream <b>n</b> bits from the array <b>vals</b>, each char 147 // considered as a storage for 8 bits. 148 // 149 void PutZone(unsigned char *vals, int n); 150 // 151 // Get from stream <b>n</b> bits and move them to the array <b>vals</b>. 152 // The size of <b>vals</b> must be at least <b>n</b>/8 long. 153 // 154 void GetZone(unsigned char *vals, int n); 155 156 // 157 // Return the length of the stream, in bits. 158 // Length()159 inline int Length() { return freezeon ? freeze_bitcount : bitpos; } 160 // 161 // Return the length of the stream, in bytes. 162 // BuffLength()163 inline int BuffLength() { return buff_length; } 164 // 165 // Return a pointer to the beginning of the stream. 166 // Buff()167 inline unsigned char* Buff() { return buff; } 168 // 169 // Return a pointer to a copy of the stream allocated with malloc. 170 // BuffCopy()171 inline unsigned char* BuffCopy() { 172 unsigned char* copy = (unsigned char*)malloc(buff_length); 173 memcpy(copy, buff, buff_length); 174 return copy; 175 } 176 // 177 // Copy <b>nbuff_length</b> bytes from <b>nbuff</b> into the stream 178 // BuffSet(const unsigned char * nbuff,int nbuff_length)179 inline void BuffSet(const unsigned char* nbuff, int nbuff_length) { 180 bitpos = 0; 181 buff_length = nbuff_length; 182 buff_idx = buff_length - 1; 183 Check(buff_length); 184 memcpy(buff, nbuff, buff_length); 185 } 186 // 187 // Move the stream cursor to the beginning of the stream. 188 // Rewind()189 void Rewind() { bitpos = 0; } 190 191 // 192 // Stop insertion into the stream. Only the stream cursor is updated 193 // to accurately reflect the size of the stream. All the Get* and Put* 194 // methods behaviour is modified. 195 // Freeze()196 void Freeze() { 197 if(freezeon) { 198 fprintf(stderr, "WordBitCompress::Freeze: recursive call not permitted\n"); 199 } 200 freeze_bitcount = 0; 201 freezeon = 1; 202 } 203 // 204 // Enable insertion into the stream, this is the normal behaviour. See 205 // the <b>Freeze</b> method. 206 // UnFreeze()207 void UnFreeze() { 208 freezeon = 0; 209 } 210 211 private: 212 213 // the buffer were the bitstream is stored 214 unsigned char* buff; 215 int buff_length; 216 int buff_size; 217 int buff_idx; 218 219 // current bit position within the buffer 220 int bitpos; 221 222 // freezing the bitstream 223 int freeze_bitcount; 224 int freezeon; 225 }; 226 227 // Constants used by WordBitCompress 228 // number of bits to code the number of values in an array 229 #define WORD_CMPR_NBITS_NVALS 16 230 231 // 232 // Number of bits encoding the log2 of 32 bits value 233 // 5 == log2(32) == log2(log2(0xffffffff)) 234 // 235 #define WORD_CMPR_LOG32_BITS 5 236 237 // 238 // Number of bits encoding the log2 of 8 bits value 239 // 3 == log2(8) == log2(log2(0xff)) 240 // (currently set to 4 for unknown reasons, never tried with 241 // 3. The fact that the above LOG32_BITS = 5 apparenlty works 242 // is not a good hint to guess that LOG8_BITS = 3 would work 243 // since the coding functions refuse values with bit 32 set 244 // so the number of bits really encoded is really 31. To 245 // make sure this works properly unary tests should be 246 // written). 247 // 248 #define WORD_CMPR_LOG8_BITS 4 249 250 // 251 // Number of bits encoding the compression model 252 // 253 #define WORD_CMPR_MODEL_BITS 2 254 // 255 // Compression model value for Decr 256 // 257 #define WORD_CMPR_MODEL_DECR 0 258 // 259 // Compression model value for Fixed 260 // 261 #define WORD_CMPR_MODEL_FIXED 1 262 263 class WordBitCompress : public WordBitStream { 264 public: 265 //- 266 // Constructor. Create a empty stream. 267 // WordBitCompress()268 WordBitCompress() { } 269 //- 270 // Constructor. Create a empty stream and pre-allocate <b>size</b> 271 // bytes. 272 // WordBitCompress(int size)273 WordBitCompress(int size) : WordBitStream(size) { } 274 275 //- 276 // Put in bitstream integer value <b>v</b> (coded on <b>n</b> bits). 277 // 278 // The encoding is : 279 // <ul> 280 // <li> WordBitStream::PutUint(log2(v), log2(n)) 281 // <li> WordBitStream::PutUint(v, log2(v)) 282 // </ul> 283 // 284 void PutUint(unsigned int v, int n); 285 //- 286 // Get integer value from bitstream and return it (coded on <b>n</b> bits). 287 // 288 // The decoding is : 289 // <ul> 290 // <li> nbits = WordBitStream::GetUint(log2(n)) 291 // <li> return WordBitStream::GetUint(nbits)) 292 // </ul> 293 // 294 unsigned int GetUint(int n); 295 296 //- 297 // Put in bitstream the array of <b>vals</b> integer values 298 // of size <b>n</b> and return the number of bits used in the bitstream. 299 // 300 // The encoding is : 301 // <ul> 302 // <li> PutUint(n, WORD_CMPR_NBITS_NVALS) 303 // <ul> 304 // If Decr preforms better than Fixed 305 // <ul> 306 // <li> PutUint(WORD_CMPR_MODEL_DECR, WORD_COMPRESS_MODEL_BITS) 307 // <li> PutUintsDecr(vals, n) 308 // </ul> 309 // Otherwise, if Fixed preforms better than Decr 310 // <ul> 311 // <li> PutUint(WORD_CMPR_MODEL_FIXED, WORD_COMPRESS_MODEL_BITS) 312 // <li> PutUintsDecr(vals, n) 313 // </ul> 314 // 315 int PutUints(unsigned int *vals, int n); 316 //- 317 // Get list of integer values from bitstream and return them in 318 // the <b>valsp</b> argument. The length of the <b>vals</b> array 319 // is returned. If 0 is returned, the <b>valsp</b> is set to null. 320 // The <b>valsp</b> argument is allocated with <i>new</i>, it is 321 // the responsibility of the caller to free it. 322 // 323 // The decoding is : 324 // <ul> 325 // <li> count = GetUint(WORD_CMPR_NBITS_NVALS) 326 // <li> model = GetUint(WORD_COMPRESS_MODEL_BITS) 327 // <li> GetUintsFixed(vals, count) (if model == WORD_COMPRESS_MODEL_FIXED) 328 // <li> GetUintsDecr(vals, count) (if model == WORD_COMPRESS_MODEL_DECR) 329 // <ul> 330 // 331 int GetUints(unsigned int **valsp); 332 //- 333 // Alias for GetUints(unsigned int **valsp). The <b>valsp</b> argument 334 // must point to an allocated array of <b>valsp_size</b> bytes. 335 // 336 int GetUints(unsigned int **valsp, int *valsp_size); 337 338 //- 339 // Put in bitstream the array of <b>vals</b> unsigned char values 340 // of size <b>n</b> and return the number of bits used in the bitstream. 341 // 342 // The encoding is : 343 // <ul> 344 // <li> PutUint(n, WORD_CMPR_NBITS_NVALS) 345 // <li> PutUint(log2(max of all vals), WORD_CMPR_LOG8_BITS) 346 // <li> foreach val in vals : PutUint(val, log2(max of all vals)) 347 // <ul> 348 // 349 int PutUchars(unsigned char *vals, int n); 350 //- 351 // Get list of unsigned char values from bitstream and return them in 352 // the <b>valsp</b> argument. The length of the <b>vals</b> array 353 // is returned. If 0 is returned, the <b>valsp</b> is set to null. 354 // The <b>valsp</b> argument is allocated with <i>new</i>, it is 355 // the responsibility of the caller to free it. 356 // 357 // The decoding is : 358 // <ul> 359 // <li> count = GetUint(WORD_CMPR_NBITS_NVALS) 360 // <li> log2(max of all vals) = GetUint(WORD_CMPR_LOG8_BITS) 361 // <li> count times : GetUint(log2(max of all vals)) 362 // </ul> 363 // 364 int GetUchars(unsigned char **valsp); 365 //- 366 // Alias for GetUchars(unsigned int **valsp). The <b>valsp</b> argument 367 // must point to an allocated array of <b>valsp_size</b> bytes. 368 // 369 int GetUchars(unsigned char **valsp, int *valsp_size); 370 371 //- 372 // Put in bitstream the array of <b>vals</b> integer values 373 // of size <b>n</b>. 374 // 375 // The encoding is : 376 // <ul> 377 // <li> PutUint(log2(max of all vals), WORD_CMPR_LOG32_BITS) 378 // <li> foreach val in vals : PutUint(val, log2(max of all vals)) 379 // </ul> 380 // 381 void PutUintsFixed(unsigned int *vals, int n); 382 //- 383 // Get list of integer values from bitstream and return them in 384 // the <b>vals</b> argument. The array pointed by the <b>vals</b> 385 // argument must be large enough to hold <b>n</b> elements. 386 // 387 // The decoding is : 388 // <ul> 389 // <li> bits = GetUint(WORD_CMPR_LOG32_BITS) 390 // <li> count times : GetUint(bits) 391 // </ul> 392 // 393 void GetUintsFixed(unsigned int *vals, int n); 394 395 //- 396 // Put in bitstream the array of <b>vals</b> integer values 397 // of size <b>n</b>. 398 // 399 // The encoding uses an algorithm inspired by Shannon-Fano. 400 // The <b>vals</b> array is divided in chunks of equal size 401 // and each number in a chunck is coded by the chunk number 402 // and its offset within the chunk. See VLengthCoder implementation 403 // in WordBitCompress.cc for more information. 404 // 405 void PutUintsDecr(unsigned int *vals, int n); 406 //- 407 // Get list of integer values from bitstream and return them in 408 // the <b>vals</b> argument. The array pointed by the <b>vals</b> 409 // argument must be large enough to hold <b>n</b> elements. 410 // 411 void GetUintsDecr(unsigned int *vals, int n); 412 }; 413 414 #endif /* _WordBitCompress_h */ 415 416