1 //
2 // WordBitCompress.h
3 //
4 // NAME
5 //
6 // Read and write objects in a bit stream, possibly compressing them.
7 //
8 // SYNOPSIS
9 //
10 // #include <WordBitStream.h>
11 //
12 // unsigned int value = 200;
13 // WordBitCompress stream;
14 // stream.PutUint(value, 8);
15 // stream.Rewind();
16 // stream.GetUint(value);
17 //
18 // DESCRIPTION
19 //
20 // A <b>WordBitCompress</b> object is a variable size string with methods
21 // to write and read numerical objects using a controlled amount of bits.
22 // Some methods implement compression such as custom Shannon-Fano or
23 // static compression similar to Golomb or Gamma.
24 //
25 // It is not possible to read a stream while writing into it and vice
26 // versa. Another limitation is that the largest number is of type unsigned
27 // int.
28 //
29 // For examples of use, check the <b>WordDBCompress</b> implementation.
30 //
31 //
32 // END
33 //
34 // Part of the ht://Dig package   <http://www.htdig.org/>
35 // Copyright (c) 1999, 2000, 2001 The ht://Dig Group
36 // For copyright details, see the file COPYING in your distribution
37 // or the GNU General Public License version 2 or later
38 // <http://www.gnu.org/copyleft/gpl.html>
39 //
40 // $Id: WordBitCompress.h,v 1.9 2001/06/29 14:14:08 loic Exp $
41 //
42 
43 #ifndef _WordBitCompress_h
44 #define _WordBitCompress_h
45 
46 #include <stdio.h>
47 #include <stdlib.h>
48 #include <string.h>
49 
50 class WordBitStream {
51 public:
WordBitStream(unsigned int nbuff_size)52   inline WordBitStream(unsigned int nbuff_size) {
53     Init();
54     Allocate(nbuff_size);
55   }
WordBitStream()56   inline WordBitStream() {
57     Init();
58   }
~WordBitStream()59   inline ~WordBitStream() {
60     free(buff);
61   }
62 
63   //
64   // Build and initialize stream
65   //
Init()66   inline void Init() {
67     buff_size = 1024;
68     buff = (unsigned char*)malloc(buff_size);
69     Clear();
70   }
71 
72   //
73   // Reset stream to empty state
74   //
Clear()75   inline void Clear() {
76     bitpos = 0;
77     buff_length = 1;
78     buff_idx = 0;
79     buff[buff_idx] = '\0';
80     freezeon = 0;
81     freeze_bitcount = 0;
82   }
83 
Allocate(unsigned int nbuff_size)84   inline void Allocate(unsigned int nbuff_size) {
85     buff_size = nbuff_size;
86     buff = (unsigned char*)realloc(buff, buff_size);
87   }
88 
89   //
90   // Make sure the buffer is large enough to safely access the
91   // byte located at buff[index]
92   //
Check(int index)93   inline void Check(int index) {
94     while(index >= buff_size)
95       Allocate(buff_size * 2);
96   }
97 
98   //
99   // Advance bitpos by adding incr to it, take care of space allocation.
100   //
BitposIncr(int incr)101   inline void BitposIncr(int incr) {
102     bitpos += incr;
103     if(!(bitpos & 0x07)) {
104       Check(++buff_idx);
105       buff[buff_idx] = 0;
106       buff_length++;
107     }
108   }
109 
110   //
111   // Append a bit : 0 if v null, 1 otherwise
112   //
Put(unsigned int v)113   inline void Put(unsigned int v) {
114     if(freezeon) {
115       freeze_bitcount++;
116       return;
117     }
118 
119     if(v) buff[buff_idx] |= 1 << (bitpos & 0x07);
120     BitposIncr(1);
121   }
122   //
123   // Return 0 if current bit in stream is not set, and not 0 if it is set. The
124   // bit position is incremented.
125   //
Get()126   inline unsigned char Get() {
127     if(bitpos >= (buff_length << 3)) {
128       fprintf(stderr, "BitStream::get reading past end of BitStream.\n");
129     }
130 
131     unsigned char res = buff[bitpos >> 3] & (1 << (bitpos & 0x07));
132     bitpos++;
133     return res;
134   }
135 
136   //
137   // Put in stream the lowest <b>n</b> bits from the value <b>v</b>
138   //
139   void PutUint(unsigned int v, int n);
140   //
141   // Return <b>n</b> bits from the stream.
142   //
143   unsigned int GetUint(int n);
144 
145   //
146   // Put in stream <b>n</b> bits from the array <b>vals</b>, each char
147   // considered as a storage for 8 bits.
148   //
149   void PutZone(unsigned char *vals, int n);
150   //
151   // Get from stream <b>n</b> bits and move them to the array <b>vals</b>.
152   // The size of <b>vals</b> must be at least <b>n</b>/8 long.
153   //
154   void GetZone(unsigned char *vals, int n);
155 
156   //
157   // Return the length of the stream, in bits.
158   //
Length()159   inline int Length() { return freezeon ? freeze_bitcount : bitpos; }
160   //
161   // Return the length of the stream, in bytes.
162   //
BuffLength()163   inline int BuffLength() { return buff_length; }
164   //
165   // Return a pointer to the beginning of the stream.
166   //
Buff()167   inline unsigned char* Buff() { return buff; }
168   //
169   // Return a pointer to a copy of the stream allocated with malloc.
170   //
BuffCopy()171   inline unsigned char* BuffCopy() {
172     unsigned char* copy = (unsigned char*)malloc(buff_length);
173     memcpy(copy, buff, buff_length);
174     return copy;
175   }
176   //
177   // Copy <b>nbuff_length</b> bytes from <b>nbuff</b> into the stream
178   //
BuffSet(const unsigned char * nbuff,int nbuff_length)179   inline void BuffSet(const unsigned char* nbuff, int nbuff_length) {
180     bitpos = 0;
181     buff_length = nbuff_length;
182     buff_idx = buff_length - 1;
183     Check(buff_length);
184     memcpy(buff, nbuff, buff_length);
185   }
186   //
187   // Move the stream cursor to the beginning of the stream.
188   //
Rewind()189   void Rewind() { bitpos = 0; }
190 
191   //
192   // Stop insertion into the stream. Only the stream cursor is updated
193   // to accurately reflect the size of the stream. All the Get* and Put*
194   // methods behaviour is modified.
195   //
Freeze()196   void Freeze() {
197     if(freezeon) {
198       fprintf(stderr, "WordBitCompress::Freeze: recursive call not permitted\n");
199     }
200     freeze_bitcount = 0;
201     freezeon = 1;
202   }
203   //
204   // Enable insertion into the stream, this is the normal behaviour. See
205   // the <b>Freeze</b> method.
206   //
UnFreeze()207   void UnFreeze() {
208     freezeon = 0;
209   }
210 
211 private:
212 
213   // the buffer were the bitstream is stored
214   unsigned char* buff;
215   int buff_length;
216   int buff_size;
217   int buff_idx;
218 
219   // current bit position within the buffer
220   int bitpos;
221 
222   // freezing the bitstream
223   int freeze_bitcount;
224   int freezeon;
225 };
226 
227 // Constants used by WordBitCompress
228 // number of bits to code the number of values in an array
229 #define WORD_CMPR_NBITS_NVALS 16
230 
231 //
232 // Number of bits encoding the log2 of 32 bits value
233 // 5 == log2(32) == log2(log2(0xffffffff))
234 //
235 #define WORD_CMPR_LOG32_BITS	5
236 
237 //
238 // Number of bits encoding the log2 of 8 bits value
239 // 3 == log2(8) == log2(log2(0xff))
240 // (currently set to 4 for unknown reasons, never tried with
241 //  3. The fact that the above LOG32_BITS = 5 apparenlty works
242 //  is not a good hint to guess that LOG8_BITS = 3 would work
243 //  since the coding functions refuse values with bit 32 set
244 //  so the number of bits really encoded is really 31. To
245 //  make sure this works properly unary tests should be
246 //  written).
247 //
248 #define WORD_CMPR_LOG8_BITS	4
249 
250 //
251 // Number of bits encoding the compression model
252 //
253 #define WORD_CMPR_MODEL_BITS	2
254 //
255 // Compression model value for Decr
256 //
257 #define WORD_CMPR_MODEL_DECR	0
258 //
259 // Compression model value for Fixed
260 //
261 #define WORD_CMPR_MODEL_FIXED	1
262 
263 class WordBitCompress : public WordBitStream {
264  public:
265   //-
266   // Constructor. Create a empty stream.
267   //
WordBitCompress()268   WordBitCompress() { }
269   //-
270   // Constructor. Create a empty stream and pre-allocate <b>size</b>
271   // bytes.
272   //
WordBitCompress(int size)273   WordBitCompress(int size) : WordBitStream(size) { }
274 
275   //-
276   // Put in bitstream integer value <b>v</b> (coded on <b>n</b> bits).
277   //
278   // The encoding is :
279   // <ul>
280   // <li> WordBitStream::PutUint(log2(v), log2(n))
281   // <li> WordBitStream::PutUint(v, log2(v))
282   // </ul>
283   //
284   void         PutUint(unsigned int v, int n);
285   //-
286   // Get integer value from bitstream and return it (coded on <b>n</b> bits).
287   //
288   // The decoding is :
289   // <ul>
290   // <li> nbits = WordBitStream::GetUint(log2(n))
291   // <li> return WordBitStream::GetUint(nbits))
292   // </ul>
293   //
294   unsigned int GetUint(int n);
295 
296   //-
297   // Put in bitstream the array of <b>vals</b> integer values
298   // of size <b>n</b> and return the number of bits used in the bitstream.
299   //
300   // The encoding is :
301   // <ul>
302   // <li> PutUint(n, WORD_CMPR_NBITS_NVALS)
303   // <ul>
304   // If Decr preforms better than Fixed
305   // <ul>
306   // <li> PutUint(WORD_CMPR_MODEL_DECR, WORD_COMPRESS_MODEL_BITS)
307   // <li> PutUintsDecr(vals, n)
308   // </ul>
309   // Otherwise, if Fixed preforms better than Decr
310   // <ul>
311   // <li> PutUint(WORD_CMPR_MODEL_FIXED, WORD_COMPRESS_MODEL_BITS)
312   // <li> PutUintsDecr(vals, n)
313   // </ul>
314   //
315   int PutUints(unsigned int *vals, int n);
316   //-
317   // Get list of integer values from bitstream and return them in
318   // the <b>valsp</b> argument. The length of the <b>vals</b> array
319   // is returned. If 0 is returned, the <b>valsp</b> is set to null.
320   // The <b>valsp</b> argument is allocated with <i>new</i>, it is
321   // the responsibility of the caller to free it.
322   //
323   // The decoding is :
324   // <ul>
325   // <li> count = GetUint(WORD_CMPR_NBITS_NVALS)
326   // <li> model = GetUint(WORD_COMPRESS_MODEL_BITS)
327   // <li> GetUintsFixed(vals, count) (if model == WORD_COMPRESS_MODEL_FIXED)
328   // <li> GetUintsDecr(vals, count) (if model == WORD_COMPRESS_MODEL_DECR)
329   // <ul>
330   //
331   int GetUints(unsigned int **valsp);
332   //-
333   // Alias for GetUints(unsigned int **valsp). The <b>valsp</b> argument
334   // must point to an allocated array of <b>valsp_size</b> bytes.
335   //
336   int GetUints(unsigned int **valsp, int *valsp_size);
337 
338   //-
339   // Put in bitstream the array of <b>vals</b> unsigned char values
340   // of size <b>n</b> and return the number of bits used in the bitstream.
341   //
342   // The encoding is :
343   // <ul>
344   // <li> PutUint(n, WORD_CMPR_NBITS_NVALS)
345   // <li> PutUint(log2(max of all vals), WORD_CMPR_LOG8_BITS)
346   // <li> foreach val in vals : PutUint(val, log2(max of all vals))
347   // <ul>
348   //
349   int PutUchars(unsigned char *vals, int n);
350   //-
351   // Get list of unsigned char values from bitstream and return them in
352   // the <b>valsp</b> argument. The length of the <b>vals</b> array
353   // is returned. If 0 is returned, the <b>valsp</b> is set to null.
354   // The <b>valsp</b> argument is allocated with <i>new</i>, it is
355   // the responsibility of the caller to free it.
356   //
357   // The decoding is :
358   // <ul>
359   // <li> count = GetUint(WORD_CMPR_NBITS_NVALS)
360   // <li> log2(max of all vals) = GetUint(WORD_CMPR_LOG8_BITS)
361   // <li> count times : GetUint(log2(max of all vals))
362   // </ul>
363   //
364   int GetUchars(unsigned char **valsp);
365   //-
366   // Alias for GetUchars(unsigned int **valsp). The <b>valsp</b> argument
367   // must point to an allocated array of <b>valsp_size</b> bytes.
368   //
369   int GetUchars(unsigned char **valsp, int *valsp_size);
370 
371   //-
372   // Put in bitstream the array of <b>vals</b> integer values
373   // of size <b>n</b>.
374   //
375   // The encoding is :
376   // <ul>
377   // <li> PutUint(log2(max of all vals), WORD_CMPR_LOG32_BITS)
378   // <li> foreach val in vals : PutUint(val, log2(max of all vals))
379   // </ul>
380   //
381   void PutUintsFixed(unsigned int *vals, int n);
382   //-
383   // Get list of integer values from bitstream and return them in
384   // the <b>vals</b> argument. The array pointed by the <b>vals</b>
385   // argument must be large enough to hold <b>n</b> elements.
386   //
387   // The decoding is :
388   // <ul>
389   // <li> bits = GetUint(WORD_CMPR_LOG32_BITS)
390   // <li> count times : GetUint(bits)
391   // </ul>
392   //
393   void GetUintsFixed(unsigned int *vals, int n);
394 
395   //-
396   // Put in bitstream the array of <b>vals</b> integer values
397   // of size <b>n</b>.
398   //
399   // The encoding uses an algorithm inspired by Shannon-Fano.
400   // The <b>vals</b> array is divided in chunks of equal size
401   // and each number in a chunck is coded by the chunk number
402   // and its offset within the chunk. See VLengthCoder implementation
403   // in WordBitCompress.cc for more information.
404   //
405   void PutUintsDecr(unsigned int *vals, int n);
406   //-
407   // Get list of integer values from bitstream and return them in
408   // the <b>vals</b> argument. The array pointed by the <b>vals</b>
409   // argument must be large enough to hold <b>n</b> elements.
410   //
411   void GetUintsDecr(unsigned int *vals, int n);
412 };
413 
414 #endif /* _WordBitCompress_h */
415 
416