1 //
2 // WordDBCompress.cc
3 //
4 // Part of the ht://Dig package <http://www.htdig.org/>
5 // Copyright (c) 1999 The ht://Dig Group
6 // For copyright details, see the file COPYING in your distribution
7 // or the GNU Public License version 2 or later
8 // <http://www.gnu.org/copyleft/gpl.html>
9 //
10 // $Id: WordDBCompress.cc,v 1.15 2000/10/25 12:32:19 loic Exp $
11 //
12 /*
13
14 BTREE original page layout in Berkeley DB
15
16 * +-----------------------------------+
17 * | lsn | pgno | prev pgno |
18 * +-----------------------------------+
19 * | next pgno | entries | hf offset |
20 * +-----------------------------------+
21 * | level | type | index |
22 * +-----------------------------------+
23 * | index | free --> |
24 * +-----------+-----------------------+
25 * | F R E E A R E A |
26 * +-----------------------------------+
27 * | <-- free | item |
28 * +-----------------------------------+
29 * | item | item | item |
30 * +-----------------------------------+
31 */
32 #ifdef HAVE_CONFIG_H
33 #include "config.h"
34 #endif /* HAVE_CONFIG_H */
35
36 #include <stdlib.h>
37 #include <errno.h>
38
39 extern "C" {
40 #include "db_int.h"
41 #include "db_page.h"
42 }
43
44 #include "lib.h"
45 #include "WordDBCompress.h"
46 #include "WordBitCompress.h"
47 #include "WordKeyInfo.h"
48 #include "WordKey.h"
49 #include "WordRecord.h"
50 #include "WordDB.h"
51 #include "HtMaxMin.h"
52
53 /*
54 * WordDBCompress: C-callbacks, actually called by Berkeley-DB
55 * they just call their WordDBCompress equivalents (by using user_data)
56 */
57 extern "C"
58 {
59
WordDBCompress_compress_c(DB_ENV *,const u_int8_t * inbuff,int inbuff_length,u_int8_t ** outbuffp,int * outbuff_lengthp,void * user_data)60 int WordDBCompress_compress_c(DB_ENV*, const u_int8_t* inbuff, int inbuff_length, u_int8_t** outbuffp, int* outbuff_lengthp, void *user_data)
61 {
62 if(!user_data) {
63 fprintf(stderr, "WordDBCompress_compress_c:: user_data is NULL");
64 return NOTOK;
65 }
66 return ((WordDBCompress *)user_data)->Compress((unsigned char*)inbuff, inbuff_length, (unsigned char**)outbuffp, outbuff_lengthp);
67 }
68
WordDBCompress_uncompress_c(DB_ENV *,const u_int8_t * inbuff,int inbuff_length,u_int8_t * outbuff,int outbuff_length,void * user_data)69 int WordDBCompress_uncompress_c(DB_ENV*, const u_int8_t* inbuff, int inbuff_length, u_int8_t* outbuff, int outbuff_length, void *user_data)
70 {
71 if(!user_data) {
72 fprintf(stderr, "WordDBCompress_uncompress_c:: user_data is NULL");
73 return NOTOK;
74 }
75 return ((WordDBCompress *)user_data)->Uncompress((unsigned char *)inbuff, inbuff_length, (unsigned char*)outbuff, outbuff_length);
76 }
77
78 }
79
80 //
81 // Symbolic names for the data in WordDBEncoded.values[..] arrays
82 //
83 //
84 // Each entry in a leaf/internal page has a flag that shows how
85 // the entry is coded.
86 //
87 #define WORD_CMPR_VAL_FLAGS 0
88 //
89 // Btree: Numerical fields of WordKey
90 //
91 #define WORD_CMPR_VAL_FIELDS 1
92 //
93 // Btree/Internal: BINTERNAL.pgno
94 //
95 #define WORD_CMPR_VAL_PGNO (WORD_CMPR_VAL_FIELDS + WORD_KEY_MAX_NFIELDS)
96 //
97 // Btree/Leaf: Length of record string
98 //
99 #define WORD_CMPR_VAL_RLENGTH WORD_CMPR_VAL_PGNO
100 //
101 // Btree/Internal: BINTERNAL.nrecs
102 //
103 #define WORD_CMPR_VAL_NRECS (WORD_CMPR_VAL_PGNO + 1)
104 //
105 // Btree/Leaf: Value of integer if record contains a single integer value
106 //
107 #define WORD_CMPR_VAL_RVALUE WORD_CMPR_VAL_NRECS
108 //
109 // Btree: length of WordKey.GetWord() prefix shared with previous entry
110 //
111 #define WORD_CMPR_VAL_PREFIX (WORD_CMPR_VAL_PGNO + 2)
112 //
113 // Btree: length of WordKey.GetWord() suffix to add to previous key
114 // shared prefix.
115 //
116 #define WORD_CMPR_VAL_SUFFIX (WORD_CMPR_VAL_PGNO + 3)
117 //
118 // Symbolic name for the last index
119 //
120 #define WORD_CMPR_VAL_LAST WORD_CMPR_VAL_SUFFIX
121
122 #define WORD_CMPR_VAL_ARRAY_SIZE (WORD_CMPR_VAL_LAST + 1)
123 //
124 // Number of bits needed to encode the maximum array size value
125 //
126 #define WORD_CMPR_VAL_ARRAY_SIZE_BITS 8
127
128 //
129 // Number of bits used in flags
130 //
131 #define WORD_CMPR_VAL_FLAGS_BITS (WORD_CMPR_VAL_LAST + 1)
132 //
133 // Values for the WORD_CMPR_VAL_FLAGS bit field
134 //
135 #define WORD_CMPR_VAL_FLAGS_FIELD(n) (1 << (WORD_CMPR_VAL_FIELDS + (n)))
136 #define WORD_CMPR_VAL_FLAGS_PGNO (1 << (WORD_CMPR_VAL_PGNO))
137 #define WORD_CMPR_VAL_FLAGS_RLENGTH WORD_CMPR_VAL_FLAGS_PGNO
138 #define WORD_CMPR_VAL_FLAGS_NRECS (1 << (WORD_CMPR_VAL_NRECS))
139 #define WORD_CMPR_VAL_FLAGS_RVALUE WORD_CMPR_VAL_FLAGS_NRECS
140 #define WORD_CMPR_VAL_FLAGS_PREFIX (1 << (WORD_CMPR_VAL_PREFIX))
141 #define WORD_CMPR_VAL_FLAGS_SUFFIX (1 << (WORD_CMPR_VAL_SUFFIX))
142 #define WORD_CMPR_VAL_FLAGS_STRING (1 << (WORD_CMPR_VAL_LAST + 1))
143 #define WORD_CMPR_VAL_FLAGS_EMPTY (1 << (WORD_CMPR_VAL_LAST + 2))
144 #define WORD_CMPR_VAL_FLAGS_RECORD_EQ (1 << (WORD_CMPR_VAL_LAST + 3))
145 #define WORD_CMPR_VAL_FLAGS_RECORD_NO (1 << (WORD_CMPR_VAL_LAST + 4))
146 #define WORD_CMPR_VAL_FLAGS_RECORD_STR (1 << (WORD_CMPR_VAL_LAST + 5))
147
148 //
149 // Storage for values extracted from a PAGE structure
150 //
151 class WordDBEncoded {
152 public:
WordDBEncoded()153 inline WordDBEncoded() {
154 Init();
155 }
~WordDBEncoded()156 inline ~WordDBEncoded() {
157 free(strings);
158 for(int i = 0; i < WORD_CMPR_VAL_ARRAY_SIZE; i++)
159 free(values[i]);
160 }
161
Init()162 inline void Init() {
163 strings_size = 32;
164 strings = (unsigned char*)malloc(strings_size);
165
166 for(int i = 0; i < WORD_CMPR_VAL_ARRAY_SIZE; i++) {
167 values_size[i] = 32;
168 values[i] = (unsigned int*)malloc(values_size[i] * sizeof(unsigned int));
169 }
170
171 Clear();
172 }
173
Clear()174 inline void Clear() {
175 strings_length = 0;
176 strings_idx = 0;
177
178 for(int i = 0; i < WORD_CMPR_VAL_ARRAY_SIZE; i++) {
179 values_length[i] = 0;
180 values_idx[i] = 0;
181 }
182 }
183
AllocateStrings(unsigned int size)184 inline void AllocateStrings(unsigned int size) {
185 strings_size = size;
186 strings = (unsigned char*)realloc(strings, strings_size);
187 }
188
CheckStrings(int index)189 inline void CheckStrings(int index) {
190 while(index >= strings_size)
191 AllocateStrings(strings_size * 2);
192 }
193
AllocateValues(int what,unsigned int size)194 inline void AllocateValues(int what, unsigned int size) {
195 values_size[what] = size;
196 values[what] = (unsigned int*)realloc(values[what], values_size[what] * sizeof(unsigned int));
197 }
198
CheckValues(int what,int index)199 inline void CheckValues(int what, int index) {
200 while(index >= values_size[what])
201 AllocateValues(what, values_size[what] * 2);
202 }
203
PushValue(int what,unsigned int value)204 inline void PushValue(int what, unsigned int value) {
205 CheckValues(what, values_length[what]);
206 values[what][values_length[what]] = value;
207 values_length[what]++;
208 }
209
PushString(const unsigned char * string,int length)210 inline void PushString(const unsigned char* string, int length) {
211 CheckStrings(strings_length + length);
212 memcpy(strings + strings_length, string, length);
213 strings_length += length;
214 }
215
ShiftValue(int what)216 inline unsigned int ShiftValue(int what) {
217 if(values_idx[what] >= values_length[what]) {
218 fprintf(stderr, "WordDBEncoded::ShiftValue: what = %d, (idx = %d) >= (length = %d)\n", what, values_idx[what], values_length[what]);
219 abort();
220 }
221 return values[what][values_idx[what]++];
222 }
223
ShiftString(int length)224 inline unsigned char* ShiftString(int length) {
225 if(strings_idx + length > strings_length) {
226 fprintf(stderr, "WordDBEncoded::ShiftString: (idx + length = %d) >= (length = %d)\n", strings_idx + length, strings_length);
227 abort();
228 }
229 unsigned char* string = strings + strings_idx;
230 strings_idx += length;
231 return string;
232 }
233
234 void Put(WordBitCompress& stream);
235 void Get(WordBitCompress& stream);
236
237 unsigned int *values[WORD_CMPR_VAL_ARRAY_SIZE];
238 int values_length[WORD_CMPR_VAL_ARRAY_SIZE];
239 int values_idx[WORD_CMPR_VAL_ARRAY_SIZE];
240 int values_size[WORD_CMPR_VAL_ARRAY_SIZE];
241
242 unsigned char *strings;
243 int strings_length;
244 int strings_idx;
245 int strings_size;
246 };
247
Put(WordBitCompress & stream)248 void WordDBEncoded::Put(WordBitCompress& stream)
249 {
250 unsigned int count = 0;
251 for(int i = 0; i < WORD_CMPR_VAL_ARRAY_SIZE; i++) {
252 if(values_length[i] > 0) count++;
253 }
254 stream.WordBitStream::PutUint(count, WORD_CMPR_VAL_ARRAY_SIZE_BITS);
255
256 for(int i = 0; i < WORD_CMPR_VAL_ARRAY_SIZE; i++) {
257 if(values_length[i] > 0) {
258 stream.WordBitStream::PutUint(i, WORD_CMPR_VAL_ARRAY_SIZE_BITS);
259 stream.PutUints(values[i], values_length[i]);
260 }
261 }
262
263 stream.PutUchars(strings, strings_length);
264 }
265
Get(WordBitCompress & stream)266 void WordDBEncoded::Get(WordBitCompress& stream)
267 {
268 Clear();
269
270 unsigned int count = 0;
271 count = stream.WordBitStream::GetUint(WORD_CMPR_VAL_ARRAY_SIZE_BITS);
272
273 for(unsigned int i = 0; i < count; i++) {
274 unsigned int index = stream.WordBitStream::GetUint(WORD_CMPR_VAL_ARRAY_SIZE_BITS);
275 values_length[index] = stream.GetUints(&values[index], &values_size[index]);
276 }
277
278 strings_length = stream.GetUchars(&strings, &strings_size);
279 }
280
281 //
282 // ******************** WordDBCompress implementation **************
283 //
284
285 #define WORD_CMPR_OVERHEAD ((int)sizeof(unsigned char))
286
WordDBCompress(WordContext * ncontext)287 WordDBCompress::WordDBCompress(WordContext* ncontext)
288 {
289 cmprInfo = 0;
290 context = ncontext;
291 encoded = new WordDBEncoded();
292
293 const Configuration& config = context->GetConfiguration();
294 debug = config.Boolean("wordlist_compress_debug", 0);
295 verbose = config.Value("wordlist_compress_verbose", 0);
296 }
297
~WordDBCompress()298 WordDBCompress::~WordDBCompress()
299 {
300 delete encoded;
301 }
302
303 int
Compress(const unsigned char * inbuff,int inbuff_length,unsigned char ** outbuffp,int * outbuff_lengthp)304 WordDBCompress::Compress(const unsigned char *inbuff, int inbuff_length, unsigned char **outbuffp, int *outbuff_lengthp)
305 {
306 //
307 // Maximum output length can be WORD_CMPR_OVERHEAD more than input
308 //
309 int outbuff_length = inbuff_length + WORD_CMPR_OVERHEAD;
310 unsigned char* outbuff = (unsigned char*)malloc(sizeof(unsigned char) * outbuff_length);
311
312 *outbuffp = 0;
313 *outbuff_lengthp = outbuff_length;
314
315 if(outbuff == 0)
316 return ENOMEM;
317
318 int ret = 0;
319 PAGE* pp = (PAGE*)inbuff;
320
321 *outbuff = TYPE_TAGS(pp);
322
323 switch(TYPE_TAGS(pp)) {
324 case P_IBTREE|WORD_DB_INDEX:
325 case P_LBTREE|WORD_DB_INDEX:
326 ret = CompressBtree(inbuff, inbuff_length, outbuff, &outbuff_length);
327 break;
328 default:
329 memcpy((char*)(outbuff + WORD_CMPR_OVERHEAD), (char*)inbuff, inbuff_length);
330 break;
331 }
332
333 if(ret == 0) {
334 *outbuffp = outbuff;
335 *outbuff_lengthp = outbuff_length;
336 } else {
337 free(outbuff);
338 }
339
340 return ret;
341 }
342
343 int
Uncompress(const unsigned char * inbuff,int inbuff_length,unsigned char * outbuff,int outbuff_length)344 WordDBCompress::Uncompress(const unsigned char *inbuff, int inbuff_length, unsigned char *outbuff, int outbuff_length)
345 {
346 int ret = 0;
347
348 switch(*inbuff) {
349 case P_IBTREE|WORD_DB_INDEX:
350 case P_LBTREE|WORD_DB_INDEX:
351 ret = UncompressBtree(inbuff, inbuff_length, outbuff, outbuff_length);
352 break;
353 default:
354 memcpy((char*)outbuff, (char*)(inbuff + WORD_CMPR_OVERHEAD), outbuff_length);
355 break;
356 }
357
358 return ret;
359 }
360
361 int
CompressBtree(const unsigned char * inbuff,int inbuff_length,unsigned char * outbuff,int * outbuff_lengthp)362 WordDBCompress::CompressBtree(const unsigned char *inbuff, int inbuff_length, unsigned char *outbuff, int *outbuff_lengthp)
363 {
364 int ret = 0;
365 PAGE *pp = (PAGE *)inbuff;
366
367 if(verbose) fprintf(stderr, "WordDBCompress::CompressBtree: page %d\n", PGNO(pp));
368
369 switch(TYPE(pp)) {
370 case P_IBTREE:
371 ret = CompressIBtree(inbuff, inbuff_length, outbuff, outbuff_lengthp);
372 break;
373 case P_LBTREE:
374 ret = CompressLBtree(inbuff, inbuff_length, outbuff, outbuff_lengthp);
375 break;
376 }
377
378 return ret;
379 }
380
381 //
382 // Return the length of the chars in b that are not common with a
383 // a = abcdef
384 // b = abcghi
385 // ---
386 // return 3
387 //
suffixlength(const String & a,const String & b)388 static inline unsigned int suffixlength(const String& a, const String& b)
389 {
390 const unsigned char* ap = (const unsigned char*)a.get();
391 const unsigned char* bp = (const unsigned char*)b.get();
392 int length = HtMIN(a.length(), b.length());
393
394 int i;
395 for(i = 0; i < length && ap[i] == bp[i]; i++)
396 ;
397
398 return b.length() - i;
399 }
400
401 int
CompressIBtree(const unsigned char * inbuff,int inbuff_length,unsigned char * outbuff,int * outbuff_lengthp)402 WordDBCompress::CompressIBtree(const unsigned char *inbuff, int inbuff_length, unsigned char *outbuff, int *outbuff_lengthp)
403 {
404 PAGE *pp = (PAGE *)inbuff;
405
406 if(verbose > 5) DumpPage(inbuff);
407
408 WordBitCompress stream(inbuff_length);
409
410 stream.PutUint(pp->lsn.file, sizeof(pp->lsn.file) * 8);
411 stream.PutUint(pp->lsn.offset, sizeof(pp->lsn.file) * 8);
412 stream.PutUint(PGNO(pp), sizeof(PGNO(pp)) * 8);
413 stream.PutUint(NUM_ENT(pp), sizeof(NUM_ENT(pp)) * 8);
414 stream.PutUint(LEVEL(pp), sizeof(LEVEL(pp)) * 8);
415
416 if(NUM_ENT(pp) > 0) {
417 int i;
418 WordKey key(context);
419 WordKey previous_key(context);
420 BINTERNAL* previous_e = 0;
421
422 encoded->Clear();
423
424 for (i = 0; i < NUM_ENT(pp); i++) {
425 BINTERNAL* e = GET_BINTERNAL(pp, i);
426 if(debug) {
427 if(e->type != B_KEYDATA)
428 fprintf(stderr, "WordDBCompress::EncodeIBtree: unexpected type 0x%02x\n", e->type);
429 }
430 unsigned int changes = 0;
431
432 if(e->len <= 0) {
433 changes = WORD_CMPR_VAL_FLAGS_EMPTY;
434 //
435 // Structural data
436 //
437 encoded->PushValue(WORD_CMPR_VAL_PGNO, e->pgno);
438 encoded->PushValue(WORD_CMPR_VAL_NRECS, e->nrecs);
439 } else {
440
441 key.Unpack((const char*)e->data, (int)e->len);
442
443 if(previous_e != 0) {
444 int is_first_change = 1;
445 #if 0
446 //
447 // String
448 //
449 const String& word = key.GetWord();
450 const String& previous_word = previous_key.GetWord();
451 if(word != previous_word) {
452 unsigned int suffix_length = suffixlength(previous_word, word);
453 unsigned int prefix_length = word.length() - suffix_length;
454 encoded->PushValue(WORD_CMPR_VAL_PREFIX, prefix_length);
455 encoded->PushValue(WORD_CMPR_VAL_SUFFIX, suffix_length);
456 const char* suffix = word.get() + prefix_length;
457 encoded->PushString((const unsigned char*)suffix, suffix_length);
458 changes |= WORD_CMPR_VAL_FLAGS_STRING;
459 is_first_change = 0;
460 }
461 #endif
462 //
463 // Fields
464 //
465 int nfields = key.NFields();
466 for(int i = 0; i < nfields; i++) {
467 unsigned int value = key.Get(i);
468 unsigned int previous_value = previous_key.Get(i);
469 if(value != previous_value) {
470 if(is_first_change) {
471 value -= previous_value;
472 is_first_change = 0;
473 }
474 encoded->PushValue(i + WORD_CMPR_VAL_FIELDS, value);
475 changes |= WORD_CMPR_VAL_FLAGS_FIELD(i);
476 }
477 }
478 //
479 // Structural data
480 //
481 if(e->pgno != previous_e->pgno) {
482 encoded->PushValue(WORD_CMPR_VAL_PGNO, e->pgno);
483 changes |= WORD_CMPR_VAL_FLAGS_PGNO;
484 }
485 if(e->nrecs != previous_e->nrecs) {
486 encoded->PushValue(WORD_CMPR_VAL_NRECS, e->nrecs);
487 changes |= WORD_CMPR_VAL_FLAGS_NRECS;
488 }
489 } else {
490 #if 0
491 //
492 // String
493 //
494 const String& word = key.GetWord();
495 encoded->PushValue(WORD_CMPR_VAL_SUFFIX, word.length());
496 encoded->PushString((const unsigned char*)word.get(), word.length());
497 #endif
498 //
499 // Fields
500 //
501 int nfields = key.NFields();
502 for(int i = 0; i < nfields; i++) {
503 encoded->PushValue(i + WORD_CMPR_VAL_FIELDS, key.Get(i));
504 }
505 //
506 // Structural data
507 //
508 encoded->PushValue(WORD_CMPR_VAL_PGNO, e->pgno);
509 encoded->PushValue(WORD_CMPR_VAL_NRECS, e->nrecs);
510 }
511
512 previous_e = e;
513 previous_key = key;
514 }
515
516 encoded->PushValue(WORD_CMPR_VAL_FLAGS, changes);
517 }
518
519 encoded->Put(stream);
520 }
521
522 int outbuff_length = *outbuff_lengthp;
523 if(stream.BuffLength() > (outbuff_length - WORD_CMPR_OVERHEAD)) {
524 fprintf(stderr, "WordDBCompress::CompressIBtree: compressed length = %d > available length = %d\n", stream.BuffLength(), outbuff_length - WORD_CMPR_OVERHEAD);
525 abort();
526 }
527 memcpy(outbuff + WORD_CMPR_OVERHEAD, stream.Buff(), stream.BuffLength());
528 outbuff_length = WORD_CMPR_OVERHEAD + stream.BuffLength();
529
530 if(debug) {
531 unsigned char* tmp = (unsigned char*)malloc(inbuff_length);
532 memset((char*)tmp, '\0', inbuff_length);
533 UncompressIBtree(outbuff, outbuff_length, tmp, inbuff_length);
534 if(DiffPage(inbuff, tmp)) {
535 fprintf(stderr, "WordDBCompress::CompressIBtree: failed\n");
536 DumpPage(inbuff);
537 DumpPage(tmp);
538 exit(1);
539 }
540 free(tmp);
541 }
542
543 *outbuff_lengthp = outbuff_length;
544
545 return 0;
546 }
547
548 int
CompressLBtree(const unsigned char * inbuff,int inbuff_length,unsigned char * outbuff,int * outbuff_lengthp)549 WordDBCompress::CompressLBtree(const unsigned char *inbuff, int inbuff_length, unsigned char *outbuff, int *outbuff_lengthp)
550 {
551 PAGE *pp = (PAGE *)inbuff;
552
553 if(verbose > 5) DumpPage(inbuff);
554
555 WordBitCompress stream(inbuff_length);
556
557 stream.PutUint(pp->lsn.file, sizeof(pp->lsn.file) * 8);
558 stream.PutUint(pp->lsn.offset, sizeof(pp->lsn.file) * 8);
559 stream.PutUint(PGNO(pp), sizeof(PGNO(pp)) * 8);
560 stream.PutUint(PREV_PGNO(pp), sizeof(PREV_PGNO(pp)) * 8);
561 stream.PutUint(NEXT_PGNO(pp), sizeof(NEXT_PGNO(pp)) * 8);
562 stream.PutUint(NUM_ENT(pp), sizeof(NUM_ENT(pp)) * 8);
563 stream.PutUint(LEVEL(pp), sizeof(LEVEL(pp)) * 8);
564
565 if(NUM_ENT(pp) > 0) {
566 int i;
567 WordKey key(context);
568 WordKey previous_key(context);
569 WordRecord record(context);
570 WordRecord previous_record(context);
571 BKEYDATA* previous_key_e = 0;
572 BKEYDATA* previous_record_e = 0;
573
574 encoded->Clear();
575
576 for (i = 0; i < NUM_ENT(pp); i += P_INDX) {
577 unsigned int changes = 0;
578
579 {
580 BKEYDATA* key_e = GET_BKEYDATA(pp, i);
581 if(debug) {
582 if(key_e->type != B_KEYDATA)
583 fprintf(stderr, "WordDBCompress::EncodeLBtree: unexpected type 0x%02x\n", key_e->type);
584 if(key_e->len <= 0)
585 fprintf(stderr, "WordDBCompress::EncodeLBtree: unexpected length <= 0\n");
586 }
587
588 key.Unpack((const char*)key_e->data, (int)key_e->len);
589
590 if(previous_key_e != 0) {
591 int is_first_change = 1;
592 #if 0
593 //
594 // String
595 //
596 const String& word = key.GetWord();
597 const String& previous_word = previous_key.GetWord();
598 if(word != previous_word) {
599 unsigned int suffix_length = suffixlength(previous_word, word);
600 unsigned int prefix_length = word.length() - suffix_length;
601 encoded->PushValue(WORD_CMPR_VAL_PREFIX, prefix_length);
602 encoded->PushValue(WORD_CMPR_VAL_SUFFIX, suffix_length);
603 const char* suffix = word.get() + prefix_length;
604 encoded->PushString((const unsigned char*)suffix, suffix_length);
605 changes |= WORD_CMPR_VAL_FLAGS_STRING;
606 is_first_change = 0;
607 }
608 #endif
609 //
610 // Fields
611 //
612 int nfields = key.NFields();
613 for(int i = 0; i < nfields; i++) {
614 unsigned int value = key.Get(i);
615 unsigned int previous_value = previous_key.Get(i);
616 if(value != previous_value) {
617 if(is_first_change) {
618 value -= previous_value;
619 is_first_change = 0;
620 }
621 encoded->PushValue(WORD_CMPR_VAL_FIELDS + i, value);
622 changes |= WORD_CMPR_VAL_FLAGS_FIELD(i);
623 }
624 }
625 } else {
626 #if 0
627 //
628 // String
629 //
630 const String& word = key.GetWord();
631 encoded->PushValue(WORD_CMPR_VAL_SUFFIX, word.length());
632 encoded->PushString((const unsigned char*)word.get(), word.length());
633 #endif
634 //
635 // Fields
636 //
637 int nfields = key.NFields();
638 for(int i = 0; i < nfields; i++) {
639 encoded->PushValue(i + WORD_CMPR_VAL_FIELDS, key.Get(i));
640 }
641 }
642
643 previous_key_e = key_e;
644 previous_key = key;
645 }
646 {
647 BKEYDATA* record_e = GET_BKEYDATA(pp, i + 1);
648 if(debug) {
649 if(record_e->type != B_KEYDATA)
650 fprintf(stderr, "WordDBCompress::EncodeLBtree: unexpected type 0x%02x\n", record_e->type);
651 }
652
653 if(record_e->len <= 0) {
654 changes |= WORD_CMPR_VAL_FLAGS_RECORD_NO;
655 } else {
656 record.Unpack((const char*)record_e->data, (int)record_e->len);
657
658 switch(record.type) {
659 case WORD_RECORD_DATA:
660 if(previous_record_e &&
661 record.info.data == previous_record.info.data) {
662 changes |= WORD_CMPR_VAL_FLAGS_RECORD_EQ;
663 } else {
664 encoded->PushValue(WORD_CMPR_VAL_RVALUE, record.info.data);
665 }
666 break;
667 case WORD_RECORD_STR:
668 changes |= WORD_CMPR_VAL_FLAGS_RECORD_STR;
669 if(previous_record_e &&
670 record.info.str == previous_record.info.str) {
671 changes |= WORD_CMPR_VAL_FLAGS_RECORD_EQ;
672 } else {
673 const String& string = record.info.str;
674 encoded->PushValue(WORD_CMPR_VAL_RLENGTH, string.length());
675 encoded->PushString((const unsigned char*)string.get(), string.length());
676 }
677 break;
678 default:
679 fprintf(stderr, "WordDBCompress::EncodeLBtree: unexpected record.type = %d\n", record.type);
680 break;
681 }
682
683 previous_record = record;
684 previous_record_e = record_e;
685 }
686
687 }
688 encoded->PushValue(WORD_CMPR_VAL_FLAGS, changes);
689 }
690
691 encoded->Put(stream);
692 }
693
694 int outbuff_length = *outbuff_lengthp;
695 if(stream.BuffLength() > (outbuff_length - WORD_CMPR_OVERHEAD)) {
696 fprintf(stderr, "WordDBCompress::CompressLBtree: compressed length = %d > available length = %d\n", stream.BuffLength(), outbuff_length - WORD_CMPR_OVERHEAD);
697 abort();
698 }
699 memcpy(outbuff + WORD_CMPR_OVERHEAD, stream.Buff(), stream.BuffLength());
700 outbuff_length = WORD_CMPR_OVERHEAD + stream.BuffLength();
701
702 if(debug) {
703 unsigned char* tmp = (unsigned char*)malloc(inbuff_length);
704 memset((char*)tmp, '\0', inbuff_length);
705 UncompressLBtree(outbuff, outbuff_length, tmp, inbuff_length);
706 if(DiffPage(inbuff, tmp)) {
707 fprintf(stderr, "WordDBCompress::CompressLBtree: failed\n");
708 DumpPage(inbuff);
709 DumpPage(tmp);
710 exit(1);
711 }
712 free(tmp);
713 }
714
715 *outbuff_lengthp = outbuff_length;
716
717 return 0;
718 }
719
720 int
UncompressBtree(const unsigned char * inbuff,int inbuff_length,unsigned char * outbuff,int outbuff_length)721 WordDBCompress::UncompressBtree(const unsigned char *inbuff, int inbuff_length, unsigned char *outbuff, int outbuff_length)
722 {
723 int ret = 0;
724 switch((*inbuff) & TYPE_MASK) {
725 case P_IBTREE:
726 ret = UncompressIBtree(inbuff, inbuff_length, outbuff, outbuff_length);
727 break;
728 case P_LBTREE:
729 ret = UncompressLBtree(inbuff, inbuff_length, outbuff, outbuff_length);
730 break;
731 }
732 if(verbose) fprintf(stderr, "WordDBCompress::UncompressBtree: page %d\n", PGNO((PAGE*)outbuff));
733 return ret;
734 }
735
736 /*
737 * cdb___db_pitem --
738 * Put an item on a page.
739 */
740 static void
cdb___db_pitem(PAGE * pagep,u_int32_t indx,u_int32_t nbytes,DBT * hdr,DBT * data)741 cdb___db_pitem(PAGE *pagep, u_int32_t indx, u_int32_t nbytes, DBT *hdr, DBT *data)
742 {
743 u_int8_t *p;
744
745 HOFFSET(pagep) -= nbytes;
746 pagep->inp[indx] = HOFFSET(pagep);
747 ++NUM_ENT(pagep);
748
749 p = P_ENTRY(pagep, indx);
750 memcpy(p, hdr->data, hdr->size);
751 if (data != NULL)
752 memcpy(p + hdr->size, data->data, data->size);
753 }
754
755 int
UncompressIBtree(const unsigned char * inbuff,int inbuff_length,unsigned char * outbuff,int outbuff_length)756 WordDBCompress::UncompressIBtree(const unsigned char *inbuff, int inbuff_length, unsigned char *outbuff, int outbuff_length)
757 {
758 memset((char*)outbuff, '\0', outbuff_length);
759
760 WordBitCompress stream(inbuff_length);
761 stream.BuffSet(inbuff + WORD_CMPR_OVERHEAD, inbuff_length - WORD_CMPR_OVERHEAD);
762
763 PAGE *pp = (PAGE *)outbuff;
764
765 TYPE_TAGS(pp) = *inbuff;
766 HOFFSET(pp) = outbuff_length;
767 short entry_count = 0;
768 pp->lsn.file = stream.GetUint(sizeof(pp->lsn.file) * 8);
769 pp->lsn.offset = stream.GetUint(sizeof(pp->lsn.offset) * 8);
770 PGNO(pp) = stream.GetUint(sizeof(PGNO(pp)) * 8);
771 entry_count = stream.GetUint(sizeof(NUM_ENT(pp)) * 8);
772 LEVEL(pp) = stream.GetUint(sizeof(LEVEL(pp)) * 8);
773 NEXT_PGNO(pp) = PREV_PGNO(pp) = 0;
774
775 if(entry_count > 0) {
776 int i;
777 String packed;
778 BINTERNAL previous_bi;
779 BINTERNAL bi;
780 DBT hdr;
781 hdr.data = &bi;
782 hdr.size = SSZA(BINTERNAL, data);
783 DBT data;
784 WordKey key(context);
785 WordKey previous_key(context);
786 String word;
787
788 encoded->Get(stream);
789
790 for(i = 0; i < entry_count; i++) {
791 unsigned int changes = encoded->ShiftValue(WORD_CMPR_VAL_FLAGS);
792
793 memset((char*)&bi, '\0', sizeof(BINTERNAL));
794
795 if(changes & WORD_CMPR_VAL_FLAGS_EMPTY) {
796 packed.trunc();
797 //
798 // Structural data
799 //
800 bi.pgno = encoded->ShiftValue(WORD_CMPR_VAL_PGNO);
801 bi.nrecs = encoded->ShiftValue(WORD_CMPR_VAL_NRECS);
802 } else {
803 key.Clear();
804
805 if(!previous_key.Empty()) {
806 int is_first_change = 1;
807 #if 0
808 //
809 // String
810 //
811 if(changes & WORD_CMPR_VAL_FLAGS_STRING) {
812 unsigned int suffix_length = encoded->ShiftValue(WORD_CMPR_VAL_SUFFIX);
813 unsigned int prefix_length = encoded->ShiftValue(WORD_CMPR_VAL_PREFIX);
814 unsigned char* suffix = encoded->ShiftString(suffix_length);
815 word.set(previous_key.GetWord().get(), prefix_length);
816 word.append((const char*)suffix, suffix_length);
817 key.SetWord(word);
818 is_first_change = 0;
819 } else {
820 key.SetWord(previous_key.GetWord());
821 }
822 #endif
823 //
824 // Fields
825 //
826 int nfields = key.NFields();
827 for(int j = 0; j < nfields; j++) {
828 if(changes & WORD_CMPR_VAL_FLAGS_FIELD(j)) {
829 unsigned int value = encoded->ShiftValue(j + WORD_CMPR_VAL_FIELDS);
830 if(is_first_change) {
831 value += previous_key.Get(j);
832 is_first_change = 0;
833 }
834 key.Set(j, value);
835 } else {
836 key.Set(j, previous_key.Get(j));
837 }
838 }
839 //
840 // Structural data
841 //
842 if(changes & WORD_CMPR_VAL_FLAGS_PGNO) {
843 bi.pgno = encoded->ShiftValue(WORD_CMPR_VAL_PGNO);
844 } else {
845 bi.pgno = previous_bi.pgno;
846 }
847 if(changes & WORD_CMPR_VAL_FLAGS_NRECS) {
848 bi.nrecs = encoded->ShiftValue(WORD_CMPR_VAL_NRECS);
849 } else {
850 bi.nrecs = previous_bi.nrecs;
851 }
852 } else {
853 #if 0
854 //
855 // String
856 //
857 unsigned int string_length = encoded->ShiftValue(WORD_CMPR_VAL_SUFFIX);
858 unsigned char* string = encoded->ShiftString(string_length);
859 key.SetWord((const char*)string, string_length);
860 #endif
861 //
862 // Fields
863 //
864 int nfields = key.NFields();
865 for(int i = 0; i < nfields; i++) {
866 key.Set(i, encoded->ShiftValue(i + WORD_CMPR_VAL_FIELDS));
867 }
868 //
869 // Structural data
870 //
871 bi.pgno = encoded->ShiftValue(WORD_CMPR_VAL_PGNO);
872 bi.nrecs = encoded->ShiftValue(WORD_CMPR_VAL_NRECS);
873 }
874
875 key.Pack(packed);
876 }
877 data.data = packed.get();
878 data.size = packed.length();
879 bi.len = packed.length();
880 bi.type = B_KEYDATA;
881
882 //
883 // Insert entry in page
884 //
885 cdb___db_pitem(pp, i, BINTERNAL_SIZE(bi.len), &hdr, &data);
886
887 //
888 // Current becomes previous
889 //
890 previous_bi = bi;
891 previous_key = key;
892 }
893 }
894
895 if(debug) {
896 if(entry_count != NUM_ENT(pp))
897 fprintf(stderr, "WordDBCompress::UncompressIBtree: pgno %d: NUM_ENT(pp) = %d is different than entry_count = %d\n", PGNO(pp), NUM_ENT(pp), entry_count);
898 }
899 return 0;
900 }
901
902 int
UncompressLBtree(const unsigned char * inbuff,int inbuff_length,unsigned char * outbuff,int outbuff_length)903 WordDBCompress::UncompressLBtree(const unsigned char *inbuff, int inbuff_length, unsigned char *outbuff, int outbuff_length)
904 {
905 memset((char*)outbuff, '\0', outbuff_length);
906
907 WordBitCompress stream(inbuff_length);
908 stream.BuffSet(inbuff + WORD_CMPR_OVERHEAD, inbuff_length - WORD_CMPR_OVERHEAD);
909
910 PAGE *pp = (PAGE *)outbuff;
911
912 TYPE_TAGS(pp) = *inbuff;
913 HOFFSET(pp) = outbuff_length;
914 short entry_count = 0;
915 pp->lsn.file = stream.GetUint(sizeof(pp->lsn.file) * 8);
916 pp->lsn.offset = stream.GetUint(sizeof(pp->lsn.offset) * 8);
917 PGNO(pp) = stream.GetUint(sizeof(PGNO(pp)) * 8);
918 PREV_PGNO(pp) = stream.GetUint(sizeof(PREV_PGNO(pp)) * 8);
919 NEXT_PGNO(pp) = stream.GetUint(sizeof(NEXT_PGNO(pp)) * 8);
920 entry_count = stream.GetUint(sizeof(NUM_ENT(pp)) * 8);
921 LEVEL(pp) = stream.GetUint(sizeof(LEVEL(pp)) * 8);
922
923 if(entry_count > 0) {
924 int i;
925 String packed;
926 BKEYDATA bk;
927 DBT hdr;
928 hdr.data = &bk;
929 hdr.size = SSZA(BKEYDATA, data);
930 DBT data;
931 WordKey key(context);
932 WordKey previous_key(context);
933 WordRecord record(context);
934 WordRecord previous_record(context);
935 int previous_record_p = 0;
936 String word;
937
938 encoded->Get(stream);
939
940 for(i = 0; i < entry_count; i += P_INDX) {
941 unsigned int changes = encoded->ShiftValue(WORD_CMPR_VAL_FLAGS);
942
943 //
944 // Key
945 //
946 {
947 memset((char*)&bk, '\0', sizeof(BKEYDATA));
948
949 key.Clear();
950
951 if(!previous_key.Empty()) {
952 int is_first_change = 1;
953 #if 0
954 //
955 // String
956 //
957 if(changes & WORD_CMPR_VAL_FLAGS_STRING) {
958 unsigned int suffix_length = encoded->ShiftValue(WORD_CMPR_VAL_SUFFIX);
959 unsigned int prefix_length = encoded->ShiftValue(WORD_CMPR_VAL_PREFIX);
960 unsigned char* suffix = encoded->ShiftString(suffix_length);
961 word.set(previous_key.GetWord().get(), prefix_length);
962 word.append((const char*)suffix, suffix_length);
963 key.SetWord(word);
964 is_first_change = 0;
965 } else {
966 key.SetWord(previous_key.GetWord());
967 }
968 #endif
969 //
970 // Fields
971 //
972 int nfields = key.NFields();
973 for(int j = 0; j < nfields; j++) {
974 if(changes & WORD_CMPR_VAL_FLAGS_FIELD(j)) {
975 unsigned int value = encoded->ShiftValue(j + WORD_CMPR_VAL_FIELDS);
976 if(is_first_change) {
977 value += previous_key.Get(j);
978 is_first_change = 0;
979 }
980 key.Set(j, value);
981 } else {
982 key.Set(j, previous_key.Get(j));
983 }
984 }
985 } else {
986 #if 0
987 //
988 // String
989 //
990 unsigned int string_length = encoded->ShiftValue(WORD_CMPR_VAL_SUFFIX);
991 unsigned char* string = encoded->ShiftString(string_length);
992 key.SetWord((const char*)string, string_length);
993 #endif
994 //
995 // Fields
996 //
997 int nfields = key.NFields();
998 for(int i = 0; i < nfields; i++) {
999 key.Set(i, encoded->ShiftValue(i + WORD_CMPR_VAL_FIELDS));
1000 }
1001 }
1002
1003 key.Pack(packed);
1004
1005 data.data = packed.get();
1006 data.size = packed.length();
1007 bk.len = packed.length();
1008 bk.type = B_KEYDATA;
1009
1010 //
1011 // Insert entry in page
1012 //
1013 cdb___db_pitem(pp, i, BKEYDATA_SIZE(bk.len), &hdr, &data);
1014
1015 previous_key = key;
1016 }
1017 //
1018 // Record
1019 //
1020 {
1021 memset((char*)&bk, '\0', sizeof(BKEYDATA));
1022
1023 if(changes & WORD_CMPR_VAL_FLAGS_RECORD_NO) {
1024 packed.trunc();
1025 } else {
1026 if(changes & WORD_CMPR_VAL_FLAGS_RECORD_STR) {
1027 record.type = WORD_RECORD_STR;
1028 if(previous_record_p &&
1029 (changes & WORD_CMPR_VAL_FLAGS_RECORD_EQ)) {
1030 record.info.str = previous_record.info.str;
1031 } else {
1032 unsigned int string_length = encoded->ShiftValue(WORD_CMPR_VAL_RLENGTH);
1033 unsigned char* string = encoded->ShiftString(string_length);
1034 record.info.str.set((const char*)string, string_length);
1035 }
1036 } else {
1037 record.type = WORD_RECORD_DATA;
1038
1039 if(previous_record_p &&
1040 (changes & WORD_CMPR_VAL_FLAGS_RECORD_EQ)) {
1041 record.info.data = previous_record.info.data;
1042 } else {
1043 record.info.data = encoded->ShiftValue(WORD_CMPR_VAL_RVALUE);
1044 }
1045 }
1046
1047 record.Pack(packed);
1048
1049 previous_record = record;
1050 previous_record_p = 1;
1051 }
1052 data.data = packed.get();
1053 data.size = packed.length();
1054 bk.len = packed.length();
1055 bk.type = B_KEYDATA;
1056
1057 //
1058 // Insert entry in page
1059 //
1060 cdb___db_pitem(pp, i + 1, BKEYDATA_SIZE(bk.len), &hdr, &data);
1061 }
1062 }
1063 }
1064
1065 if(debug) {
1066 if(entry_count != NUM_ENT(pp))
1067 fprintf(stderr, "WordDBCompress::UncompressLBtree: pgno %d: NUM_ENT(pp) = %d is different than entry_count = %d\n", PGNO(pp), NUM_ENT(pp), entry_count);
1068 }
1069
1070 return 0;
1071 }
1072
CmprInfo()1073 DB_CMPR_INFO* WordDBCompress::CmprInfo()
1074 {
1075 DB_CMPR_INFO *cmpr_info = new DB_CMPR_INFO;
1076
1077 cmpr_info->user_data = (void *)this;
1078 cmpr_info->compress = WordDBCompress_compress_c;
1079 cmpr_info->uncompress = WordDBCompress_uncompress_c;
1080 cmpr_info->coefficient = 3;
1081 cmpr_info->max_npages = 9;
1082 cmprInfo = cmpr_info;
1083
1084 return cmpr_info;
1085 }
1086
1087 void
DumpPage(const unsigned char * page) const1088 WordDBCompress::DumpPage(const unsigned char* page) const
1089 {
1090 PAGE* pp = (PAGE*)page;
1091
1092 fprintf(stderr, "--------------------------------------------------\n");
1093 fprintf(stderr, "pgno = %d ", PGNO(pp));
1094 fprintf(stderr, "lsn.file = %d ", pp->lsn.file);
1095 fprintf(stderr, "lsn.offset = %d ", pp->lsn.offset);
1096 fprintf(stderr, "prev_pgno = %d ", PREV_PGNO(pp));
1097 fprintf(stderr, "next_pgno = %d\n", NEXT_PGNO(pp));
1098 fprintf(stderr, "entries = %d ", NUM_ENT(pp));
1099 fprintf(stderr, "hf_offset = %d ", HOFFSET(pp));
1100 fprintf(stderr, "level = %d ", LEVEL(pp));
1101 fprintf(stderr, "type = %d\n", TYPE(pp));
1102 fprintf(stderr, "tags = 0x%02x\n", TAGS(pp));
1103 fprintf(stderr, "freespace = %d bytes from byte %d to byte %d\n", P_FREESPACE(pp), LOFFSET(pp), HOFFSET(pp));
1104
1105 int i;
1106 for(i = 0; i < NUM_ENT(pp); i++) {
1107 fprintf(stderr, "index = %d, ", pp->inp[i]);
1108 unsigned char* data = 0;
1109 int data_length = 0;
1110 switch(TYPE(pp)) {
1111 case P_IBTREE:
1112 {
1113 BINTERNAL* e = GET_BINTERNAL(pp, i);
1114 fprintf(stderr, "len = %d, type = %d, pgno = %d, nrecs = %d\n", e->len, e->type, e->pgno, e->nrecs);
1115 data = (unsigned char*)e->data;
1116 data_length = e->len;
1117 }
1118 break;
1119 case P_LBTREE:
1120 {
1121 BKEYDATA* e = GET_BKEYDATA(pp, i);
1122 fprintf(stderr, "len = %d, type = %d\n", e->len, e->type);
1123 data = (unsigned char*)e->data;
1124 data_length = e->len;
1125 }
1126 break;
1127 }
1128 if(data_length > 0) {
1129 int j;
1130 for(j = 0; j < data_length; j++) {
1131 fprintf(stderr, "(%d) ", data[j]);
1132 }
1133 fprintf(stderr, "\n");
1134 }
1135 }
1136 }
1137
1138 int
DiffPage(const unsigned char * first,const unsigned char * second) const1139 WordDBCompress::DiffPage(const unsigned char* first, const unsigned char* second) const
1140 {
1141 PAGE* p1 = (PAGE*)first;
1142 PAGE* p2 = (PAGE*)second;
1143
1144 if(TAGS(p1) != TAGS(p2)) return 1;
1145 if(TYPE(p1) != TYPE(p2)) return 1;
1146 if(PGNO(p1) != PGNO(p2)) return 1;
1147 if(p1->lsn.file != p2->lsn.file) return 1;
1148 if(p1->lsn.offset != p2->lsn.offset) return 1;
1149 if(TYPE(p1) == P_LBTREE) {
1150 if(PREV_PGNO(p1) != PREV_PGNO(p2)) return 1;
1151 if(NEXT_PGNO(p1) != NEXT_PGNO(p2)) return 1;
1152 }
1153 if(NUM_ENT(p1) != NUM_ENT(p2)) return 1;
1154 if(HOFFSET(p1) != HOFFSET(p2)) return 1;
1155 if(LEVEL(p1) != LEVEL(p2)) return 1;
1156
1157 int i;
1158 for(i = 0; i < NUM_ENT(p1); i++) {
1159 unsigned char* data1 = 0;
1160 int data1_length = 0;
1161 unsigned char* data2 = 0;
1162 int data2_length = 0;
1163 switch(TYPE(p1)) {
1164 case P_IBTREE:
1165 {
1166 BINTERNAL* e1 = GET_BINTERNAL(p1, i);
1167 BINTERNAL* e2 = GET_BINTERNAL(p2, i);
1168 if(e1->len != e2->len) return 1;
1169 if(e1->type != e2->type) return 1;
1170 if(e1->pgno != e2->pgno) return 1;
1171 if(e1->nrecs != e2->nrecs) return 1;
1172 data1 = (unsigned char*)e1->data;
1173 data1_length = e1->len;
1174 data2 = (unsigned char*)e2->data;
1175 data2_length = e2->len;
1176 }
1177 break;
1178 case P_LBTREE:
1179 {
1180 BKEYDATA* e1 = GET_BKEYDATA(p1, i);
1181 BKEYDATA* e2 = GET_BKEYDATA(p2, i);
1182 if(e1->len != e2->len) return 1;
1183 if(e1->type != e2->type) return 1;
1184 data1 = (unsigned char*)e1->data;
1185 data1_length = e1->len;
1186 data2 = (unsigned char*)e2->data;
1187 data2_length = e2->len;
1188 }
1189 break;
1190 }
1191 if(data1_length > 0) {
1192 int j;
1193 for(j = 0; j < data1_length; j++) {
1194 if(data1[j] != data2[j]) return 1;
1195 }
1196 }
1197 }
1198 return 0;
1199 }
1200