1 //
2 // WordDBPage.h
3 //
4 // WordDBPage: Implements specific compression scheme for
5 //                 Berkeley DB pages containing WordReferences objects.
6 //
7 // Part of the ht://Dig package   <http://www.htdig.org/>
8 // Copyright (c) 1999-2004 The ht://Dig Group
9 // For copyright details, see the file COPYING in your distribution
10 // or the GNU Library General Public License (LGPL) version 2 or later
11 // <http://www.gnu.org/copyleft/lgpl.html>
12 //
13 // $Id: WordDBPage.h,v 1.8 2004/05/28 13:15:26 lha Exp $
14 //
15 //
16 // Access to Berkeley DB internal
17 //
18 
19 #ifndef _WordDBPage_h_
20 #define _WordDBPage_h_
21 
22 extern "C"
23 {
24 #include "db_int.h"
25 #include "shqueue.h"
26 #include "db_shash.h"
27 #include "mp.h"
28 #include "db_page.h"
29 #include "common_ext.h"
30 }
31 
32 #include "WordDBCompress.h"
33 #include "WordBitCompress.h"
34 #include "WordRecord.h"
35 #include "WordKey.h"
36 
37 
38 #define WORD_ALIGN_TO(v,a) ( (v)%(a) ? (v+((a)-(v)%(a))) : v)
39 #define NBITS_KEYLEN 16
40 #define NBITS_DATALEN 16
41 
42 // ***********************************************
43 // *************** WordDBRecord  *****************
44 // ***********************************************
45 
46 // WordRecord with added functionalities to help with compression/decompression
47 class WordDBRecord : public WordRecord
48 {
49 public:
50 
51     // retreive WordRecord data/stats from coded numbers
set_decompress(unsigned int ** data,int * indexes,int i,int pdata,int pstat0,int pstat1)52     void set_decompress(unsigned int **data,int *indexes,int i,int pdata,int pstat0,int pstat1)
53     {
54 	if(i>=indexes[pstat0])
55 	{// were pas the end of coded stats, so this can't be a stat
56 	    type=DefaultType();
57 	    if(type==WORD_RECORD_DATA){info.data=data[pdata][i-indexes[pstat0]];}
58 	    else{info.data=0;}
59 	}
60 	else
61 	{// this is a stat
62 	    type=WORD_RECORD_STATS;
63 	    info.stats.noccurrence=data[pstat0][i];
64 	    info.stats.ndoc      =data[pstat1][i];
65 	}
66     }
WordDBRecord()67     WordDBRecord():WordRecord(){;}
WordDBRecord(byte * dat,int len,int rectyp)68     WordDBRecord(byte *dat,int len,int rectyp):WordRecord()
69     {
70 	type=(rectyp ? DefaultType() : WORD_RECORD_STATS);
71 	Unpack(String((char *)dat,len));
72     }
WordDBRecord(BKEYDATA * ndata,int rectyp)73     WordDBRecord(BKEYDATA *ndata,int rectyp):WordRecord()
74     {// typ: 0->stat 1->data
75 	type=(rectyp ? DefaultType() : WORD_RECORD_STATS);
76 	Unpack(String((char *)ndata->data,ndata->len));
77     }
78 };
79 
80 
81 // ***********************************************
82 // ****************  WordDBKey   *****************
83 // ***********************************************
84 
85 // WordKey with added functionalities to help with compression/decompression
86 class WordDBKey : public WordKey
87 {
88     BKEYDATA *key;
89 public:
90 
RecType()91     int RecType(){return (GetWord()[0]!=1 ? 1 :0);}
WordDBKey()92     WordDBKey():WordKey()
93     {
94 	key=NULL;
95     }
WordDBKey(BKEYDATA * nkey)96     WordDBKey(BKEYDATA *nkey):WordKey()
97     {
98 	key=nkey;
99 	Unpack(String((char *)key->data,key->len));
100     }
is_null()101     int is_null()
102     {
103 	errr("UNUSED");
104   	if(GetWord().length()==0)
105 	{
106 	    for(int j=1;j<NFields();j++)
107 	    {if(Get(j)!=0){errr("WordDBKey::is_null key has 0 len word but is not null");}}
108 	    return 1;
109 	}
110 	return 0;
111     }
WordDBKey(BINTERNAL * nkey)112     WordDBKey(BINTERNAL *nkey):WordKey()
113     {
114 	key=NULL;
115 	if(nkey->len==0)
116 	{
117 	    ;//	    errr("WordDBKey::WordDBKey(BINTERNAL) : nkey->len==0");
118 	}
119 	else{Unpack(String((char *)nkey->data,nkey->len));}
120     }
WordDBKey(byte * data,int len)121     WordDBKey(byte *data,int len):WordKey()
122     {
123 	key=NULL;
124 	if(!data || !len){errr("WordDBKey::WordDBKey(data,len) !data || !len");}
125 	Unpack(String((char *)data,len));
126     }
127 };
128 
129 
130 // ***********************************************
131 // ****************  WordDBPage  *****************
132 // ***********************************************
133 
134 // encapsulation of Berkeley DB BTREE page.
135 // this one knows how to compress/decompress itself
136 class WordDBPage
137 {
138  public:
139     int n;   // number of entries
140     int nk;  // number of keys
141     int type; // for now 3(btreeinternal) && 5(leave:normal case) are allowed
142     int pgsz;
143 
144     PAGE *pg;         // pointer to BerkeleyDB BTREE page structure
145 
146     // assert this page is a leave
isleave()147     void isleave()
148     {
149 	if(type!=P_LBTREE){errr("WordDBPage::isleave: trying leave specific on non leave");}
150     }
151 
152     // assert this page is an internal (non-leave) page
isintern()153     void isintern()
154     {
155 	if(type!=P_IBTREE){errr("WordDBPage::isintern: trying btreeinternal  specific on non btreeinternal page type");}
156 
157     }
158 
159     // get the i'th key stored in this page
get_WordDBKey(int i)160     WordDBKey get_WordDBKey(int i)
161     {
162 	if(type==P_LBTREE){return(WordDBKey(key(i)));}
163 	else
164 	if(type==P_IBTREE){return(WordDBKey(btikey(i)));}
165 	else
166 	{errr("WordDBPage:get_WordDBKey: bad page type");}
167 	return WordDBKey();
168     }
169 
170     // ******************* Accessors to packed entries ****************
171 
172     // get the i'th key stored in this (internal==nonleave) page. (ptr to packed)
btikey(int i)173     BINTERNAL *btikey(int i)
174     {
175 	if(i<0 || i>=pg->entries){printf("btikey:%d\n",i);errr("WordDBPage::btikey out iof bounds");}
176 	isintern();return(GET_BINTERNAL(pg,i    ));
177     }
178     // get the i'th entry stored in this (nonleave) page. (ptr to packed)
179     // an entry can either be a key or a data entry
entry(int i)180     BKEYDATA  *entry (int i)
181     {
182 	if(i<0 || i>=pg->entries){printf("entry:%d\n",i);errr("WordDBPage::entry out iof bounds");}
183 	isleave(); return(GET_BKEYDATA (pg,i    ));
184     }
185     // get the i'th key stored in this (leave) page. (ptr to packed)
key(int i)186     BKEYDATA  *key   (int i)
187     {
188 	if(i<0 || 2*i>=pg->entries){printf("key:%d\n",i);errr("WordDBPage::key out iof bounds");}
189 	isleave(); return(GET_BKEYDATA (pg,i*2  ));
190     }
191     // get the i'th data stored in this (leave) page. (ptr to packed)
data(int i)192     BKEYDATA  *data  (int i)
193     {
194 	if(i<0 || 2*i+1>=pg->entries){printf("data:%d\n",i);errr("WordDBPage::data out iof bounds");}
195 	isleave(); return(GET_BKEYDATA (pg,i*2+1));
196     }
197 
198 
199     // ********************* Inserting entries into a page ***************
200 
201     int insert_pos;   // offset in page of last inserted entry
202     int insert_indx;  // index of next entry to be inserted
203 
e_offset(int i)204     int e_offset(int i) {return((int)(pg->inp[i]));}
205 
206     // allocate space (in the db page) for adding an entry to this page
alloc_entry(int size)207     void *alloc_entry(int size)
208     {
209 	size=WORD_ALIGN_TO(size,4);
210 	int inp_pos=((byte *)&(pg->inp[insert_indx]))-(byte *)pg;
211 	insert_pos-=size;
212 	if(insert_pos<=inp_pos)
213 	{
214 	    show();
215 	    printf("alloc_entry: allocating size:%4d entrynum:insert_indx:%4d at:insert_pos:%4d\n",size,insert_indx,insert_pos);
216 	    errr("WordDBPage::alloc_entry: PAGE OVERFLOW");
217 	}
218 	pg->inp[insert_indx++]=insert_pos;
219 	return((void *)((byte *)pg+insert_pos));
220     }
221 
222 
223     // add a data entry to this page
insert_data(WordDBRecord & wrec)224     void insert_data(WordDBRecord &wrec)
225     {
226 	isleave();
227 	if(!(insert_indx%2)){errr("WordDBPage::insert_data data must be an odd number!");}
228 	String prec;
229 	wrec.Pack(prec);
230 	int len=prec.length();
231 	int size=len+(sizeof(BKEYDATA)-1);
232 
233 	BKEYDATA *dat=(BKEYDATA *)alloc_entry(size);
234 	dat->len=len;
235 	dat->type=1;//!!!!!!!!!!!!!
236 	memcpy((void *)dat->data,(void *)(char *)prec,len);
237     }
238     // add a key entry to this page
insert_key(WordDBKey & ky)239     void insert_key(WordDBKey &ky)
240     {
241 	isleave();
242 	if(insert_indx%2){errr("WordDBPage::insert_key key must be an even number!");}
243 	String pkey;
244 	ky.Pack(pkey);
245 	int keylen=pkey.length();
246 	int size=keylen+(sizeof(BKEYDATA)-1);
247 	BKEYDATA *bky=(BKEYDATA *)alloc_entry(size);
248 	bky->len=keylen;
249 	bky->type=1;// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
250 	memcpy((void *)bky->data,(void *)(char *)pkey,keylen);
251     }
252     // add a key entry to this internal page
253     void insert_btikey(WordDBKey &ky,BINTERNAL &bti,int empty=0)
254     {
255 	isintern();
256 	int keylen=0;
257 	String pkey;
258 	if(!empty)
259 	{
260 	    ky.Pack(pkey);
261 	    keylen=pkey.length();
262 	}
263 	int size=keylen+((byte *)&(bti.data))-((byte *)&bti);// pos of data field in BINTERNAL
264 	if(empty)
265 	{
266 	    if(verbose){printf("WordDBPage::insert_btikey: empty : BINTERNAL:%d datapos:%d keylen:%d size:%d alligned to:%d\n",(int)sizeof(BINTERNAL),
267 			       (int)(((byte *)&(bti.data))-((byte *)&bti)),
268 			       keylen,size,WORD_ALIGN_TO(size,4));}
269 	}
270 
271 	BINTERNAL *btik=(BINTERNAL *)alloc_entry(size);
272 	btik->len  =(empty ? 0 : keylen);
273 	btik->type=1;// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
274 	btik->pgno =bti.pgno;
275 	btik->nrecs=bti.nrecs;
276 	if(!empty){memcpy((void *)btik->data,(void *)(char *)pkey,keylen);}
277 //  	else
278 //  	{btik->data[0]=0;}// just to avoid uninit memory read
279     }
entry_struct_size()280     int entry_struct_size()
281     {
282 	return(type==P_IBTREE ? sizeof(BINTERNAL) : sizeof(BKEYDATA )   )-1;
283     }
entry_size(int i)284     int entry_size(int i)
285     {
286 	return entry_struct_size() + (type==P_IBTREE ? btikey(i)->len : key(i)->len   );
287     }
288 
289 
290 
291 
292 
293     // ************** Comrpession/Uncompression ***************************
294 
295     // The compression functions
296     void Compress_extract_vals_wordiffs(int *nums,int *nums_pos,int nnums,HtVector_byte &wordiffs);
297     void Compress_show_extracted(int *nums,int *nums_pos,int nnums,HtVector_byte &wordiffs);
298     void Compress_vals(Compressor &out,int *nums,int *nums_pos,int nnums);
299     void Compress_vals_changed_flags(Compressor &out,unsigned int *cflags,int n);
300     void Compress_header(Compressor &out);
301     int  Compress_main(Compressor &out);
302     Compressor *Compress(int debug=0, DB_CMPR_INFO *cmprInfo=NULL);
303 
304     // The uncompression functions
305     int  Uncompress(Compressor *pin,int debug=0, DB_CMPR_INFO *cmprInfo=NULL);
306     int  Uncompress_main(Compressor *pin);
307     void Uncompress_vals_chaged_flags(Compressor &in,unsigned int **pcflags,int *pn);
308     int  Uncompress_header(Compressor &in);
309     void Uncompress_rebuild(unsigned int **rnums,int *rnum_sizes,int nnums,byte *rworddiffs,int nrworddiffs);
310     void Uncompress_show_rebuild(unsigned int **rnums,int *rnum_sizes,int nnums,byte *rworddiffs,int nrworddiffs);
311 
312     int TestCompress(int debuglevel);
313     int Compare(WordDBPage &other);
314 
315     // the following functions are use to compress/uncompress
316     // keys/data directly
317     // This is necesary for the first key/data elements of the page
compress_key(Compressor & out,int i)318     void compress_key(Compressor &out,int i)
319     {
320 	if(type==P_IBTREE)
321 	{
322 	    int len=btikey(i)->len;
323 	    out.put_uint(len,NBITS_KEYLEN,label_str("seperatekey_len",i));
324 	    if(verbose){printf("WordDBPage::compress_key:compress(typ3):%d ::: sizeof(BINTERNAL):%d\n",len,(int)sizeof(BINTERNAL));}
325 	    out.put_uint(btikey(i)->len  ,sizeof(btikey(i)->len  )*8,label_str("seperatekey_bti_len"  ,i));
326 	    out.put_uint(btikey(i)->type ,sizeof(btikey(i)->type )*8,label_str("seperatekey_bti_type" ,i));
327 	    out.put_uint(btikey(i)->pgno ,sizeof(btikey(i)->pgno )*8,label_str("seperatekey_bti_pgno" ,i));
328 	    out.put_uint(btikey(i)->nrecs,sizeof(btikey(i)->nrecs)*8,label_str("seperatekey_bti_nrecs",i));
329 	    if(len){out.put_zone((byte *)btikey(i)->data,8*len,label_str("seperatekey_btidata",i));}
330 	}
331 	else
332 	{
333 	    int len=key(i)->len;
334 	    out.put_uint(len,NBITS_KEYLEN,label_str("seperatekey_len",i));
335 	    if(verbose){printf("WordDBPage::compress_key: compress(typ5):%d\n",len);}
336 	    out.put_zone((byte *)key(i)->data,8*len,label_str("seperatekey_data",i));
337 	}
338     }
compress_data(Compressor & out,int i)339     void compress_data(Compressor &out,int i)
340     {
341 	int len=data(i)->len;
342 	out.put_uint(len,NBITS_DATALEN,label_str("seperatedata_len",i));
343 	if(verbose){printf("WordDBPage::compress_data: compressdata(typ5):%d\n",len);}
344 	out.put_zone((byte *)data(i)->data,8*len,label_str("seperatedata_data",i));
345     }
uncompress_key(Compressor & in,int i)346     WordDBKey uncompress_key(Compressor &in,int i)
347     {
348 	WordDBKey res;
349 	int len=in.get_uint(NBITS_KEYLEN,label_str("seperatekey_len",i));
350 	if(verbose){printf("WordDBPage::uncompress_key: seperatekey:len:%d\n",len);}
351 
352 	if(type==P_IBTREE)
353 	{
354 	    if(len==0 && i!=0){errr("WordDBPage::uncompress_key: keylen=0 &&    i!=0");}
355 	    BINTERNAL bti;
356 	    bti.len  =in.get_uint(sizeof(bti.len  )*8,label_str("seperatekey_bti_len"  ,i));
357 	    bti.type =in.get_uint(sizeof(bti.type )*8,label_str("seperatekey_bti_type" ,i));
358 	    bti.pgno =in.get_uint(sizeof(bti.pgno )*8,label_str("seperatekey_bti_pgno" ,i));
359 	    bti.nrecs=in.get_uint(sizeof(bti.nrecs)*8,label_str("seperatekey_bti_nrecs",i));
360 	    if(len!=bti.len){errr("WordDBPage::uncompress_key: incoherence: len!=bti.len");}
361 	    if(len)
362 	    {
363 		byte *gotdata=new byte[len];
364 		CHECK_MEM(gotdata);
365 		in.get_zone(gotdata,8*len,label_str("seperatekey_btidata",i));
366 		res=WordDBKey(gotdata,len);
367 		delete [] gotdata;
368 	    }
369 	    insert_btikey(res,bti,(len==0 ? 1:0));
370 	}
371 	else
372 	{
373 	    byte *gotdata=new byte[len];
374 	    CHECK_MEM(gotdata);
375 	    in.get_zone(gotdata,8*len,label_str("seperatekey_data",i));
376 	    res=WordDBKey(gotdata,len);
377 	    insert_key(res);
378 	    delete [] gotdata;
379 	}
380 	return res;
381     }
uncompress_data(Compressor & in,int i,int rectyp)382     WordDBRecord uncompress_data(Compressor &in,int i,int rectyp)
383     {
384 	WordDBRecord res;
385 	int len=in.get_uint(NBITS_DATALEN,label_str("seperatedata_len",i));
386 	if(verbose)printf("uncompressdata:len:%d\n",len);
387 	byte *gotdata=new byte[len];
388 	CHECK_MEM(gotdata);
389 	in.get_zone(gotdata,8*len,label_str("seperatedata_data",i));
390 	res=WordDBRecord(gotdata,len,rectyp);
391 	insert_data(res);
392 	delete [] gotdata;
393 	return res;
394     }
395 
396 
397     // exctracted numerical fields
398 
number_field_label(int j)399     const char* number_field_label(int j)
400     {
401 	if(j>0 && j<WordKey::NFields()){return (char *)(WordKey::Info()->sort[j].name);}
402 	if( j==CNFLAGS        )return "CNFLAGS      "  ;
403 	if( j==CNDATASTATS0   )return "CNDATASTATS0 "  ;
404 	if( j==CNDATASTATS1   )return "CNDATASTATS1 "  ;
405 	if( j==CNDATADATA     )return "CNDATADATA   "  ;
406 	if( j==CNBTIPGNO      )return "CNBTIPGNO    "  ;
407 	if( j==CNBTINRECS     )return "CNBTINRECS   "  ;
408 	if( j==CNWORDDIFFPOS  )return "CNWORDDIFFPOS"  ;
409 	if( j==CNWORDDIFFLEN  )return "CNWORDDIFFLEN"  ;
410 	return "BADFIELD";
411     }
412     // positions of different fileds in
413     // number arrays that are extracted
414     int CNFLAGS        ;// FLAGS: which key-fields have changed
415     int CNFIELDS       ;// first numerical field
416     int CNDATASTATS0   ;// word record - stats element 0
417     int CNDATASTATS1   ;// word record - stats element 1
418     int CNDATADATA     ;// word record - data
419     int CNBTIPGNO      ;// internal page: page pointed at by node
420     int CNBTINRECS     ;// internal page: ??
421     int CNWORDDIFFPOS  ;// position of first caracter that changed in word
422     int CNWORDDIFFLEN  ;// number of chars that  changed in word
423     int nnums      ;
424 
425 
426     // ************** DEBUGING/BENCHMARKING  ***************
427     void show();
428     int verbose;
429     int debug;
430 
431 
432     // ************** Initialization/Destruction *****************
433 
434     // initialize when header is valid
init()435     void init()
436     {
437 	type=pg->type;
438 	n=pg->entries;
439 	nk=(type==P_LBTREE ? n/2 : n);
440 	insert_pos=pgsz;
441 	insert_indx=0;
442     }
443 
init0()444     void init0()
445     {
446 	CNFLAGS        =0;
447 	CNFIELDS       =1;
448 	CNDATASTATS0   = WordKey::NFields()    ;
449 	CNDATASTATS1   = WordKey::NFields() + 1;
450 	CNDATADATA     = WordKey::NFields() + 2;
451 	CNBTIPGNO      = WordKey::NFields() + 3;
452 	CNBTINRECS     = WordKey::NFields() + 4;
453 	CNWORDDIFFPOS  = WordKey::NFields() + 5;
454 	CNWORDDIFFLEN  = WordKey::NFields() + 6;
455 	nnums=(CNWORDDIFFLEN+1);
456 
457 	pg=NULL;
458 	pgsz=0;
459 	n=0;
460 	nk=0;
461 	type=-1;
462 	verbose=0;
463 	debug=0;
464 	insert_pos=pgsz;
465 	insert_indx=0;
466     }
467 
468     // db page was created here, destroy it
delete_page()469     void delete_page()
470     {
471        if(!pg){errr("WordDBPage::delete_page: pg==NULL");}
472        delete [] pg;
473        pg=NULL;
474     }
475     // unlink db page from this encapsulation
unset_page()476     void unset_page()
477     {
478        if(!pg){errr("WordDBPage::unset_page: pg==NULL");}
479        pg=NULL;
480     }
481     // the DB page must be unset or deleted
482     // before destroying this encapsulation
~WordDBPage()483     ~WordDBPage()
484     {
485 	if(pg){errr("WordDBPage::~WordDBPage: page not empty");}
486     }
WordDBPage(int npgsz)487     WordDBPage(int npgsz)
488     {
489 	init0();
490 	pgsz=npgsz;
491 	pg=(PAGE *)(new byte[pgsz]);
492 	CHECK_MEM(pg);
493 	insert_pos=pgsz;
494 	insert_indx=0;
495     }
WordDBPage(const u_int8_t * buff,int buff_length)496     WordDBPage(const u_int8_t* buff,int buff_length)
497     {
498 	init0();
499 	pg=(PAGE *)buff;
500 	pgsz=buff_length;
501 	insert_pos=pgsz;
502 	insert_indx=0;
503 	init();
504     }
505 };
506 
507 
508 #endif// _WordDBPage_h_
509