1 // 2 // WordDBPage.h 3 // 4 // WordDBPage: Implements specific compression scheme for 5 // Berkeley DB pages containing WordReferences objects. 6 // 7 // Part of the ht://Dig package <http://www.htdig.org/> 8 // Copyright (c) 1999-2004 The ht://Dig Group 9 // For copyright details, see the file COPYING in your distribution 10 // or the GNU Library General Public License (LGPL) version 2 or later 11 // <http://www.gnu.org/copyleft/lgpl.html> 12 // 13 // $Id: WordDBPage.h,v 1.8 2004/05/28 13:15:26 lha Exp $ 14 // 15 // 16 // Access to Berkeley DB internal 17 // 18 19 #ifndef _WordDBPage_h_ 20 #define _WordDBPage_h_ 21 22 extern "C" 23 { 24 #include "db_int.h" 25 #include "shqueue.h" 26 #include "db_shash.h" 27 #include "mp.h" 28 #include "db_page.h" 29 #include "common_ext.h" 30 } 31 32 #include "WordDBCompress.h" 33 #include "WordBitCompress.h" 34 #include "WordRecord.h" 35 #include "WordKey.h" 36 37 38 #define WORD_ALIGN_TO(v,a) ( (v)%(a) ? (v+((a)-(v)%(a))) : v) 39 #define NBITS_KEYLEN 16 40 #define NBITS_DATALEN 16 41 42 // *********************************************** 43 // *************** WordDBRecord ***************** 44 // *********************************************** 45 46 // WordRecord with added functionalities to help with compression/decompression 47 class WordDBRecord : public WordRecord 48 { 49 public: 50 51 // retreive WordRecord data/stats from coded numbers set_decompress(unsigned int ** data,int * indexes,int i,int pdata,int pstat0,int pstat1)52 void set_decompress(unsigned int **data,int *indexes,int i,int pdata,int pstat0,int pstat1) 53 { 54 if(i>=indexes[pstat0]) 55 {// were pas the end of coded stats, so this can't be a stat 56 type=DefaultType(); 57 if(type==WORD_RECORD_DATA){info.data=data[pdata][i-indexes[pstat0]];} 58 else{info.data=0;} 59 } 60 else 61 {// this is a stat 62 type=WORD_RECORD_STATS; 63 info.stats.noccurrence=data[pstat0][i]; 64 info.stats.ndoc =data[pstat1][i]; 65 } 66 } WordDBRecord()67 WordDBRecord():WordRecord(){;} WordDBRecord(byte * dat,int len,int rectyp)68 WordDBRecord(byte *dat,int len,int rectyp):WordRecord() 69 { 70 type=(rectyp ? DefaultType() : WORD_RECORD_STATS); 71 Unpack(String((char *)dat,len)); 72 } WordDBRecord(BKEYDATA * ndata,int rectyp)73 WordDBRecord(BKEYDATA *ndata,int rectyp):WordRecord() 74 {// typ: 0->stat 1->data 75 type=(rectyp ? DefaultType() : WORD_RECORD_STATS); 76 Unpack(String((char *)ndata->data,ndata->len)); 77 } 78 }; 79 80 81 // *********************************************** 82 // **************** WordDBKey ***************** 83 // *********************************************** 84 85 // WordKey with added functionalities to help with compression/decompression 86 class WordDBKey : public WordKey 87 { 88 BKEYDATA *key; 89 public: 90 RecType()91 int RecType(){return (GetWord()[0]!=1 ? 1 :0);} WordDBKey()92 WordDBKey():WordKey() 93 { 94 key=NULL; 95 } WordDBKey(BKEYDATA * nkey)96 WordDBKey(BKEYDATA *nkey):WordKey() 97 { 98 key=nkey; 99 Unpack(String((char *)key->data,key->len)); 100 } is_null()101 int is_null() 102 { 103 errr("UNUSED"); 104 if(GetWord().length()==0) 105 { 106 for(int j=1;j<NFields();j++) 107 {if(Get(j)!=0){errr("WordDBKey::is_null key has 0 len word but is not null");}} 108 return 1; 109 } 110 return 0; 111 } WordDBKey(BINTERNAL * nkey)112 WordDBKey(BINTERNAL *nkey):WordKey() 113 { 114 key=NULL; 115 if(nkey->len==0) 116 { 117 ;// errr("WordDBKey::WordDBKey(BINTERNAL) : nkey->len==0"); 118 } 119 else{Unpack(String((char *)nkey->data,nkey->len));} 120 } WordDBKey(byte * data,int len)121 WordDBKey(byte *data,int len):WordKey() 122 { 123 key=NULL; 124 if(!data || !len){errr("WordDBKey::WordDBKey(data,len) !data || !len");} 125 Unpack(String((char *)data,len)); 126 } 127 }; 128 129 130 // *********************************************** 131 // **************** WordDBPage ***************** 132 // *********************************************** 133 134 // encapsulation of Berkeley DB BTREE page. 135 // this one knows how to compress/decompress itself 136 class WordDBPage 137 { 138 public: 139 int n; // number of entries 140 int nk; // number of keys 141 int type; // for now 3(btreeinternal) && 5(leave:normal case) are allowed 142 int pgsz; 143 144 PAGE *pg; // pointer to BerkeleyDB BTREE page structure 145 146 // assert this page is a leave isleave()147 void isleave() 148 { 149 if(type!=P_LBTREE){errr("WordDBPage::isleave: trying leave specific on non leave");} 150 } 151 152 // assert this page is an internal (non-leave) page isintern()153 void isintern() 154 { 155 if(type!=P_IBTREE){errr("WordDBPage::isintern: trying btreeinternal specific on non btreeinternal page type");} 156 157 } 158 159 // get the i'th key stored in this page get_WordDBKey(int i)160 WordDBKey get_WordDBKey(int i) 161 { 162 if(type==P_LBTREE){return(WordDBKey(key(i)));} 163 else 164 if(type==P_IBTREE){return(WordDBKey(btikey(i)));} 165 else 166 {errr("WordDBPage:get_WordDBKey: bad page type");} 167 return WordDBKey(); 168 } 169 170 // ******************* Accessors to packed entries **************** 171 172 // get the i'th key stored in this (internal==nonleave) page. (ptr to packed) btikey(int i)173 BINTERNAL *btikey(int i) 174 { 175 if(i<0 || i>=pg->entries){printf("btikey:%d\n",i);errr("WordDBPage::btikey out iof bounds");} 176 isintern();return(GET_BINTERNAL(pg,i )); 177 } 178 // get the i'th entry stored in this (nonleave) page. (ptr to packed) 179 // an entry can either be a key or a data entry entry(int i)180 BKEYDATA *entry (int i) 181 { 182 if(i<0 || i>=pg->entries){printf("entry:%d\n",i);errr("WordDBPage::entry out iof bounds");} 183 isleave(); return(GET_BKEYDATA (pg,i )); 184 } 185 // get the i'th key stored in this (leave) page. (ptr to packed) key(int i)186 BKEYDATA *key (int i) 187 { 188 if(i<0 || 2*i>=pg->entries){printf("key:%d\n",i);errr("WordDBPage::key out iof bounds");} 189 isleave(); return(GET_BKEYDATA (pg,i*2 )); 190 } 191 // get the i'th data stored in this (leave) page. (ptr to packed) data(int i)192 BKEYDATA *data (int i) 193 { 194 if(i<0 || 2*i+1>=pg->entries){printf("data:%d\n",i);errr("WordDBPage::data out iof bounds");} 195 isleave(); return(GET_BKEYDATA (pg,i*2+1)); 196 } 197 198 199 // ********************* Inserting entries into a page *************** 200 201 int insert_pos; // offset in page of last inserted entry 202 int insert_indx; // index of next entry to be inserted 203 e_offset(int i)204 int e_offset(int i) {return((int)(pg->inp[i]));} 205 206 // allocate space (in the db page) for adding an entry to this page alloc_entry(int size)207 void *alloc_entry(int size) 208 { 209 size=WORD_ALIGN_TO(size,4); 210 int inp_pos=((byte *)&(pg->inp[insert_indx]))-(byte *)pg; 211 insert_pos-=size; 212 if(insert_pos<=inp_pos) 213 { 214 show(); 215 printf("alloc_entry: allocating size:%4d entrynum:insert_indx:%4d at:insert_pos:%4d\n",size,insert_indx,insert_pos); 216 errr("WordDBPage::alloc_entry: PAGE OVERFLOW"); 217 } 218 pg->inp[insert_indx++]=insert_pos; 219 return((void *)((byte *)pg+insert_pos)); 220 } 221 222 223 // add a data entry to this page insert_data(WordDBRecord & wrec)224 void insert_data(WordDBRecord &wrec) 225 { 226 isleave(); 227 if(!(insert_indx%2)){errr("WordDBPage::insert_data data must be an odd number!");} 228 String prec; 229 wrec.Pack(prec); 230 int len=prec.length(); 231 int size=len+(sizeof(BKEYDATA)-1); 232 233 BKEYDATA *dat=(BKEYDATA *)alloc_entry(size); 234 dat->len=len; 235 dat->type=1;//!!!!!!!!!!!!! 236 memcpy((void *)dat->data,(void *)(char *)prec,len); 237 } 238 // add a key entry to this page insert_key(WordDBKey & ky)239 void insert_key(WordDBKey &ky) 240 { 241 isleave(); 242 if(insert_indx%2){errr("WordDBPage::insert_key key must be an even number!");} 243 String pkey; 244 ky.Pack(pkey); 245 int keylen=pkey.length(); 246 int size=keylen+(sizeof(BKEYDATA)-1); 247 BKEYDATA *bky=(BKEYDATA *)alloc_entry(size); 248 bky->len=keylen; 249 bky->type=1;// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 250 memcpy((void *)bky->data,(void *)(char *)pkey,keylen); 251 } 252 // add a key entry to this internal page 253 void insert_btikey(WordDBKey &ky,BINTERNAL &bti,int empty=0) 254 { 255 isintern(); 256 int keylen=0; 257 String pkey; 258 if(!empty) 259 { 260 ky.Pack(pkey); 261 keylen=pkey.length(); 262 } 263 int size=keylen+((byte *)&(bti.data))-((byte *)&bti);// pos of data field in BINTERNAL 264 if(empty) 265 { 266 if(verbose){printf("WordDBPage::insert_btikey: empty : BINTERNAL:%d datapos:%d keylen:%d size:%d alligned to:%d\n",(int)sizeof(BINTERNAL), 267 (int)(((byte *)&(bti.data))-((byte *)&bti)), 268 keylen,size,WORD_ALIGN_TO(size,4));} 269 } 270 271 BINTERNAL *btik=(BINTERNAL *)alloc_entry(size); 272 btik->len =(empty ? 0 : keylen); 273 btik->type=1;// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 274 btik->pgno =bti.pgno; 275 btik->nrecs=bti.nrecs; 276 if(!empty){memcpy((void *)btik->data,(void *)(char *)pkey,keylen);} 277 // else 278 // {btik->data[0]=0;}// just to avoid uninit memory read 279 } entry_struct_size()280 int entry_struct_size() 281 { 282 return(type==P_IBTREE ? sizeof(BINTERNAL) : sizeof(BKEYDATA ) )-1; 283 } entry_size(int i)284 int entry_size(int i) 285 { 286 return entry_struct_size() + (type==P_IBTREE ? btikey(i)->len : key(i)->len ); 287 } 288 289 290 291 292 293 // ************** Comrpession/Uncompression *************************** 294 295 // The compression functions 296 void Compress_extract_vals_wordiffs(int *nums,int *nums_pos,int nnums,HtVector_byte &wordiffs); 297 void Compress_show_extracted(int *nums,int *nums_pos,int nnums,HtVector_byte &wordiffs); 298 void Compress_vals(Compressor &out,int *nums,int *nums_pos,int nnums); 299 void Compress_vals_changed_flags(Compressor &out,unsigned int *cflags,int n); 300 void Compress_header(Compressor &out); 301 int Compress_main(Compressor &out); 302 Compressor *Compress(int debug=0, DB_CMPR_INFO *cmprInfo=NULL); 303 304 // The uncompression functions 305 int Uncompress(Compressor *pin,int debug=0, DB_CMPR_INFO *cmprInfo=NULL); 306 int Uncompress_main(Compressor *pin); 307 void Uncompress_vals_chaged_flags(Compressor &in,unsigned int **pcflags,int *pn); 308 int Uncompress_header(Compressor &in); 309 void Uncompress_rebuild(unsigned int **rnums,int *rnum_sizes,int nnums,byte *rworddiffs,int nrworddiffs); 310 void Uncompress_show_rebuild(unsigned int **rnums,int *rnum_sizes,int nnums,byte *rworddiffs,int nrworddiffs); 311 312 int TestCompress(int debuglevel); 313 int Compare(WordDBPage &other); 314 315 // the following functions are use to compress/uncompress 316 // keys/data directly 317 // This is necesary for the first key/data elements of the page compress_key(Compressor & out,int i)318 void compress_key(Compressor &out,int i) 319 { 320 if(type==P_IBTREE) 321 { 322 int len=btikey(i)->len; 323 out.put_uint(len,NBITS_KEYLEN,label_str("seperatekey_len",i)); 324 if(verbose){printf("WordDBPage::compress_key:compress(typ3):%d ::: sizeof(BINTERNAL):%d\n",len,(int)sizeof(BINTERNAL));} 325 out.put_uint(btikey(i)->len ,sizeof(btikey(i)->len )*8,label_str("seperatekey_bti_len" ,i)); 326 out.put_uint(btikey(i)->type ,sizeof(btikey(i)->type )*8,label_str("seperatekey_bti_type" ,i)); 327 out.put_uint(btikey(i)->pgno ,sizeof(btikey(i)->pgno )*8,label_str("seperatekey_bti_pgno" ,i)); 328 out.put_uint(btikey(i)->nrecs,sizeof(btikey(i)->nrecs)*8,label_str("seperatekey_bti_nrecs",i)); 329 if(len){out.put_zone((byte *)btikey(i)->data,8*len,label_str("seperatekey_btidata",i));} 330 } 331 else 332 { 333 int len=key(i)->len; 334 out.put_uint(len,NBITS_KEYLEN,label_str("seperatekey_len",i)); 335 if(verbose){printf("WordDBPage::compress_key: compress(typ5):%d\n",len);} 336 out.put_zone((byte *)key(i)->data,8*len,label_str("seperatekey_data",i)); 337 } 338 } compress_data(Compressor & out,int i)339 void compress_data(Compressor &out,int i) 340 { 341 int len=data(i)->len; 342 out.put_uint(len,NBITS_DATALEN,label_str("seperatedata_len",i)); 343 if(verbose){printf("WordDBPage::compress_data: compressdata(typ5):%d\n",len);} 344 out.put_zone((byte *)data(i)->data,8*len,label_str("seperatedata_data",i)); 345 } uncompress_key(Compressor & in,int i)346 WordDBKey uncompress_key(Compressor &in,int i) 347 { 348 WordDBKey res; 349 int len=in.get_uint(NBITS_KEYLEN,label_str("seperatekey_len",i)); 350 if(verbose){printf("WordDBPage::uncompress_key: seperatekey:len:%d\n",len);} 351 352 if(type==P_IBTREE) 353 { 354 if(len==0 && i!=0){errr("WordDBPage::uncompress_key: keylen=0 && i!=0");} 355 BINTERNAL bti; 356 bti.len =in.get_uint(sizeof(bti.len )*8,label_str("seperatekey_bti_len" ,i)); 357 bti.type =in.get_uint(sizeof(bti.type )*8,label_str("seperatekey_bti_type" ,i)); 358 bti.pgno =in.get_uint(sizeof(bti.pgno )*8,label_str("seperatekey_bti_pgno" ,i)); 359 bti.nrecs=in.get_uint(sizeof(bti.nrecs)*8,label_str("seperatekey_bti_nrecs",i)); 360 if(len!=bti.len){errr("WordDBPage::uncompress_key: incoherence: len!=bti.len");} 361 if(len) 362 { 363 byte *gotdata=new byte[len]; 364 CHECK_MEM(gotdata); 365 in.get_zone(gotdata,8*len,label_str("seperatekey_btidata",i)); 366 res=WordDBKey(gotdata,len); 367 delete [] gotdata; 368 } 369 insert_btikey(res,bti,(len==0 ? 1:0)); 370 } 371 else 372 { 373 byte *gotdata=new byte[len]; 374 CHECK_MEM(gotdata); 375 in.get_zone(gotdata,8*len,label_str("seperatekey_data",i)); 376 res=WordDBKey(gotdata,len); 377 insert_key(res); 378 delete [] gotdata; 379 } 380 return res; 381 } uncompress_data(Compressor & in,int i,int rectyp)382 WordDBRecord uncompress_data(Compressor &in,int i,int rectyp) 383 { 384 WordDBRecord res; 385 int len=in.get_uint(NBITS_DATALEN,label_str("seperatedata_len",i)); 386 if(verbose)printf("uncompressdata:len:%d\n",len); 387 byte *gotdata=new byte[len]; 388 CHECK_MEM(gotdata); 389 in.get_zone(gotdata,8*len,label_str("seperatedata_data",i)); 390 res=WordDBRecord(gotdata,len,rectyp); 391 insert_data(res); 392 delete [] gotdata; 393 return res; 394 } 395 396 397 // exctracted numerical fields 398 number_field_label(int j)399 const char* number_field_label(int j) 400 { 401 if(j>0 && j<WordKey::NFields()){return (char *)(WordKey::Info()->sort[j].name);} 402 if( j==CNFLAGS )return "CNFLAGS " ; 403 if( j==CNDATASTATS0 )return "CNDATASTATS0 " ; 404 if( j==CNDATASTATS1 )return "CNDATASTATS1 " ; 405 if( j==CNDATADATA )return "CNDATADATA " ; 406 if( j==CNBTIPGNO )return "CNBTIPGNO " ; 407 if( j==CNBTINRECS )return "CNBTINRECS " ; 408 if( j==CNWORDDIFFPOS )return "CNWORDDIFFPOS" ; 409 if( j==CNWORDDIFFLEN )return "CNWORDDIFFLEN" ; 410 return "BADFIELD"; 411 } 412 // positions of different fileds in 413 // number arrays that are extracted 414 int CNFLAGS ;// FLAGS: which key-fields have changed 415 int CNFIELDS ;// first numerical field 416 int CNDATASTATS0 ;// word record - stats element 0 417 int CNDATASTATS1 ;// word record - stats element 1 418 int CNDATADATA ;// word record - data 419 int CNBTIPGNO ;// internal page: page pointed at by node 420 int CNBTINRECS ;// internal page: ?? 421 int CNWORDDIFFPOS ;// position of first caracter that changed in word 422 int CNWORDDIFFLEN ;// number of chars that changed in word 423 int nnums ; 424 425 426 // ************** DEBUGING/BENCHMARKING *************** 427 void show(); 428 int verbose; 429 int debug; 430 431 432 // ************** Initialization/Destruction ***************** 433 434 // initialize when header is valid init()435 void init() 436 { 437 type=pg->type; 438 n=pg->entries; 439 nk=(type==P_LBTREE ? n/2 : n); 440 insert_pos=pgsz; 441 insert_indx=0; 442 } 443 init0()444 void init0() 445 { 446 CNFLAGS =0; 447 CNFIELDS =1; 448 CNDATASTATS0 = WordKey::NFields() ; 449 CNDATASTATS1 = WordKey::NFields() + 1; 450 CNDATADATA = WordKey::NFields() + 2; 451 CNBTIPGNO = WordKey::NFields() + 3; 452 CNBTINRECS = WordKey::NFields() + 4; 453 CNWORDDIFFPOS = WordKey::NFields() + 5; 454 CNWORDDIFFLEN = WordKey::NFields() + 6; 455 nnums=(CNWORDDIFFLEN+1); 456 457 pg=NULL; 458 pgsz=0; 459 n=0; 460 nk=0; 461 type=-1; 462 verbose=0; 463 debug=0; 464 insert_pos=pgsz; 465 insert_indx=0; 466 } 467 468 // db page was created here, destroy it delete_page()469 void delete_page() 470 { 471 if(!pg){errr("WordDBPage::delete_page: pg==NULL");} 472 delete [] pg; 473 pg=NULL; 474 } 475 // unlink db page from this encapsulation unset_page()476 void unset_page() 477 { 478 if(!pg){errr("WordDBPage::unset_page: pg==NULL");} 479 pg=NULL; 480 } 481 // the DB page must be unset or deleted 482 // before destroying this encapsulation ~WordDBPage()483 ~WordDBPage() 484 { 485 if(pg){errr("WordDBPage::~WordDBPage: page not empty");} 486 } WordDBPage(int npgsz)487 WordDBPage(int npgsz) 488 { 489 init0(); 490 pgsz=npgsz; 491 pg=(PAGE *)(new byte[pgsz]); 492 CHECK_MEM(pg); 493 insert_pos=pgsz; 494 insert_indx=0; 495 } WordDBPage(const u_int8_t * buff,int buff_length)496 WordDBPage(const u_int8_t* buff,int buff_length) 497 { 498 init0(); 499 pg=(PAGE *)buff; 500 pgsz=buff_length; 501 insert_pos=pgsz; 502 insert_indx=0; 503 init(); 504 } 505 }; 506 507 508 #endif// _WordDBPage_h_ 509