//
// WordDBPage.h
//
// WordDBPage: Implements specific compression scheme for
// Berkeley DB pages containing WordReferences objects.
//
// Part of the ht://Dig package
// Copyright (c) 1999-2004 The ht://Dig Group
// For copyright details, see the file COPYING in your distribution
// or the GNU Library General Public License (LGPL) version 2 or later
//
//
// $Id: WordDBPage.h,v 1.8 2004/05/28 13:15:26 lha Exp $
//
//
// Access to Berkeley DB internal
//
#ifndef _WordDBPage_h_
#define _WordDBPage_h_
extern "C"
{
#include "db_int.h"
#include "shqueue.h"
#include "db_shash.h"
#include "mp.h"
#include "db_page.h"
#include "common_ext.h"
}
#include "WordDBCompress.h"
#include "WordBitCompress.h"
#include "WordRecord.h"
#include "WordKey.h"
#define WORD_ALIGN_TO(v,a) ( (v)%(a) ? (v+((a)-(v)%(a))) : v)
#define NBITS_KEYLEN 16
#define NBITS_DATALEN 16
// ***********************************************
// *************** WordDBRecord *****************
// ***********************************************
// WordRecord with added functionalities to help with compression/decompression
class WordDBRecord : public WordRecord
{
public:
// retreive WordRecord data/stats from coded numbers
void set_decompress(unsigned int **data,int *indexes,int i,int pdata,int pstat0,int pstat1)
{
if(i>=indexes[pstat0])
{// were pas the end of coded stats, so this can't be a stat
type=DefaultType();
if(type==WORD_RECORD_DATA){info.data=data[pdata][i-indexes[pstat0]];}
else{info.data=0;}
}
else
{// this is a stat
type=WORD_RECORD_STATS;
info.stats.noccurrence=data[pstat0][i];
info.stats.ndoc =data[pstat1][i];
}
}
WordDBRecord():WordRecord(){;}
WordDBRecord(byte *dat,int len,int rectyp):WordRecord()
{
type=(rectyp ? DefaultType() : WORD_RECORD_STATS);
Unpack(String((char *)dat,len));
}
WordDBRecord(BKEYDATA *ndata,int rectyp):WordRecord()
{// typ: 0->stat 1->data
type=(rectyp ? DefaultType() : WORD_RECORD_STATS);
Unpack(String((char *)ndata->data,ndata->len));
}
};
// ***********************************************
// **************** WordDBKey *****************
// ***********************************************
// WordKey with added functionalities to help with compression/decompression
class WordDBKey : public WordKey
{
BKEYDATA *key;
public:
int RecType(){return (GetWord()[0]!=1 ? 1 :0);}
WordDBKey():WordKey()
{
key=NULL;
}
WordDBKey(BKEYDATA *nkey):WordKey()
{
key=nkey;
Unpack(String((char *)key->data,key->len));
}
int is_null()
{
errr("UNUSED");
if(GetWord().length()==0)
{
for(int j=1;jlen==0)
{
;// errr("WordDBKey::WordDBKey(BINTERNAL) : nkey->len==0");
}
else{Unpack(String((char *)nkey->data,nkey->len));}
}
WordDBKey(byte *data,int len):WordKey()
{
key=NULL;
if(!data || !len){errr("WordDBKey::WordDBKey(data,len) !data || !len");}
Unpack(String((char *)data,len));
}
};
// ***********************************************
// **************** WordDBPage *****************
// ***********************************************
// encapsulation of Berkeley DB BTREE page.
// this one knows how to compress/decompress itself
class WordDBPage
{
public:
int n; // number of entries
int nk; // number of keys
int type; // for now 3(btreeinternal) && 5(leave:normal case) are allowed
int pgsz;
PAGE *pg; // pointer to BerkeleyDB BTREE page structure
// assert this page is a leave
void isleave()
{
if(type!=P_LBTREE){errr("WordDBPage::isleave: trying leave specific on non leave");}
}
// assert this page is an internal (non-leave) page
void isintern()
{
if(type!=P_IBTREE){errr("WordDBPage::isintern: trying btreeinternal specific on non btreeinternal page type");}
}
// get the i'th key stored in this page
WordDBKey get_WordDBKey(int i)
{
if(type==P_LBTREE){return(WordDBKey(key(i)));}
else
if(type==P_IBTREE){return(WordDBKey(btikey(i)));}
else
{errr("WordDBPage:get_WordDBKey: bad page type");}
return WordDBKey();
}
// ******************* Accessors to packed entries ****************
// get the i'th key stored in this (internal==nonleave) page. (ptr to packed)
BINTERNAL *btikey(int i)
{
if(i<0 || i>=pg->entries){printf("btikey:%d\n",i);errr("WordDBPage::btikey out iof bounds");}
isintern();return(GET_BINTERNAL(pg,i ));
}
// get the i'th entry stored in this (nonleave) page. (ptr to packed)
// an entry can either be a key or a data entry
BKEYDATA *entry (int i)
{
if(i<0 || i>=pg->entries){printf("entry:%d\n",i);errr("WordDBPage::entry out iof bounds");}
isleave(); return(GET_BKEYDATA (pg,i ));
}
// get the i'th key stored in this (leave) page. (ptr to packed)
BKEYDATA *key (int i)
{
if(i<0 || 2*i>=pg->entries){printf("key:%d\n",i);errr("WordDBPage::key out iof bounds");}
isleave(); return(GET_BKEYDATA (pg,i*2 ));
}
// get the i'th data stored in this (leave) page. (ptr to packed)
BKEYDATA *data (int i)
{
if(i<0 || 2*i+1>=pg->entries){printf("data:%d\n",i);errr("WordDBPage::data out iof bounds");}
isleave(); return(GET_BKEYDATA (pg,i*2+1));
}
// ********************* Inserting entries into a page ***************
int insert_pos; // offset in page of last inserted entry
int insert_indx; // index of next entry to be inserted
int e_offset(int i) {return((int)(pg->inp[i]));}
// allocate space (in the db page) for adding an entry to this page
void *alloc_entry(int size)
{
size=WORD_ALIGN_TO(size,4);
int inp_pos=((byte *)&(pg->inp[insert_indx]))-(byte *)pg;
insert_pos-=size;
if(insert_pos<=inp_pos)
{
show();
printf("alloc_entry: allocating size:%4d entrynum:insert_indx:%4d at:insert_pos:%4d\n",size,insert_indx,insert_pos);
errr("WordDBPage::alloc_entry: PAGE OVERFLOW");
}
pg->inp[insert_indx++]=insert_pos;
return((void *)((byte *)pg+insert_pos));
}
// add a data entry to this page
void insert_data(WordDBRecord &wrec)
{
isleave();
if(!(insert_indx%2)){errr("WordDBPage::insert_data data must be an odd number!");}
String prec;
wrec.Pack(prec);
int len=prec.length();
int size=len+(sizeof(BKEYDATA)-1);
BKEYDATA *dat=(BKEYDATA *)alloc_entry(size);
dat->len=len;
dat->type=1;//!!!!!!!!!!!!!
memcpy((void *)dat->data,(void *)(char *)prec,len);
}
// add a key entry to this page
void insert_key(WordDBKey &ky)
{
isleave();
if(insert_indx%2){errr("WordDBPage::insert_key key must be an even number!");}
String pkey;
ky.Pack(pkey);
int keylen=pkey.length();
int size=keylen+(sizeof(BKEYDATA)-1);
BKEYDATA *bky=(BKEYDATA *)alloc_entry(size);
bky->len=keylen;
bky->type=1;// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
memcpy((void *)bky->data,(void *)(char *)pkey,keylen);
}
// add a key entry to this internal page
void insert_btikey(WordDBKey &ky,BINTERNAL &bti,int empty=0)
{
isintern();
int keylen=0;
String pkey;
if(!empty)
{
ky.Pack(pkey);
keylen=pkey.length();
}
int size=keylen+((byte *)&(bti.data))-((byte *)&bti);// pos of data field in BINTERNAL
if(empty)
{
if(verbose){printf("WordDBPage::insert_btikey: empty : BINTERNAL:%d datapos:%d keylen:%d size:%d alligned to:%d\n",(int)sizeof(BINTERNAL),
(int)(((byte *)&(bti.data))-((byte *)&bti)),
keylen,size,WORD_ALIGN_TO(size,4));}
}
BINTERNAL *btik=(BINTERNAL *)alloc_entry(size);
btik->len =(empty ? 0 : keylen);
btik->type=1;// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
btik->pgno =bti.pgno;
btik->nrecs=bti.nrecs;
if(!empty){memcpy((void *)btik->data,(void *)(char *)pkey,keylen);}
// else
// {btik->data[0]=0;}// just to avoid uninit memory read
}
int entry_struct_size()
{
return(type==P_IBTREE ? sizeof(BINTERNAL) : sizeof(BKEYDATA ) )-1;
}
int entry_size(int i)
{
return entry_struct_size() + (type==P_IBTREE ? btikey(i)->len : key(i)->len );
}
// ************** Comrpession/Uncompression ***************************
// The compression functions
void Compress_extract_vals_wordiffs(int *nums,int *nums_pos,int nnums,HtVector_byte &wordiffs);
void Compress_show_extracted(int *nums,int *nums_pos,int nnums,HtVector_byte &wordiffs);
void Compress_vals(Compressor &out,int *nums,int *nums_pos,int nnums);
void Compress_vals_changed_flags(Compressor &out,unsigned int *cflags,int n);
void Compress_header(Compressor &out);
int Compress_main(Compressor &out);
Compressor *Compress(int debug=0, DB_CMPR_INFO *cmprInfo=NULL);
// The uncompression functions
int Uncompress(Compressor *pin,int debug=0, DB_CMPR_INFO *cmprInfo=NULL);
int Uncompress_main(Compressor *pin);
void Uncompress_vals_chaged_flags(Compressor &in,unsigned int **pcflags,int *pn);
int Uncompress_header(Compressor &in);
void Uncompress_rebuild(unsigned int **rnums,int *rnum_sizes,int nnums,byte *rworddiffs,int nrworddiffs);
void Uncompress_show_rebuild(unsigned int **rnums,int *rnum_sizes,int nnums,byte *rworddiffs,int nrworddiffs);
int TestCompress(int debuglevel);
int Compare(WordDBPage &other);
// the following functions are use to compress/uncompress
// keys/data directly
// This is necesary for the first key/data elements of the page
void compress_key(Compressor &out,int i)
{
if(type==P_IBTREE)
{
int len=btikey(i)->len;
out.put_uint(len,NBITS_KEYLEN,label_str("seperatekey_len",i));
if(verbose){printf("WordDBPage::compress_key:compress(typ3):%d ::: sizeof(BINTERNAL):%d\n",len,(int)sizeof(BINTERNAL));}
out.put_uint(btikey(i)->len ,sizeof(btikey(i)->len )*8,label_str("seperatekey_bti_len" ,i));
out.put_uint(btikey(i)->type ,sizeof(btikey(i)->type )*8,label_str("seperatekey_bti_type" ,i));
out.put_uint(btikey(i)->pgno ,sizeof(btikey(i)->pgno )*8,label_str("seperatekey_bti_pgno" ,i));
out.put_uint(btikey(i)->nrecs,sizeof(btikey(i)->nrecs)*8,label_str("seperatekey_bti_nrecs",i));
if(len){out.put_zone((byte *)btikey(i)->data,8*len,label_str("seperatekey_btidata",i));}
}
else
{
int len=key(i)->len;
out.put_uint(len,NBITS_KEYLEN,label_str("seperatekey_len",i));
if(verbose){printf("WordDBPage::compress_key: compress(typ5):%d\n",len);}
out.put_zone((byte *)key(i)->data,8*len,label_str("seperatekey_data",i));
}
}
void compress_data(Compressor &out,int i)
{
int len=data(i)->len;
out.put_uint(len,NBITS_DATALEN,label_str("seperatedata_len",i));
if(verbose){printf("WordDBPage::compress_data: compressdata(typ5):%d\n",len);}
out.put_zone((byte *)data(i)->data,8*len,label_str("seperatedata_data",i));
}
WordDBKey uncompress_key(Compressor &in,int i)
{
WordDBKey res;
int len=in.get_uint(NBITS_KEYLEN,label_str("seperatekey_len",i));
if(verbose){printf("WordDBPage::uncompress_key: seperatekey:len:%d\n",len);}
if(type==P_IBTREE)
{
if(len==0 && i!=0){errr("WordDBPage::uncompress_key: keylen=0 && i!=0");}
BINTERNAL bti;
bti.len =in.get_uint(sizeof(bti.len )*8,label_str("seperatekey_bti_len" ,i));
bti.type =in.get_uint(sizeof(bti.type )*8,label_str("seperatekey_bti_type" ,i));
bti.pgno =in.get_uint(sizeof(bti.pgno )*8,label_str("seperatekey_bti_pgno" ,i));
bti.nrecs=in.get_uint(sizeof(bti.nrecs)*8,label_str("seperatekey_bti_nrecs",i));
if(len!=bti.len){errr("WordDBPage::uncompress_key: incoherence: len!=bti.len");}
if(len)
{
byte *gotdata=new byte[len];
CHECK_MEM(gotdata);
in.get_zone(gotdata,8*len,label_str("seperatekey_btidata",i));
res=WordDBKey(gotdata,len);
delete [] gotdata;
}
insert_btikey(res,bti,(len==0 ? 1:0));
}
else
{
byte *gotdata=new byte[len];
CHECK_MEM(gotdata);
in.get_zone(gotdata,8*len,label_str("seperatekey_data",i));
res=WordDBKey(gotdata,len);
insert_key(res);
delete [] gotdata;
}
return res;
}
WordDBRecord uncompress_data(Compressor &in,int i,int rectyp)
{
WordDBRecord res;
int len=in.get_uint(NBITS_DATALEN,label_str("seperatedata_len",i));
if(verbose)printf("uncompressdata:len:%d\n",len);
byte *gotdata=new byte[len];
CHECK_MEM(gotdata);
in.get_zone(gotdata,8*len,label_str("seperatedata_data",i));
res=WordDBRecord(gotdata,len,rectyp);
insert_data(res);
delete [] gotdata;
return res;
}
// exctracted numerical fields
const char* number_field_label(int j)
{
if(j>0 && jsort[j].name);}
if( j==CNFLAGS )return "CNFLAGS " ;
if( j==CNDATASTATS0 )return "CNDATASTATS0 " ;
if( j==CNDATASTATS1 )return "CNDATASTATS1 " ;
if( j==CNDATADATA )return "CNDATADATA " ;
if( j==CNBTIPGNO )return "CNBTIPGNO " ;
if( j==CNBTINRECS )return "CNBTINRECS " ;
if( j==CNWORDDIFFPOS )return "CNWORDDIFFPOS" ;
if( j==CNWORDDIFFLEN )return "CNWORDDIFFLEN" ;
return "BADFIELD";
}
// positions of different fileds in
// number arrays that are extracted
int CNFLAGS ;// FLAGS: which key-fields have changed
int CNFIELDS ;// first numerical field
int CNDATASTATS0 ;// word record - stats element 0
int CNDATASTATS1 ;// word record - stats element 1
int CNDATADATA ;// word record - data
int CNBTIPGNO ;// internal page: page pointed at by node
int CNBTINRECS ;// internal page: ??
int CNWORDDIFFPOS ;// position of first caracter that changed in word
int CNWORDDIFFLEN ;// number of chars that changed in word
int nnums ;
// ************** DEBUGING/BENCHMARKING ***************
void show();
int verbose;
int debug;
// ************** Initialization/Destruction *****************
// initialize when header is valid
void init()
{
type=pg->type;
n=pg->entries;
nk=(type==P_LBTREE ? n/2 : n);
insert_pos=pgsz;
insert_indx=0;
}
void init0()
{
CNFLAGS =0;
CNFIELDS =1;
CNDATASTATS0 = WordKey::NFields() ;
CNDATASTATS1 = WordKey::NFields() + 1;
CNDATADATA = WordKey::NFields() + 2;
CNBTIPGNO = WordKey::NFields() + 3;
CNBTINRECS = WordKey::NFields() + 4;
CNWORDDIFFPOS = WordKey::NFields() + 5;
CNWORDDIFFLEN = WordKey::NFields() + 6;
nnums=(CNWORDDIFFLEN+1);
pg=NULL;
pgsz=0;
n=0;
nk=0;
type=-1;
verbose=0;
debug=0;
insert_pos=pgsz;
insert_indx=0;
}
// db page was created here, destroy it
void delete_page()
{
if(!pg){errr("WordDBPage::delete_page: pg==NULL");}
delete [] pg;
pg=NULL;
}
// unlink db page from this encapsulation
void unset_page()
{
if(!pg){errr("WordDBPage::unset_page: pg==NULL");}
pg=NULL;
}
// the DB page must be unset or deleted
// before destroying this encapsulation
~WordDBPage()
{
if(pg){errr("WordDBPage::~WordDBPage: page not empty");}
}
WordDBPage(int npgsz)
{
init0();
pgsz=npgsz;
pg=(PAGE *)(new byte[pgsz]);
CHECK_MEM(pg);
insert_pos=pgsz;
insert_indx=0;
}
WordDBPage(const u_int8_t* buff,int buff_length)
{
init0();
pg=(PAGE *)buff;
pgsz=buff_length;
insert_pos=pgsz;
insert_indx=0;
init();
}
};
#endif// _WordDBPage_h_