1 /*===========================================================================
2  *
3  *                            PUBLIC DOMAIN NOTICE
4  *               National Center for Biotechnology Information
5  *
6  *  This software/database is a "United States Government Work" under the
7  *  terms of the United States Copyright Act.  It was written as part of
8  *  the author's official duties as a United States Government employee and
9  *  thus cannot be copyrighted.  This software/database is freely available
10  *  to the public for use. The National Library of Medicine and the U.S.
11  *  Government have not placed any restriction on its use or reproduction.
12  *
13  *  Although all reasonable efforts have been taken to ensure the accuracy
14  *  and reliability of the software and data, the NLM and the U.S.
15  *  Government do not and cannot warrant the performance or results that
16  *  may be obtained by using this software or data. The NLM and the U.S.
17  *  Government disclaim all warranties, express or implied, including
18  *  warranties of performance, merchantability or fitness for any particular
19  *  purpose.
20  *
21  *  Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  */
26 
27 #ifndef _h_page_map_
28 #define _h_page_map_
29 
30 #include <assert.h>
31 
32 #ifndef _h_vdb_extern_
33 #include <vdb/extern.h>
34 #endif
35 
36 #ifndef _h_klib_defs_
37 #include <klib/defs.h>
38 #endif
39 
40 #include <klib/data-buffer.h>
41 #include <klib/refcount.h>
42 
43 #if _DEBUGGING
44 #define _HEAVY_PAGEMAP_DEBUGGING 0
45 #endif
46 
47 struct KDataBuffer;
48 
49 typedef uint32_t pm_size_t;
50 typedef uint32_t row_count_t;
51 typedef uint32_t elem_count_t;
52 
53 typedef enum {  PM_REGION_EXPAND_UNKNOWN=0,     /** notset **/
54                 PM_REGION_EXPAND_FULL,          /** full expansion     - 2 arrays: offset,length**/
55                 PM_REGION_EXPAND_SAMELEN,       /** partial expansion  - 1 array: offset **/
56                 PM_REGION_EXPAND_EQUIDISTANT,   /** data have same length and always unique - no expansion needed **/
57                 PM_REGION_EXPAND_SAMEDATA       /** same data - no expansion needed ***/
58 } pm_expand_region_type_t;
59 
60 typedef struct PageMapRegion {
61 	row_count_t	start_row;
62 	row_count_t	numrows;
63 	elem_count_t	data_offset;/** for unexpanded regions first direct offset into data**/
64 				    /** for expanded regions - offset into expanded storage **/
65 	elem_count_t	length;	    /** first length of the region ***/
66 	uint8_t		type;       /** one of the types from pm_expand_region_type_t*/
67 	bool		expanded;   /** if expandable storage is being used ***/
68 } PageMapRegion;
69 
70 
71 
72 
73 typedef struct PageMap {
74     /* memory allocation object for length[], leng_run[], data_run[] */
75     KDataBuffer cstorage;
76 
77     /* array of row lengths
78      * has leng_recs elements
79      * is sized to reserve_leng elements
80      * == storage.base
81      */
82     bool   random_access;
83     enum { eBlobPageMapOptimizedNone, eBlobPageMapOptimizedSucceeded, eBlobPageMapOptimizedFailed}  optimized;
84     elem_count_t *length;
85 
86     /* array of run lengths of row lengths
87      * has leng_recs elements
88      * is sized to reserve_leng elements
89      * == length + reserve_leng
90      */
91     row_count_t *leng_run;
92 
93     /* array of repeat counts of data
94      * has data_recs elements
95      * is sized to reserve_data elements
96      * == leng_run + reserve_leng
97      * OPTIONAL
98      */
99     row_count_t *data_run;
100     /** expanded offsets into data - needed for random access ***/
101     /** only valid when random_access is true **/
102     elem_count_t *data_offset;
103 
104 /******* DYNAMIC EXPANSION CONTROL *****************/
105     PageMapRegion		*exp_rgn_last;
106     row_count_t			exp_row_last;   /* last row analyzed for region expansion */
107     row_count_t			exp_lr_used;    /* how much leng_run was used from the */
108     pm_size_t			exp_lr_last;	/* index of last leng_run expanded */
109     pm_size_t			exp_dr_last;	/* index of last data_run expanded */
110     pm_size_t			exp_rgn_cnt;    /* current number of expanded regions */
111     elem_count_t		exp_data_offset_last;/* last offset into data */
112 
113 
114     KDataBuffer			istorage;	/* binary searchable storage for expansion regions */
115     KDataBuffer			dstorage;	/* storage for expanded data */
116 /** LAST SEARCH CONTROL *****/
117     pm_size_t			i_rgn_last; 	/* region index found in previous lookup **/
118     PageMapRegion*		rgn_last; 	/* redundant - region found in previous lookup **/
119 
120 /****************************/
121 
122     pm_size_t leng_recs;     /* number of valid elements in length[] and leng_run[] */
123     pm_size_t data_recs;     /* number of valid elements in data_run[] */
124     pm_size_t reserve_leng;  /* number of allocated elements in length[] and leng_run[] */
125     pm_size_t reserve_data;  /* number of allocated elements in data_run[] */
126     pm_size_t start_valid;   /* the expanded array contains valid data upto start_valid */
127     row_count_t row_count;   /* total number of rows in page map */
128     row_count_t pre_exp_row_count; /* number of rows pre-expanded */
129     KRefcount refcount;
130 } PageMap;
131 
132 
133 /* a pessimistic estimate - actual size will *always* be less than or equal */
134 size_t PageMapGetMaxBufferSize(const PageMap *self);
135 
136 rc_t PageMapSerialize(const PageMap *self, struct KDataBuffer *buffer, uint64_t byte_offset, uint64_t *actual_bytes);
137 
138 rc_t PageMapDeserialize(PageMap **lhs, const void *src, uint64_t src_bytes, uint64_t row_count);
139 
140 rc_t PageMapRelease(const PageMap *self);
141 
142 rc_t PageMapAddRef(const PageMap *self);
143 
144 /*  PageMapGetIdxRowInfo
145  *
146  *  Get row length and starting element number for a row.
147  *  This is a potentially expensive operation; the first time
148  *  it is called for any page map, the memory used by the page
149  *  map nearly doubles and an index is built on the row length
150  *  runs and the data runs.  Subsequent calls are O(N log N).
151  *  However, asking for the information about the first row
152  *  (i.e. idx = 0) never causes allocation or indexing and is
153  *  always O(1).
154  *
155  *  Params:
156  *      self: [IN] the page map
157  *      idx:  the row number starting from 0
158  *      starting_element: [OUT, OPTIONAL]
159  *
160  *  Returns:
161  *      the length of the row
162  *      or 0 if not found
163  *      repeat_count - how may times this row is repeated from idx
164  */
165 uint32_t PageMapGetIdxRowInfo(const PageMap *self, uint32_t idx, uint32_t *starting_element, uint32_t * repeat_count);
166 
167 rc_t PageMapNew(PageMap **lhs, uint32_t reserve);
168 
169 rc_t PageMapNewSingle(PageMap **lhs, uint64_t row_count, uint64_t row_length);
170 
171 rc_t PageMapNewFixedRowLength(PageMap **lhs, uint64_t row_count, uint64_t row_len);
172 
173 rc_t PageMapToRandomAccess(PageMap **rslt, PageMap * src,uint32_t *data_offset);
174 
175 uint32_t PageMapFixedRowLength(const PageMap *self);
176 
177 rc_t	PageMapRowLengthRange(const PageMap *self, elem_count_t *min,elem_count_t *max);
178 
179 uint32_t PageMapHasSimpleStructure(const PageMap *self);
180 
181 rc_t PageMapAppendRows(PageMap *self, uint64_t row_length, uint64_t run_length, bool same_data);
182 
183 #define PageMapAppendRow(SELF, ROW_LENGTH, SAME_DATA) (PageMapAppendRows((SELF), (ROW_LENGTH), 1, SAME_DATA))
184 
185 /* append some rows of the same data */
186 #define PageMapAppendSomeRows(SELF, ROW_LENGTH, RUN_LENGTH) (PageMapAppendRows((SELF), (ROW_LENGTH), (RUN_LENGTH), false))
187 
188 /* concatenate two page maps */
189 rc_t PageMapAppend(PageMap *self, const PageMap *other);
190 
191 /** Find data using pagemap ***/
192 rc_t PageMapFindRow(const PageMap *cself,uint64_t row,uint32_t * data_offset,uint32_t * data_length,uint32_t * repeat_count);
193 
194 
195 /*
196  -1: error
197  0: not same
198  1: compatible (i.e. all rows same length)
199  else: identical
200  */
201 int PageMapCompare(const PageMap *a, const PageMap *b);
202 /* same but static columns */
203 int PageMapCompareStatic(const PageMap *a, const PageMap *b);
204 
205 typedef struct PageMapIterator PageMapIterator;
206 struct PageMapIterator {
207     row_count_t		last_row;
208     row_count_t		cur_row;
209     PageMapRegion**	rgns;	  /** all regions from the pagemap **/
210     pm_size_t		cur_rgn;  /** offset of the current region **/
211     row_count_t 	cur_rgn_row; /** row relative to offset of the region **/
212     elem_count_t	**exp_base; /*** exp buffer ***/
213     row_count_t		repeat_count; /** remaining repeat count **/
214     elem_count_t	static_datalen;
215 #if _HEAVY_PAGEMAP_DEBUGGING
216     PageMap *parent;
217 #endif
218 };
219 
220 VDB_EXTERN rc_t PageMapNewIterator(const PageMap *self, PageMapIterator *lhs, uint64_t first_row, uint64_t num_rows);
221 
PageMapIteratorAdvance(PageMapIterator * self,row_count_t rows)222 static __inline__ bool PageMapIteratorAdvance(PageMapIterator *self, row_count_t rows)
223 {
224     if (self->cur_row + rows < self->last_row) {
225 	self->cur_row += rows;
226 	self->cur_rgn_row += rows;
227 	if(self->repeat_count > rows) self->repeat_count-= rows;
228 	else self->repeat_count = 0;
229 	if(self->rgns){/** non-static, non simple random access**/
230 		while((*self->rgns)[self->cur_rgn].numrows <= self->cur_rgn_row){
231 			self->cur_rgn_row -= (*self->rgns)[self->cur_rgn].numrows;
232 			self->cur_rgn++;
233 		}
234         }
235         return true;
236     }
237     return false;
238 }
239 
240 #define PageMapIteratorNext(SELF) PageMapIteratorAdvance(SELF, 1)
241 
PageMapIteratorDataLength(const PageMapIterator * self)242 static __inline__ elem_count_t PageMapIteratorDataLength(const PageMapIterator *self)
243 {
244     elem_count_t datalen=0;
245     if(self->rgns == NULL) {/* static or simple random-access*/
246 	return  self->static_datalen;
247     }
248     switch ((*self->rgns)[self->cur_rgn].type){
249 	case PM_REGION_EXPAND_FULL:
250 		if((*self->rgns)[self->cur_rgn].expanded){
251 			datalen = (*self->exp_base)[(*self->rgns)[self->cur_rgn].data_offset + 2*self->cur_rgn_row];
252 		} else {
253 			datalen = (*self->rgns)[self->cur_rgn].length;
254 		}
255                 break;
256 	case PM_REGION_EXPAND_SAMELEN:
257 	case PM_REGION_EXPAND_EQUIDISTANT:
258 	case PM_REGION_EXPAND_SAMEDATA:
259 		datalen = (*self->rgns)[self->cur_rgn].length;
260 		break;
261 	default:
262 		assert(0);
263 		break;
264    }
265 #if _HEAVY_PAGEMAP_DEBUGGING
266    {
267 	elem_count_t dtl,dto,dtr;
268 	PageMapFindRow(self->parent,self->cur_row,&dto,&dtl,&dtr);
269 	assert(dtl==datalen);
270    }
271 #endif
272    /*printf("DATA_LEN=%d\n",datalen);*/
273    return datalen;
274 }
275 
PageMapIteratorDataOffset(const PageMapIterator * self)276 static __inline__ elem_count_t PageMapIteratorDataOffset(const PageMapIterator *self)
277 {
278     elem_count_t dataoff=0;
279     if(self->rgns == NULL){ /** static or simple random **/
280 	if(self->exp_base != NULL)  /** simple random access */
281 		dataoff= (*self->exp_base)[self->cur_row];
282 	return dataoff;
283     }
284     switch ((*self->rgns)[self->cur_rgn].type){
285         case PM_REGION_EXPAND_FULL:
286                 if((*self->rgns)[self->cur_rgn].expanded){
287                         dataoff = (*self->exp_base)[(*self->rgns)[self->cur_rgn].data_offset + 2*self->cur_rgn_row + 1];
288                 } else {
289                         dataoff = (*self->rgns)[self->cur_rgn].data_offset;
290                 }
291                 break;
292         case PM_REGION_EXPAND_SAMELEN:
293 		if((*self->rgns)[self->cur_rgn].expanded){
294                         dataoff = (*self->exp_base)[(*self->rgns)[self->cur_rgn].data_offset + self->cur_rgn_row];
295                 } else {
296                         dataoff = (*self->rgns)[self->cur_rgn].data_offset;
297                 }
298                 break;
299         case PM_REGION_EXPAND_EQUIDISTANT:
300 		dataoff = (*self->rgns)[self->cur_rgn].data_offset + (*self->rgns)[self->cur_rgn].length * self->cur_rgn_row;
301 		break;
302         case PM_REGION_EXPAND_SAMEDATA:
303                 dataoff = (*self->rgns)[self->cur_rgn].data_offset;
304                 break;
305         default:
306                 assert(0);
307                 break;
308     }
309 #if _HEAVY_PAGEMAP_DEBUGGING
310    {
311         elem_count_t dtl,dto,dtr;
312         PageMapFindRow(self->parent,self->cur_row,&dto,&dtl,&dtr);
313         assert(dto==dataoff);
314    }
315 #endif
316     return dataoff;
317 }
318 
PageMapIteratorRepeatCount(const PageMapIterator * cself)319 static __inline__ row_count_t PageMapIteratorRepeatCount(const PageMapIterator *cself)
320 {
321     assert( cself );
322     if(cself->repeat_count==0){
323 	PageMapIterator *self = (PageMapIterator*) cself;
324 	if(self->rgns==NULL){ /** must be simple random access **/
325 		uint64_t i;
326         assert( ( ( self->exp_base == NULL ) || ( *self->exp_base == NULL ) ) ? self->cur_row+1 >= self->last_row : true );
327 		for(i=self->cur_row+1,self->repeat_count=1;
328                     i< self->last_row && (*self->exp_base)[i]==(*self->exp_base)[self->cur_row];
329                     i++,self->repeat_count++){}
330 	} else {
331 		switch ((*self->rgns)[self->cur_rgn].type){
332 		 case PM_REGION_EXPAND_FULL:
333 			if((*self->rgns)[self->cur_rgn].expanded){
334 				row_count_t i;
335 				elem_count_t* base = (*self->exp_base) + (*self->rgns)[self->cur_rgn].data_offset;
336 				self->repeat_count = 1;
337 				for(i=self->cur_rgn_row+1;i<(*self->rgns)[self->cur_rgn].numrows;i++){
338 					if(base[2*self->cur_rgn_row]== base[2*i] && base[2*self->cur_rgn_row+1]== base[2*i+1]) self->repeat_count++;
339 					else break;
340 				}
341 			} else {
342 				self->repeat_count =  (*self->rgns)[self->cur_rgn].numrows - self->cur_rgn_row;
343 			}
344 			break;
345 		 case PM_REGION_EXPAND_SAMELEN:
346 			if((*self->rgns)[self->cur_rgn].expanded){
347 				row_count_t i;
348 				elem_count_t* base = (*self->exp_base) + (*self->rgns)[self->cur_rgn].data_offset;
349 				self->repeat_count = 1;
350 				for(i=self->cur_rgn_row+1;i<(*self->rgns)[self->cur_rgn].numrows;i++){
351 					if(base[self->cur_rgn_row] == base[i]) self->repeat_count++;
352 					else break;
353 				}
354 			} else {
355 				self->repeat_count = (*self->rgns)[self->cur_rgn].numrows - self->cur_rgn_row;
356 			}
357 			break;
358 		 case PM_REGION_EXPAND_EQUIDISTANT:
359 			self->repeat_count = 1;
360 			break;
361 		 case PM_REGION_EXPAND_SAMEDATA:
362 			self->repeat_count =  (*self->rgns)[self->cur_rgn].numrows - self->cur_rgn_row;
363 			break;
364 		 default:
365 			assert(0);
366 			break;
367 		}
368 	}
369     }
370 #if _HEAVY_PAGEMAP_DEBUGGING
371    {
372         elem_count_t dtl,dto,dtr;
373         PageMapFindRow(cself->parent,cself->cur_row,&dto,&dtl,&dtr);
374         assert(dtr==cself->repeat_count);
375    }
376 #endif
377     return cself->repeat_count;
378 }
379 
380 elem_count_t PageMapLastLength(const PageMap *cself);
381 bool PageMapHasRows(const PageMap *self);
382 rc_t PageMapExpand(const PageMap *cself, row_count_t upto);
383 rc_t PageMapExpandFull(const PageMap *cself);
384 rc_t PageMapPreExpandFull(const PageMap *cself, row_count_t upto);
385 
386 #endif /* _h_page_map_ */
387