1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26
27 #ifndef _h_page_map_
28 #define _h_page_map_
29
30 #include <assert.h>
31
32 #ifndef _h_vdb_extern_
33 #include <vdb/extern.h>
34 #endif
35
36 #ifndef _h_klib_defs_
37 #include <klib/defs.h>
38 #endif
39
40 #include <klib/data-buffer.h>
41 #include <klib/refcount.h>
42
43 #if _DEBUGGING
44 #define _HEAVY_PAGEMAP_DEBUGGING 0
45 #endif
46
47 struct KDataBuffer;
48
49 typedef uint32_t pm_size_t;
50 typedef uint32_t row_count_t;
51 typedef uint32_t elem_count_t;
52
53 typedef enum { PM_REGION_EXPAND_UNKNOWN=0, /** notset **/
54 PM_REGION_EXPAND_FULL, /** full expansion - 2 arrays: offset,length**/
55 PM_REGION_EXPAND_SAMELEN, /** partial expansion - 1 array: offset **/
56 PM_REGION_EXPAND_EQUIDISTANT, /** data have same length and always unique - no expansion needed **/
57 PM_REGION_EXPAND_SAMEDATA /** same data - no expansion needed ***/
58 } pm_expand_region_type_t;
59
60 typedef struct PageMapRegion {
61 row_count_t start_row;
62 row_count_t numrows;
63 elem_count_t data_offset;/** for unexpanded regions first direct offset into data**/
64 /** for expanded regions - offset into expanded storage **/
65 elem_count_t length; /** first length of the region ***/
66 uint8_t type; /** one of the types from pm_expand_region_type_t*/
67 bool expanded; /** if expandable storage is being used ***/
68 } PageMapRegion;
69
70
71
72
73 typedef struct PageMap {
74 /* memory allocation object for length[], leng_run[], data_run[] */
75 KDataBuffer cstorage;
76
77 /* array of row lengths
78 * has leng_recs elements
79 * is sized to reserve_leng elements
80 * == storage.base
81 */
82 bool random_access;
83 enum { eBlobPageMapOptimizedNone, eBlobPageMapOptimizedSucceeded, eBlobPageMapOptimizedFailed} optimized;
84 elem_count_t *length;
85
86 /* array of run lengths of row lengths
87 * has leng_recs elements
88 * is sized to reserve_leng elements
89 * == length + reserve_leng
90 */
91 row_count_t *leng_run;
92
93 /* array of repeat counts of data
94 * has data_recs elements
95 * is sized to reserve_data elements
96 * == leng_run + reserve_leng
97 * OPTIONAL
98 */
99 row_count_t *data_run;
100 /** expanded offsets into data - needed for random access ***/
101 /** only valid when random_access is true **/
102 elem_count_t *data_offset;
103
104 /******* DYNAMIC EXPANSION CONTROL *****************/
105 PageMapRegion *exp_rgn_last;
106 row_count_t exp_row_last; /* last row analyzed for region expansion */
107 row_count_t exp_lr_used; /* how much leng_run was used from the */
108 pm_size_t exp_lr_last; /* index of last leng_run expanded */
109 pm_size_t exp_dr_last; /* index of last data_run expanded */
110 pm_size_t exp_rgn_cnt; /* current number of expanded regions */
111 elem_count_t exp_data_offset_last;/* last offset into data */
112
113
114 KDataBuffer istorage; /* binary searchable storage for expansion regions */
115 KDataBuffer dstorage; /* storage for expanded data */
116 /** LAST SEARCH CONTROL *****/
117 pm_size_t i_rgn_last; /* region index found in previous lookup **/
118 PageMapRegion* rgn_last; /* redundant - region found in previous lookup **/
119
120 /****************************/
121
122 pm_size_t leng_recs; /* number of valid elements in length[] and leng_run[] */
123 pm_size_t data_recs; /* number of valid elements in data_run[] */
124 pm_size_t reserve_leng; /* number of allocated elements in length[] and leng_run[] */
125 pm_size_t reserve_data; /* number of allocated elements in data_run[] */
126 pm_size_t start_valid; /* the expanded array contains valid data upto start_valid */
127 row_count_t row_count; /* total number of rows in page map */
128 row_count_t pre_exp_row_count; /* number of rows pre-expanded */
129 KRefcount refcount;
130 } PageMap;
131
132
133 /* a pessimistic estimate - actual size will *always* be less than or equal */
134 size_t PageMapGetMaxBufferSize(const PageMap *self);
135
136 rc_t PageMapSerialize(const PageMap *self, struct KDataBuffer *buffer, uint64_t byte_offset, uint64_t *actual_bytes);
137
138 rc_t PageMapDeserialize(PageMap **lhs, const void *src, uint64_t src_bytes, uint64_t row_count);
139
140 rc_t PageMapRelease(const PageMap *self);
141
142 rc_t PageMapAddRef(const PageMap *self);
143
144 /* PageMapGetIdxRowInfo
145 *
146 * Get row length and starting element number for a row.
147 * This is a potentially expensive operation; the first time
148 * it is called for any page map, the memory used by the page
149 * map nearly doubles and an index is built on the row length
150 * runs and the data runs. Subsequent calls are O(N log N).
151 * However, asking for the information about the first row
152 * (i.e. idx = 0) never causes allocation or indexing and is
153 * always O(1).
154 *
155 * Params:
156 * self: [IN] the page map
157 * idx: the row number starting from 0
158 * starting_element: [OUT, OPTIONAL]
159 *
160 * Returns:
161 * the length of the row
162 * or 0 if not found
163 * repeat_count - how may times this row is repeated from idx
164 */
165 uint32_t PageMapGetIdxRowInfo(const PageMap *self, uint32_t idx, uint32_t *starting_element, uint32_t * repeat_count);
166
167 rc_t PageMapNew(PageMap **lhs, uint32_t reserve);
168
169 rc_t PageMapNewSingle(PageMap **lhs, uint64_t row_count, uint64_t row_length);
170
171 rc_t PageMapNewFixedRowLength(PageMap **lhs, uint64_t row_count, uint64_t row_len);
172
173 rc_t PageMapToRandomAccess(PageMap **rslt, PageMap * src,uint32_t *data_offset);
174
175 uint32_t PageMapFixedRowLength(const PageMap *self);
176
177 rc_t PageMapRowLengthRange(const PageMap *self, elem_count_t *min,elem_count_t *max);
178
179 uint32_t PageMapHasSimpleStructure(const PageMap *self);
180
181 rc_t PageMapAppendRows(PageMap *self, uint64_t row_length, uint64_t run_length, bool same_data);
182
183 #define PageMapAppendRow(SELF, ROW_LENGTH, SAME_DATA) (PageMapAppendRows((SELF), (ROW_LENGTH), 1, SAME_DATA))
184
185 /* append some rows of the same data */
186 #define PageMapAppendSomeRows(SELF, ROW_LENGTH, RUN_LENGTH) (PageMapAppendRows((SELF), (ROW_LENGTH), (RUN_LENGTH), false))
187
188 /* concatenate two page maps */
189 rc_t PageMapAppend(PageMap *self, const PageMap *other);
190
191 /** Find data using pagemap ***/
192 rc_t PageMapFindRow(const PageMap *cself,uint64_t row,uint32_t * data_offset,uint32_t * data_length,uint32_t * repeat_count);
193
194
195 /*
196 -1: error
197 0: not same
198 1: compatible (i.e. all rows same length)
199 else: identical
200 */
201 int PageMapCompare(const PageMap *a, const PageMap *b);
202 /* same but static columns */
203 int PageMapCompareStatic(const PageMap *a, const PageMap *b);
204
205 typedef struct PageMapIterator PageMapIterator;
206 struct PageMapIterator {
207 row_count_t last_row;
208 row_count_t cur_row;
209 PageMapRegion** rgns; /** all regions from the pagemap **/
210 pm_size_t cur_rgn; /** offset of the current region **/
211 row_count_t cur_rgn_row; /** row relative to offset of the region **/
212 elem_count_t **exp_base; /*** exp buffer ***/
213 row_count_t repeat_count; /** remaining repeat count **/
214 elem_count_t static_datalen;
215 #if _HEAVY_PAGEMAP_DEBUGGING
216 PageMap *parent;
217 #endif
218 };
219
220 VDB_EXTERN rc_t PageMapNewIterator(const PageMap *self, PageMapIterator *lhs, uint64_t first_row, uint64_t num_rows);
221
PageMapIteratorAdvance(PageMapIterator * self,row_count_t rows)222 static __inline__ bool PageMapIteratorAdvance(PageMapIterator *self, row_count_t rows)
223 {
224 if (self->cur_row + rows < self->last_row) {
225 self->cur_row += rows;
226 self->cur_rgn_row += rows;
227 if(self->repeat_count > rows) self->repeat_count-= rows;
228 else self->repeat_count = 0;
229 if(self->rgns){/** non-static, non simple random access**/
230 while((*self->rgns)[self->cur_rgn].numrows <= self->cur_rgn_row){
231 self->cur_rgn_row -= (*self->rgns)[self->cur_rgn].numrows;
232 self->cur_rgn++;
233 }
234 }
235 return true;
236 }
237 return false;
238 }
239
240 #define PageMapIteratorNext(SELF) PageMapIteratorAdvance(SELF, 1)
241
PageMapIteratorDataLength(const PageMapIterator * self)242 static __inline__ elem_count_t PageMapIteratorDataLength(const PageMapIterator *self)
243 {
244 elem_count_t datalen=0;
245 if(self->rgns == NULL) {/* static or simple random-access*/
246 return self->static_datalen;
247 }
248 switch ((*self->rgns)[self->cur_rgn].type){
249 case PM_REGION_EXPAND_FULL:
250 if((*self->rgns)[self->cur_rgn].expanded){
251 datalen = (*self->exp_base)[(*self->rgns)[self->cur_rgn].data_offset + 2*self->cur_rgn_row];
252 } else {
253 datalen = (*self->rgns)[self->cur_rgn].length;
254 }
255 break;
256 case PM_REGION_EXPAND_SAMELEN:
257 case PM_REGION_EXPAND_EQUIDISTANT:
258 case PM_REGION_EXPAND_SAMEDATA:
259 datalen = (*self->rgns)[self->cur_rgn].length;
260 break;
261 default:
262 assert(0);
263 break;
264 }
265 #if _HEAVY_PAGEMAP_DEBUGGING
266 {
267 elem_count_t dtl,dto,dtr;
268 PageMapFindRow(self->parent,self->cur_row,&dto,&dtl,&dtr);
269 assert(dtl==datalen);
270 }
271 #endif
272 /*printf("DATA_LEN=%d\n",datalen);*/
273 return datalen;
274 }
275
PageMapIteratorDataOffset(const PageMapIterator * self)276 static __inline__ elem_count_t PageMapIteratorDataOffset(const PageMapIterator *self)
277 {
278 elem_count_t dataoff=0;
279 if(self->rgns == NULL){ /** static or simple random **/
280 if(self->exp_base != NULL) /** simple random access */
281 dataoff= (*self->exp_base)[self->cur_row];
282 return dataoff;
283 }
284 switch ((*self->rgns)[self->cur_rgn].type){
285 case PM_REGION_EXPAND_FULL:
286 if((*self->rgns)[self->cur_rgn].expanded){
287 dataoff = (*self->exp_base)[(*self->rgns)[self->cur_rgn].data_offset + 2*self->cur_rgn_row + 1];
288 } else {
289 dataoff = (*self->rgns)[self->cur_rgn].data_offset;
290 }
291 break;
292 case PM_REGION_EXPAND_SAMELEN:
293 if((*self->rgns)[self->cur_rgn].expanded){
294 dataoff = (*self->exp_base)[(*self->rgns)[self->cur_rgn].data_offset + self->cur_rgn_row];
295 } else {
296 dataoff = (*self->rgns)[self->cur_rgn].data_offset;
297 }
298 break;
299 case PM_REGION_EXPAND_EQUIDISTANT:
300 dataoff = (*self->rgns)[self->cur_rgn].data_offset + (*self->rgns)[self->cur_rgn].length * self->cur_rgn_row;
301 break;
302 case PM_REGION_EXPAND_SAMEDATA:
303 dataoff = (*self->rgns)[self->cur_rgn].data_offset;
304 break;
305 default:
306 assert(0);
307 break;
308 }
309 #if _HEAVY_PAGEMAP_DEBUGGING
310 {
311 elem_count_t dtl,dto,dtr;
312 PageMapFindRow(self->parent,self->cur_row,&dto,&dtl,&dtr);
313 assert(dto==dataoff);
314 }
315 #endif
316 return dataoff;
317 }
318
PageMapIteratorRepeatCount(const PageMapIterator * cself)319 static __inline__ row_count_t PageMapIteratorRepeatCount(const PageMapIterator *cself)
320 {
321 assert( cself );
322 if(cself->repeat_count==0){
323 PageMapIterator *self = (PageMapIterator*) cself;
324 if(self->rgns==NULL){ /** must be simple random access **/
325 uint64_t i;
326 assert( ( ( self->exp_base == NULL ) || ( *self->exp_base == NULL ) ) ? self->cur_row+1 >= self->last_row : true );
327 for(i=self->cur_row+1,self->repeat_count=1;
328 i< self->last_row && (*self->exp_base)[i]==(*self->exp_base)[self->cur_row];
329 i++,self->repeat_count++){}
330 } else {
331 switch ((*self->rgns)[self->cur_rgn].type){
332 case PM_REGION_EXPAND_FULL:
333 if((*self->rgns)[self->cur_rgn].expanded){
334 row_count_t i;
335 elem_count_t* base = (*self->exp_base) + (*self->rgns)[self->cur_rgn].data_offset;
336 self->repeat_count = 1;
337 for(i=self->cur_rgn_row+1;i<(*self->rgns)[self->cur_rgn].numrows;i++){
338 if(base[2*self->cur_rgn_row]== base[2*i] && base[2*self->cur_rgn_row+1]== base[2*i+1]) self->repeat_count++;
339 else break;
340 }
341 } else {
342 self->repeat_count = (*self->rgns)[self->cur_rgn].numrows - self->cur_rgn_row;
343 }
344 break;
345 case PM_REGION_EXPAND_SAMELEN:
346 if((*self->rgns)[self->cur_rgn].expanded){
347 row_count_t i;
348 elem_count_t* base = (*self->exp_base) + (*self->rgns)[self->cur_rgn].data_offset;
349 self->repeat_count = 1;
350 for(i=self->cur_rgn_row+1;i<(*self->rgns)[self->cur_rgn].numrows;i++){
351 if(base[self->cur_rgn_row] == base[i]) self->repeat_count++;
352 else break;
353 }
354 } else {
355 self->repeat_count = (*self->rgns)[self->cur_rgn].numrows - self->cur_rgn_row;
356 }
357 break;
358 case PM_REGION_EXPAND_EQUIDISTANT:
359 self->repeat_count = 1;
360 break;
361 case PM_REGION_EXPAND_SAMEDATA:
362 self->repeat_count = (*self->rgns)[self->cur_rgn].numrows - self->cur_rgn_row;
363 break;
364 default:
365 assert(0);
366 break;
367 }
368 }
369 }
370 #if _HEAVY_PAGEMAP_DEBUGGING
371 {
372 elem_count_t dtl,dto,dtr;
373 PageMapFindRow(cself->parent,cself->cur_row,&dto,&dtl,&dtr);
374 assert(dtr==cself->repeat_count);
375 }
376 #endif
377 return cself->repeat_count;
378 }
379
380 elem_count_t PageMapLastLength(const PageMap *cself);
381 bool PageMapHasRows(const PageMap *self);
382 rc_t PageMapExpand(const PageMap *cself, row_count_t upto);
383 rc_t PageMapExpandFull(const PageMap *cself);
384 rc_t PageMapPreExpandFull(const PageMap *cself, row_count_t upto);
385
386 #endif /* _h_page_map_ */
387