1 /*===========================================================================
2 *
3 *                            PUBLIC DOMAIN NOTICE
4 *               National Center for Biotechnology Information
5 *
6 *  This software/database is a "United States Government Work" under the
7 *  terms of the United States Copyright Act.  It was written as part of
8 *  the author's official duties as a United States Government employee and
9 *  thus cannot be copyrighted.  This software/database is freely available
10 *  to the public for use. The National Library of Medicine and the U.S.
11 *  Government have not placed any restriction on its use or reproduction.
12 *
13 *  Although all reasonable efforts have been taken to ensure the accuracy
14 *  and reliability of the software and data, the NLM and the U.S.
15 *  Government do not and cannot warrant the performance or results that
16 *  may be obtained by using this software or data. The NLM and the U.S.
17 *  Government disclaim all warranties, express or implied, including
18 *  warranties of performance, merchantability or fitness for any particular
19 *  purpose.
20 *
21 *  Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26 
27 #include <kdb/extern.h>
28 
29 #include "index-priv.h"
30 #include "idstats-priv.h"
31 
32 #include <kdb/index.h>
33 #include <klib/ptrie.h>
34 #include <klib/pack.h>
35 #include <klib/text.h>
36 #include <klib/log.h>
37 #include <klib/rc.h>
38 #include <sysalloc.h>
39 
40 #include <stdlib.h>
41 #include <limits.h>
42 #include <stdio.h>
43 #include <string.h>
44 #include <endian.h>
45 #include <byteswap.h>
46 #include <assert.h>
47 
48 
49 
50 /*--------------------------------------------------------------------------
51  * KPTrieIndexCCParms
52  *  consistency check parameter block
53  */
54 typedef struct KPTrieIndexCCParms_v2 KPTrieIndexCCParms_v2;
55 struct KPTrieIndexCCParms_v2
56 {
57     KIdStats stats;
58     rc_t rc;
59     const KPTrieIndex_v2 *self;
60     const KIndex *outer;
61     bool key2id;
62     bool id2key;
63     bool all_ids;
64     bool convertFromV1;
65     bool failed;
66 };
67 
68 /* Init
69  */
70 static
KPTrieIndexCCParmsInit_v2(KPTrieIndexCCParms_v2 * pb,const KPTrieIndex_v2 * self,const KIndex * outer,bool key2id,bool id2key,bool all_ids,bool convertFromV1)71 void KPTrieIndexCCParmsInit_v2 ( KPTrieIndexCCParms_v2 *pb,
72     const KPTrieIndex_v2 *self, const KIndex *outer, bool key2id, bool id2key, bool all_ids, bool convertFromV1 )
73 {
74     KIdStatsInit ( & pb -> stats );
75     pb -> rc = 0;
76     pb -> self = self;
77     pb -> outer = outer;
78     pb -> key2id = key2id;
79     pb -> id2key = self -> ord2node ? id2key : false;
80     pb -> all_ids = all_ids;
81     pb -> convertFromV1 = convertFromV1;
82     pb -> failed = false;
83 }
84 
85 /* Whack
86  */
87 static
KPTrieIndexCCParmsWhack_v2(KPTrieIndexCCParms_v2 * pb)88 void KPTrieIndexCCParmsWhack_v2 ( KPTrieIndexCCParms_v2 *pb )
89 {
90     KIdStatsWhack ( & pb -> stats );
91 }
92 
93 /*--------------------------------------------------------------------------
94  * KPTrieIndex_v2
95  *  persisted keymap
96  */
97 
98 /* CheckConsistency
99  *  runs a consistency check as efficiently as possible
100  *
101  *  in all cases, we make use of "PTrieForEach" to visit each node
102  *  using the natural storage order. each node returned is counted,
103  *  read and inserted into a BSTree whose nodes merge adjacent and
104  *  overlapping ids into existing nodes.
105  *
106  *  if running a deep "key->id" test, then the key is first regenerated
107  *  from the node, and then used to retrieve the id via the KIndex.
108  *
109  *  if the projection index exists, the id is tested against the node
110  *  to ensure that projection works. if "id->key" is true, then use
111  *  the KIndex to produce a key and compare it against the node.
112  */
113 static
KPTrieIndexCCVisit_v2(PTNode * n,void * data)114 bool CC KPTrieIndexCCVisit_v2 ( PTNode *n, void *data )
115 {
116     KPTrieIndexCCParms_v2 *pb = (KPTrieIndexCCParms_v2 *)data;
117     const KPTrieIndex_v2 *self = pb -> self;
118 
119     rc_t rc;
120     int64_t id;
121     size_t usize;
122     uint64_t span;
123     uint32_t i, ord;
124 
125     /* detect conversion from v1 */
126     if ( pb -> convertFromV1 && self -> id_bits == 0 )
127     {
128         uint32_t id32;
129 
130         /* payload of v1 PTNode is a 32-bit spot id */
131         assert ( n -> data . size == sizeof id32 );
132         memmove ( & id32, n -> data . addr, sizeof id32 );
133         id = self -> byteswap ? bswap_32 ( id32 ) : id32;
134     }
135     else
136     {
137         /* native v2 */
138         /* TBD - should this pass n -> data . size * 8 ??? */
139         if ( self -> id_bits != 0 )
140         {
141             rc = Unpack ( self -> id_bits, sizeof id * 8,
142                 n -> data . addr, 0, self -> id_bits, NULL,
143                 & id, sizeof id, & usize );
144             if ( rc != 0 )
145             {
146                 PLOGMSG ( klogWarn, ( klogWarn, "could not determine row id of v2 node $(nid)",
147                                  "nid=0x%08x", n -> id ));
148                 pb -> failed = true;
149                 return false;
150             }
151         }
152         else
153         {
154             id = 0;
155         }
156 
157         id += self -> first;
158     }
159 
160     /* convert from row-id to 1-based ordinal index */
161     ord = KPTrieIndexID2Ord_v2 ( self, id );
162     if ( ord == 0 )
163     {
164         /* 0 means not found */
165         PLOGMSG ( klogWarn, ( klogWarn, "v2 node $(nid): row id $(rid) not found in trie",
166                               "nid=0x%08x,rid=%ld", n -> id, id ));
167         pb -> failed = true;
168         return false;
169     }
170 
171     if ( self -> ord2node != NULL )
172     {
173         /* to calculate span of last entry, where
174            1-based "ord" == the number of nodes,
175            just find the distance between the last row-id
176            in index ( self->maxid ) and the current row-id */
177         if ( ord == self -> count )
178             span = self -> maxid - id + 1;
179         else
180         {
181             /* from here on, we will use "ord" to be the
182                ZERO-BASED index of the slot FOLLOWING the
183                one corresponding to id. we want to find the
184                row-id AFTER the current one and calculate distance */
185             switch ( self -> variant )
186             {
187             /* linear array */
188             case 0:
189                 /* starting with the FOLLOWING slot,
190                    count duplicate entries */
191                 for ( i = ord; i < self -> count; ++ i )
192                 {
193                     if ( n -> id != self -> ord2node [ i ] )
194                         break;
195                 }
196                 span = self -> first + i - id;
197                 break;
198 
199             /* packed ordered array of sparse row-ids from here on
200                we already have "id" for this node, so the span will
201                be the distance from NEXT id - 1-based ord is 0-based next */
202             case 1:
203                 span = self -> first + self -> id2ord . v8 [ ord ] - id;
204                 break;
205             case 2:
206                 span = self -> first + self -> id2ord . v16 [ ord ] - id;
207                 break;
208             case 3:
209                 span = self -> first + self -> id2ord . v32 [ ord ] - id;
210                 break;
211             case 4:
212                 span = self -> first + self -> id2ord . v64 [ ord ] - id;
213                 break;
214             default:
215                 PLOGMSG ( klogErr, ( klogErr, "PTrie v2 index has bad variant code: $(variant)",
216                                      "variant=%u", self -> variant ));
217                 pb -> rc = RC ( rcDB, rcIndex, rcValidating, rcIndex, rcCorrupt );
218                 return true;
219             }
220         }
221     }
222     else if ( self -> span_bits == 0 )
223         span = 1;
224     else
225     {
226         /* TBD - this case is never used in practice.
227            it would be an skey without a projection index */
228         rc = Unpack ( self -> span_bits, sizeof span * 8,
229                       n -> data . addr, 0, self -> id_bits, NULL,
230                       & span, sizeof span, & usize );
231         if ( rc != 0 )
232         {
233             PLOGMSG ( klogWarn, ( klogWarn, "could not determine span of v2 node $(nid), row id $(rid)",
234                                   "nid=0x%08x,rid=%ld", n -> id, id ));
235             pb -> failed = true;
236             return false;
237         }
238     }
239 
240     /* record the node, row id, span */
241     pb -> rc = KIdStatsInsert ( & pb -> stats, id, span );
242     if ( pb -> rc != 0 )
243     {
244         PLOGERR ( klogSys, ( klogSys, pb -> rc, "failed when examining node id $(nid) with row id $(rid), span $(span)",
245                              "nid=0x%08x,span=%u,rid=%ld", n -> id, span, id ));
246         return true;
247     }
248 
249     /* if we have a projection index, test it */
250     if ( self -> ord2node != NULL )
251     {
252         if ( id < self -> first || id > self -> last )
253         {
254             PLOGMSG ( klogWarn, ( klogWarn, "node id $(nid) with row id $(rid) is not within projection range of $(min_rid)..$(max_rid)",
255                                  "nid=0x%08x,rid=%ld,min_rid=%ld,max_rid=%ld",
256                                   n -> id, id, self -> first, self -> last ));
257             pb -> failed = true;
258             return false;
259         }
260         for ( i = 0; i < span; ++ i )
261         {
262             if ( self -> ord2node [ i + ord - 1 ] != n -> id )
263             {
264                 PLOGMSG ( klogWarn, ( klogWarn, "node id $(nid) with row id $(rid) does not match projection node id of $(pnid)",
265                                       "nid=0x%08x,rid=%ld,pnid=0x%08x", n -> id, id + 1, self -> ord2node [ i + ord - 1 ] ));
266                 pb -> failed = true;
267                 return false;
268             }
269             if ( ! pb -> all_ids || self -> variant != 0 )
270                 break;
271         }
272     }
273 
274     /* if performing deeper tests */
275     if ( pb -> key2id || pb -> id2key )
276     {
277         int64_t start_id;
278         uint64_t id_count;
279 
280         /* need to recover key from given node */
281         const String *orig;
282         pb -> rc = PTNodeMakeKey ( n, & orig );
283         if ( pb -> rc != 0 )
284         {
285             PLOGERR ( klogSys, ( klogSys, pb -> rc, "failed when retrieving text for node id $(nid) with row id $(rid)",
286                                  "nid=0x%08x,rid=%u", n -> id, id ));
287             return true;
288         }
289 
290         /* key->id test */
291         if ( pb -> key2id )
292         {
293             rc = KIndexFindText ( pb -> outer, orig -> addr, & start_id, & id_count, NULL, NULL );
294             if ( rc != 0 )
295             {
296                 PLOGERR ( klogWarn, ( klogWarn, rc, "failed to retrieve start id and count for key '$(key)', row id $(rid)",
297                                       "key=%S,rid=%u", orig, id ) );
298                 pb -> failed = true;
299             }
300             else if ( start_id != ( int64_t ) id || id_count != span )
301             {
302                 PLOGERR ( klogWarn, ( klogWarn, rc, "key '$(key)' maps to start id $(start_id), count $(id_count): expected id $(rid), count 1.",
303                                       "key=%S,rid=%u,start_id=%ld,id_count=%lu", orig, id, start_id, id_count ) );
304                 pb -> failed = true;
305             }
306         }
307 
308         /* id->key test */
309         if ( pb -> id2key )
310         {
311             char buffer [ 256 ], *key = buffer;
312             size_t key_size, bsize = sizeof buffer;
313             if ( sizeof buffer <= orig -> size )
314             {
315                 key = (char *)malloc ( bsize = orig -> size + 1 );
316                 if ( key == 0 )
317                 {
318                     pb -> rc = RC ( rcDB, rcIndex, rcValidating, rcMemory, rcExhausted );
319                     StringWhack ( ( String* ) orig );
320                     return true;
321                 }
322             }
323 
324             for ( i = 0; i < span; ++ i )
325             {
326                 rc = KIndexProjectText ( pb -> outer, id + i, & start_id, & id_count, key, bsize, & key_size );
327                 if ( rc != 0 )
328                 {
329                     PLOGERR ( klogWarn, ( klogWarn, rc, "failed to retrieve key, start id and count for row id $(rid)",
330                                           "rid=%u", id + i ) );
331                     pb -> failed = true;
332                     break;
333                 }
334 
335                 if ( orig -> size != key_size || memcmp ( orig -> addr, key, key_size ) != 0 )
336                 {
337                     PLOGERR ( klogWarn, ( klogWarn, rc, "row $(rid) maps to key '$(key)': expected key '$(orig)'.",
338                                           "rid=%u,key=%.*s,orig=%S", id + i, ( int ) key_size, key, orig ) );
339                     pb -> failed = true;
340                 }
341                 if ( start_id != id || id_count != ( uint64_t ) span )
342                 {
343                     PLOGERR ( klogWarn, ( klogWarn, rc, "row $(rid) maps to start id $(start_id), count $(id_count): expected $(row_start), $(span).",
344                                           "rid=%u,id_count=%lu,start_id=%ld,row_start=%ld,span=%u",
345                                           id, id_count, start_id, id, span ) );
346                     pb -> failed = true;
347                 }
348 
349                 if ( ! pb -> all_ids || pb -> failed )
350                     break;
351             }
352 
353             if ( key != buffer )
354                 free ( key );
355         }
356 
357         StringWhack ( ( String* ) orig );
358     }
359 
360     return false;
361 }
362 
KPTrieIndexCheckConsistency_v2(const KPTrieIndex_v2 * self,int64_t * start_id,uint64_t * id_range,uint64_t * num_keys,uint64_t * num_rows,uint64_t * num_holes,const KIndex * outer,bool key2id,bool id2key,bool all_ids,bool convertFromV1)363 rc_t KPTrieIndexCheckConsistency_v2 ( const KPTrieIndex_v2 *self,
364     int64_t *start_id, uint64_t *id_range, uint64_t *num_keys,
365     uint64_t *num_rows, uint64_t *num_holes,
366     const KIndex *outer, bool key2id, bool id2key, bool all_ids, bool convertFromV1 )
367 {
368     rc_t rc = 0;
369     KPTrieIndexCCParms_v2 pb;
370 
371     if ( self == NULL )
372         return RC ( rcDB, rcIndex, rcValidating, rcParam, rcNull );
373 
374     if ( ( key2id || id2key ) && outer == NULL )
375         return RC ( rcDB, rcIndex, rcValidating, rcSelf, rcNull );
376 
377     KPTrieIndexCCParmsInit_v2 ( & pb, self, outer, key2id, id2key, all_ids, convertFromV1 );
378     if ( PTrieDoUntil ( self -> key2id, KPTrieIndexCCVisit_v2, & pb ) )
379         rc = pb . rc;
380     else if ( pb . failed )
381         rc = RC ( rcDB, rcIndex, rcValidating, rcSelf, rcCorrupt );
382 
383     if ( start_id != NULL )
384         * start_id = pb . stats . i_min_id;
385     if ( id_range != NULL )
386         * id_range = pb . stats . x_max_id - pb . stats . i_min_id;
387     if ( num_keys != NULL )
388         * num_keys = pb . stats . num_entries;
389     if ( num_rows != NULL )
390         * num_rows = pb . stats . num_ids;
391     if ( num_holes != NULL )
392         * num_holes = pb . stats . num_holes;
393 
394     KPTrieIndexCCParmsWhack_v2 ( & pb );
395     return rc;
396 }
397