1 /*===========================================================================
2 *
3 *                            PUBLIC DOMAIN NOTICE
4 *               National Center for Biotechnology Information
5 *
6 *  This software/database is a "United States Government Work" under the
7 *  terms of the United States Copyright Act.  It was written as part of
8 *  the author's official duties as a United States Government employee and
9 *  thus cannot be copyrighted.  This software/database is freely available
10 *  to the public for use. The National Library of Medicine and the U.S.
11 *  Government have not placed any restriction on its use or reproduction.
12 *
13 *  Although all reasonable efforts have been taken to ensure the accuracy
14 *  and reliability of the software and data, the NLM and the U.S.
15 *  Government do not and cannot warrant the performance or results that
16 *  may be obtained by using this software or data. The NLM and the U.S.
17 *  Government disclaim all warranties, express or implied, including
18 *  warranties of performance, merchantability or fitness for any particular
19 *  purpose.
20 *
21 *  Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26 
27 #ifndef _h_index_priv_
28 #define _h_index_priv_
29 
30 #ifndef _h_index_cmn_
31 #include "index-cmn.h"
32 #endif
33 
34 #ifndef _h_klib_trie_
35 #include <klib/trie.h>
36 #endif
37 
38 #ifdef __cplusplus
39 extern "C" {
40 #endif
41 
42 
43 /*--------------------------------------------------------------------------
44  * forwards
45  */
46 struct BSTNode;
47 struct KDirectory;
48 
49 
50 /*--------------------------------------------------------------------------
51  * V1
52  *  version 1 of the trie index was hard-coded to enforce uniqueness of
53  *  both the string, and a 32-bit id. furthermore, the id was unfortunately
54  *  assumed to occupy a mostly contiguous space, such that the projection
55  *  was always implemented as an array of ptrie node ids where the id was
56  *  used to index the array.
57  *
58  *  the introduction of highly sparse ids led to deprecation of this
59  *  implementation. see version 2 for further information.
60  */
61 
62 
63 /*--------------------------------------------------------------------------
64  * KTrieIdxNode_v1
65  */
66 typedef struct KTrieIdxNode_v1 KTrieIdxNode_v1;
67 struct KTrieIdxNode_v1
68 {
69     TNode n;
70     uint32_t id;
71     char key [ 1 ];
72 };
73 
74 /*--------------------------------------------------------------------------
75  * KTrieIndex_v1
76  */
77 struct KTrieIndex_v1
78 {
79     KPTrieIndex_v1 pt;
80     Trie key2id;
81     KTrieIdxNode_v1 **id2node;
82     uint32_t first;
83     uint32_t last;
84     uint32_t len;
85 };
86 
87 /* insert string into trie, mapping to 32 bit id */
88 rc_t KTrieIndexInsert_v1 ( KTrieIndex_v1 *self,
89     bool proj, const char *key, uint32_t id );
90 
91 /* drop string from trie and all mappings */
92 rc_t KTrieIndexDelete_v1 ( KTrieIndex_v1 *self,
93     bool proj, const char *key );
94 
95 /* persist index to file */
96 rc_t KTrieIndexPersist_v1 ( const KTrieIndex_v1 *self,
97     bool proj, struct KDirectory *dir, const char *path, bool use_md5 );
98 
99 
100 /*--------------------------------------------------------------------------
101  * V2
102  *  version 2 of the trie index was introduced to handle sparse ids,
103  *  and to recognize that ids may be 64 bits and/or negative.
104  *
105  *  v2 introduces strategy identifiers to handle various cases.
106  *
107  *  CONSTRAINTS
108  *  - both key and id are unique ( version 1 )
109  *  - key is not unique, but must map to a contiguous range of ids,
110  *    while ids are unique ( the main use case for SRA )
111  *
112  *  INSERTION
113  *  - ids are observed in increasing order
114  *
115  *  PROJECTION
116  *  - id range is contiguous or nearly so ( can use single array )
117  *  - id range is sparse
118  *
119  *  the implementation may be extended by adding new strategies,
120  *  but the moment the implementation supports 1 to many mappings,
121  *  inserted with ids in increasing order, and an unique constraint
122  *  on the ids themselves.
123  *
124  *  the general case for v2 is "key -> id range", where "id range -> key"
125  *  is via contiguous array if avg ( id range ) <= 2, and via sparse
126  *  array otherwise.
127  *
128  *  for the key -> id mappings, this means that the in-core node
129  *  either retains an id range ( when not using a projection index ),
130  *  or it retains a start-id only, since its range can be determined
131  *  from the projection index.
132  *
133  *  for id -> key mappings, the id is first converted to an ordinal, and
134  *  the ordinal to a node. when ids are contiguous, id -> ordinal is simply
135  *  derived by subtracting the initial start id. when sparse, an id -> ord
136  *  array is used in a binary search to produce the ordinal.
137  *
138  *  ids are assumed to be 64 bit, stored as id - start id, and packed to
139  *  a minimum number of bits to represent the id. ptrie node ids are still
140  *  32-bit entities.
141  */
142 
143 
144 /*--------------------------------------------------------------------------
145  * KTrieIdxNode_v2_s1
146  *  strategy 1 - store only start id, derive range from proj index
147  */
148 typedef struct KTrieIdxNode_v2_s1 KTrieIdxNode_v2_s1;
149 struct KTrieIdxNode_v2_s1
150 {
151     TNode n;
152     int64_t start_id;
153     char key [ 1 ];
154 };
155 
156 /*--------------------------------------------------------------------------
157  * KTrieIdxNode_v2_s2
158  *  strategy 2 - store complete range when not using proj index
159  */
160 typedef struct KTrieIdxNode_v2_s2 KTrieIdxNode_v2_s2;
161 struct KTrieIdxNode_v2_s2
162 {
163     TNode n;
164     int64_t start_id;
165     uint32_t span;
166     char key [ 1 ];
167 };
168 
169 /*--------------------------------------------------------------------------
170  * KTrieIndex_v2
171  */
172 struct KTrieIndex_v2
173 {
174     int64_t first, last;
175     KPTrieIndex_v2 pt;
176     Trie key2id;
177     KTrieIdxNode_v2_s1 **ord2node;
178     uint32_t count;
179     uint32_t max_span;
180 };
181 
182 /* cause persisted tree to be loaded into trie */
183 rc_t KTrieIndexAttach_v2 ( KTrieIndex_v2 *self, bool proj );
184 
185 /* insert string into trie, mapping to 64 bit id */
186 rc_t KTrieIndexInsert_v2 ( KTrieIndex_v2 *self,
187     bool proj, const char *key, int64_t id );
188 
189 /* drop string from trie and all mappings */
190 rc_t KTrieIndexDelete_v2 ( KTrieIndex_v2 *self,
191     bool proj, const char *key );
192 
193 /* persist index to file */
194 rc_t KTrieIndexPersist_v2 ( const KTrieIndex_v2 *self,
195     bool proj, struct KDirectory *dir, const char *path, bool use_md5 );
196 
197 
198 /*--------------------------------------------------------------------------
199  * KU64Index_v3
200  */
201 struct KU64Index_v3
202 {
203     BSTree tree;
204     rc_t rc;
205 };
206 
207 rc_t KU64IndexInsert_v3(KU64Index_v3* self, bool unique, uint64_t key, uint64_t key_size, int64_t id, uint64_t id_qty);
208 rc_t KU64IndexDelete_v3(KU64Index_v3* self, uint64_t key);
209 
210 rc_t KU64IndexPersist_v3(KU64Index_v3* self, bool proj, struct KDirectory *dir, const char *path, bool use_md5);
211 
212 
213 
214 /*--------------------------------------------------------------------------
215  * KIndex
216  *  represents an index
217  */
218 
219 /* Cmp
220  * Sort
221  */
222 int KIndexCmp ( const void *item, struct BSTNode const *n );
223 int KIndexSort ( struct BSTNode const *item, struct BSTNode const *n );
224 
225 
226 #ifdef __cplusplus
227 }
228 #endif
229 
230 #endif /* _h_index_priv_ */
231