1 /*=========================================================================== 2 * 3 * PUBLIC DOMAIN NOTICE 4 * National Center for Biotechnology Information 5 * 6 * This software/database is a "United States Government Work" under the 7 * terms of the United States Copyright Act. It was written as part of 8 * the author's official duties as a United States Government employee and 9 * thus cannot be copyrighted. This software/database is freely available 10 * to the public for use. The National Library of Medicine and the U.S. 11 * Government have not placed any restriction on its use or reproduction. 12 * 13 * Although all reasonable efforts have been taken to ensure the accuracy 14 * and reliability of the software and data, the NLM and the U.S. 15 * Government do not and cannot warrant the performance or results that 16 * may be obtained by using this software or data. The NLM and the U.S. 17 * Government disclaim all warranties, express or implied, including 18 * warranties of performance, merchantability or fitness for any particular 19 * purpose. 20 * 21 * Please cite the author in any work or product based on this material. 22 * 23 * =========================================================================== 24 * 25 */ 26 27 #ifndef _h_index_priv_ 28 #define _h_index_priv_ 29 30 #ifndef _h_index_cmn_ 31 #include "index-cmn.h" 32 #endif 33 34 #ifndef _h_klib_trie_ 35 #include <klib/trie.h> 36 #endif 37 38 #ifdef __cplusplus 39 extern "C" { 40 #endif 41 42 43 /*-------------------------------------------------------------------------- 44 * forwards 45 */ 46 struct BSTNode; 47 struct KDirectory; 48 49 50 /*-------------------------------------------------------------------------- 51 * V1 52 * version 1 of the trie index was hard-coded to enforce uniqueness of 53 * both the string, and a 32-bit id. furthermore, the id was unfortunately 54 * assumed to occupy a mostly contiguous space, such that the projection 55 * was always implemented as an array of ptrie node ids where the id was 56 * used to index the array. 57 * 58 * the introduction of highly sparse ids led to deprecation of this 59 * implementation. see version 2 for further information. 60 */ 61 62 63 /*-------------------------------------------------------------------------- 64 * KTrieIdxNode_v1 65 */ 66 typedef struct KTrieIdxNode_v1 KTrieIdxNode_v1; 67 struct KTrieIdxNode_v1 68 { 69 TNode n; 70 uint32_t id; 71 char key [ 1 ]; 72 }; 73 74 /*-------------------------------------------------------------------------- 75 * KTrieIndex_v1 76 */ 77 struct KTrieIndex_v1 78 { 79 KPTrieIndex_v1 pt; 80 Trie key2id; 81 KTrieIdxNode_v1 **id2node; 82 uint32_t first; 83 uint32_t last; 84 uint32_t len; 85 }; 86 87 /* insert string into trie, mapping to 32 bit id */ 88 rc_t KTrieIndexInsert_v1 ( KTrieIndex_v1 *self, 89 bool proj, const char *key, uint32_t id ); 90 91 /* drop string from trie and all mappings */ 92 rc_t KTrieIndexDelete_v1 ( KTrieIndex_v1 *self, 93 bool proj, const char *key ); 94 95 /* persist index to file */ 96 rc_t KTrieIndexPersist_v1 ( const KTrieIndex_v1 *self, 97 bool proj, struct KDirectory *dir, const char *path, bool use_md5 ); 98 99 100 /*-------------------------------------------------------------------------- 101 * V2 102 * version 2 of the trie index was introduced to handle sparse ids, 103 * and to recognize that ids may be 64 bits and/or negative. 104 * 105 * v2 introduces strategy identifiers to handle various cases. 106 * 107 * CONSTRAINTS 108 * - both key and id are unique ( version 1 ) 109 * - key is not unique, but must map to a contiguous range of ids, 110 * while ids are unique ( the main use case for SRA ) 111 * 112 * INSERTION 113 * - ids are observed in increasing order 114 * 115 * PROJECTION 116 * - id range is contiguous or nearly so ( can use single array ) 117 * - id range is sparse 118 * 119 * the implementation may be extended by adding new strategies, 120 * but the moment the implementation supports 1 to many mappings, 121 * inserted with ids in increasing order, and an unique constraint 122 * on the ids themselves. 123 * 124 * the general case for v2 is "key -> id range", where "id range -> key" 125 * is via contiguous array if avg ( id range ) <= 2, and via sparse 126 * array otherwise. 127 * 128 * for the key -> id mappings, this means that the in-core node 129 * either retains an id range ( when not using a projection index ), 130 * or it retains a start-id only, since its range can be determined 131 * from the projection index. 132 * 133 * for id -> key mappings, the id is first converted to an ordinal, and 134 * the ordinal to a node. when ids are contiguous, id -> ordinal is simply 135 * derived by subtracting the initial start id. when sparse, an id -> ord 136 * array is used in a binary search to produce the ordinal. 137 * 138 * ids are assumed to be 64 bit, stored as id - start id, and packed to 139 * a minimum number of bits to represent the id. ptrie node ids are still 140 * 32-bit entities. 141 */ 142 143 144 /*-------------------------------------------------------------------------- 145 * KTrieIdxNode_v2_s1 146 * strategy 1 - store only start id, derive range from proj index 147 */ 148 typedef struct KTrieIdxNode_v2_s1 KTrieIdxNode_v2_s1; 149 struct KTrieIdxNode_v2_s1 150 { 151 TNode n; 152 int64_t start_id; 153 char key [ 1 ]; 154 }; 155 156 /*-------------------------------------------------------------------------- 157 * KTrieIdxNode_v2_s2 158 * strategy 2 - store complete range when not using proj index 159 */ 160 typedef struct KTrieIdxNode_v2_s2 KTrieIdxNode_v2_s2; 161 struct KTrieIdxNode_v2_s2 162 { 163 TNode n; 164 int64_t start_id; 165 uint32_t span; 166 char key [ 1 ]; 167 }; 168 169 /*-------------------------------------------------------------------------- 170 * KTrieIndex_v2 171 */ 172 struct KTrieIndex_v2 173 { 174 int64_t first, last; 175 KPTrieIndex_v2 pt; 176 Trie key2id; 177 KTrieIdxNode_v2_s1 **ord2node; 178 uint32_t count; 179 uint32_t max_span; 180 }; 181 182 /* cause persisted tree to be loaded into trie */ 183 rc_t KTrieIndexAttach_v2 ( KTrieIndex_v2 *self, bool proj ); 184 185 /* insert string into trie, mapping to 64 bit id */ 186 rc_t KTrieIndexInsert_v2 ( KTrieIndex_v2 *self, 187 bool proj, const char *key, int64_t id ); 188 189 /* drop string from trie and all mappings */ 190 rc_t KTrieIndexDelete_v2 ( KTrieIndex_v2 *self, 191 bool proj, const char *key ); 192 193 /* persist index to file */ 194 rc_t KTrieIndexPersist_v2 ( const KTrieIndex_v2 *self, 195 bool proj, struct KDirectory *dir, const char *path, bool use_md5 ); 196 197 198 /*-------------------------------------------------------------------------- 199 * KU64Index_v3 200 */ 201 struct KU64Index_v3 202 { 203 BSTree tree; 204 rc_t rc; 205 }; 206 207 rc_t KU64IndexInsert_v3(KU64Index_v3* self, bool unique, uint64_t key, uint64_t key_size, int64_t id, uint64_t id_qty); 208 rc_t KU64IndexDelete_v3(KU64Index_v3* self, uint64_t key); 209 210 rc_t KU64IndexPersist_v3(KU64Index_v3* self, bool proj, struct KDirectory *dir, const char *path, bool use_md5); 211 212 213 214 /*-------------------------------------------------------------------------- 215 * KIndex 216 * represents an index 217 */ 218 219 /* Cmp 220 * Sort 221 */ 222 int KIndexCmp ( const void *item, struct BSTNode const *n ); 223 int KIndexSort ( struct BSTNode const *item, struct BSTNode const *n ); 224 225 226 #ifdef __cplusplus 227 } 228 #endif 229 230 #endif /* _h_index_priv_ */ 231