1 /****************************************************************************** 2 * Double-array trie implementation for compactly storing large dictionaries. 3 * This trie differs from most implmentations in that it stores all of the tails 4 * (compressed) in a single contiguous character array, separated by NUL-bytes, 5 * so given an index into that array, we can treat the array as a C string 6 * starting at that index. It also makes serialization dead simple. We 7 * implement a novel scheme for storing reversed strings (suffixes, etc.) A suffix 8 * is defined as the reversed UTF-8 suffix string prefixed by TRIE_SUFFIX_CHAR. 9 * Similarly, a prefix is defined as being prefixed by TRIE_PREFIX_CHAR. 10 * trie_search defines several methods for searching strings, tokenized strings, 11 * prefixes and suffixes. Note that the single characters TRIE_SUFFIX_CHAR 12 * and TRIE_PREFIX_CHAR are not allowed as keys (both are defined as control 13 * characters, so are unlikely to affect natural language applications). 14 * This trie implementation also has several *_from_index methods which allow 15 * for effective namespacing e.g. adding the keys "en|blvd" and "fr|blvd" 16 * and searching by language. For more information on double-array tries 17 * generally, see: http://linux.thai.net/~thep/datrie/datrie.html 18 ******************************************************************************/ 19 20 #ifndef TRIE_H 21 #define TRIE_H 22 23 24 25 #include <stdio.h> 26 #include <stdlib.h> 27 #include <stdint.h> 28 #include <string.h> 29 #include <stdbool.h> 30 31 #include "collections.h" 32 #include "file_utils.h" 33 #include "klib/kvec.h" 34 #include "log/log.h" 35 #include "string_utils.h" 36 37 #define TRIE_SIGNATURE 0xABABABAB 38 #define NULL_NODE_ID 0 39 #define FREE_LIST_ID 1 40 #define ROOT_NODE_ID 2 41 #define TRIE_POOL_BEGIN 3 42 #define DEFAULT_NODE_ARRAY_SIZE 32 43 44 #define TRIE_INDEX_ERROR 0 45 #define TRIE_MAX_INDEX 0x7fffffff 46 47 #define TRIE_PREFIX_CHAR "\x02" 48 #define TRIE_SUFFIX_CHAR "\x03" 49 50 // Using 256 characters can fit all UTF-8 encoded strings 51 #define NUM_CHARS 256 52 53 typedef struct trie_node { 54 int32_t base; 55 int32_t check; 56 } trie_node_t; 57 58 #define NULL_NODE (trie_node_t){0, 0} 59 60 typedef struct trie_data_node { 61 uint32_t tail; 62 uint32_t data; 63 } trie_data_node_t; 64 65 #define NULL_DATA_NODE (trie_data_node_t){0, 0}; 66 67 VECTOR_INIT(trie_node_array, trie_node_t) 68 VECTOR_INIT(trie_data_array, trie_data_node_t) 69 70 typedef struct trie { 71 trie_node_t null_node; 72 trie_node_array *nodes; 73 trie_data_array *data; 74 uchar_array *tail; 75 char *alphabet; 76 uint8_t alpha_map[NUM_CHARS]; 77 uint32_t alphabet_size; 78 uint32_t num_keys; 79 } trie_t; 80 81 trie_t *trie_new_alphabet(uint8_t *alphabet, uint32_t alphabet_size); 82 trie_t *trie_new(void); 83 84 uint32_t trie_get_char_index(trie_t *self, unsigned char c); 85 uint32_t trie_get_transition_index(trie_t *self, trie_node_t node, unsigned char c); 86 trie_node_t trie_get_transition(trie_t *self, trie_node_t node, unsigned char c); 87 bool trie_node_is_free(trie_node_t node); 88 89 90 trie_node_t trie_get_node(trie_t *self, uint32_t index); 91 void trie_set_base(trie_t *self, uint32_t index, int32_t base); 92 void trie_set_check(trie_t *self, uint32_t index, int32_t check); 93 trie_node_t trie_get_root(trie_t *self); 94 trie_node_t trie_get_free_list(trie_t *self); 95 96 trie_data_node_t trie_get_data_node(trie_t *self, trie_node_t node); 97 bool trie_set_data_node(trie_t *self, uint32_t index, trie_data_node_t data_node); 98 99 bool trie_get_data_at_index(trie_t *self, uint32_t index, uint32_t *data); 100 bool trie_get_data(trie_t *self, char *key, uint32_t *data); 101 bool trie_set_data_at_index(trie_t *self, uint32_t index, uint32_t data); 102 bool trie_set_data(trie_t *self, char *key, uint32_t data); 103 104 bool trie_tail_match(trie_t *self, char *str, uint32_t tail_index); 105 106 uint32_t trie_add_transition(trie_t *self, uint32_t node_id, unsigned char c); 107 108 void trie_make_room_for(trie_t *self, uint32_t next_id); 109 110 void trie_add_tail(trie_t *self, unsigned char *tail); 111 void trie_set_tail(trie_t *self, unsigned char *tail, uint32_t tail_pos); 112 int32_t trie_separate_tail(trie_t *self, uint32_t from_index, unsigned char *tail, uint32_t data); 113 void trie_tail_merge(trie_t *self, uint32_t old_node_id, unsigned char *suffix, uint32_t data); 114 115 bool trie_add_at_index(trie_t *self, uint32_t node_id, char *key, size_t len, uint32_t data); 116 bool trie_add(trie_t *self, char *key, uint32_t data); 117 bool trie_add_len(trie_t *self, char *key, size_t len, uint32_t data); 118 bool trie_add_suffix(trie_t *self, char *key, uint32_t data); 119 bool trie_add_suffix_at_index(trie_t *self, char *key, uint32_t start_node_id, uint32_t data); 120 bool trie_add_prefix(trie_t *self, char *key, uint32_t data); 121 bool trie_add_prefix_at_index(trie_t *self, char *key, uint32_t start_node_id, uint32_t data); 122 123 uint32_t trie_get_from_index(trie_t *self, char *word, size_t len, uint32_t i); 124 uint32_t trie_get_len(trie_t *self, char *word, size_t len); 125 uint32_t trie_get(trie_t *self, char *word); 126 127 uint32_t trie_num_keys(trie_t *self); 128 129 typedef struct trie_prefix_result { 130 uint32_t node_id; 131 size_t tail_pos; 132 } trie_prefix_result_t; 133 134 #define ROOT_PREFIX_RESULT (trie_prefix_result_t) {ROOT_NODE_ID, 0} 135 #define NULL_PREFIX_RESULT (trie_prefix_result_t) {NULL_NODE_ID, 0} 136 137 trie_prefix_result_t trie_get_prefix(trie_t *self, char *key); 138 trie_prefix_result_t trie_get_prefix_len(trie_t *self, char *key, size_t len); 139 trie_prefix_result_t trie_get_prefix_from_index(trie_t *self, char *key, size_t len, uint32_t start_index, size_t tail_pos); 140 141 void trie_print(trie_t *self); 142 143 bool trie_write(trie_t *self, FILE *file); 144 bool trie_save(trie_t *self, char *path); 145 146 trie_t *trie_read(FILE *file); 147 trie_t *trie_load(char *path); 148 149 void trie_destroy(trie_t *self); 150 151 152 153 #endif 154