1 /******************************************************************************
2 * Double-array trie implementation for compactly storing large dictionaries.
3 * This trie differs from most implmentations in that it stores all of the tails
4 * (compressed) in a single contiguous character array, separated by NUL-bytes,
5 * so given an index into that array, we can treat the array as a C string
6 * starting at that index. It also makes serialization dead simple. We
7 * implement a novel scheme for storing reversed strings (suffixes, etc.) A suffix
8 * is defined as the reversed UTF-8 suffix string prefixed by TRIE_SUFFIX_CHAR.
9 * Similarly, a prefix is defined as being prefixed by TRIE_PREFIX_CHAR.
10 * trie_search defines several methods for searching strings, tokenized strings,
11 * prefixes and suffixes. Note that the single characters TRIE_SUFFIX_CHAR
12 * and TRIE_PREFIX_CHAR are not allowed as keys (both are defined as control
13 * characters, so are unlikely to affect natural language applications).
14 * This trie implementation also has several *_from_index methods which allow
15 * for effective namespacing e.g. adding the keys "en|blvd" and "fr|blvd"
16 * and searching by language. For more information on double-array tries
17 * generally, see: http://linux.thai.net/~thep/datrie/datrie.html
18 ******************************************************************************/
19 
20 #ifndef TRIE_H
21 #define TRIE_H
22 
23 
24 
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <stdint.h>
28 #include <string.h>
29 #include <stdbool.h>
30 
31 #include "collections.h"
32 #include "file_utils.h"
33 #include "klib/kvec.h"
34 #include "log/log.h"
35 #include "string_utils.h"
36 
37 #define TRIE_SIGNATURE 0xABABABAB
38 #define NULL_NODE_ID 0
39 #define FREE_LIST_ID 1
40 #define ROOT_NODE_ID 2
41 #define TRIE_POOL_BEGIN 3
42 #define DEFAULT_NODE_ARRAY_SIZE 32
43 
44 #define TRIE_INDEX_ERROR  0
45 #define TRIE_MAX_INDEX 0x7fffffff
46 
47 #define TRIE_PREFIX_CHAR "\x02"
48 #define TRIE_SUFFIX_CHAR "\x03"
49 
50 // Using 256 characters can fit all UTF-8 encoded strings
51 #define NUM_CHARS 256
52 
53 typedef struct trie_node {
54     int32_t base;
55     int32_t check;
56 } trie_node_t;
57 
58 #define NULL_NODE (trie_node_t){0, 0}
59 
60 typedef struct trie_data_node {
61     uint32_t tail;
62     uint32_t data;
63 } trie_data_node_t;
64 
65 #define NULL_DATA_NODE (trie_data_node_t){0, 0};
66 
67 VECTOR_INIT(trie_node_array, trie_node_t)
68 VECTOR_INIT(trie_data_array, trie_data_node_t)
69 
70 typedef struct trie {
71     trie_node_t null_node;
72     trie_node_array *nodes;
73     trie_data_array *data;
74     uchar_array *tail;
75     char *alphabet;
76     uint8_t alpha_map[NUM_CHARS];
77     uint32_t alphabet_size;
78     uint32_t num_keys;
79 } trie_t;
80 
81 trie_t *trie_new_alphabet(uint8_t *alphabet, uint32_t alphabet_size);
82 trie_t *trie_new(void);
83 
84 uint32_t trie_get_char_index(trie_t *self, unsigned char c);
85 uint32_t trie_get_transition_index(trie_t *self, trie_node_t node, unsigned char c);
86 trie_node_t trie_get_transition(trie_t *self, trie_node_t node, unsigned char c);
87 bool trie_node_is_free(trie_node_t node);
88 
89 
90 trie_node_t trie_get_node(trie_t *self, uint32_t index);
91 void trie_set_base(trie_t *self, uint32_t index, int32_t base);
92 void trie_set_check(trie_t *self, uint32_t index, int32_t check);
93 trie_node_t trie_get_root(trie_t *self);
94 trie_node_t trie_get_free_list(trie_t *self);
95 
96 trie_data_node_t trie_get_data_node(trie_t *self, trie_node_t node);
97 bool trie_set_data_node(trie_t *self, uint32_t index, trie_data_node_t data_node);
98 
99 bool trie_get_data_at_index(trie_t *self, uint32_t index,  uint32_t *data);
100 bool trie_get_data(trie_t *self, char *key, uint32_t *data);
101 bool trie_set_data_at_index(trie_t *self, uint32_t index, uint32_t data);
102 bool trie_set_data(trie_t *self, char *key, uint32_t data);
103 
104 bool trie_tail_match(trie_t *self, char *str, uint32_t tail_index);
105 
106 uint32_t trie_add_transition(trie_t *self, uint32_t node_id, unsigned char c);
107 
108 void trie_make_room_for(trie_t *self, uint32_t next_id);
109 
110 void trie_add_tail(trie_t *self, unsigned char *tail);
111 void trie_set_tail(trie_t *self, unsigned char *tail, uint32_t tail_pos);
112 int32_t trie_separate_tail(trie_t *self, uint32_t from_index, unsigned char *tail, uint32_t data);
113 void trie_tail_merge(trie_t *self, uint32_t old_node_id, unsigned char *suffix, uint32_t data);
114 
115 bool trie_add_at_index(trie_t *self, uint32_t node_id, char *key, size_t len, uint32_t data);
116 bool trie_add(trie_t *self, char *key, uint32_t data);
117 bool trie_add_len(trie_t *self, char *key, size_t len, uint32_t data);
118 bool trie_add_suffix(trie_t *self, char *key, uint32_t data);
119 bool trie_add_suffix_at_index(trie_t *self, char *key, uint32_t start_node_id, uint32_t data);
120 bool trie_add_prefix(trie_t *self, char *key, uint32_t data);
121 bool trie_add_prefix_at_index(trie_t *self, char *key, uint32_t start_node_id, uint32_t data);
122 
123 uint32_t trie_get_from_index(trie_t *self, char *word, size_t len, uint32_t i);
124 uint32_t trie_get_len(trie_t *self, char *word, size_t len);
125 uint32_t trie_get(trie_t *self, char *word);
126 
127 uint32_t trie_num_keys(trie_t *self);
128 
129 typedef struct trie_prefix_result {
130     uint32_t node_id;
131     size_t tail_pos;
132 } trie_prefix_result_t;
133 
134 #define ROOT_PREFIX_RESULT (trie_prefix_result_t) {ROOT_NODE_ID, 0}
135 #define NULL_PREFIX_RESULT (trie_prefix_result_t) {NULL_NODE_ID, 0}
136 
137 trie_prefix_result_t trie_get_prefix(trie_t *self, char *key);
138 trie_prefix_result_t trie_get_prefix_len(trie_t *self, char *key, size_t len);
139 trie_prefix_result_t trie_get_prefix_from_index(trie_t *self, char *key, size_t len, uint32_t start_index, size_t tail_pos);
140 
141 void trie_print(trie_t *self);
142 
143 bool trie_write(trie_t *self, FILE *file);
144 bool trie_save(trie_t *self, char *path);
145 
146 trie_t *trie_read(FILE *file);
147 trie_t *trie_load(char *path);
148 
149 void trie_destroy(trie_t *self);
150 
151 
152 
153 #endif
154