1 #include "jellyfish.h"
2 #include <string.h>
3 #include <stdio.h>
4 #include <wchar.h>
5
6 /*
7
8 Trie is a nested search tree, where each node's key is broken down into parts
9 and looking up a certain key means a sequence of lookups in small associative
10 arrays. They are usually used for strings, where a word "foo" would be
11 basically looked up as trie["f"]["o"]["o"].
12
13 In this case, the tries split the incoming integer into segments, omitting
14 upper zero ones, and they are looked up as follows:
15
16 I.e. for segments of 1 byte,
17
18 - for key = 0x11, the result is d->values[0x11]
19 - for key = 0x1122 - d->child_nodes[0x11]->values[0x22]
20 - for key = 0x112233 - d->child_nodes[0x11]->child_nodes[0x22]->values[0x33]
21
22 And so on.
23
24 Child nodes are created on demand, when a value should be stored in them.
25
26 If no value is stored in the trie for a certain key, the lookup returns 0.
27
28 */
29
30
31 #define TRIE_VALUES_PER_LEVEL 256
32 /* Each level takes one byte from dictionary key, hence max levels is: */
33 #define TRIE_MAX_LEVELS sizeof(size_t)
34
35 struct trie {
36 size_t* values;
37 struct trie** child_nodes;
38 };
39
40
trie_create(void)41 struct trie* trie_create(void)
42 {
43 return calloc(1, sizeof(struct trie));
44 }
45
46
trie_destroy(struct trie * d)47 void trie_destroy(struct trie* d)
48 {
49 size_t i;
50 if (!d) {
51 return;
52 }
53 free(d->values);
54 if (d->child_nodes) {
55 for (i = 0; i < TRIE_VALUES_PER_LEVEL; ++i) {
56 trie_destroy(d->child_nodes[i]);
57 }
58 }
59 free(d->child_nodes);
60 free(d);
61 }
62
63
trie_get(struct trie * d,size_t key)64 size_t trie_get(struct trie* d, size_t key)
65 {
66 size_t level_keys[TRIE_MAX_LEVELS];
67 size_t level_pos = 0;
68
69 size_t cur_remainder = key;
70 size_t cur_key;
71 while (1) {
72 level_keys[level_pos] = cur_remainder % TRIE_VALUES_PER_LEVEL;
73 cur_remainder /= TRIE_VALUES_PER_LEVEL;
74 if (!cur_remainder) {
75 break;
76 }
77 ++level_pos;
78 }
79
80 while (level_pos) {
81 cur_key = level_keys[level_pos];
82 if (!d->child_nodes || !d->child_nodes[cur_key]) {
83 return 0;
84 }
85 d = d->child_nodes[cur_key];
86 --level_pos;
87 }
88 if (!d->values) {
89 return 0;
90 }
91 return d->values[level_keys[0]];
92 }
93
94
trie_set(struct trie * d,size_t key,size_t val)95 int trie_set(struct trie* d, size_t key, size_t val)
96 {
97 size_t level_keys[TRIE_MAX_LEVELS];
98 size_t level_pos = 0;
99
100 size_t cur_remainder = key;
101 size_t cur_key;
102 while (1) {
103 level_keys[level_pos] = cur_remainder % TRIE_VALUES_PER_LEVEL;
104 cur_remainder /= TRIE_VALUES_PER_LEVEL;
105 if (!cur_remainder) {
106 break;
107 }
108 ++level_pos;
109 }
110
111 while (level_pos) {
112 cur_key = level_keys[level_pos];
113 if (!d->child_nodes) {
114 d->child_nodes = calloc(TRIE_VALUES_PER_LEVEL, sizeof(struct trie*));
115 if (!d->child_nodes) {
116 return 0;
117 }
118 }
119 if (!d->child_nodes[cur_key]) {
120 d->child_nodes[cur_key] = trie_create();
121 if (!d->child_nodes[cur_key]){
122 return 0;
123 }
124 }
125 d = d->child_nodes[cur_key];
126 --level_pos;
127 }
128
129 if (!d->values) {
130 d->values = calloc(TRIE_VALUES_PER_LEVEL, sizeof(size_t));
131 if (!d->values) {
132 return 0;
133 }
134 }
135 d->values[level_keys[0]] = val;
136 return 1;
137 }
138
139
damerau_levenshtein_distance(const JFISH_UNICODE * s1,const JFISH_UNICODE * s2,size_t len1,size_t len2)140 int damerau_levenshtein_distance(const JFISH_UNICODE *s1, const JFISH_UNICODE *s2, size_t len1, size_t len2)
141 {
142 size_t infinite = len1 + len2;
143 size_t cols = len2 + 2;
144
145 size_t i, j, i1, j1;
146 size_t db;
147 size_t d1, d2, d3, d4, result;
148 unsigned short cost;
149
150 size_t *dist = NULL;
151
152 struct trie* da = trie_create();
153 if (!da) {
154 return -1;
155 }
156
157 dist = safe_matrix_malloc((len1 + 2), cols, sizeof(size_t));
158 if (!dist) {
159 result = -1;
160 goto cleanup_da;
161 }
162
163 dist[0] = infinite;
164
165 for (i = 0; i <= len1; i++) {
166 dist[((i + 1) * cols) + 0] = infinite;
167 dist[((i + 1) * cols) + 1] = i;
168 }
169
170 for (i = 0; i <= len2; i++) {
171 dist[i + 1] = infinite; // 0*cols + row
172 dist[cols + i + 1] = i; // 1*cols + row
173 }
174
175 for (i = 1; i <= len1; i++) {
176 db = 0;
177 for (j = 1; j <= len2; j++) {
178 i1 = trie_get(da, s2[j-1]);
179 j1 = db;
180
181 if (s1[i - 1] == s2[j - 1]) {
182 cost = 0;
183 db = j;
184 } else {
185 cost = 1;
186 }
187
188 d1 = dist[(i * cols) + j] + cost;
189 d2 = dist[((i + 1) * cols) + j] + 1;
190 d3 = dist[(i * cols) + j + 1] + 1;
191 d4 = dist[(i1 * cols) + j1] + (i - i1 - 1) + 1 + (j - j1 - 1);
192
193 dist[((i+1)*cols) + j + 1] = MIN(MIN(d1, d2), MIN(d3, d4));
194 }
195
196 if (!trie_set(da, s1[i-1], i)) {
197 result = -1;
198 goto cleanup;
199 };
200 }
201
202 result = dist[((len1+1) * cols) + len2 + 1];
203
204
205 cleanup:
206 free(dist);
207
208 cleanup_da:
209 trie_destroy(da);
210
211 return result;
212 }
213