1 #include "jellyfish.h"
2 #include <string.h>
3 #include <stdio.h>
4 #include <wchar.h>
5 
6 /*
7 
8   Trie is a nested search tree, where each node's key is broken down into parts
9   and looking up a certain key means a sequence of lookups in small associative
10   arrays. They are usually used for strings, where a word "foo" would be
11   basically looked up as trie["f"]["o"]["o"].
12 
13   In this case, the tries split the incoming integer into segments, omitting
14   upper zero ones, and they are looked up as follows:
15 
16   I.e. for segments of 1 byte,
17 
18   - for key = 0x11, the result is d->values[0x11]
19   - for key = 0x1122  -  d->child_nodes[0x11]->values[0x22]
20   - for key = 0x112233  -  d->child_nodes[0x11]->child_nodes[0x22]->values[0x33]
21 
22   And so on.
23 
24   Child nodes are created on demand, when a value should be stored in them.
25 
26   If no value is stored in the trie for a certain key, the lookup returns 0.
27 
28 */
29 
30 
31 #define TRIE_VALUES_PER_LEVEL 256
32 /* Each level takes one byte from dictionary key, hence max levels is: */
33 #define TRIE_MAX_LEVELS sizeof(size_t)
34 
35 struct trie {
36     size_t* values;
37     struct trie** child_nodes;
38 };
39 
40 
trie_create(void)41 struct trie* trie_create(void)
42 {
43     return calloc(1, sizeof(struct trie));
44 }
45 
46 
trie_destroy(struct trie * d)47 void trie_destroy(struct trie* d)
48 {
49     size_t i;
50     if (!d) {
51         return;
52     }
53     free(d->values);
54     if (d->child_nodes) {
55         for (i = 0; i < TRIE_VALUES_PER_LEVEL; ++i) {
56             trie_destroy(d->child_nodes[i]);
57         }
58     }
59     free(d->child_nodes);
60     free(d);
61 }
62 
63 
trie_get(struct trie * d,size_t key)64 size_t trie_get(struct trie* d, size_t key)
65 {
66     size_t level_keys[TRIE_MAX_LEVELS];
67     size_t level_pos = 0;
68 
69     size_t cur_remainder = key;
70     size_t cur_key;
71     while (1) {
72         level_keys[level_pos] = cur_remainder % TRIE_VALUES_PER_LEVEL;
73         cur_remainder /= TRIE_VALUES_PER_LEVEL;
74         if (!cur_remainder) {
75             break;
76         }
77         ++level_pos;
78     }
79 
80     while (level_pos) {
81         cur_key = level_keys[level_pos];
82         if (!d->child_nodes || !d->child_nodes[cur_key]) {
83             return 0;
84         }
85         d = d->child_nodes[cur_key];
86         --level_pos;
87     }
88     if (!d->values) {
89         return 0;
90     }
91     return d->values[level_keys[0]];
92 }
93 
94 
trie_set(struct trie * d,size_t key,size_t val)95 int trie_set(struct trie* d, size_t key, size_t val)
96 {
97     size_t level_keys[TRIE_MAX_LEVELS];
98     size_t level_pos = 0;
99 
100     size_t cur_remainder = key;
101     size_t cur_key;
102     while (1) {
103         level_keys[level_pos] = cur_remainder % TRIE_VALUES_PER_LEVEL;
104         cur_remainder /= TRIE_VALUES_PER_LEVEL;
105         if (!cur_remainder) {
106             break;
107         }
108         ++level_pos;
109     }
110 
111     while (level_pos) {
112         cur_key = level_keys[level_pos];
113         if (!d->child_nodes) {
114             d->child_nodes = calloc(TRIE_VALUES_PER_LEVEL, sizeof(struct trie*));
115             if (!d->child_nodes) {
116                 return 0;
117             }
118         }
119         if (!d->child_nodes[cur_key]) {
120             d->child_nodes[cur_key] = trie_create();
121             if (!d->child_nodes[cur_key]){
122                 return 0;
123             }
124         }
125         d = d->child_nodes[cur_key];
126         --level_pos;
127     }
128 
129     if (!d->values) {
130         d->values = calloc(TRIE_VALUES_PER_LEVEL, sizeof(size_t));
131         if (!d->values) {
132             return 0;
133         }
134     }
135     d->values[level_keys[0]] = val;
136     return 1;
137 }
138 
139 
damerau_levenshtein_distance(const JFISH_UNICODE * s1,const JFISH_UNICODE * s2,size_t len1,size_t len2)140 int damerau_levenshtein_distance(const JFISH_UNICODE *s1, const JFISH_UNICODE *s2, size_t len1, size_t len2)
141 {
142     size_t infinite = len1 + len2;
143     size_t cols = len2 + 2;
144 
145     size_t i, j, i1, j1;
146     size_t db;
147     size_t d1, d2, d3, d4, result;
148     unsigned short cost;
149 
150     size_t *dist = NULL;
151 
152     struct trie* da = trie_create();
153     if (!da) {
154         return -1;
155     }
156 
157     dist = safe_matrix_malloc((len1 + 2), cols, sizeof(size_t));
158     if (!dist) {
159         result = -1;
160         goto cleanup_da;
161     }
162 
163     dist[0] = infinite;
164 
165     for (i = 0; i <= len1; i++) {
166         dist[((i + 1) * cols) + 0] = infinite;
167         dist[((i + 1) * cols) + 1] = i;
168     }
169 
170     for (i = 0; i <= len2; i++) {
171         dist[i + 1] = infinite;       // 0*cols + row
172         dist[cols + i + 1] = i;       // 1*cols + row
173     }
174 
175     for (i = 1; i <= len1; i++) {
176         db = 0;
177         for (j = 1; j <= len2; j++) {
178             i1 = trie_get(da, s2[j-1]);
179             j1 = db;
180 
181             if (s1[i - 1] == s2[j - 1]) {
182                 cost = 0;
183                 db = j;
184             } else {
185                 cost = 1;
186             }
187 
188             d1 = dist[(i * cols) + j] + cost;
189             d2 = dist[((i + 1) * cols) + j] + 1;
190             d3 = dist[(i * cols) + j + 1] + 1;
191             d4 = dist[(i1 * cols) + j1] + (i - i1 - 1) + 1 + (j - j1 - 1);
192 
193             dist[((i+1)*cols) + j + 1] = MIN(MIN(d1, d2), MIN(d3, d4));
194         }
195 
196         if (!trie_set(da, s1[i-1], i)) {
197             result = -1;
198             goto cleanup;
199         };
200     }
201 
202     result = dist[((len1+1) * cols) + len2 + 1];
203 
204 
205  cleanup:
206     free(dist);
207 
208  cleanup_da:
209     trie_destroy(da);
210 
211     return result;
212 }
213