1 #include <math.h>
2 #include "transliterate.h"
3 #include "file_utils.h"
4 
5 #include "log/log.h"
6 #include "strndup.h"
7 
8 #define TRANSLITERATION_TABLE_SIGNATURE 0xAAAAAAAA
9 
10 #define NFD "NFD"
11 #define NFC "NFC"
12 #define NFKC "NFKC"
13 #define NFKD "NFKD"
14 #define STRIP_MARK "STRIP_MARK"
15 
16 static transliteration_table_t *trans_table = NULL;
17 
get_transliteration_table(void)18 transliteration_table_t *get_transliteration_table(void) {
19     return trans_table;
20 }
21 
transliterator_new(char * name,uint8_t internal,uint32_t steps_index,size_t steps_length)22 transliterator_t *transliterator_new(char *name, uint8_t internal, uint32_t steps_index, size_t steps_length) {
23     transliterator_t *trans = malloc(sizeof(transliterator_t));
24 
25     if (trans == NULL) {
26         return NULL;
27     }
28 
29     trans->name = name;
30     trans->internal = internal;
31     trans->steps_index = steps_index;
32     trans->steps_length = steps_length;
33 
34     return trans;
35 }
36 
transliterator_destroy(transliterator_t * self)37 void transliterator_destroy(transliterator_t *self) {
38     if (self == NULL) return;
39     if (self->name) {
40         free(self->name);
41     }
42     free(self);
43 }
44 
45 
get_transliterator(char * name)46 transliterator_t *get_transliterator(char *name) {
47     if (trans_table == NULL) {
48         return NULL;
49     }
50 
51     khiter_t k;
52     k = kh_get(str_transliterator, trans_table->transliterators, name);
53     return (k != kh_end(trans_table->transliterators)) ? kh_value(trans_table->transliterators, k) : NULL;
54 }
55 
56 
57 typedef enum {
58     TRANS_STATE_BEGIN,
59     TRANS_STATE_PARTIAL_MATCH,
60     TRANS_STATE_MATCH
61 } transliteration_state_type_t;
62 
63 typedef struct {
64     trie_prefix_result_t result;
65     transliteration_state_type_t state;
66     ssize_t phrase_start;
67     size_t phrase_len;
68     size_t char_len;
69     uint8_t advance_index:1;
70     uint8_t advance_state:1;
71     uint8_t in_set:1;
72     uint8_t empty_transition:1;
73     uint8_t repeat:1;
74     uint8_t word_boundary:1;
75 } transliteration_state_t;
76 
77 
78 #define TRANSLITERATION_DEFAULT_STATE (transliteration_state_t){NULL_PREFIX_RESULT, TRANS_STATE_BEGIN, 0, 0, 0, 1, 1, 0, 0, 0, 0}
79 
80 
get_replacement(trie_t * trie,trie_prefix_result_t result,char * str,size_t start_index)81 static transliteration_replacement_t *get_replacement(trie_t *trie, trie_prefix_result_t result, char *str, size_t start_index) {
82     uint32_t node_id = result.node_id;
83     if (node_id == NULL_NODE_ID) return NULL;
84 
85     uint32_t replacement_index = 0;
86 
87     if (!trie_get_data_at_index(trie, node_id, &replacement_index)) {
88         return NULL;
89     }
90 
91     if (replacement_index < trans_table->replacements->n) {
92         log_debug("Got data node\n");
93         return trans_table->replacements->a[replacement_index];
94     }
95 
96     return NULL;
97 
98 }
99 
100 typedef enum {
101     NO_CHAR_RESULT,
102     SINGLE_CHAR_ONLY,
103     SINGLE_CHAR_REPEAT,
104     OPEN_CHAR_SET,
105     CLOSED_CHAR_SET,
106     CHAR_SET_REPEAT,
107     SINGLE_EMPTY_TRANSITION,
108     CHAR_SET_EMPTY_TRANSITION
109 } char_set_type;
110 
111 
112 typedef struct char_set_result {
113     trie_prefix_result_t result;
114     char_set_type type;
115 } char_set_result_t;
116 
117 #define NULL_CHAR_SET_RESULT (char_set_result_t){NULL_PREFIX_RESULT, NO_CHAR_RESULT};
118 
next_prefix_or_set(trie_t * trie,char * str,size_t len,trie_prefix_result_t last_result,bool in_set,bool check_set_only)119 static char_set_result_t next_prefix_or_set(trie_t *trie, char *str, size_t len, trie_prefix_result_t last_result, bool in_set, bool check_set_only) {
120     trie_prefix_result_t result = trie_get_prefix_from_index(trie, str, len, last_result.node_id, last_result.tail_pos);
121 
122     bool has_empty_transition = false;
123 
124 
125     if (!check_set_only && result.node_id != NULL_NODE_ID) {
126         last_result = result;
127         result = trie_get_prefix_from_index(trie, REPEAT_CHAR, REPEAT_CHAR_LEN, last_result.node_id, last_result.tail_pos);
128         if (result.node_id == NULL_NODE_ID) {
129             return (char_set_result_t){last_result, SINGLE_CHAR_ONLY};
130         } else {
131             log_debug("Got single char repeat\n");
132             return (char_set_result_t){last_result, SINGLE_CHAR_REPEAT};
133         }
134     } else if (!in_set) {
135         result = trie_get_prefix_from_index(trie, BEGIN_SET_CHAR, BEGIN_SET_CHAR_LEN, last_result.node_id, last_result.tail_pos);
136 
137         if (result.node_id == NULL_NODE_ID) {
138             result = trie_get_prefix_from_index(trie, EMPTY_TRANSITION_CHAR, EMPTY_TRANSITION_CHAR_LEN, last_result.node_id, last_result.tail_pos);
139             if (result.node_id == NULL_NODE_ID) {
140                 return NULL_CHAR_SET_RESULT;
141             } else {
142                 log_debug("empty result node_id=%d\n", result.node_id);
143                 return (char_set_result_t){result, SINGLE_EMPTY_TRANSITION};
144             }
145         }
146 
147         log_debug("Got begin set, node_id = %d\n", result.node_id);
148 
149         last_result = result;
150 
151         result = trie_get_prefix_from_index(trie, str, len, last_result.node_id, last_result.tail_pos);
152 
153         log_debug("Set node_id = %d, len=%zu\n", result.node_id, len);
154 
155         if (result.node_id == NULL_NODE_ID) {
156             result = trie_get_prefix_from_index(trie, EMPTY_TRANSITION_CHAR, EMPTY_TRANSITION_CHAR_LEN, last_result.node_id, last_result.tail_pos);
157             if (result.node_id == NULL_NODE_ID) {
158                 return NULL_CHAR_SET_RESULT;
159             }
160             log_debug("Got empty transition char\n");
161             has_empty_transition = true;
162         }
163 
164         in_set = true;
165         last_result = result;
166     }
167 
168     if (in_set) {
169         // In the set but can potentially have more than one unicode character
170         result = trie_get_prefix_from_index(trie, END_SET_CHAR, END_SET_CHAR_LEN, last_result.node_id, last_result.tail_pos);
171         if (result.node_id == NULL_NODE_ID && !has_empty_transition) {
172             log_debug("No end set\n");
173             return (char_set_result_t){last_result, OPEN_CHAR_SET};
174         } else if (result.node_id == NULL_NODE_ID && has_empty_transition) {
175             log_debug("has_empty_transition\n");
176             return NULL_CHAR_SET_RESULT;
177         }
178 
179         last_result = result;
180         result = trie_get_prefix_from_index(trie, REPEAT_CHAR, REPEAT_CHAR_LEN, last_result.node_id, last_result.tail_pos);
181 
182         if (result.node_id == NULL_NODE_ID && !has_empty_transition) {
183             log_debug("Got closed set\n");
184             return (char_set_result_t){last_result, CLOSED_CHAR_SET};
185         // Shouldn't repeat the empty transition, so ignore repeats
186         } else if (has_empty_transition) {
187             log_debug("Char set empty transition\n");
188             return (char_set_result_t){result, CHAR_SET_EMPTY_TRANSITION};
189         } else {
190             log_debug("Char set repeated\n");
191             return (char_set_result_t){result, CHAR_SET_REPEAT};
192         }
193     }
194     return NULL_CHAR_SET_RESULT;
195 
196 }
197 
state_from_char_result(char_set_result_t char_result,size_t index,size_t len,transliteration_state_t prev_state,bool is_context)198 static transliteration_state_t state_from_char_result(char_set_result_t char_result, size_t index, size_t len, transliteration_state_t prev_state, bool is_context) {
199     transliteration_state_t state = TRANSLITERATION_DEFAULT_STATE;
200 
201     trie_prefix_result_t result = char_result.result;
202 
203     state.result = result;
204     state.char_len = len;
205     state.in_set = (char_result.type == OPEN_CHAR_SET || (prev_state.in_set && char_result.type == SINGLE_CHAR_ONLY));
206     state.repeat = (char_result.type == SINGLE_CHAR_REPEAT || char_result.type == CHAR_SET_REPEAT);
207     state.empty_transition = (char_result.type == SINGLE_EMPTY_TRANSITION || char_result.type == CHAR_SET_EMPTY_TRANSITION);
208 
209     if (char_result.type != NO_CHAR_RESULT) {
210         log_debug("in state_from_char_result, char_result.type = %d\n", char_result.type);
211         state.state = TRANS_STATE_PARTIAL_MATCH;
212 
213         if (!is_context) {
214             if (prev_state.state == TRANS_STATE_BEGIN) {
215                 state.phrase_start = index;
216             } else {
217                 state.phrase_start = prev_state.phrase_start;
218             }
219             state.phrase_len = prev_state.phrase_len + len;
220         }
221     }
222 
223     return state;
224 
225 }
226 
state_transition(trie_t * trie,char * str,size_t index,size_t len,transliteration_state_t prev_state)227 static transliteration_state_t state_transition(trie_t *trie, char *str, size_t index, size_t len, transliteration_state_t prev_state) {
228 
229     log_debug("str = %s, index = %zu, char_len=%zu\n", str, index, len);
230 
231     log_debug("prev_state.result.node_id=%d, prev_state.in_set=%d\n", prev_state.result.node_id, prev_state.in_set);
232 
233     char_set_result_t char_result = next_prefix_or_set(trie, str + index, len, prev_state.result, prev_state.in_set, false);
234 
235     log_debug("char_result.type = %d\n", char_result.type);
236 
237     return state_from_char_result(char_result, index, len, prev_state, false);
238 }
239 
240 
state_transition_context(trie_t * trie,char * str,size_t index,size_t len,transliteration_state_t prev_state)241 static transliteration_state_t state_transition_context(trie_t *trie, char *str, size_t index, size_t len, transliteration_state_t prev_state) {
242 
243     log_debug("str = %s, index = %zu, char_len=%zu\n", str, index, len);
244 
245     log_debug("prev_state.result.node_id=%d, prev_state.in_set=%d\n", prev_state.result.node_id, prev_state.in_set);
246 
247     char_set_result_t char_result = next_prefix_or_set(trie, str + index, len, prev_state.result, prev_state.in_set, false);
248 
249     log_debug("char_result.type = %d\n", char_result.type);
250 
251     return state_from_char_result(char_result, index, len, prev_state, true);
252 }
253 
254 
255 
set_match_if_any(trie_t * trie,transliteration_state_t state,transliteration_state_t * match_state)256 static inline void set_match_if_any(trie_t *trie, transliteration_state_t state, transliteration_state_t *match_state) {
257     if (state.state != TRANS_STATE_PARTIAL_MATCH) return;
258 
259     trie_prefix_result_t prev_result = state.result;
260 
261     // Complete string
262     trie_prefix_result_t result = trie_get_prefix_from_index(trie, "", 1, prev_result.node_id, prev_result.tail_pos);
263     if (result.node_id != NULL_NODE_ID) {
264         match_state->result = result;
265         match_state->state = TRANS_STATE_MATCH;
266         match_state->phrase_start = state.phrase_start;
267         match_state->phrase_len = state.phrase_len;
268     }
269 }
270 
271 
check_pre_context(trie_t * trie,char * str,transliteration_state_t original_state)272 static transliteration_state_t check_pre_context(trie_t *trie, char *str, transliteration_state_t original_state) {
273     size_t start_index = original_state.phrase_start;
274     int32_t ch = 0;
275     size_t idx = start_index;
276     ssize_t char_len = 0;
277 
278     bool in_repeat = false;
279 
280     transliteration_state_t prev_state = original_state;
281     transliteration_state_t state = original_state;
282 
283     // Save the end of the repeated state the first time through
284     transliteration_state_t repeat_state_end;
285 
286     transliteration_state_t match_state = TRANSLITERATION_DEFAULT_STATE;
287 
288     log_debug("start_index=%zu, str=%s\n", start_index, str);
289 
290     while (idx > 0) {
291         char_len = utf8proc_iterate_reversed((uint8_t *)str, idx, &ch);
292 
293         if (char_len <= 0) {
294             break;
295         }
296 
297         if (!utf8proc_codepoint_valid(ch)) {
298             idx -= char_len;
299             continue;
300         }
301 
302         log_debug("In pre-context, got char %d, \"%.*s\"\n", ch, (int)char_len, str + idx - char_len);
303 
304         state = state_transition_context(trie, str, idx - char_len, char_len, prev_state);
305         set_match_if_any(trie, state, &match_state);
306 
307         if (match_state.state == TRANS_STATE_MATCH) {
308             log_debug("pre-context TRANS_STATE_MATCH\n");
309             state = match_state;
310             break;
311         } else if (state.state == TRANS_STATE_BEGIN && !in_repeat) {
312             log_debug("pre-context TRANS_STATE_BEGIN and not in repeat\n");
313             if (prev_state.state == TRANS_STATE_PARTIAL_MATCH) {
314                 state = prev_state;
315             }
316             break;
317         } else if (state.repeat) {
318             log_debug("pre-context in repeat\n");
319             in_repeat = true;
320             repeat_state_end = state;
321             state.advance_state = false;
322         } else if (state.empty_transition) {
323             log_debug("pre-context empty_transition\n");
324             state.advance_index = false;
325             if (in_repeat) {
326                 log_debug("empty_transition in repeat\n");
327                 prev_state = repeat_state_end;
328                 state.advance_state = false;
329                 in_repeat = false;
330             }
331         // If we're repeating e.g. "[abcd]+e", when we hit the "e" or another character, stop repeating and try from the end of the block
332         } else if (state.state == TRANS_STATE_BEGIN && in_repeat) {
333             log_debug("pre-context stop repeat\n");
334             prev_state = repeat_state_end;
335             in_repeat = false;
336             state.advance_index = false;
337             state.advance_state = false;
338         } else if (in_repeat) {
339             log_debug("end repeat\n");
340             log_debug("state.state==%d, state.result.node_id=%d, repeat_state_end.result.node_id=%d\n", state.state, state.result.node_id, repeat_state_end.result.node_id);
341             in_repeat = false;
342             break;
343         }
344 
345         if (state.advance_index) {
346             idx -= char_len;
347         }
348 
349         if (state.advance_state) {
350             prev_state = state;
351         }
352 
353     }
354 
355     return state;
356 }
357 
check_post_context(trie_t * trie,char * str,transliteration_state_t original_state)358 static transliteration_state_t check_post_context(trie_t *trie, char *str, transliteration_state_t original_state) {
359     size_t index = original_state.phrase_start + original_state.phrase_len;
360     uint8_t *ptr = (uint8_t *)str + index;
361     size_t len = strlen(str) - index;
362     int32_t ch = 0;
363     size_t idx = 0;
364     ssize_t char_len = 0;
365 
366     bool in_repeat = false;
367 
368     transliteration_state_t prev_state = original_state;
369 
370     transliteration_state_t state = original_state;
371 
372     // Save the end of the repeated state the first time through
373     transliteration_state_t repeat_state_end;
374 
375     transliteration_state_t match_state = TRANSLITERATION_DEFAULT_STATE;
376 
377     while (idx < len) {
378         char_len = utf8proc_iterate(ptr, len, &ch);
379 
380         if (char_len <= 0) {
381             break;
382         }
383 
384         if (!utf8proc_codepoint_valid(ch)) {
385             idx += char_len;
386             ptr += char_len;
387             continue;
388         }
389 
390         log_debug("In post-context, got char \"%.*s\"\n", (int)char_len, str + index + idx);
391 
392         state = state_transition_context(trie, str, index + idx, char_len, prev_state);
393         set_match_if_any(trie, state, &match_state);
394 
395         if (match_state.state == TRANS_STATE_MATCH) {
396             log_debug("post-context TRANS_STATE_MATCH\n");
397             state = match_state;
398             break;
399         } else if (state.state == TRANS_STATE_BEGIN && !in_repeat) {
400             log_debug("post-context TRANS_STATE_BEGIN and not in repeat\n");
401             break;
402         } else if (state.repeat) {
403             log_debug("post-context in repeat\n");
404             in_repeat = true;
405             repeat_state_end = state;
406             state.advance_state = false;
407         } else if (state.empty_transition) {
408             log_debug("post-context empty_transition\n");
409             state.advance_index = false;
410             if (in_repeat) {
411                 log_debug("empty_transition in repeat\n");
412                 prev_state = repeat_state_end;
413                 state.advance_state = false;
414                 in_repeat = false;
415             }
416         // If we're repeating e.g. "[abcd]+e", when we hit the "e" or another character, stop repeating and try from the end of the block
417         } else if (state.state == TRANS_STATE_BEGIN && in_repeat) {
418             log_debug("post-context stop repeat\n");
419             prev_state = repeat_state_end;
420             in_repeat = false;
421             state.advance_index = false;
422             state.advance_state = false;
423         } else if (in_repeat) {
424             log_debug("end repeat\n");
425             in_repeat = false;
426             break;
427         }
428 
429         if (state.advance_index) {
430             idx += char_len;
431             ptr += char_len;
432         }
433 
434         if (state.advance_state) {
435             prev_state = state;
436         }
437 
438     }
439 
440     return state;
441 }
442 
context_match(trie_t * trie,char * str,transliteration_state_t original_state)443 static trie_prefix_result_t context_match(trie_t *trie, char *str, transliteration_state_t original_state) {
444     trie_prefix_result_t prev_result = original_state.result;
445     transliteration_state_t state = TRANSLITERATION_DEFAULT_STATE;
446     transliteration_state_t prev_state = original_state;
447     trie_prefix_result_t result = trie_get_prefix_from_index(trie, PRE_CONTEXT_CHAR, PRE_CONTEXT_CHAR_LEN, prev_result.node_id, prev_result.tail_pos);
448 
449     log_debug("phrase_start=%zd, phrase_len=%zu\n", original_state.phrase_start, original_state.phrase_len);
450 
451     if (result.node_id != NULL_NODE_ID) {
452         prev_state.result = result;
453         log_debug("Have pre_context\n");
454         state = check_pre_context(trie, str, prev_state);
455 
456         if (state.state == TRANS_STATE_MATCH && state.result.node_id != prev_state.result.node_id) {
457             return state.result;
458         }
459 
460         if (state.state == TRANS_STATE_PARTIAL_MATCH && state.result.node_id != prev_state.result.node_id) {
461             log_debug("Pre-context partial match\n");
462         }
463 
464         prev_result = state.result;
465         prev_state = state;
466     }
467 
468     result = trie_get_prefix_from_index(trie, POST_CONTEXT_CHAR, POST_CONTEXT_CHAR_LEN, prev_result.node_id, prev_result.tail_pos);
469     if (result.node_id != NULL_NODE_ID) {
470         prev_state.result = result;
471         log_debug("Have post_context\n");
472         state = check_post_context(trie, str, prev_state);
473         if (state.state == TRANS_STATE_MATCH && state.result.node_id != prev_state.result.node_id) {
474             return state.result;
475         }
476     }
477 
478     log_debug("Failed to match context\n");
479     return NULL_PREFIX_RESULT;
480 }
481 
replace_groups(trie_t * trie,char * str,char * replacement,group_capture_array * groups,transliteration_state_t original_state)482 static char *replace_groups(trie_t *trie, char *str, char *replacement, group_capture_array *groups, transliteration_state_t original_state) {
483     size_t idx = 0;
484 
485     int32_t ch = 0;
486     ssize_t char_len = 0;
487     uint8_t *ptr = (uint8_t *)str + original_state.phrase_start;
488 
489     log_debug("str=%s\n", (char *)ptr);
490 
491     size_t len = original_state.phrase_len;
492 
493     log_debug("phrase_start = %zd, phrase_len = %zu\n", original_state.phrase_start, original_state.phrase_len);
494 
495     size_t num_groups = groups->n;
496 
497     log_debug("num_groups = %zu\n", num_groups);
498 
499     if (num_groups == 0) {
500         return NULL;
501     }
502 
503     cstring_array *group_strings = cstring_array_new_size(num_groups);
504 
505     log_debug("Created arrays\n");
506 
507     transliteration_state_t state = original_state;
508     transliteration_state_t prev_state = original_state;
509 
510     transliteration_state_t repeat_state_end = TRANSLITERATION_DEFAULT_STATE;
511 
512     size_t group_num = 0;
513     group_capture_t group = groups->a[group_num];
514 
515     log_debug("group = {%zu, %zu}\n", group.start, group.len);
516 
517     bool in_group = false;
518     bool in_repeat = false;
519 
520     size_t group_start = 0;
521     size_t group_len = 0;
522 
523     log_debug("group now {%zu, %zu}\n", group_start, group_len);
524 
525     size_t num_chars = 0;
526 
527     while (idx < len) {
528         char_len = utf8proc_iterate(ptr, len, &ch);
529 
530         log_debug("Got char '%.*s' at idx=%zu, len=%zu\n", (int)char_len, ptr, idx, char_len);
531 
532         if (char_len <= 0) {
533             break;
534         }
535 
536         if (!(utf8proc_codepoint_valid(ch))) {
537             log_warn("Invalid codepoint: %d\n", ch);
538             continue;
539         }
540 
541         state = state_transition(trie, str, idx, char_len, prev_state);
542 
543         if (state.state == TRANS_STATE_BEGIN && !in_repeat) {
544             log_debug("Normal char: %.*s\n", (int)char_len, ptr);
545             prev_state = original_state;
546         } else if (state.repeat) {
547             log_debug("state.repeat\n");
548             in_repeat = true;
549             repeat_state_end = state;
550             state.advance_state = false;
551         } else if (state.empty_transition) {
552             log_debug("state.empty_transition\n");
553             state.advance_index = false;
554             num_chars++;
555         } else if (state.state == TRANS_STATE_BEGIN && in_repeat && state.result.node_id == repeat_state_end.result.node_id) {
556             log_debug("end of repeat\n");
557             prev_state = repeat_state_end;
558             state.advance_index = false;
559             state.advance_state = false;
560         } else if (in_repeat) {
561             log_debug("in repeat\n");
562             in_repeat = false;
563             state.advance_index = false;
564             state.advance_state = false;
565         }
566 
567         if (state.advance_index) {
568             if (num_chars == group.start) {
569                 log_debug("Starting group\n");
570                 in_group = true;
571                 group_start = idx;
572                 log_debug("group_start = %zu\n", group_start);
573             }
574 
575             if (in_group) {
576                 log_debug("In group\n");
577                 group_len += char_len;
578 
579                 log_debug("group_len=%zu\n", group_len);
580                 log_debug("group.start + group.len = %zu\n", group.start + group.len);
581                 if (num_chars == group.start + group.len - 1) {
582                     in_group = false;
583                     log_debug("adding group str %.*s\n", (int)group_len, str + original_state.phrase_start + group_start);
584                     cstring_array_add_string_len(group_strings, str + original_state.phrase_start + group_start, group_len);
585                     if (group_num < num_groups - 1) {
586                         group_num++;
587                         log_debug("group_num=%zu\n", group_num);
588                         group = groups->a[group_num];
589                         group_len = 0;
590                     }
591                     state = TRANSLITERATION_DEFAULT_STATE;
592                 }
593             }
594 
595         }
596 
597         if (state.advance_index) {
598             ptr += char_len;
599             idx += char_len;
600             num_chars++;
601             log_debug("num_chars = %zu\n", num_chars);
602         }
603 
604         if (state.advance_state) {
605             prev_state = state;
606         }
607 
608     }
609 
610     bool in_group_ref = false;
611 
612     int group_ref = 0;
613 
614     size_t group_num_start = 0;
615     size_t group_num_len = 0;
616 
617     idx = 0;
618 
619     log_debug("Doing replacements\n");
620 
621     size_t replacement_len = strlen(replacement);
622 
623     log_debug("replacement = %s, len = %zu\n", replacement, replacement_len);
624 
625     char_array *ret = char_array_new_size(replacement_len);
626 
627     uint8_t *replacement_ptr = (uint8_t *)replacement;
628 
629     while (idx < replacement_len) {
630         char_len = utf8proc_iterate(replacement_ptr, replacement_len, &ch);
631 
632         if (ch == GROUP_INDICATOR_CODEPOINT) {
633             log_debug("start group ref\n");
634             in_group_ref = true;
635             group_num_start = idx + 1;
636             group_num_len = 0;
637             idx += char_len;
638             replacement_ptr += char_len;
639             continue;
640         } else if (in_group_ref) {
641             log_debug("in group ref\n");
642             sscanf((char *)replacement_ptr, "%d", &group_ref);
643             log_debug("Got group_ref=%d\n", group_ref);
644             char *group = cstring_array_get_string(group_strings, group_ref-1);
645             log_debug("Got group=%s\n", group);
646             if (group != NULL) {
647                 char_array_cat(ret, group);
648             }
649             log_debug("Did cat\n");
650             if (group_ref > 0) {
651                 size_t group_ref_len = (int)(log10(group_ref) + 1);
652                 log_debug("group_ref_len=%zu\n", group_ref_len);
653                 idx += group_ref_len;
654                 replacement_ptr += group_ref_len;
655             }
656             in_group_ref = false;
657         } else {
658             log_debug("ptr=%.*s\n", (int)char_len, replacement_ptr);
659             char_array_cat_len(ret, (char *)replacement_ptr, char_len);
660             idx += char_len;
661             replacement_ptr += char_len;
662         }
663     }
664 
665     cstring_array_destroy(group_strings);
666     return char_array_to_string(ret);
667 }
668 
transliterate(char * trans_name,char * str,size_t len)669 char *transliterate(char *trans_name, char *str, size_t len) {
670     if (trans_name == NULL || str == NULL) return NULL;
671 
672     transliteration_table_t *trans_table = get_transliteration_table();
673 
674     if (trans_table == NULL) {
675         log_error("transliteration table is NULL. Call libpostal_setup() or transliteration_module_setup()\n");
676         return NULL;
677     }
678 
679     trie_t *trie = trans_table->trie;
680 
681     if (trie == NULL) {
682         log_warn("transliteration table not initialized\n");
683         return NULL;
684     }
685 
686     log_debug("len = %zu\n", len);
687 
688     str = strndup(str, len);
689 
690     bool allocated_trans_name = false;
691 
692     if (!string_is_lower(trans_name)) {
693         trans_name = strdup(trans_name);
694 
695         // Transliterator names are ASCII strings, so this is fine
696         string_lower(trans_name);
697         allocated_trans_name = true;
698     }
699 
700     log_debug("lower = %s\n", trans_name);
701 
702     transliterator_t *transliterator = get_transliterator(trans_name);
703     if (transliterator == NULL) {
704         log_warn("transliterator \"%s\" does not exist\n", trans_name);
705         if (allocated_trans_name) free(trans_name);
706         free(str);
707         return NULL;
708     }
709 
710     log_debug("got transliterator\n");
711 
712     trie_prefix_result_t result = trie_get_prefix(trie, trans_name);
713 
714     log_debug("result = {%d, %zu}\n", result.node_id, result.tail_pos);
715 
716     uint32_t trans_node_id = result.node_id;
717 
718     if (allocated_trans_name) free(trans_name);
719 
720     result = trie_get_prefix_from_index(trans_table->trie, NAMESPACE_SEPARATOR_CHAR, NAMESPACE_SEPARATOR_CHAR_LEN, result.node_id, result.tail_pos);
721 
722     trans_node_id = result.node_id;
723 
724     trie_prefix_result_t trans_result = result;
725 
726     log_debug("trans_node_id = %d\n", trans_node_id);
727 
728     transliteration_step_t *step;
729     char *step_name;
730 
731     char_array *new_str = NULL;
732 
733     for (uint32_t i = transliterator->steps_index; i < transliterator->steps_index + transliterator->steps_length; i++) {
734         step = trans_table->steps->a[i];
735         step_name = step->name;
736         if (step->type == STEP_RULESET && trans_node_id == NULL_NODE_ID) {
737             log_warn("transliterator \"%s\" does not exist in trie\n", trans_name);
738             free(str);
739             return NULL;
740         }
741 
742         if (step->type == STEP_RULESET) {
743             log_debug("ruleset\n");
744             result = trie_get_prefix_from_index(trie, step_name, strlen(step_name), trans_result.node_id, trans_result.tail_pos);
745             uint32_t step_node_id = result.node_id;
746 
747             if (step_node_id == NULL_NODE_ID) {
748                 log_warn("transliterator step \"%s\" does not exist\n", step_name);
749                 free(str);
750                 return NULL;
751             }
752 
753             result = trie_get_prefix_from_index(trie, NAMESPACE_SEPARATOR_CHAR, NAMESPACE_SEPARATOR_CHAR_LEN, result.node_id, result.tail_pos);
754             step_node_id = result.node_id;
755 
756             log_debug("step_node_id = %d\n", step_node_id);
757 
758             trie_prefix_result_t step_result = result;
759             trie_prefix_result_t context_result = NULL_PREFIX_RESULT;
760 
761             new_str = char_array_new_size(len);
762 
763             transliteration_state_t state = TRANSLITERATION_DEFAULT_STATE;
764 
765             transliteration_state_t start_state = TRANSLITERATION_DEFAULT_STATE;
766             start_state.result = step_result;
767 
768             transliteration_state_t prev_state = start_state;
769             transliteration_state_t prev2_state = start_state;
770 
771             transliteration_state_t repeat_state_end = start_state;
772 
773             bool in_repeat = false;
774 
775             int32_t ch = 0;
776             ssize_t char_len = 0;
777             uint8_t *ptr = (uint8_t *)str;
778             size_t idx = 0;
779 
780             char *original_str = str;
781             char_array *revisit = NULL;
782 
783             transliteration_replacement_t *replacement = NULL;
784 
785             transliteration_state_t match_state = TRANSLITERATION_DEFAULT_STATE;
786 
787             while (idx < len) {
788                 log_debug("idx=%zu, ptr=%s\n", idx, ptr);
789                 char_len = utf8proc_iterate(ptr, len, &ch);
790                 if (char_len == UTF8PROC_ERROR_INVALIDUTF8) {
791                     log_warn("invalid UTF-8\n");
792                     char_len = 1;
793                     ch = (int32_t)*ptr;
794                 } else if (char_len <= 0) {
795                     log_warn("char_len=%zd at idx=%zu\n", char_len, idx);
796                     free(trans_name);
797                     free(str);
798                     return NULL;
799                 }
800 
801                 if (!(utf8proc_codepoint_valid(ch))) {
802                     log_warn("Invalid codepoint: %d\n", ch);
803                     idx += char_len;
804                     ptr += char_len;
805                     continue;
806                 }
807 
808                 if (ch == 0) break;
809 
810                 log_debug("Got char '%.*s' at idx=%zu, prev_state.state=%d\n", (int)char_len, str + idx, idx, prev_state.state);
811 
812                 state = state_transition(trie, str, idx, char_len, prev_state);
813                 set_match_if_any(trie, state, &match_state);
814 
815                 replacement = NULL;
816 
817                 if ((state.state == TRANS_STATE_BEGIN && prev_state.state == TRANS_STATE_PARTIAL_MATCH) ||
818                     (state.state == TRANS_STATE_PARTIAL_MATCH && idx + char_len == len)) {
819 
820                     log_debug("end of partial or last char, prev start=%zd, prev len=%zu\n", prev_state.phrase_start, prev_state.phrase_len);
821 
822                     bool context_no_match = false;
823 
824                     bool is_last_char = idx + char_len == len;
825 
826                     transliteration_state_t match_candidate_state = state.state == TRANS_STATE_PARTIAL_MATCH ? state : prev_state;
827                     if (state.state == TRANS_STATE_PARTIAL_MATCH) {
828                         log_debug("state.state == TRANS_STATE_PARTIAL_MATCH\n");
829                     }
830 
831                     context_result = context_match(trie, str, match_candidate_state);
832 
833                     if (context_result.node_id != NULL_NODE_ID) {
834                         log_debug("Context match\n");
835                         match_state = match_candidate_state;
836                         match_state.state = TRANS_STATE_MATCH;
837                         replacement = get_replacement(trie, context_result, str, match_state.phrase_start);
838                     } else {
839                         if (match_state.state == TRANS_STATE_MATCH) {
840                             log_debug("Context no match and previous match\n");
841                             replacement = get_replacement(trie, match_state.result, str, match_state.phrase_start);
842                             if (state.state != TRANS_STATE_PARTIAL_MATCH) {
843                                 state.advance_index = false;
844                             }
845                         } else {
846                             log_debug("Checking for no-context match\n");
847                             set_match_if_any(trie, match_candidate_state, &match_state);
848                             if (match_state.state != TRANS_STATE_MATCH && !match_candidate_state.in_set) {
849                                 log_debug("Trying set for match candidate\n");
850 
851                                 transliteration_state_t match_prev_state = !is_last_char ? prev2_state : prev_state;
852 
853                                 log_debug("idx = %zu, match_candidate_state.char_len = %zu\n", idx, match_candidate_state.char_len);
854 
855                                 char_set_result_t char_result = next_prefix_or_set(trie, str + idx, match_candidate_state.char_len, match_prev_state.result, false, true);
856                                 log_debug("char_result.type = %d\n", char_result.type);
857                                 bool is_context = false;
858 
859                                 match_candidate_state = state_from_char_result(char_result, idx, match_candidate_state.char_len, match_prev_state, is_context);
860                                 if (match_candidate_state.state == TRANS_STATE_PARTIAL_MATCH) {
861                                     log_debug("Got partial match for set check\n");
862                                     set_match_if_any(trie, match_candidate_state, &match_state);
863                                     if (match_state.state != TRANS_STATE_MATCH && !match_candidate_state.empty_transition) {
864                                         log_debug("match_state.state != TRANS_STATE_MATCH && !match_candidate_state.empty_transition\n");
865                                         prev_state = match_candidate_state;
866                                     }
867                                 }
868                             }
869 
870                             if (match_state.state == TRANS_STATE_MATCH) {
871                                 log_debug("Match no context\n");
872                                 replacement = get_replacement(trie, match_state.result, str, match_state.phrase_start);
873                             } else {
874 
875                                 log_debug("Tried context for %s at char '%.*s', no match\n", str, (int)char_len, ptr);
876                                 context_no_match = true;
877                             }
878                         }
879 
880                     }
881 
882                     if (replacement != NULL) {
883                         char *replacement_string = cstring_array_get_string(trans_table->replacement_strings, replacement->string_index);
884                         char *revisit_string = NULL;
885                         if (replacement->revisit_index != 0) {
886                             log_debug("revisit_index = %d\n", replacement->revisit_index);
887                             revisit_string = cstring_array_get_string(trans_table->revisit_strings, replacement->revisit_index);
888                         }
889 
890                         bool free_revisit = false;
891                         bool free_replacement = false;
892 
893                         if (replacement->groups != NULL) {
894                             log_debug("Did groups, str=%s\n", str);
895                             replacement_string = replace_groups(trie, str, replacement_string, replacement->groups, match_state);
896                             free_replacement = (replacement_string != NULL);
897                             if (revisit_string != NULL) {
898                                 log_debug("===Doing revisit\n");
899                                 revisit_string = replace_groups(trie, str, revisit_string, replacement->groups, match_state);
900                                 free_revisit = (revisit_string != NULL);
901                             }
902                         }
903 
904                         if (revisit_string != NULL) {
905                             log_debug("revisit_string not null, %s\n", revisit_string);
906                             size_t revisit_size = strlen(revisit_string) + len - idx;
907                             if (revisit == NULL) {
908                                 revisit = char_array_new_size(revisit_size + 1);
909                             } else {
910                                 log_debug("revisit not null\n");
911                                 char_array_clear(revisit);
912                             }
913 
914                             char_array_cat(revisit, revisit_string);
915                             char_array_cat_len(revisit, str + idx, len - idx);
916 
917                             idx = 0;
918                             len = revisit_size;
919                             str = char_array_get_string(revisit);
920                             ptr = (uint8_t *)str;
921                             log_debug("Switching to revisit=%s, size=%zu\n", str, revisit_size);
922                         }
923 
924                         char_array_cat(new_str, replacement_string);
925                         log_debug("Replacement = %s, revisit = %s\n", replacement_string, revisit_string);
926 
927                         if (free_replacement) {
928                             free(replacement_string);
929                         }
930                         if (free_revisit) {
931                             free(revisit_string);
932                         }
933 
934                         match_state = TRANSLITERATION_DEFAULT_STATE;
935                     }
936 
937 
938                     if (context_no_match && !prev_state.empty_transition && prev_state.phrase_len > 0) {
939                         log_debug("Previous phrase stays as is %.*s\n", (int)prev_state.phrase_len, str+prev_state.phrase_start);
940                         char_array_cat_len(new_str, str + prev_state.phrase_start, prev_state.phrase_len);
941                         state = start_state;
942                     }
943 
944                     if (state.state == TRANS_STATE_BEGIN && !prev_state.empty_transition) {
945                         log_debug("TRANS_STATE_BEGIN && !prev_state.empty_transition\n");
946                         state.advance_index = false;
947                     } else if (prev_state.empty_transition) {
948                         log_debug("No replacement for %.*s\n", (int)char_len, ptr);
949                         char_array_cat_len(new_str, str + idx, char_len);
950                     }
951 
952                     state.advance_state = false;
953                     prev_state = start_state;
954                 } else if (state.state == TRANS_STATE_BEGIN && !in_repeat) {
955                     log_debug("No replacement for %.*s\n", (int)char_len, ptr);
956                     char_array_cat_len(new_str, str + idx, char_len);
957                     prev_state = start_state;
958                     state.advance_state = false;
959                 } else if (state.repeat) {
960                     log_debug("state.repeat\n");
961                     in_repeat = true;
962                     repeat_state_end = state;
963                     state.advance_state = false;
964                 } else if (state.empty_transition) {
965                     log_debug("state.empty_transition\n");
966                     state.advance_index = false;
967                 } else if (state.state == TRANS_STATE_BEGIN && in_repeat && state.result.node_id == repeat_state_end.result.node_id) {
968                     prev_state = repeat_state_end;
969                     state.advance_index = false;
970                     state.advance_state = false;
971                 } else if (in_repeat) {
972                     in_repeat = false;
973                     state.advance_index = false;
974                     state.advance_state = false;
975                 }
976 
977                 log_debug("state.phrase_start = %zd, state.phrase_len=%zu\n", state.phrase_start, state.phrase_len);
978                 if (state.advance_index) {
979                     ptr += char_len;
980                     idx += char_len;
981                 }
982 
983                 if (state.advance_state) {
984                     prev2_state = prev_state;
985                     prev_state = state;
986                 }
987 
988             }
989 
990             if (revisit != NULL) {
991                 char_array_destroy(revisit);
992             }
993 
994             log_debug("original_str=%s\n", original_str);
995 
996             free(original_str);
997 
998             str = char_array_to_string(new_str);
999 
1000             log_debug("new_str = %s\n", str);
1001 
1002         } else if (step->type == STEP_UNICODE_NORMALIZATION) {
1003             log_debug("unicode normalization\n");
1004             int utf8proc_options = UTF8PROC_OPTIONS_BASE;
1005             if (string_equals(step->name, NFD)) {
1006                 utf8proc_options = UTF8PROC_OPTIONS_NFD;
1007             } else if (string_equals(step->name, NFC)) {
1008                 utf8proc_options = UTF8PROC_OPTIONS_NFC;
1009             } else if (string_equals(step->name, NFKD)) {
1010                 utf8proc_options = UTF8PROC_OPTIONS_NFKD;
1011             } else if (string_equals(step->name, NFKC)) {
1012                 utf8proc_options = UTF8PROC_OPTIONS_NFKC;
1013             } else if (string_equals(step->name, STRIP_MARK)) {
1014                 utf8proc_options = UTF8PROC_OPTIONS_STRIP_ACCENTS;
1015             }
1016 
1017             uint8_t *utf8proc_normalized = NULL;
1018             utf8proc_map((uint8_t *)str, 0, &utf8proc_normalized, utf8proc_options);
1019             if (utf8proc_normalized != NULL) {
1020                 char *old_str = str;
1021                 str = (char *)utf8proc_normalized;
1022                 log_debug("utf8proc_normalized=%s\n", utf8proc_normalized);
1023                 len = strlen(str);
1024                 free(old_str);
1025             }
1026             log_debug("Got unicode normalization step, new str=%s, len=%lu\n", str, strlen(str));
1027         } else if (step->type == STEP_TRANSFORM) {
1028             // Recursive call here shouldn't hurt too much, happens in only a few languages and only 2-3 calls deep
1029             log_debug("Got STEP_TYPE_TRANSFORM, step=%s\n", step_name);
1030             char *old_str = str;
1031             str = transliterate(step_name, str, strlen(str));
1032             log_debug("Transform result = %s\n", str);
1033             log_debug("str = %s\n", str);
1034             len = strlen(str);
1035             free(old_str);
1036         }
1037 
1038     }
1039 
1040     return str;
1041 
1042 }
1043 
transliteration_table_destroy(void)1044 void transliteration_table_destroy(void) {
1045     transliteration_table_t *trans_table = get_transliteration_table();
1046     if (trans_table == NULL) return;
1047     if (trans_table->trie) {
1048         trie_destroy(trans_table->trie);
1049     }
1050 
1051     if (trans_table->transliterators) {
1052         transliterator_t *trans;
1053         kh_foreach_value(trans_table->transliterators, trans, {
1054             transliterator_destroy(trans);
1055         })
1056 
1057         kh_destroy(str_transliterator, trans_table->transliterators);
1058     }
1059 
1060     if (trans_table->script_languages) {
1061         kh_destroy(script_language_index, trans_table->script_languages);
1062     }
1063 
1064     if (trans_table->transliterator_names) {
1065         cstring_array_destroy(trans_table->transliterator_names);
1066     }
1067 
1068     if (trans_table->steps) {
1069         step_array_destroy(trans_table->steps);
1070     }
1071 
1072     if (trans_table->replacements) {
1073         transliteration_replacement_array_destroy(trans_table->replacements);
1074     }
1075 
1076     if (trans_table->replacement_strings) {
1077         cstring_array_destroy(trans_table->replacement_strings);
1078     }
1079 
1080     if (trans_table->revisit_strings) {
1081         cstring_array_destroy(trans_table->revisit_strings);
1082     }
1083 
1084     free(trans_table);
1085 }
1086 
1087 
transliteration_table_init(void)1088 transliteration_table_t *transliteration_table_init(void) {
1089     transliteration_table_t *trans_table = get_transliteration_table();
1090 
1091     if (trans_table == NULL) {
1092         trans_table = calloc(1, sizeof(transliteration_table_t));
1093 
1094         trans_table->trie = trie_new();
1095         if (trans_table->trie == NULL) {
1096             goto exit_trans_table_created;
1097         }
1098 
1099         trans_table->transliterators = kh_init(str_transliterator);
1100         if (trans_table->transliterators == NULL) {
1101             goto exit_trans_table_created;
1102         }
1103 
1104         trans_table->script_languages = kh_init(script_language_index);
1105         if (trans_table->script_languages == NULL) {
1106             goto exit_trans_table_created;
1107         }
1108 
1109         trans_table->transliterator_names = cstring_array_new();
1110         if (trans_table->transliterator_names == NULL) {
1111             goto exit_trans_table_created;
1112         }
1113 
1114         trans_table->steps = step_array_new();
1115         if (trans_table->steps == NULL) {
1116             goto exit_trans_table_created;
1117         }
1118 
1119         trans_table->replacements = transliteration_replacement_array_new();
1120         if (trans_table->replacements == NULL) {
1121             goto exit_trans_table_created;
1122         }
1123 
1124         trans_table->replacement_strings = cstring_array_new();
1125         if (trans_table->replacement_strings == NULL) {
1126             goto exit_trans_table_created;
1127         }
1128 
1129         trans_table->revisit_strings = cstring_array_new();
1130         if (trans_table->revisit_strings == NULL) {
1131             goto exit_trans_table_created;
1132         }
1133 
1134     }
1135 
1136     return trans_table;
1137 
1138 exit_trans_table_created:
1139    transliteration_table_destroy();
1140    exit(1);
1141 }
1142 
transliteration_table_new(void)1143 transliteration_table_t *transliteration_table_new(void) {
1144     transliteration_table_t *trans_table = transliteration_table_init();
1145     if (trans_table != NULL) {
1146         cstring_array_add_string(trans_table->replacement_strings, "");
1147         cstring_array_add_string(trans_table->revisit_strings, "");
1148     }
1149     return trans_table;
1150 }
1151 
transliteration_step_new(char * name,step_type_t type)1152 transliteration_step_t *transliteration_step_new(char *name, step_type_t type) {
1153     transliteration_step_t *self = malloc(sizeof(transliteration_step_t));
1154 
1155     if (self == NULL) {
1156         return NULL;
1157     }
1158 
1159     self->name = strdup(name);
1160     if (self->name == NULL) {
1161         transliteration_step_destroy(self);
1162     }
1163 
1164     self->type = type;
1165     return self;
1166 }
1167 
1168 
transliteration_step_destroy(transliteration_step_t * self)1169 void transliteration_step_destroy(transliteration_step_t *self) {
1170     if (self == NULL) {
1171         return;
1172     }
1173 
1174     if (self->name != NULL) {
1175         free(self->name);
1176     }
1177 
1178     free(self);
1179 }
1180 
1181 
transliteration_replacement_new(uint32_t string_index,uint32_t revisit_index,group_capture_array * groups)1182 transliteration_replacement_t *transliteration_replacement_new(uint32_t string_index, uint32_t revisit_index, group_capture_array *groups) {
1183     transliteration_replacement_t *replacement = malloc(sizeof(transliteration_replacement_t));
1184 
1185     if (replacement == NULL) {
1186         return NULL;
1187     }
1188 
1189     replacement->num_groups = groups == NULL ? 0 : groups->n;
1190     replacement->groups = groups;
1191 
1192     replacement->string_index = string_index;
1193     replacement->revisit_index = revisit_index;
1194     return replacement;
1195 
1196 }
1197 
transliteration_replacement_destroy(transliteration_replacement_t * self)1198 void transliteration_replacement_destroy(transliteration_replacement_t *self) {
1199     if (self == NULL) return;
1200 
1201     if (self->groups != NULL) {
1202         group_capture_array_destroy(self->groups);
1203     }
1204 
1205     free(self);
1206 }
1207 
transliteration_table_add_transliterator(transliterator_t * trans)1208 bool transliteration_table_add_transliterator(transliterator_t *trans) {
1209     if (trans_table == NULL) {
1210         return false;
1211     }
1212 
1213     int ret;
1214     khiter_t k = kh_put(str_transliterator, trans_table->transliterators, trans->name, &ret);
1215     if (ret < 0) return false;
1216     kh_value(trans_table->transliterators, k) = trans;
1217 
1218     return true;
1219 }
1220 
transliteration_table_add_script_language(script_language_t script_language,transliterator_index_t index)1221 bool transliteration_table_add_script_language(script_language_t script_language, transliterator_index_t index) {
1222     if (trans_table == NULL) {
1223         return false;
1224     }
1225 
1226     int ret;
1227     khiter_t k = kh_put(script_language_index, trans_table->script_languages, script_language, &ret);
1228     if (ret < 0) return false;
1229     kh_value(trans_table->script_languages, k) = index;
1230 
1231     return true;
1232 }
1233 
get_transliterator_index_for_script_language(script_t script,char * language)1234 transliterator_index_t get_transliterator_index_for_script_language(script_t script, char *language) {
1235     if (trans_table == NULL || language == NULL || strlen(language) >= MAX_LANGUAGE_LEN) {
1236         return NULL_TRANSLITERATOR_INDEX;
1237     }
1238 
1239     script_language_t script_lang;
1240     script_lang.script = script;
1241     strcpy(script_lang.language, language);
1242 
1243     khiter_t k;
1244     k = kh_get(script_language_index, trans_table->script_languages, script_lang);
1245     return (k != kh_end(trans_table->script_languages)) ? kh_value(trans_table->script_languages, k) : NULL_TRANSLITERATOR_INDEX;
1246 }
1247 
1248 
transliterator_replace_strings(trie_t * trie,cstring_array * replacements,char * input)1249 char *transliterator_replace_strings(trie_t *trie, cstring_array *replacements, char *input) {
1250     phrase_array *phrases;
1251     char_array *str;
1252     char *current = input;
1253     bool is_original = true;
1254 
1255     size_t len = strlen(input);
1256 
1257     // We may go through several rounds of replacements
1258     while (1) {
1259         phrases = trie_search(trie, current);
1260         if (!phrases) {
1261             break;
1262         } else {
1263             str = char_array_new_size(len);
1264             phrase_t phrase;
1265             size_t start = 0;
1266             size_t end = 0;
1267             for (size_t i = 0; i < phrases->n; i++) {
1268                 phrase = phrases->a[i];
1269                 end = phrase.start;
1270                 char_array_append_len(str, input + start, end - start);
1271                 char_array_append(str, cstring_array_get_string(replacements, phrase.data));
1272                 start = phrase.start + phrase.len;
1273             }
1274 
1275             char_array_append_len(str, input + end, len - end);
1276             char_array_terminate(str);
1277 
1278             if (!is_original) {
1279                 free(current);
1280             }
1281 
1282             // Destroys the char array itself, but not the string it holds
1283             current = char_array_to_string(str);
1284             is_original = false;
1285         }
1286     }
1287 
1288     return current;
1289 }
1290 
transliterator_read(FILE * f)1291 transliterator_t *transliterator_read(FILE *f) {
1292     uint64_t trans_name_len;
1293 
1294     if (!file_read_uint64(f, &trans_name_len)) {
1295         return NULL;
1296     }
1297 
1298     char *name = malloc(trans_name_len);
1299     if (name == NULL) {
1300         return NULL;
1301     }
1302 
1303     if (!file_read_chars(f, name, trans_name_len)) {
1304         return NULL;
1305     }
1306 
1307     bool internal;
1308     if (!file_read_uint8(f, (uint8_t *)&internal)) {
1309         return NULL;
1310     }
1311 
1312     uint32_t steps_index;
1313 
1314     if (!file_read_uint32(f, &steps_index)) {
1315         return NULL;
1316     }
1317 
1318 
1319     uint32_t steps_length;
1320 
1321     if (!file_read_uint32(f, &steps_length)) {
1322         return NULL;
1323     }
1324 
1325     transliterator_t *trans =  transliterator_new(name, internal, steps_index, steps_length);
1326     return trans;
1327 }
1328 
transliterator_write(transliterator_t * trans,FILE * f)1329 bool transliterator_write(transliterator_t *trans, FILE *f) {
1330     size_t trans_name_len = strlen(trans->name) + 1;
1331     if (!file_write_uint64(f, (uint64_t)trans_name_len) ||
1332         !file_write_chars(f, trans->name, trans_name_len)) {
1333         return false;
1334     }
1335 
1336     if (!file_write_uint8(f, trans->internal)) {
1337         return false;
1338     }
1339 
1340     if (!file_write_uint32(f, trans->steps_index)) {
1341         return false;
1342     }
1343 
1344     if (!file_write_uint32(f, (uint32_t)trans->steps_length)) {
1345         return false;
1346     }
1347 
1348     return true;
1349 }
1350 
transliteration_step_read(FILE * f)1351 transliteration_step_t *transliteration_step_read(FILE *f) {
1352     uint64_t step_name_len;
1353 
1354     log_debug("reading step\n");;
1355 
1356     transliteration_step_t *step = malloc(sizeof(transliteration_step_t));
1357     if (step == NULL) {
1358         return NULL;
1359     }
1360 
1361     if (!file_read_uint32(f, &step->type)) {
1362         goto exit_step_destroy;
1363     }
1364     if (!file_read_uint64(f, &step_name_len)) {
1365         goto exit_step_destroy;
1366     }
1367 
1368     char *name = malloc(step_name_len);
1369     if (name == NULL) {
1370         goto exit_step_destroy;
1371     }
1372 
1373     if (!file_read_chars(f, name, step_name_len)) {
1374         free(name);
1375         goto exit_step_destroy;
1376     }
1377     step->name = name;
1378 
1379     return step;
1380 
1381 exit_step_destroy:
1382     free(step);
1383     return NULL;
1384 }
1385 
transliteration_step_write(transliteration_step_t * step,FILE * f)1386 bool transliteration_step_write(transliteration_step_t *step, FILE *f) {
1387     if (!file_write_uint32(f, step->type)) {
1388         return false;
1389     }
1390 
1391     // Include the NUL byte
1392     size_t step_name_len = strlen(step->name) + 1;
1393 
1394     if (!file_write_uint64(f, (uint64_t)step_name_len) ||
1395         !file_write_chars(f, step->name, step_name_len)) {
1396         return false;
1397     }
1398 
1399     return true;
1400 }
1401 
group_capture_read(FILE * f,group_capture_t * group)1402 bool group_capture_read(FILE *f, group_capture_t *group) {
1403     uint64_t start;
1404     if (!file_read_uint64(f, &start)) {
1405         return false;
1406     }
1407     group->start = (size_t)start;
1408 
1409     uint64_t len;
1410     if (!file_read_uint64(f, &len)) {
1411         return false;
1412     }
1413 
1414     group->len = (size_t)len;
1415 
1416     return true;
1417 }
1418 
group_capture_write(group_capture_t group,FILE * f)1419 bool group_capture_write(group_capture_t group, FILE *f) {
1420     if (!file_write_uint64(f, (uint64_t)group.start) ||
1421         !file_write_uint64(f, (uint64_t)group.len)) {
1422         return false;
1423     }
1424 
1425     return true;
1426 }
1427 
transliteration_replacement_read(FILE * f)1428 transliteration_replacement_t *transliteration_replacement_read(FILE *f) {
1429     uint32_t string_index;
1430 
1431     if (!file_read_uint32(f, &string_index)) {
1432         return NULL;
1433     }
1434 
1435     uint32_t revisit_index;
1436 
1437     if (!file_read_uint32(f, &revisit_index)) {
1438         return NULL;
1439     }
1440 
1441     uint64_t num_groups;
1442 
1443     if (!file_read_uint64(f, &num_groups)) {
1444         return NULL;
1445     }
1446 
1447     group_capture_array *groups = NULL;
1448 
1449     if (num_groups > 0) {
1450         groups = group_capture_array_new_size((size_t)num_groups);
1451         group_capture_t group;
1452         for (size_t i = 0; i < (size_t)num_groups; i++) {
1453             if (!group_capture_read(f, &group)) {
1454                 group_capture_array_destroy(groups);
1455                 return NULL;
1456             }
1457             group_capture_array_push(groups, group);
1458         }
1459 
1460     }
1461 
1462 
1463     return transliteration_replacement_new(string_index, revisit_index, groups);
1464 }
1465 
transliteration_replacement_write(transliteration_replacement_t * replacement,FILE * f)1466 bool transliteration_replacement_write(transliteration_replacement_t *replacement, FILE *f) {
1467     if (!file_write_uint32(f, replacement->string_index)) {
1468         return false;
1469     }
1470 
1471     if (!file_write_uint32(f, replacement->revisit_index)) {
1472         return false;
1473     }
1474 
1475     if (!file_write_uint64(f, replacement->num_groups)) {
1476         return false;
1477     }
1478 
1479     group_capture_t group;
1480 
1481     for (size_t i = 0; i < replacement->num_groups; i++) {
1482         group = replacement->groups->a[i];
1483         if (!group_capture_write(group, f)) {
1484             return false;
1485         }
1486     }
1487 
1488     return true;
1489 
1490 }
1491 
transliteration_table_read(FILE * f)1492 bool transliteration_table_read(FILE *f) {
1493     if (f == NULL) {
1494         return false;
1495     }
1496 
1497     uint32_t signature;
1498 
1499     log_debug("Reading signature\n");
1500 
1501     if (!file_read_uint32(f, &signature) || signature != TRANSLITERATION_TABLE_SIGNATURE) {
1502         return false;
1503     }
1504 
1505     trans_table = transliteration_table_init();
1506 
1507     log_debug("Table initialized\n");
1508 
1509     uint64_t num_transliterators = 0;
1510 
1511     if (!file_read_uint64(f, &num_transliterators)) {
1512         goto exit_trans_table_load_error;
1513     }
1514 
1515 
1516     log_debug("num_transliterators = %zu\n", (size_t)num_transliterators);
1517 
1518     size_t i;
1519 
1520     transliterator_t *trans;
1521 
1522     for (i = 0; i < (size_t)num_transliterators; i++) {
1523         trans = transliterator_read(f);
1524         if (trans == NULL) {
1525             log_error("trans was NULL\n");
1526             goto exit_trans_table_load_error;
1527         } else {
1528             log_debug("read trans with name: %s\n", trans->name);
1529         }
1530         if (!transliteration_table_add_transliterator(trans)) {
1531             goto exit_trans_table_load_error;
1532         }
1533     }
1534 
1535     log_debug("Read transliterators\n");
1536 
1537     uint64_t num_script_languages;
1538     if (!file_read_uint64(f, &num_script_languages)) {
1539         goto exit_trans_table_load_error;
1540     }
1541 
1542     log_debug("num_script_languages = %zu\n", (size_t)num_script_languages);
1543 
1544     script_language_t script_language;
1545     transliterator_index_t index;
1546 
1547     uint64_t language_len = 0;
1548     char language[MAX_LANGUAGE_LEN] = "";
1549 
1550     uint64_t transliterator_index = 0;
1551     uint64_t index_num_transliterators = 0;
1552 
1553     for (i = 0; i < num_script_languages; i++) {
1554         if (!file_read_uint32(f, (uint32_t *)&script_language.script)) {
1555             goto exit_trans_table_load_error;
1556         }
1557 
1558         if (!file_read_uint64(f, &language_len) || language_len >= MAX_LANGUAGE_LEN) {
1559             goto exit_trans_table_load_error;
1560         }
1561 
1562         if (language_len == 0) {
1563             script_language.language[0] = '\0';
1564         } else if (!file_read_chars(f, (char *)language, (size_t)language_len)) {
1565             goto exit_trans_table_load_error;
1566         } else {
1567             strcpy(script_language.language, language);
1568         }
1569 
1570         if (!file_read_uint64(f, &transliterator_index)) {
1571             goto exit_trans_table_load_error;
1572         }
1573 
1574         index.transliterator_index = (size_t)transliterator_index;
1575 
1576         if (!file_read_uint64(f, &index_num_transliterators)) {
1577             goto exit_trans_table_load_error;
1578         }
1579 
1580         index.num_transliterators = (size_t)index_num_transliterators;
1581 
1582         log_debug("Adding script language key={%d, %s}, value={%zu, %zu}\n", script_language.script, script_language.language, index.transliterator_index, index.num_transliterators);
1583 
1584         transliteration_table_add_script_language(script_language, index);
1585     }
1586 
1587     uint64_t trans_table_num_strings;
1588 
1589     if (!file_read_uint64(f, &trans_table_num_strings)) {
1590         goto exit_trans_table_load_error;
1591     }
1592 
1593     log_debug("trans_table_num_strings=%zu\n", (size_t)trans_table_num_strings);
1594 
1595     uint64_t trans_name_str_len;
1596 
1597     if (!file_read_uint64(f, &trans_name_str_len)) {
1598         goto exit_trans_table_load_error;
1599     }
1600 
1601     log_debug("Creating char_array with size=%zu\n", (size_t)trans_name_str_len);
1602 
1603     char_array *array = char_array_new_size((size_t)trans_name_str_len);
1604 
1605     if (!file_read_chars(f, array->a, (size_t)trans_name_str_len)) {
1606         goto exit_trans_table_load_error;
1607     }
1608 
1609     array->n = trans_name_str_len;
1610 
1611     cstring_array_destroy(trans_table->transliterator_names);
1612     log_debug("Destroyed current cstring_array\n");
1613 
1614     log_debug("char_array len=%zu\n", array->n);
1615 
1616     trans_table->transliterator_names = cstring_array_from_char_array(array);
1617     log_debug("Set trans_table->transliterator_names\n");
1618 
1619     if (cstring_array_num_strings(trans_table->transliterator_names) != trans_table_num_strings) {
1620         goto exit_trans_table_load_error;
1621     }
1622 
1623     uint64_t num_steps;
1624 
1625     if (!file_read_uint64(f, &num_steps)) {
1626         goto exit_trans_table_load_error;
1627     }
1628 
1629     log_debug("num_steps = %zu\n", (size_t)num_steps);
1630 
1631     if (!step_array_resize(trans_table->steps, (size_t)num_steps)) {
1632         goto exit_trans_table_load_error;
1633     }
1634 
1635     log_debug("resized\n");
1636 
1637     transliteration_step_t *step;
1638 
1639     for (i = 0; i < num_steps; i++) {
1640         step = transliteration_step_read(f);
1641         if (step == NULL) {
1642             goto exit_trans_table_load_error;
1643         }
1644         log_debug("Read step with name %s and type %d\n", step->name, step->type);
1645         step_array_push(trans_table->steps, step);
1646     }
1647 
1648     log_debug("Done with steps\n");
1649 
1650     transliteration_replacement_t *replacement;
1651 
1652     uint64_t num_replacements;
1653 
1654     if (!file_read_uint64(f, &num_replacements)) {
1655         goto exit_trans_table_load_error;
1656     }
1657 
1658     log_debug("num_replacements = %zu\n", (size_t)num_replacements);
1659 
1660     if (!transliteration_replacement_array_resize(trans_table->replacements, (size_t)num_replacements)) {
1661         goto exit_trans_table_load_error;
1662     }
1663 
1664     log_debug("resized\n");
1665 
1666     for (i = 0; i < num_replacements; i++) {
1667         replacement = transliteration_replacement_read(f);
1668         if (replacement == NULL) {
1669             goto exit_trans_table_load_error;
1670         }
1671         transliteration_replacement_array_push(trans_table->replacements, replacement);
1672     }
1673 
1674     log_debug("Done with replacements\n");
1675 
1676     uint64_t num_replacement_tokens;
1677 
1678     if (!file_read_uint64(f, &num_replacement_tokens)) {
1679         goto exit_trans_table_load_error;
1680     }
1681 
1682     log_debug("num_replacement_tokens = %zu\n", (size_t)num_replacement_tokens);
1683 
1684     if (!uint32_array_resize(trans_table->replacement_strings->indices, (size_t)num_replacement_tokens)) {
1685         goto exit_trans_table_load_error;
1686     }
1687 
1688     log_debug("resized\n");
1689 
1690     uint32_t token_index;
1691 
1692     for (i = 0; i < num_replacement_tokens; i++) {
1693         if (!file_read_uint32(f, &token_index)) {
1694             goto exit_trans_table_load_error;
1695         }
1696         uint32_array_push(trans_table->replacement_strings->indices, token_index);
1697     }
1698 
1699     log_debug("Done with replacement token indices\n");
1700 
1701     uint64_t replacement_strings_len;
1702 
1703     if (!file_read_uint64(f, &replacement_strings_len)) {
1704         goto exit_trans_table_load_error;
1705     }
1706 
1707     log_debug("replacement_strings_len = %zu\n", (size_t)replacement_strings_len);
1708 
1709     if (!char_array_resize(trans_table->replacement_strings->str, (size_t)replacement_strings_len)) {
1710         goto exit_trans_table_load_error;
1711     }
1712 
1713     log_debug("resized\n");
1714 
1715     if (!file_read_chars(f, trans_table->replacement_strings->str->a, (size_t)replacement_strings_len)) {
1716         goto exit_trans_table_load_error;
1717     }
1718 
1719     log_debug("Read replacement_strings\n");
1720 
1721     trans_table->replacement_strings->str->n = replacement_strings_len;
1722 
1723     uint64_t num_revisit_tokens;
1724 
1725     if (!file_read_uint64(f, &num_revisit_tokens)) {
1726         goto exit_trans_table_load_error;
1727     }
1728 
1729     log_debug("num_revisit_tokens = %zu\n", (size_t)num_revisit_tokens);
1730 
1731     if (!uint32_array_resize(trans_table->revisit_strings->indices, (size_t)num_revisit_tokens)) {
1732         goto exit_trans_table_load_error;
1733     }
1734 
1735     log_debug("resized\n");
1736 
1737     for (i = 0; i < num_revisit_tokens; i++) {
1738         if (!file_read_uint32(f, &token_index)) {
1739             goto exit_trans_table_load_error;
1740         }
1741         uint32_array_push(trans_table->revisit_strings->indices, token_index);
1742     }
1743 
1744     log_debug("Done with revisit token indices\n");
1745 
1746     uint64_t revisit_strings_len = 0;
1747 
1748     if (!file_read_uint64(f, &revisit_strings_len)) {
1749         goto exit_trans_table_load_error;
1750     }
1751 
1752     log_debug("revisit_strings_len = %zu\n", (size_t)revisit_strings_len);
1753 
1754     if (!char_array_resize(trans_table->revisit_strings->str, (size_t)revisit_strings_len)) {
1755         goto exit_trans_table_load_error;
1756     }
1757 
1758     log_debug("resized\n");
1759 
1760     if (!file_read_chars(f, trans_table->revisit_strings->str->a, (size_t)revisit_strings_len)) {
1761         goto exit_trans_table_load_error;
1762     }
1763 
1764     log_debug("Read revisit_strings\n");
1765 
1766     trans_table->revisit_strings->str->n = revisit_strings_len;
1767 
1768     // Free the default trie
1769     trie_destroy(trans_table->trie);
1770 
1771     trans_table->trie = trie_read(f);
1772     log_debug("Read trie\n");
1773     if (trans_table->trie == NULL) {
1774         goto exit_trans_table_load_error;
1775     }
1776 
1777     return true;
1778 
1779 exit_trans_table_load_error:
1780     transliteration_table_destroy();
1781     return false;
1782 }
1783 
transliteration_table_write(FILE * f)1784 bool transliteration_table_write(FILE *f) {
1785     if (f == NULL) {
1786         return false;
1787     }
1788 
1789     transliterator_t *trans;
1790 
1791     if (!file_write_uint32(f, TRANSLITERATION_TABLE_SIGNATURE)) {
1792         return false;
1793     }
1794 
1795     size_t num_transliterators = kh_size(trans_table->transliterators);
1796 
1797     if (!file_write_uint64(f, (uint64_t)num_transliterators)) {
1798         return false;
1799     }
1800 
1801     kh_foreach_value(trans_table->transliterators, trans, {
1802         if (!transliterator_write(trans, f)) {
1803             return false;
1804         }
1805     })
1806 
1807     size_t i;
1808 
1809     size_t num_script_languages = kh_size(trans_table->script_languages);
1810 
1811     if (!file_write_uint64(f, (uint64_t)num_script_languages)) {
1812         return false;
1813     }
1814 
1815     script_language_t script_language;
1816     transliterator_index_t index;
1817 
1818     kh_foreach(trans_table->script_languages, script_language, index, {
1819         if (!file_write_uint32(f, (uint32_t)script_language.script)) {
1820             return false;
1821         }
1822 
1823         size_t language_len = strlen(script_language.language);
1824 
1825         if (!file_write_uint64(f, (uint64_t)language_len)) {
1826             return false;
1827         }
1828 
1829         if (language_len > 0 && !file_write_chars(f, script_language.language, language_len)) {
1830             return false;
1831         }
1832 
1833         if (!file_write_uint64(f, (uint64_t)index.transliterator_index)) {
1834             return false;
1835         }
1836 
1837         if (!file_write_uint64(f, (uint64_t)index.num_transliterators)) {
1838             return false;
1839         }
1840     })
1841 
1842     size_t num_trans_names = trans_table->transliterator_names->indices->n;
1843 
1844     if (!file_write_uint64(f, (uint64_t)num_trans_names)) {
1845         return false;
1846     }
1847 
1848     size_t trans_names_str_len = trans_table->transliterator_names->str->n;
1849 
1850     if (!file_write_uint64(f, (uint64_t)trans_names_str_len)) {
1851         return false;
1852     }
1853 
1854     if (!file_write_chars(f, trans_table->transliterator_names->str->a, trans_names_str_len)) {
1855         return false;
1856     }
1857 
1858     transliteration_step_t *step;
1859 
1860 
1861     size_t num_steps = trans_table->steps->n;
1862 
1863     if (!file_write_uint64(f, num_steps)) {
1864         return false;
1865     }
1866 
1867     for (i = 0; i < num_steps; i++) {
1868         step = trans_table->steps->a[i];
1869         if (!transliteration_step_write(step, f)) {
1870             return false;
1871         }
1872     }
1873 
1874     size_t num_replacements = trans_table->replacements->n;
1875 
1876     if (!file_write_uint64(f, num_replacements)) {
1877         return false;
1878     }
1879 
1880     transliteration_replacement_t *replacement;
1881 
1882     for (i = 0; i < trans_table->replacements->n; i++) {
1883         replacement = trans_table->replacements->a[i];
1884         if (!transliteration_replacement_write(replacement, f)) {
1885             return false;
1886         }
1887     }
1888 
1889     size_t replacement_tokens_len = trans_table->replacement_strings->indices->n;
1890 
1891     if (!file_write_uint64(f, replacement_tokens_len)) {
1892         return false;
1893     }
1894 
1895     for (i = 0; i < replacement_tokens_len; i++) {
1896         if (!file_write_uint32(f, trans_table->replacement_strings->indices->a[i])) {
1897             return false;
1898         }
1899     }
1900 
1901     size_t replacement_strings_len = trans_table->replacement_strings->str->n;
1902 
1903     if (!file_write_uint64(f, replacement_strings_len)) {
1904         return false;
1905     }
1906 
1907     if (!file_write_chars(f, trans_table->replacement_strings->str->a, replacement_strings_len)) {
1908         return false;
1909     }
1910 
1911     size_t revisit_tokens_len = trans_table->revisit_strings->indices->n;
1912 
1913     log_debug("revisit_tokens_len=%zu\n", revisit_tokens_len);
1914 
1915     if (!file_write_uint64(f, revisit_tokens_len)) {
1916         return false;
1917     }
1918 
1919     for (i = 0; i < revisit_tokens_len; i++) {
1920         if (!file_write_uint32(f, trans_table->revisit_strings->indices->a[i])) {
1921             return false;
1922         }
1923     }
1924 
1925     size_t revisit_strings_len = trans_table->revisit_strings->str->n;
1926 
1927     if (!file_write_uint64(f, revisit_strings_len)) {
1928         return false;
1929     }
1930 
1931     if (!file_write_chars(f, trans_table->revisit_strings->str->a, revisit_strings_len)) {
1932         return false;
1933     }
1934 
1935     if (!trie_write(trans_table->trie, f)) {
1936         return false;
1937     }
1938 
1939     return true;
1940 
1941 }
1942 
transliteration_table_load(char * filename)1943 bool transliteration_table_load(char *filename) {
1944     if (filename == NULL || trans_table != NULL) {
1945         return false;
1946     }
1947 
1948     FILE *f;
1949 
1950     if ((f = fopen(filename, "rb")) != NULL) {
1951         bool ret = transliteration_table_read(f);
1952         fclose(f);
1953         return ret;
1954     } else {
1955         return false;
1956     }
1957 }
1958 
1959 
transliteration_table_save(char * filename)1960 bool transliteration_table_save(char *filename) {
1961     if (trans_table == NULL || filename == NULL) {
1962         return false;
1963     }
1964 
1965     FILE *f;
1966 
1967     if ((f = fopen(filename, "wb")) != NULL) {
1968         bool ret = transliteration_table_write(f);
1969         fclose(f);
1970         return ret;
1971     } else {
1972         return false;
1973     }
1974 
1975 }
1976 
transliteration_module_init(void)1977 bool transliteration_module_init(void) {
1978     trans_table = transliteration_table_new();
1979     return trans_table != NULL;
1980 }
1981 
transliteration_module_setup(char * filename)1982 bool transliteration_module_setup(char *filename) {
1983     if (trans_table == NULL) {
1984         return transliteration_table_load(filename == NULL ? DEFAULT_TRANSLITERATION_PATH : filename);
1985     }
1986 
1987     return true;
1988 }
1989 
1990 
transliteration_module_teardown(void)1991 void transliteration_module_teardown(void) {
1992     transliteration_table_destroy();
1993     trans_table = NULL;
1994 }
1995 
1996