1 #include <math.h>
2 #include "transliterate.h"
3 #include "file_utils.h"
4
5 #include "log/log.h"
6 #include "strndup.h"
7
8 #define TRANSLITERATION_TABLE_SIGNATURE 0xAAAAAAAA
9
10 #define NFD "NFD"
11 #define NFC "NFC"
12 #define NFKC "NFKC"
13 #define NFKD "NFKD"
14 #define STRIP_MARK "STRIP_MARK"
15
16 static transliteration_table_t *trans_table = NULL;
17
get_transliteration_table(void)18 transliteration_table_t *get_transliteration_table(void) {
19 return trans_table;
20 }
21
transliterator_new(char * name,uint8_t internal,uint32_t steps_index,size_t steps_length)22 transliterator_t *transliterator_new(char *name, uint8_t internal, uint32_t steps_index, size_t steps_length) {
23 transliterator_t *trans = malloc(sizeof(transliterator_t));
24
25 if (trans == NULL) {
26 return NULL;
27 }
28
29 trans->name = name;
30 trans->internal = internal;
31 trans->steps_index = steps_index;
32 trans->steps_length = steps_length;
33
34 return trans;
35 }
36
transliterator_destroy(transliterator_t * self)37 void transliterator_destroy(transliterator_t *self) {
38 if (self == NULL) return;
39 if (self->name) {
40 free(self->name);
41 }
42 free(self);
43 }
44
45
get_transliterator(char * name)46 transliterator_t *get_transliterator(char *name) {
47 if (trans_table == NULL) {
48 return NULL;
49 }
50
51 khiter_t k;
52 k = kh_get(str_transliterator, trans_table->transliterators, name);
53 return (k != kh_end(trans_table->transliterators)) ? kh_value(trans_table->transliterators, k) : NULL;
54 }
55
56
57 typedef enum {
58 TRANS_STATE_BEGIN,
59 TRANS_STATE_PARTIAL_MATCH,
60 TRANS_STATE_MATCH
61 } transliteration_state_type_t;
62
63 typedef struct {
64 trie_prefix_result_t result;
65 transliteration_state_type_t state;
66 ssize_t phrase_start;
67 size_t phrase_len;
68 size_t char_len;
69 uint8_t advance_index:1;
70 uint8_t advance_state:1;
71 uint8_t in_set:1;
72 uint8_t empty_transition:1;
73 uint8_t repeat:1;
74 uint8_t word_boundary:1;
75 } transliteration_state_t;
76
77
78 #define TRANSLITERATION_DEFAULT_STATE (transliteration_state_t){NULL_PREFIX_RESULT, TRANS_STATE_BEGIN, 0, 0, 0, 1, 1, 0, 0, 0, 0}
79
80
get_replacement(trie_t * trie,trie_prefix_result_t result,char * str,size_t start_index)81 static transliteration_replacement_t *get_replacement(trie_t *trie, trie_prefix_result_t result, char *str, size_t start_index) {
82 uint32_t node_id = result.node_id;
83 if (node_id == NULL_NODE_ID) return NULL;
84
85 uint32_t replacement_index = 0;
86
87 if (!trie_get_data_at_index(trie, node_id, &replacement_index)) {
88 return NULL;
89 }
90
91 if (replacement_index < trans_table->replacements->n) {
92 log_debug("Got data node\n");
93 return trans_table->replacements->a[replacement_index];
94 }
95
96 return NULL;
97
98 }
99
100 typedef enum {
101 NO_CHAR_RESULT,
102 SINGLE_CHAR_ONLY,
103 SINGLE_CHAR_REPEAT,
104 OPEN_CHAR_SET,
105 CLOSED_CHAR_SET,
106 CHAR_SET_REPEAT,
107 SINGLE_EMPTY_TRANSITION,
108 CHAR_SET_EMPTY_TRANSITION
109 } char_set_type;
110
111
112 typedef struct char_set_result {
113 trie_prefix_result_t result;
114 char_set_type type;
115 } char_set_result_t;
116
117 #define NULL_CHAR_SET_RESULT (char_set_result_t){NULL_PREFIX_RESULT, NO_CHAR_RESULT};
118
next_prefix_or_set(trie_t * trie,char * str,size_t len,trie_prefix_result_t last_result,bool in_set,bool check_set_only)119 static char_set_result_t next_prefix_or_set(trie_t *trie, char *str, size_t len, trie_prefix_result_t last_result, bool in_set, bool check_set_only) {
120 trie_prefix_result_t result = trie_get_prefix_from_index(trie, str, len, last_result.node_id, last_result.tail_pos);
121
122 bool has_empty_transition = false;
123
124
125 if (!check_set_only && result.node_id != NULL_NODE_ID) {
126 last_result = result;
127 result = trie_get_prefix_from_index(trie, REPEAT_CHAR, REPEAT_CHAR_LEN, last_result.node_id, last_result.tail_pos);
128 if (result.node_id == NULL_NODE_ID) {
129 return (char_set_result_t){last_result, SINGLE_CHAR_ONLY};
130 } else {
131 log_debug("Got single char repeat\n");
132 return (char_set_result_t){last_result, SINGLE_CHAR_REPEAT};
133 }
134 } else if (!in_set) {
135 result = trie_get_prefix_from_index(trie, BEGIN_SET_CHAR, BEGIN_SET_CHAR_LEN, last_result.node_id, last_result.tail_pos);
136
137 if (result.node_id == NULL_NODE_ID) {
138 result = trie_get_prefix_from_index(trie, EMPTY_TRANSITION_CHAR, EMPTY_TRANSITION_CHAR_LEN, last_result.node_id, last_result.tail_pos);
139 if (result.node_id == NULL_NODE_ID) {
140 return NULL_CHAR_SET_RESULT;
141 } else {
142 log_debug("empty result node_id=%d\n", result.node_id);
143 return (char_set_result_t){result, SINGLE_EMPTY_TRANSITION};
144 }
145 }
146
147 log_debug("Got begin set, node_id = %d\n", result.node_id);
148
149 last_result = result;
150
151 result = trie_get_prefix_from_index(trie, str, len, last_result.node_id, last_result.tail_pos);
152
153 log_debug("Set node_id = %d, len=%zu\n", result.node_id, len);
154
155 if (result.node_id == NULL_NODE_ID) {
156 result = trie_get_prefix_from_index(trie, EMPTY_TRANSITION_CHAR, EMPTY_TRANSITION_CHAR_LEN, last_result.node_id, last_result.tail_pos);
157 if (result.node_id == NULL_NODE_ID) {
158 return NULL_CHAR_SET_RESULT;
159 }
160 log_debug("Got empty transition char\n");
161 has_empty_transition = true;
162 }
163
164 in_set = true;
165 last_result = result;
166 }
167
168 if (in_set) {
169 // In the set but can potentially have more than one unicode character
170 result = trie_get_prefix_from_index(trie, END_SET_CHAR, END_SET_CHAR_LEN, last_result.node_id, last_result.tail_pos);
171 if (result.node_id == NULL_NODE_ID && !has_empty_transition) {
172 log_debug("No end set\n");
173 return (char_set_result_t){last_result, OPEN_CHAR_SET};
174 } else if (result.node_id == NULL_NODE_ID && has_empty_transition) {
175 log_debug("has_empty_transition\n");
176 return NULL_CHAR_SET_RESULT;
177 }
178
179 last_result = result;
180 result = trie_get_prefix_from_index(trie, REPEAT_CHAR, REPEAT_CHAR_LEN, last_result.node_id, last_result.tail_pos);
181
182 if (result.node_id == NULL_NODE_ID && !has_empty_transition) {
183 log_debug("Got closed set\n");
184 return (char_set_result_t){last_result, CLOSED_CHAR_SET};
185 // Shouldn't repeat the empty transition, so ignore repeats
186 } else if (has_empty_transition) {
187 log_debug("Char set empty transition\n");
188 return (char_set_result_t){result, CHAR_SET_EMPTY_TRANSITION};
189 } else {
190 log_debug("Char set repeated\n");
191 return (char_set_result_t){result, CHAR_SET_REPEAT};
192 }
193 }
194 return NULL_CHAR_SET_RESULT;
195
196 }
197
state_from_char_result(char_set_result_t char_result,size_t index,size_t len,transliteration_state_t prev_state,bool is_context)198 static transliteration_state_t state_from_char_result(char_set_result_t char_result, size_t index, size_t len, transliteration_state_t prev_state, bool is_context) {
199 transliteration_state_t state = TRANSLITERATION_DEFAULT_STATE;
200
201 trie_prefix_result_t result = char_result.result;
202
203 state.result = result;
204 state.char_len = len;
205 state.in_set = (char_result.type == OPEN_CHAR_SET || (prev_state.in_set && char_result.type == SINGLE_CHAR_ONLY));
206 state.repeat = (char_result.type == SINGLE_CHAR_REPEAT || char_result.type == CHAR_SET_REPEAT);
207 state.empty_transition = (char_result.type == SINGLE_EMPTY_TRANSITION || char_result.type == CHAR_SET_EMPTY_TRANSITION);
208
209 if (char_result.type != NO_CHAR_RESULT) {
210 log_debug("in state_from_char_result, char_result.type = %d\n", char_result.type);
211 state.state = TRANS_STATE_PARTIAL_MATCH;
212
213 if (!is_context) {
214 if (prev_state.state == TRANS_STATE_BEGIN) {
215 state.phrase_start = index;
216 } else {
217 state.phrase_start = prev_state.phrase_start;
218 }
219 state.phrase_len = prev_state.phrase_len + len;
220 }
221 }
222
223 return state;
224
225 }
226
state_transition(trie_t * trie,char * str,size_t index,size_t len,transliteration_state_t prev_state)227 static transliteration_state_t state_transition(trie_t *trie, char *str, size_t index, size_t len, transliteration_state_t prev_state) {
228
229 log_debug("str = %s, index = %zu, char_len=%zu\n", str, index, len);
230
231 log_debug("prev_state.result.node_id=%d, prev_state.in_set=%d\n", prev_state.result.node_id, prev_state.in_set);
232
233 char_set_result_t char_result = next_prefix_or_set(trie, str + index, len, prev_state.result, prev_state.in_set, false);
234
235 log_debug("char_result.type = %d\n", char_result.type);
236
237 return state_from_char_result(char_result, index, len, prev_state, false);
238 }
239
240
state_transition_context(trie_t * trie,char * str,size_t index,size_t len,transliteration_state_t prev_state)241 static transliteration_state_t state_transition_context(trie_t *trie, char *str, size_t index, size_t len, transliteration_state_t prev_state) {
242
243 log_debug("str = %s, index = %zu, char_len=%zu\n", str, index, len);
244
245 log_debug("prev_state.result.node_id=%d, prev_state.in_set=%d\n", prev_state.result.node_id, prev_state.in_set);
246
247 char_set_result_t char_result = next_prefix_or_set(trie, str + index, len, prev_state.result, prev_state.in_set, false);
248
249 log_debug("char_result.type = %d\n", char_result.type);
250
251 return state_from_char_result(char_result, index, len, prev_state, true);
252 }
253
254
255
set_match_if_any(trie_t * trie,transliteration_state_t state,transliteration_state_t * match_state)256 static inline void set_match_if_any(trie_t *trie, transliteration_state_t state, transliteration_state_t *match_state) {
257 if (state.state != TRANS_STATE_PARTIAL_MATCH) return;
258
259 trie_prefix_result_t prev_result = state.result;
260
261 // Complete string
262 trie_prefix_result_t result = trie_get_prefix_from_index(trie, "", 1, prev_result.node_id, prev_result.tail_pos);
263 if (result.node_id != NULL_NODE_ID) {
264 match_state->result = result;
265 match_state->state = TRANS_STATE_MATCH;
266 match_state->phrase_start = state.phrase_start;
267 match_state->phrase_len = state.phrase_len;
268 }
269 }
270
271
check_pre_context(trie_t * trie,char * str,transliteration_state_t original_state)272 static transliteration_state_t check_pre_context(trie_t *trie, char *str, transliteration_state_t original_state) {
273 size_t start_index = original_state.phrase_start;
274 int32_t ch = 0;
275 size_t idx = start_index;
276 ssize_t char_len = 0;
277
278 bool in_repeat = false;
279
280 transliteration_state_t prev_state = original_state;
281 transliteration_state_t state = original_state;
282
283 // Save the end of the repeated state the first time through
284 transliteration_state_t repeat_state_end;
285
286 transliteration_state_t match_state = TRANSLITERATION_DEFAULT_STATE;
287
288 log_debug("start_index=%zu, str=%s\n", start_index, str);
289
290 while (idx > 0) {
291 char_len = utf8proc_iterate_reversed((uint8_t *)str, idx, &ch);
292
293 if (char_len <= 0) {
294 break;
295 }
296
297 if (!utf8proc_codepoint_valid(ch)) {
298 idx -= char_len;
299 continue;
300 }
301
302 log_debug("In pre-context, got char %d, \"%.*s\"\n", ch, (int)char_len, str + idx - char_len);
303
304 state = state_transition_context(trie, str, idx - char_len, char_len, prev_state);
305 set_match_if_any(trie, state, &match_state);
306
307 if (match_state.state == TRANS_STATE_MATCH) {
308 log_debug("pre-context TRANS_STATE_MATCH\n");
309 state = match_state;
310 break;
311 } else if (state.state == TRANS_STATE_BEGIN && !in_repeat) {
312 log_debug("pre-context TRANS_STATE_BEGIN and not in repeat\n");
313 if (prev_state.state == TRANS_STATE_PARTIAL_MATCH) {
314 state = prev_state;
315 }
316 break;
317 } else if (state.repeat) {
318 log_debug("pre-context in repeat\n");
319 in_repeat = true;
320 repeat_state_end = state;
321 state.advance_state = false;
322 } else if (state.empty_transition) {
323 log_debug("pre-context empty_transition\n");
324 state.advance_index = false;
325 if (in_repeat) {
326 log_debug("empty_transition in repeat\n");
327 prev_state = repeat_state_end;
328 state.advance_state = false;
329 in_repeat = false;
330 }
331 // If we're repeating e.g. "[abcd]+e", when we hit the "e" or another character, stop repeating and try from the end of the block
332 } else if (state.state == TRANS_STATE_BEGIN && in_repeat) {
333 log_debug("pre-context stop repeat\n");
334 prev_state = repeat_state_end;
335 in_repeat = false;
336 state.advance_index = false;
337 state.advance_state = false;
338 } else if (in_repeat) {
339 log_debug("end repeat\n");
340 log_debug("state.state==%d, state.result.node_id=%d, repeat_state_end.result.node_id=%d\n", state.state, state.result.node_id, repeat_state_end.result.node_id);
341 in_repeat = false;
342 break;
343 }
344
345 if (state.advance_index) {
346 idx -= char_len;
347 }
348
349 if (state.advance_state) {
350 prev_state = state;
351 }
352
353 }
354
355 return state;
356 }
357
check_post_context(trie_t * trie,char * str,transliteration_state_t original_state)358 static transliteration_state_t check_post_context(trie_t *trie, char *str, transliteration_state_t original_state) {
359 size_t index = original_state.phrase_start + original_state.phrase_len;
360 uint8_t *ptr = (uint8_t *)str + index;
361 size_t len = strlen(str) - index;
362 int32_t ch = 0;
363 size_t idx = 0;
364 ssize_t char_len = 0;
365
366 bool in_repeat = false;
367
368 transliteration_state_t prev_state = original_state;
369
370 transliteration_state_t state = original_state;
371
372 // Save the end of the repeated state the first time through
373 transliteration_state_t repeat_state_end;
374
375 transliteration_state_t match_state = TRANSLITERATION_DEFAULT_STATE;
376
377 while (idx < len) {
378 char_len = utf8proc_iterate(ptr, len, &ch);
379
380 if (char_len <= 0) {
381 break;
382 }
383
384 if (!utf8proc_codepoint_valid(ch)) {
385 idx += char_len;
386 ptr += char_len;
387 continue;
388 }
389
390 log_debug("In post-context, got char \"%.*s\"\n", (int)char_len, str + index + idx);
391
392 state = state_transition_context(trie, str, index + idx, char_len, prev_state);
393 set_match_if_any(trie, state, &match_state);
394
395 if (match_state.state == TRANS_STATE_MATCH) {
396 log_debug("post-context TRANS_STATE_MATCH\n");
397 state = match_state;
398 break;
399 } else if (state.state == TRANS_STATE_BEGIN && !in_repeat) {
400 log_debug("post-context TRANS_STATE_BEGIN and not in repeat\n");
401 break;
402 } else if (state.repeat) {
403 log_debug("post-context in repeat\n");
404 in_repeat = true;
405 repeat_state_end = state;
406 state.advance_state = false;
407 } else if (state.empty_transition) {
408 log_debug("post-context empty_transition\n");
409 state.advance_index = false;
410 if (in_repeat) {
411 log_debug("empty_transition in repeat\n");
412 prev_state = repeat_state_end;
413 state.advance_state = false;
414 in_repeat = false;
415 }
416 // If we're repeating e.g. "[abcd]+e", when we hit the "e" or another character, stop repeating and try from the end of the block
417 } else if (state.state == TRANS_STATE_BEGIN && in_repeat) {
418 log_debug("post-context stop repeat\n");
419 prev_state = repeat_state_end;
420 in_repeat = false;
421 state.advance_index = false;
422 state.advance_state = false;
423 } else if (in_repeat) {
424 log_debug("end repeat\n");
425 in_repeat = false;
426 break;
427 }
428
429 if (state.advance_index) {
430 idx += char_len;
431 ptr += char_len;
432 }
433
434 if (state.advance_state) {
435 prev_state = state;
436 }
437
438 }
439
440 return state;
441 }
442
context_match(trie_t * trie,char * str,transliteration_state_t original_state)443 static trie_prefix_result_t context_match(trie_t *trie, char *str, transliteration_state_t original_state) {
444 trie_prefix_result_t prev_result = original_state.result;
445 transliteration_state_t state = TRANSLITERATION_DEFAULT_STATE;
446 transliteration_state_t prev_state = original_state;
447 trie_prefix_result_t result = trie_get_prefix_from_index(trie, PRE_CONTEXT_CHAR, PRE_CONTEXT_CHAR_LEN, prev_result.node_id, prev_result.tail_pos);
448
449 log_debug("phrase_start=%zd, phrase_len=%zu\n", original_state.phrase_start, original_state.phrase_len);
450
451 if (result.node_id != NULL_NODE_ID) {
452 prev_state.result = result;
453 log_debug("Have pre_context\n");
454 state = check_pre_context(trie, str, prev_state);
455
456 if (state.state == TRANS_STATE_MATCH && state.result.node_id != prev_state.result.node_id) {
457 return state.result;
458 }
459
460 if (state.state == TRANS_STATE_PARTIAL_MATCH && state.result.node_id != prev_state.result.node_id) {
461 log_debug("Pre-context partial match\n");
462 }
463
464 prev_result = state.result;
465 prev_state = state;
466 }
467
468 result = trie_get_prefix_from_index(trie, POST_CONTEXT_CHAR, POST_CONTEXT_CHAR_LEN, prev_result.node_id, prev_result.tail_pos);
469 if (result.node_id != NULL_NODE_ID) {
470 prev_state.result = result;
471 log_debug("Have post_context\n");
472 state = check_post_context(trie, str, prev_state);
473 if (state.state == TRANS_STATE_MATCH && state.result.node_id != prev_state.result.node_id) {
474 return state.result;
475 }
476 }
477
478 log_debug("Failed to match context\n");
479 return NULL_PREFIX_RESULT;
480 }
481
replace_groups(trie_t * trie,char * str,char * replacement,group_capture_array * groups,transliteration_state_t original_state)482 static char *replace_groups(trie_t *trie, char *str, char *replacement, group_capture_array *groups, transliteration_state_t original_state) {
483 size_t idx = 0;
484
485 int32_t ch = 0;
486 ssize_t char_len = 0;
487 uint8_t *ptr = (uint8_t *)str + original_state.phrase_start;
488
489 log_debug("str=%s\n", (char *)ptr);
490
491 size_t len = original_state.phrase_len;
492
493 log_debug("phrase_start = %zd, phrase_len = %zu\n", original_state.phrase_start, original_state.phrase_len);
494
495 size_t num_groups = groups->n;
496
497 log_debug("num_groups = %zu\n", num_groups);
498
499 if (num_groups == 0) {
500 return NULL;
501 }
502
503 cstring_array *group_strings = cstring_array_new_size(num_groups);
504
505 log_debug("Created arrays\n");
506
507 transliteration_state_t state = original_state;
508 transliteration_state_t prev_state = original_state;
509
510 transliteration_state_t repeat_state_end = TRANSLITERATION_DEFAULT_STATE;
511
512 size_t group_num = 0;
513 group_capture_t group = groups->a[group_num];
514
515 log_debug("group = {%zu, %zu}\n", group.start, group.len);
516
517 bool in_group = false;
518 bool in_repeat = false;
519
520 size_t group_start = 0;
521 size_t group_len = 0;
522
523 log_debug("group now {%zu, %zu}\n", group_start, group_len);
524
525 size_t num_chars = 0;
526
527 while (idx < len) {
528 char_len = utf8proc_iterate(ptr, len, &ch);
529
530 log_debug("Got char '%.*s' at idx=%zu, len=%zu\n", (int)char_len, ptr, idx, char_len);
531
532 if (char_len <= 0) {
533 break;
534 }
535
536 if (!(utf8proc_codepoint_valid(ch))) {
537 log_warn("Invalid codepoint: %d\n", ch);
538 continue;
539 }
540
541 state = state_transition(trie, str, idx, char_len, prev_state);
542
543 if (state.state == TRANS_STATE_BEGIN && !in_repeat) {
544 log_debug("Normal char: %.*s\n", (int)char_len, ptr);
545 prev_state = original_state;
546 } else if (state.repeat) {
547 log_debug("state.repeat\n");
548 in_repeat = true;
549 repeat_state_end = state;
550 state.advance_state = false;
551 } else if (state.empty_transition) {
552 log_debug("state.empty_transition\n");
553 state.advance_index = false;
554 num_chars++;
555 } else if (state.state == TRANS_STATE_BEGIN && in_repeat && state.result.node_id == repeat_state_end.result.node_id) {
556 log_debug("end of repeat\n");
557 prev_state = repeat_state_end;
558 state.advance_index = false;
559 state.advance_state = false;
560 } else if (in_repeat) {
561 log_debug("in repeat\n");
562 in_repeat = false;
563 state.advance_index = false;
564 state.advance_state = false;
565 }
566
567 if (state.advance_index) {
568 if (num_chars == group.start) {
569 log_debug("Starting group\n");
570 in_group = true;
571 group_start = idx;
572 log_debug("group_start = %zu\n", group_start);
573 }
574
575 if (in_group) {
576 log_debug("In group\n");
577 group_len += char_len;
578
579 log_debug("group_len=%zu\n", group_len);
580 log_debug("group.start + group.len = %zu\n", group.start + group.len);
581 if (num_chars == group.start + group.len - 1) {
582 in_group = false;
583 log_debug("adding group str %.*s\n", (int)group_len, str + original_state.phrase_start + group_start);
584 cstring_array_add_string_len(group_strings, str + original_state.phrase_start + group_start, group_len);
585 if (group_num < num_groups - 1) {
586 group_num++;
587 log_debug("group_num=%zu\n", group_num);
588 group = groups->a[group_num];
589 group_len = 0;
590 }
591 state = TRANSLITERATION_DEFAULT_STATE;
592 }
593 }
594
595 }
596
597 if (state.advance_index) {
598 ptr += char_len;
599 idx += char_len;
600 num_chars++;
601 log_debug("num_chars = %zu\n", num_chars);
602 }
603
604 if (state.advance_state) {
605 prev_state = state;
606 }
607
608 }
609
610 bool in_group_ref = false;
611
612 int group_ref = 0;
613
614 size_t group_num_start = 0;
615 size_t group_num_len = 0;
616
617 idx = 0;
618
619 log_debug("Doing replacements\n");
620
621 size_t replacement_len = strlen(replacement);
622
623 log_debug("replacement = %s, len = %zu\n", replacement, replacement_len);
624
625 char_array *ret = char_array_new_size(replacement_len);
626
627 uint8_t *replacement_ptr = (uint8_t *)replacement;
628
629 while (idx < replacement_len) {
630 char_len = utf8proc_iterate(replacement_ptr, replacement_len, &ch);
631
632 if (ch == GROUP_INDICATOR_CODEPOINT) {
633 log_debug("start group ref\n");
634 in_group_ref = true;
635 group_num_start = idx + 1;
636 group_num_len = 0;
637 idx += char_len;
638 replacement_ptr += char_len;
639 continue;
640 } else if (in_group_ref) {
641 log_debug("in group ref\n");
642 sscanf((char *)replacement_ptr, "%d", &group_ref);
643 log_debug("Got group_ref=%d\n", group_ref);
644 char *group = cstring_array_get_string(group_strings, group_ref-1);
645 log_debug("Got group=%s\n", group);
646 if (group != NULL) {
647 char_array_cat(ret, group);
648 }
649 log_debug("Did cat\n");
650 if (group_ref > 0) {
651 size_t group_ref_len = (int)(log10(group_ref) + 1);
652 log_debug("group_ref_len=%zu\n", group_ref_len);
653 idx += group_ref_len;
654 replacement_ptr += group_ref_len;
655 }
656 in_group_ref = false;
657 } else {
658 log_debug("ptr=%.*s\n", (int)char_len, replacement_ptr);
659 char_array_cat_len(ret, (char *)replacement_ptr, char_len);
660 idx += char_len;
661 replacement_ptr += char_len;
662 }
663 }
664
665 cstring_array_destroy(group_strings);
666 return char_array_to_string(ret);
667 }
668
transliterate(char * trans_name,char * str,size_t len)669 char *transliterate(char *trans_name, char *str, size_t len) {
670 if (trans_name == NULL || str == NULL) return NULL;
671
672 transliteration_table_t *trans_table = get_transliteration_table();
673
674 if (trans_table == NULL) {
675 log_error("transliteration table is NULL. Call libpostal_setup() or transliteration_module_setup()\n");
676 return NULL;
677 }
678
679 trie_t *trie = trans_table->trie;
680
681 if (trie == NULL) {
682 log_warn("transliteration table not initialized\n");
683 return NULL;
684 }
685
686 log_debug("len = %zu\n", len);
687
688 str = strndup(str, len);
689
690 bool allocated_trans_name = false;
691
692 if (!string_is_lower(trans_name)) {
693 trans_name = strdup(trans_name);
694
695 // Transliterator names are ASCII strings, so this is fine
696 string_lower(trans_name);
697 allocated_trans_name = true;
698 }
699
700 log_debug("lower = %s\n", trans_name);
701
702 transliterator_t *transliterator = get_transliterator(trans_name);
703 if (transliterator == NULL) {
704 log_warn("transliterator \"%s\" does not exist\n", trans_name);
705 if (allocated_trans_name) free(trans_name);
706 free(str);
707 return NULL;
708 }
709
710 log_debug("got transliterator\n");
711
712 trie_prefix_result_t result = trie_get_prefix(trie, trans_name);
713
714 log_debug("result = {%d, %zu}\n", result.node_id, result.tail_pos);
715
716 uint32_t trans_node_id = result.node_id;
717
718 if (allocated_trans_name) free(trans_name);
719
720 result = trie_get_prefix_from_index(trans_table->trie, NAMESPACE_SEPARATOR_CHAR, NAMESPACE_SEPARATOR_CHAR_LEN, result.node_id, result.tail_pos);
721
722 trans_node_id = result.node_id;
723
724 trie_prefix_result_t trans_result = result;
725
726 log_debug("trans_node_id = %d\n", trans_node_id);
727
728 transliteration_step_t *step;
729 char *step_name;
730
731 char_array *new_str = NULL;
732
733 for (uint32_t i = transliterator->steps_index; i < transliterator->steps_index + transliterator->steps_length; i++) {
734 step = trans_table->steps->a[i];
735 step_name = step->name;
736 if (step->type == STEP_RULESET && trans_node_id == NULL_NODE_ID) {
737 log_warn("transliterator \"%s\" does not exist in trie\n", trans_name);
738 free(str);
739 return NULL;
740 }
741
742 if (step->type == STEP_RULESET) {
743 log_debug("ruleset\n");
744 result = trie_get_prefix_from_index(trie, step_name, strlen(step_name), trans_result.node_id, trans_result.tail_pos);
745 uint32_t step_node_id = result.node_id;
746
747 if (step_node_id == NULL_NODE_ID) {
748 log_warn("transliterator step \"%s\" does not exist\n", step_name);
749 free(str);
750 return NULL;
751 }
752
753 result = trie_get_prefix_from_index(trie, NAMESPACE_SEPARATOR_CHAR, NAMESPACE_SEPARATOR_CHAR_LEN, result.node_id, result.tail_pos);
754 step_node_id = result.node_id;
755
756 log_debug("step_node_id = %d\n", step_node_id);
757
758 trie_prefix_result_t step_result = result;
759 trie_prefix_result_t context_result = NULL_PREFIX_RESULT;
760
761 new_str = char_array_new_size(len);
762
763 transliteration_state_t state = TRANSLITERATION_DEFAULT_STATE;
764
765 transliteration_state_t start_state = TRANSLITERATION_DEFAULT_STATE;
766 start_state.result = step_result;
767
768 transliteration_state_t prev_state = start_state;
769 transliteration_state_t prev2_state = start_state;
770
771 transliteration_state_t repeat_state_end = start_state;
772
773 bool in_repeat = false;
774
775 int32_t ch = 0;
776 ssize_t char_len = 0;
777 uint8_t *ptr = (uint8_t *)str;
778 size_t idx = 0;
779
780 char *original_str = str;
781 char_array *revisit = NULL;
782
783 transliteration_replacement_t *replacement = NULL;
784
785 transliteration_state_t match_state = TRANSLITERATION_DEFAULT_STATE;
786
787 while (idx < len) {
788 log_debug("idx=%zu, ptr=%s\n", idx, ptr);
789 char_len = utf8proc_iterate(ptr, len, &ch);
790 if (char_len == UTF8PROC_ERROR_INVALIDUTF8) {
791 log_warn("invalid UTF-8\n");
792 char_len = 1;
793 ch = (int32_t)*ptr;
794 } else if (char_len <= 0) {
795 log_warn("char_len=%zd at idx=%zu\n", char_len, idx);
796 free(trans_name);
797 free(str);
798 return NULL;
799 }
800
801 if (!(utf8proc_codepoint_valid(ch))) {
802 log_warn("Invalid codepoint: %d\n", ch);
803 idx += char_len;
804 ptr += char_len;
805 continue;
806 }
807
808 if (ch == 0) break;
809
810 log_debug("Got char '%.*s' at idx=%zu, prev_state.state=%d\n", (int)char_len, str + idx, idx, prev_state.state);
811
812 state = state_transition(trie, str, idx, char_len, prev_state);
813 set_match_if_any(trie, state, &match_state);
814
815 replacement = NULL;
816
817 if ((state.state == TRANS_STATE_BEGIN && prev_state.state == TRANS_STATE_PARTIAL_MATCH) ||
818 (state.state == TRANS_STATE_PARTIAL_MATCH && idx + char_len == len)) {
819
820 log_debug("end of partial or last char, prev start=%zd, prev len=%zu\n", prev_state.phrase_start, prev_state.phrase_len);
821
822 bool context_no_match = false;
823
824 bool is_last_char = idx + char_len == len;
825
826 transliteration_state_t match_candidate_state = state.state == TRANS_STATE_PARTIAL_MATCH ? state : prev_state;
827 if (state.state == TRANS_STATE_PARTIAL_MATCH) {
828 log_debug("state.state == TRANS_STATE_PARTIAL_MATCH\n");
829 }
830
831 context_result = context_match(trie, str, match_candidate_state);
832
833 if (context_result.node_id != NULL_NODE_ID) {
834 log_debug("Context match\n");
835 match_state = match_candidate_state;
836 match_state.state = TRANS_STATE_MATCH;
837 replacement = get_replacement(trie, context_result, str, match_state.phrase_start);
838 } else {
839 if (match_state.state == TRANS_STATE_MATCH) {
840 log_debug("Context no match and previous match\n");
841 replacement = get_replacement(trie, match_state.result, str, match_state.phrase_start);
842 if (state.state != TRANS_STATE_PARTIAL_MATCH) {
843 state.advance_index = false;
844 }
845 } else {
846 log_debug("Checking for no-context match\n");
847 set_match_if_any(trie, match_candidate_state, &match_state);
848 if (match_state.state != TRANS_STATE_MATCH && !match_candidate_state.in_set) {
849 log_debug("Trying set for match candidate\n");
850
851 transliteration_state_t match_prev_state = !is_last_char ? prev2_state : prev_state;
852
853 log_debug("idx = %zu, match_candidate_state.char_len = %zu\n", idx, match_candidate_state.char_len);
854
855 char_set_result_t char_result = next_prefix_or_set(trie, str + idx, match_candidate_state.char_len, match_prev_state.result, false, true);
856 log_debug("char_result.type = %d\n", char_result.type);
857 bool is_context = false;
858
859 match_candidate_state = state_from_char_result(char_result, idx, match_candidate_state.char_len, match_prev_state, is_context);
860 if (match_candidate_state.state == TRANS_STATE_PARTIAL_MATCH) {
861 log_debug("Got partial match for set check\n");
862 set_match_if_any(trie, match_candidate_state, &match_state);
863 if (match_state.state != TRANS_STATE_MATCH && !match_candidate_state.empty_transition) {
864 log_debug("match_state.state != TRANS_STATE_MATCH && !match_candidate_state.empty_transition\n");
865 prev_state = match_candidate_state;
866 }
867 }
868 }
869
870 if (match_state.state == TRANS_STATE_MATCH) {
871 log_debug("Match no context\n");
872 replacement = get_replacement(trie, match_state.result, str, match_state.phrase_start);
873 } else {
874
875 log_debug("Tried context for %s at char '%.*s', no match\n", str, (int)char_len, ptr);
876 context_no_match = true;
877 }
878 }
879
880 }
881
882 if (replacement != NULL) {
883 char *replacement_string = cstring_array_get_string(trans_table->replacement_strings, replacement->string_index);
884 char *revisit_string = NULL;
885 if (replacement->revisit_index != 0) {
886 log_debug("revisit_index = %d\n", replacement->revisit_index);
887 revisit_string = cstring_array_get_string(trans_table->revisit_strings, replacement->revisit_index);
888 }
889
890 bool free_revisit = false;
891 bool free_replacement = false;
892
893 if (replacement->groups != NULL) {
894 log_debug("Did groups, str=%s\n", str);
895 replacement_string = replace_groups(trie, str, replacement_string, replacement->groups, match_state);
896 free_replacement = (replacement_string != NULL);
897 if (revisit_string != NULL) {
898 log_debug("===Doing revisit\n");
899 revisit_string = replace_groups(trie, str, revisit_string, replacement->groups, match_state);
900 free_revisit = (revisit_string != NULL);
901 }
902 }
903
904 if (revisit_string != NULL) {
905 log_debug("revisit_string not null, %s\n", revisit_string);
906 size_t revisit_size = strlen(revisit_string) + len - idx;
907 if (revisit == NULL) {
908 revisit = char_array_new_size(revisit_size + 1);
909 } else {
910 log_debug("revisit not null\n");
911 char_array_clear(revisit);
912 }
913
914 char_array_cat(revisit, revisit_string);
915 char_array_cat_len(revisit, str + idx, len - idx);
916
917 idx = 0;
918 len = revisit_size;
919 str = char_array_get_string(revisit);
920 ptr = (uint8_t *)str;
921 log_debug("Switching to revisit=%s, size=%zu\n", str, revisit_size);
922 }
923
924 char_array_cat(new_str, replacement_string);
925 log_debug("Replacement = %s, revisit = %s\n", replacement_string, revisit_string);
926
927 if (free_replacement) {
928 free(replacement_string);
929 }
930 if (free_revisit) {
931 free(revisit_string);
932 }
933
934 match_state = TRANSLITERATION_DEFAULT_STATE;
935 }
936
937
938 if (context_no_match && !prev_state.empty_transition && prev_state.phrase_len > 0) {
939 log_debug("Previous phrase stays as is %.*s\n", (int)prev_state.phrase_len, str+prev_state.phrase_start);
940 char_array_cat_len(new_str, str + prev_state.phrase_start, prev_state.phrase_len);
941 state = start_state;
942 }
943
944 if (state.state == TRANS_STATE_BEGIN && !prev_state.empty_transition) {
945 log_debug("TRANS_STATE_BEGIN && !prev_state.empty_transition\n");
946 state.advance_index = false;
947 } else if (prev_state.empty_transition) {
948 log_debug("No replacement for %.*s\n", (int)char_len, ptr);
949 char_array_cat_len(new_str, str + idx, char_len);
950 }
951
952 state.advance_state = false;
953 prev_state = start_state;
954 } else if (state.state == TRANS_STATE_BEGIN && !in_repeat) {
955 log_debug("No replacement for %.*s\n", (int)char_len, ptr);
956 char_array_cat_len(new_str, str + idx, char_len);
957 prev_state = start_state;
958 state.advance_state = false;
959 } else if (state.repeat) {
960 log_debug("state.repeat\n");
961 in_repeat = true;
962 repeat_state_end = state;
963 state.advance_state = false;
964 } else if (state.empty_transition) {
965 log_debug("state.empty_transition\n");
966 state.advance_index = false;
967 } else if (state.state == TRANS_STATE_BEGIN && in_repeat && state.result.node_id == repeat_state_end.result.node_id) {
968 prev_state = repeat_state_end;
969 state.advance_index = false;
970 state.advance_state = false;
971 } else if (in_repeat) {
972 in_repeat = false;
973 state.advance_index = false;
974 state.advance_state = false;
975 }
976
977 log_debug("state.phrase_start = %zd, state.phrase_len=%zu\n", state.phrase_start, state.phrase_len);
978 if (state.advance_index) {
979 ptr += char_len;
980 idx += char_len;
981 }
982
983 if (state.advance_state) {
984 prev2_state = prev_state;
985 prev_state = state;
986 }
987
988 }
989
990 if (revisit != NULL) {
991 char_array_destroy(revisit);
992 }
993
994 log_debug("original_str=%s\n", original_str);
995
996 free(original_str);
997
998 str = char_array_to_string(new_str);
999
1000 log_debug("new_str = %s\n", str);
1001
1002 } else if (step->type == STEP_UNICODE_NORMALIZATION) {
1003 log_debug("unicode normalization\n");
1004 int utf8proc_options = UTF8PROC_OPTIONS_BASE;
1005 if (string_equals(step->name, NFD)) {
1006 utf8proc_options = UTF8PROC_OPTIONS_NFD;
1007 } else if (string_equals(step->name, NFC)) {
1008 utf8proc_options = UTF8PROC_OPTIONS_NFC;
1009 } else if (string_equals(step->name, NFKD)) {
1010 utf8proc_options = UTF8PROC_OPTIONS_NFKD;
1011 } else if (string_equals(step->name, NFKC)) {
1012 utf8proc_options = UTF8PROC_OPTIONS_NFKC;
1013 } else if (string_equals(step->name, STRIP_MARK)) {
1014 utf8proc_options = UTF8PROC_OPTIONS_STRIP_ACCENTS;
1015 }
1016
1017 uint8_t *utf8proc_normalized = NULL;
1018 utf8proc_map((uint8_t *)str, 0, &utf8proc_normalized, utf8proc_options);
1019 if (utf8proc_normalized != NULL) {
1020 char *old_str = str;
1021 str = (char *)utf8proc_normalized;
1022 log_debug("utf8proc_normalized=%s\n", utf8proc_normalized);
1023 len = strlen(str);
1024 free(old_str);
1025 }
1026 log_debug("Got unicode normalization step, new str=%s, len=%lu\n", str, strlen(str));
1027 } else if (step->type == STEP_TRANSFORM) {
1028 // Recursive call here shouldn't hurt too much, happens in only a few languages and only 2-3 calls deep
1029 log_debug("Got STEP_TYPE_TRANSFORM, step=%s\n", step_name);
1030 char *old_str = str;
1031 str = transliterate(step_name, str, strlen(str));
1032 log_debug("Transform result = %s\n", str);
1033 log_debug("str = %s\n", str);
1034 len = strlen(str);
1035 free(old_str);
1036 }
1037
1038 }
1039
1040 return str;
1041
1042 }
1043
transliteration_table_destroy(void)1044 void transliteration_table_destroy(void) {
1045 transliteration_table_t *trans_table = get_transliteration_table();
1046 if (trans_table == NULL) return;
1047 if (trans_table->trie) {
1048 trie_destroy(trans_table->trie);
1049 }
1050
1051 if (trans_table->transliterators) {
1052 transliterator_t *trans;
1053 kh_foreach_value(trans_table->transliterators, trans, {
1054 transliterator_destroy(trans);
1055 })
1056
1057 kh_destroy(str_transliterator, trans_table->transliterators);
1058 }
1059
1060 if (trans_table->script_languages) {
1061 kh_destroy(script_language_index, trans_table->script_languages);
1062 }
1063
1064 if (trans_table->transliterator_names) {
1065 cstring_array_destroy(trans_table->transliterator_names);
1066 }
1067
1068 if (trans_table->steps) {
1069 step_array_destroy(trans_table->steps);
1070 }
1071
1072 if (trans_table->replacements) {
1073 transliteration_replacement_array_destroy(trans_table->replacements);
1074 }
1075
1076 if (trans_table->replacement_strings) {
1077 cstring_array_destroy(trans_table->replacement_strings);
1078 }
1079
1080 if (trans_table->revisit_strings) {
1081 cstring_array_destroy(trans_table->revisit_strings);
1082 }
1083
1084 free(trans_table);
1085 }
1086
1087
transliteration_table_init(void)1088 transliteration_table_t *transliteration_table_init(void) {
1089 transliteration_table_t *trans_table = get_transliteration_table();
1090
1091 if (trans_table == NULL) {
1092 trans_table = calloc(1, sizeof(transliteration_table_t));
1093
1094 trans_table->trie = trie_new();
1095 if (trans_table->trie == NULL) {
1096 goto exit_trans_table_created;
1097 }
1098
1099 trans_table->transliterators = kh_init(str_transliterator);
1100 if (trans_table->transliterators == NULL) {
1101 goto exit_trans_table_created;
1102 }
1103
1104 trans_table->script_languages = kh_init(script_language_index);
1105 if (trans_table->script_languages == NULL) {
1106 goto exit_trans_table_created;
1107 }
1108
1109 trans_table->transliterator_names = cstring_array_new();
1110 if (trans_table->transliterator_names == NULL) {
1111 goto exit_trans_table_created;
1112 }
1113
1114 trans_table->steps = step_array_new();
1115 if (trans_table->steps == NULL) {
1116 goto exit_trans_table_created;
1117 }
1118
1119 trans_table->replacements = transliteration_replacement_array_new();
1120 if (trans_table->replacements == NULL) {
1121 goto exit_trans_table_created;
1122 }
1123
1124 trans_table->replacement_strings = cstring_array_new();
1125 if (trans_table->replacement_strings == NULL) {
1126 goto exit_trans_table_created;
1127 }
1128
1129 trans_table->revisit_strings = cstring_array_new();
1130 if (trans_table->revisit_strings == NULL) {
1131 goto exit_trans_table_created;
1132 }
1133
1134 }
1135
1136 return trans_table;
1137
1138 exit_trans_table_created:
1139 transliteration_table_destroy();
1140 exit(1);
1141 }
1142
transliteration_table_new(void)1143 transliteration_table_t *transliteration_table_new(void) {
1144 transliteration_table_t *trans_table = transliteration_table_init();
1145 if (trans_table != NULL) {
1146 cstring_array_add_string(trans_table->replacement_strings, "");
1147 cstring_array_add_string(trans_table->revisit_strings, "");
1148 }
1149 return trans_table;
1150 }
1151
transliteration_step_new(char * name,step_type_t type)1152 transliteration_step_t *transliteration_step_new(char *name, step_type_t type) {
1153 transliteration_step_t *self = malloc(sizeof(transliteration_step_t));
1154
1155 if (self == NULL) {
1156 return NULL;
1157 }
1158
1159 self->name = strdup(name);
1160 if (self->name == NULL) {
1161 transliteration_step_destroy(self);
1162 }
1163
1164 self->type = type;
1165 return self;
1166 }
1167
1168
transliteration_step_destroy(transliteration_step_t * self)1169 void transliteration_step_destroy(transliteration_step_t *self) {
1170 if (self == NULL) {
1171 return;
1172 }
1173
1174 if (self->name != NULL) {
1175 free(self->name);
1176 }
1177
1178 free(self);
1179 }
1180
1181
transliteration_replacement_new(uint32_t string_index,uint32_t revisit_index,group_capture_array * groups)1182 transliteration_replacement_t *transliteration_replacement_new(uint32_t string_index, uint32_t revisit_index, group_capture_array *groups) {
1183 transliteration_replacement_t *replacement = malloc(sizeof(transliteration_replacement_t));
1184
1185 if (replacement == NULL) {
1186 return NULL;
1187 }
1188
1189 replacement->num_groups = groups == NULL ? 0 : groups->n;
1190 replacement->groups = groups;
1191
1192 replacement->string_index = string_index;
1193 replacement->revisit_index = revisit_index;
1194 return replacement;
1195
1196 }
1197
transliteration_replacement_destroy(transliteration_replacement_t * self)1198 void transliteration_replacement_destroy(transliteration_replacement_t *self) {
1199 if (self == NULL) return;
1200
1201 if (self->groups != NULL) {
1202 group_capture_array_destroy(self->groups);
1203 }
1204
1205 free(self);
1206 }
1207
transliteration_table_add_transliterator(transliterator_t * trans)1208 bool transliteration_table_add_transliterator(transliterator_t *trans) {
1209 if (trans_table == NULL) {
1210 return false;
1211 }
1212
1213 int ret;
1214 khiter_t k = kh_put(str_transliterator, trans_table->transliterators, trans->name, &ret);
1215 if (ret < 0) return false;
1216 kh_value(trans_table->transliterators, k) = trans;
1217
1218 return true;
1219 }
1220
transliteration_table_add_script_language(script_language_t script_language,transliterator_index_t index)1221 bool transliteration_table_add_script_language(script_language_t script_language, transliterator_index_t index) {
1222 if (trans_table == NULL) {
1223 return false;
1224 }
1225
1226 int ret;
1227 khiter_t k = kh_put(script_language_index, trans_table->script_languages, script_language, &ret);
1228 if (ret < 0) return false;
1229 kh_value(trans_table->script_languages, k) = index;
1230
1231 return true;
1232 }
1233
get_transliterator_index_for_script_language(script_t script,char * language)1234 transliterator_index_t get_transliterator_index_for_script_language(script_t script, char *language) {
1235 if (trans_table == NULL || language == NULL || strlen(language) >= MAX_LANGUAGE_LEN) {
1236 return NULL_TRANSLITERATOR_INDEX;
1237 }
1238
1239 script_language_t script_lang;
1240 script_lang.script = script;
1241 strcpy(script_lang.language, language);
1242
1243 khiter_t k;
1244 k = kh_get(script_language_index, trans_table->script_languages, script_lang);
1245 return (k != kh_end(trans_table->script_languages)) ? kh_value(trans_table->script_languages, k) : NULL_TRANSLITERATOR_INDEX;
1246 }
1247
1248
transliterator_replace_strings(trie_t * trie,cstring_array * replacements,char * input)1249 char *transliterator_replace_strings(trie_t *trie, cstring_array *replacements, char *input) {
1250 phrase_array *phrases;
1251 char_array *str;
1252 char *current = input;
1253 bool is_original = true;
1254
1255 size_t len = strlen(input);
1256
1257 // We may go through several rounds of replacements
1258 while (1) {
1259 phrases = trie_search(trie, current);
1260 if (!phrases) {
1261 break;
1262 } else {
1263 str = char_array_new_size(len);
1264 phrase_t phrase;
1265 size_t start = 0;
1266 size_t end = 0;
1267 for (size_t i = 0; i < phrases->n; i++) {
1268 phrase = phrases->a[i];
1269 end = phrase.start;
1270 char_array_append_len(str, input + start, end - start);
1271 char_array_append(str, cstring_array_get_string(replacements, phrase.data));
1272 start = phrase.start + phrase.len;
1273 }
1274
1275 char_array_append_len(str, input + end, len - end);
1276 char_array_terminate(str);
1277
1278 if (!is_original) {
1279 free(current);
1280 }
1281
1282 // Destroys the char array itself, but not the string it holds
1283 current = char_array_to_string(str);
1284 is_original = false;
1285 }
1286 }
1287
1288 return current;
1289 }
1290
transliterator_read(FILE * f)1291 transliterator_t *transliterator_read(FILE *f) {
1292 uint64_t trans_name_len;
1293
1294 if (!file_read_uint64(f, &trans_name_len)) {
1295 return NULL;
1296 }
1297
1298 char *name = malloc(trans_name_len);
1299 if (name == NULL) {
1300 return NULL;
1301 }
1302
1303 if (!file_read_chars(f, name, trans_name_len)) {
1304 return NULL;
1305 }
1306
1307 bool internal;
1308 if (!file_read_uint8(f, (uint8_t *)&internal)) {
1309 return NULL;
1310 }
1311
1312 uint32_t steps_index;
1313
1314 if (!file_read_uint32(f, &steps_index)) {
1315 return NULL;
1316 }
1317
1318
1319 uint32_t steps_length;
1320
1321 if (!file_read_uint32(f, &steps_length)) {
1322 return NULL;
1323 }
1324
1325 transliterator_t *trans = transliterator_new(name, internal, steps_index, steps_length);
1326 return trans;
1327 }
1328
transliterator_write(transliterator_t * trans,FILE * f)1329 bool transliterator_write(transliterator_t *trans, FILE *f) {
1330 size_t trans_name_len = strlen(trans->name) + 1;
1331 if (!file_write_uint64(f, (uint64_t)trans_name_len) ||
1332 !file_write_chars(f, trans->name, trans_name_len)) {
1333 return false;
1334 }
1335
1336 if (!file_write_uint8(f, trans->internal)) {
1337 return false;
1338 }
1339
1340 if (!file_write_uint32(f, trans->steps_index)) {
1341 return false;
1342 }
1343
1344 if (!file_write_uint32(f, (uint32_t)trans->steps_length)) {
1345 return false;
1346 }
1347
1348 return true;
1349 }
1350
transliteration_step_read(FILE * f)1351 transliteration_step_t *transliteration_step_read(FILE *f) {
1352 uint64_t step_name_len;
1353
1354 log_debug("reading step\n");;
1355
1356 transliteration_step_t *step = malloc(sizeof(transliteration_step_t));
1357 if (step == NULL) {
1358 return NULL;
1359 }
1360
1361 if (!file_read_uint32(f, &step->type)) {
1362 goto exit_step_destroy;
1363 }
1364 if (!file_read_uint64(f, &step_name_len)) {
1365 goto exit_step_destroy;
1366 }
1367
1368 char *name = malloc(step_name_len);
1369 if (name == NULL) {
1370 goto exit_step_destroy;
1371 }
1372
1373 if (!file_read_chars(f, name, step_name_len)) {
1374 free(name);
1375 goto exit_step_destroy;
1376 }
1377 step->name = name;
1378
1379 return step;
1380
1381 exit_step_destroy:
1382 free(step);
1383 return NULL;
1384 }
1385
transliteration_step_write(transliteration_step_t * step,FILE * f)1386 bool transliteration_step_write(transliteration_step_t *step, FILE *f) {
1387 if (!file_write_uint32(f, step->type)) {
1388 return false;
1389 }
1390
1391 // Include the NUL byte
1392 size_t step_name_len = strlen(step->name) + 1;
1393
1394 if (!file_write_uint64(f, (uint64_t)step_name_len) ||
1395 !file_write_chars(f, step->name, step_name_len)) {
1396 return false;
1397 }
1398
1399 return true;
1400 }
1401
group_capture_read(FILE * f,group_capture_t * group)1402 bool group_capture_read(FILE *f, group_capture_t *group) {
1403 uint64_t start;
1404 if (!file_read_uint64(f, &start)) {
1405 return false;
1406 }
1407 group->start = (size_t)start;
1408
1409 uint64_t len;
1410 if (!file_read_uint64(f, &len)) {
1411 return false;
1412 }
1413
1414 group->len = (size_t)len;
1415
1416 return true;
1417 }
1418
group_capture_write(group_capture_t group,FILE * f)1419 bool group_capture_write(group_capture_t group, FILE *f) {
1420 if (!file_write_uint64(f, (uint64_t)group.start) ||
1421 !file_write_uint64(f, (uint64_t)group.len)) {
1422 return false;
1423 }
1424
1425 return true;
1426 }
1427
transliteration_replacement_read(FILE * f)1428 transliteration_replacement_t *transliteration_replacement_read(FILE *f) {
1429 uint32_t string_index;
1430
1431 if (!file_read_uint32(f, &string_index)) {
1432 return NULL;
1433 }
1434
1435 uint32_t revisit_index;
1436
1437 if (!file_read_uint32(f, &revisit_index)) {
1438 return NULL;
1439 }
1440
1441 uint64_t num_groups;
1442
1443 if (!file_read_uint64(f, &num_groups)) {
1444 return NULL;
1445 }
1446
1447 group_capture_array *groups = NULL;
1448
1449 if (num_groups > 0) {
1450 groups = group_capture_array_new_size((size_t)num_groups);
1451 group_capture_t group;
1452 for (size_t i = 0; i < (size_t)num_groups; i++) {
1453 if (!group_capture_read(f, &group)) {
1454 group_capture_array_destroy(groups);
1455 return NULL;
1456 }
1457 group_capture_array_push(groups, group);
1458 }
1459
1460 }
1461
1462
1463 return transliteration_replacement_new(string_index, revisit_index, groups);
1464 }
1465
transliteration_replacement_write(transliteration_replacement_t * replacement,FILE * f)1466 bool transliteration_replacement_write(transliteration_replacement_t *replacement, FILE *f) {
1467 if (!file_write_uint32(f, replacement->string_index)) {
1468 return false;
1469 }
1470
1471 if (!file_write_uint32(f, replacement->revisit_index)) {
1472 return false;
1473 }
1474
1475 if (!file_write_uint64(f, replacement->num_groups)) {
1476 return false;
1477 }
1478
1479 group_capture_t group;
1480
1481 for (size_t i = 0; i < replacement->num_groups; i++) {
1482 group = replacement->groups->a[i];
1483 if (!group_capture_write(group, f)) {
1484 return false;
1485 }
1486 }
1487
1488 return true;
1489
1490 }
1491
transliteration_table_read(FILE * f)1492 bool transliteration_table_read(FILE *f) {
1493 if (f == NULL) {
1494 return false;
1495 }
1496
1497 uint32_t signature;
1498
1499 log_debug("Reading signature\n");
1500
1501 if (!file_read_uint32(f, &signature) || signature != TRANSLITERATION_TABLE_SIGNATURE) {
1502 return false;
1503 }
1504
1505 trans_table = transliteration_table_init();
1506
1507 log_debug("Table initialized\n");
1508
1509 uint64_t num_transliterators = 0;
1510
1511 if (!file_read_uint64(f, &num_transliterators)) {
1512 goto exit_trans_table_load_error;
1513 }
1514
1515
1516 log_debug("num_transliterators = %zu\n", (size_t)num_transliterators);
1517
1518 size_t i;
1519
1520 transliterator_t *trans;
1521
1522 for (i = 0; i < (size_t)num_transliterators; i++) {
1523 trans = transliterator_read(f);
1524 if (trans == NULL) {
1525 log_error("trans was NULL\n");
1526 goto exit_trans_table_load_error;
1527 } else {
1528 log_debug("read trans with name: %s\n", trans->name);
1529 }
1530 if (!transliteration_table_add_transliterator(trans)) {
1531 goto exit_trans_table_load_error;
1532 }
1533 }
1534
1535 log_debug("Read transliterators\n");
1536
1537 uint64_t num_script_languages;
1538 if (!file_read_uint64(f, &num_script_languages)) {
1539 goto exit_trans_table_load_error;
1540 }
1541
1542 log_debug("num_script_languages = %zu\n", (size_t)num_script_languages);
1543
1544 script_language_t script_language;
1545 transliterator_index_t index;
1546
1547 uint64_t language_len = 0;
1548 char language[MAX_LANGUAGE_LEN] = "";
1549
1550 uint64_t transliterator_index = 0;
1551 uint64_t index_num_transliterators = 0;
1552
1553 for (i = 0; i < num_script_languages; i++) {
1554 if (!file_read_uint32(f, (uint32_t *)&script_language.script)) {
1555 goto exit_trans_table_load_error;
1556 }
1557
1558 if (!file_read_uint64(f, &language_len) || language_len >= MAX_LANGUAGE_LEN) {
1559 goto exit_trans_table_load_error;
1560 }
1561
1562 if (language_len == 0) {
1563 script_language.language[0] = '\0';
1564 } else if (!file_read_chars(f, (char *)language, (size_t)language_len)) {
1565 goto exit_trans_table_load_error;
1566 } else {
1567 strcpy(script_language.language, language);
1568 }
1569
1570 if (!file_read_uint64(f, &transliterator_index)) {
1571 goto exit_trans_table_load_error;
1572 }
1573
1574 index.transliterator_index = (size_t)transliterator_index;
1575
1576 if (!file_read_uint64(f, &index_num_transliterators)) {
1577 goto exit_trans_table_load_error;
1578 }
1579
1580 index.num_transliterators = (size_t)index_num_transliterators;
1581
1582 log_debug("Adding script language key={%d, %s}, value={%zu, %zu}\n", script_language.script, script_language.language, index.transliterator_index, index.num_transliterators);
1583
1584 transliteration_table_add_script_language(script_language, index);
1585 }
1586
1587 uint64_t trans_table_num_strings;
1588
1589 if (!file_read_uint64(f, &trans_table_num_strings)) {
1590 goto exit_trans_table_load_error;
1591 }
1592
1593 log_debug("trans_table_num_strings=%zu\n", (size_t)trans_table_num_strings);
1594
1595 uint64_t trans_name_str_len;
1596
1597 if (!file_read_uint64(f, &trans_name_str_len)) {
1598 goto exit_trans_table_load_error;
1599 }
1600
1601 log_debug("Creating char_array with size=%zu\n", (size_t)trans_name_str_len);
1602
1603 char_array *array = char_array_new_size((size_t)trans_name_str_len);
1604
1605 if (!file_read_chars(f, array->a, (size_t)trans_name_str_len)) {
1606 goto exit_trans_table_load_error;
1607 }
1608
1609 array->n = trans_name_str_len;
1610
1611 cstring_array_destroy(trans_table->transliterator_names);
1612 log_debug("Destroyed current cstring_array\n");
1613
1614 log_debug("char_array len=%zu\n", array->n);
1615
1616 trans_table->transliterator_names = cstring_array_from_char_array(array);
1617 log_debug("Set trans_table->transliterator_names\n");
1618
1619 if (cstring_array_num_strings(trans_table->transliterator_names) != trans_table_num_strings) {
1620 goto exit_trans_table_load_error;
1621 }
1622
1623 uint64_t num_steps;
1624
1625 if (!file_read_uint64(f, &num_steps)) {
1626 goto exit_trans_table_load_error;
1627 }
1628
1629 log_debug("num_steps = %zu\n", (size_t)num_steps);
1630
1631 if (!step_array_resize(trans_table->steps, (size_t)num_steps)) {
1632 goto exit_trans_table_load_error;
1633 }
1634
1635 log_debug("resized\n");
1636
1637 transliteration_step_t *step;
1638
1639 for (i = 0; i < num_steps; i++) {
1640 step = transliteration_step_read(f);
1641 if (step == NULL) {
1642 goto exit_trans_table_load_error;
1643 }
1644 log_debug("Read step with name %s and type %d\n", step->name, step->type);
1645 step_array_push(trans_table->steps, step);
1646 }
1647
1648 log_debug("Done with steps\n");
1649
1650 transliteration_replacement_t *replacement;
1651
1652 uint64_t num_replacements;
1653
1654 if (!file_read_uint64(f, &num_replacements)) {
1655 goto exit_trans_table_load_error;
1656 }
1657
1658 log_debug("num_replacements = %zu\n", (size_t)num_replacements);
1659
1660 if (!transliteration_replacement_array_resize(trans_table->replacements, (size_t)num_replacements)) {
1661 goto exit_trans_table_load_error;
1662 }
1663
1664 log_debug("resized\n");
1665
1666 for (i = 0; i < num_replacements; i++) {
1667 replacement = transliteration_replacement_read(f);
1668 if (replacement == NULL) {
1669 goto exit_trans_table_load_error;
1670 }
1671 transliteration_replacement_array_push(trans_table->replacements, replacement);
1672 }
1673
1674 log_debug("Done with replacements\n");
1675
1676 uint64_t num_replacement_tokens;
1677
1678 if (!file_read_uint64(f, &num_replacement_tokens)) {
1679 goto exit_trans_table_load_error;
1680 }
1681
1682 log_debug("num_replacement_tokens = %zu\n", (size_t)num_replacement_tokens);
1683
1684 if (!uint32_array_resize(trans_table->replacement_strings->indices, (size_t)num_replacement_tokens)) {
1685 goto exit_trans_table_load_error;
1686 }
1687
1688 log_debug("resized\n");
1689
1690 uint32_t token_index;
1691
1692 for (i = 0; i < num_replacement_tokens; i++) {
1693 if (!file_read_uint32(f, &token_index)) {
1694 goto exit_trans_table_load_error;
1695 }
1696 uint32_array_push(trans_table->replacement_strings->indices, token_index);
1697 }
1698
1699 log_debug("Done with replacement token indices\n");
1700
1701 uint64_t replacement_strings_len;
1702
1703 if (!file_read_uint64(f, &replacement_strings_len)) {
1704 goto exit_trans_table_load_error;
1705 }
1706
1707 log_debug("replacement_strings_len = %zu\n", (size_t)replacement_strings_len);
1708
1709 if (!char_array_resize(trans_table->replacement_strings->str, (size_t)replacement_strings_len)) {
1710 goto exit_trans_table_load_error;
1711 }
1712
1713 log_debug("resized\n");
1714
1715 if (!file_read_chars(f, trans_table->replacement_strings->str->a, (size_t)replacement_strings_len)) {
1716 goto exit_trans_table_load_error;
1717 }
1718
1719 log_debug("Read replacement_strings\n");
1720
1721 trans_table->replacement_strings->str->n = replacement_strings_len;
1722
1723 uint64_t num_revisit_tokens;
1724
1725 if (!file_read_uint64(f, &num_revisit_tokens)) {
1726 goto exit_trans_table_load_error;
1727 }
1728
1729 log_debug("num_revisit_tokens = %zu\n", (size_t)num_revisit_tokens);
1730
1731 if (!uint32_array_resize(trans_table->revisit_strings->indices, (size_t)num_revisit_tokens)) {
1732 goto exit_trans_table_load_error;
1733 }
1734
1735 log_debug("resized\n");
1736
1737 for (i = 0; i < num_revisit_tokens; i++) {
1738 if (!file_read_uint32(f, &token_index)) {
1739 goto exit_trans_table_load_error;
1740 }
1741 uint32_array_push(trans_table->revisit_strings->indices, token_index);
1742 }
1743
1744 log_debug("Done with revisit token indices\n");
1745
1746 uint64_t revisit_strings_len = 0;
1747
1748 if (!file_read_uint64(f, &revisit_strings_len)) {
1749 goto exit_trans_table_load_error;
1750 }
1751
1752 log_debug("revisit_strings_len = %zu\n", (size_t)revisit_strings_len);
1753
1754 if (!char_array_resize(trans_table->revisit_strings->str, (size_t)revisit_strings_len)) {
1755 goto exit_trans_table_load_error;
1756 }
1757
1758 log_debug("resized\n");
1759
1760 if (!file_read_chars(f, trans_table->revisit_strings->str->a, (size_t)revisit_strings_len)) {
1761 goto exit_trans_table_load_error;
1762 }
1763
1764 log_debug("Read revisit_strings\n");
1765
1766 trans_table->revisit_strings->str->n = revisit_strings_len;
1767
1768 // Free the default trie
1769 trie_destroy(trans_table->trie);
1770
1771 trans_table->trie = trie_read(f);
1772 log_debug("Read trie\n");
1773 if (trans_table->trie == NULL) {
1774 goto exit_trans_table_load_error;
1775 }
1776
1777 return true;
1778
1779 exit_trans_table_load_error:
1780 transliteration_table_destroy();
1781 return false;
1782 }
1783
transliteration_table_write(FILE * f)1784 bool transliteration_table_write(FILE *f) {
1785 if (f == NULL) {
1786 return false;
1787 }
1788
1789 transliterator_t *trans;
1790
1791 if (!file_write_uint32(f, TRANSLITERATION_TABLE_SIGNATURE)) {
1792 return false;
1793 }
1794
1795 size_t num_transliterators = kh_size(trans_table->transliterators);
1796
1797 if (!file_write_uint64(f, (uint64_t)num_transliterators)) {
1798 return false;
1799 }
1800
1801 kh_foreach_value(trans_table->transliterators, trans, {
1802 if (!transliterator_write(trans, f)) {
1803 return false;
1804 }
1805 })
1806
1807 size_t i;
1808
1809 size_t num_script_languages = kh_size(trans_table->script_languages);
1810
1811 if (!file_write_uint64(f, (uint64_t)num_script_languages)) {
1812 return false;
1813 }
1814
1815 script_language_t script_language;
1816 transliterator_index_t index;
1817
1818 kh_foreach(trans_table->script_languages, script_language, index, {
1819 if (!file_write_uint32(f, (uint32_t)script_language.script)) {
1820 return false;
1821 }
1822
1823 size_t language_len = strlen(script_language.language);
1824
1825 if (!file_write_uint64(f, (uint64_t)language_len)) {
1826 return false;
1827 }
1828
1829 if (language_len > 0 && !file_write_chars(f, script_language.language, language_len)) {
1830 return false;
1831 }
1832
1833 if (!file_write_uint64(f, (uint64_t)index.transliterator_index)) {
1834 return false;
1835 }
1836
1837 if (!file_write_uint64(f, (uint64_t)index.num_transliterators)) {
1838 return false;
1839 }
1840 })
1841
1842 size_t num_trans_names = trans_table->transliterator_names->indices->n;
1843
1844 if (!file_write_uint64(f, (uint64_t)num_trans_names)) {
1845 return false;
1846 }
1847
1848 size_t trans_names_str_len = trans_table->transliterator_names->str->n;
1849
1850 if (!file_write_uint64(f, (uint64_t)trans_names_str_len)) {
1851 return false;
1852 }
1853
1854 if (!file_write_chars(f, trans_table->transliterator_names->str->a, trans_names_str_len)) {
1855 return false;
1856 }
1857
1858 transliteration_step_t *step;
1859
1860
1861 size_t num_steps = trans_table->steps->n;
1862
1863 if (!file_write_uint64(f, num_steps)) {
1864 return false;
1865 }
1866
1867 for (i = 0; i < num_steps; i++) {
1868 step = trans_table->steps->a[i];
1869 if (!transliteration_step_write(step, f)) {
1870 return false;
1871 }
1872 }
1873
1874 size_t num_replacements = trans_table->replacements->n;
1875
1876 if (!file_write_uint64(f, num_replacements)) {
1877 return false;
1878 }
1879
1880 transliteration_replacement_t *replacement;
1881
1882 for (i = 0; i < trans_table->replacements->n; i++) {
1883 replacement = trans_table->replacements->a[i];
1884 if (!transliteration_replacement_write(replacement, f)) {
1885 return false;
1886 }
1887 }
1888
1889 size_t replacement_tokens_len = trans_table->replacement_strings->indices->n;
1890
1891 if (!file_write_uint64(f, replacement_tokens_len)) {
1892 return false;
1893 }
1894
1895 for (i = 0; i < replacement_tokens_len; i++) {
1896 if (!file_write_uint32(f, trans_table->replacement_strings->indices->a[i])) {
1897 return false;
1898 }
1899 }
1900
1901 size_t replacement_strings_len = trans_table->replacement_strings->str->n;
1902
1903 if (!file_write_uint64(f, replacement_strings_len)) {
1904 return false;
1905 }
1906
1907 if (!file_write_chars(f, trans_table->replacement_strings->str->a, replacement_strings_len)) {
1908 return false;
1909 }
1910
1911 size_t revisit_tokens_len = trans_table->revisit_strings->indices->n;
1912
1913 log_debug("revisit_tokens_len=%zu\n", revisit_tokens_len);
1914
1915 if (!file_write_uint64(f, revisit_tokens_len)) {
1916 return false;
1917 }
1918
1919 for (i = 0; i < revisit_tokens_len; i++) {
1920 if (!file_write_uint32(f, trans_table->revisit_strings->indices->a[i])) {
1921 return false;
1922 }
1923 }
1924
1925 size_t revisit_strings_len = trans_table->revisit_strings->str->n;
1926
1927 if (!file_write_uint64(f, revisit_strings_len)) {
1928 return false;
1929 }
1930
1931 if (!file_write_chars(f, trans_table->revisit_strings->str->a, revisit_strings_len)) {
1932 return false;
1933 }
1934
1935 if (!trie_write(trans_table->trie, f)) {
1936 return false;
1937 }
1938
1939 return true;
1940
1941 }
1942
transliteration_table_load(char * filename)1943 bool transliteration_table_load(char *filename) {
1944 if (filename == NULL || trans_table != NULL) {
1945 return false;
1946 }
1947
1948 FILE *f;
1949
1950 if ((f = fopen(filename, "rb")) != NULL) {
1951 bool ret = transliteration_table_read(f);
1952 fclose(f);
1953 return ret;
1954 } else {
1955 return false;
1956 }
1957 }
1958
1959
transliteration_table_save(char * filename)1960 bool transliteration_table_save(char *filename) {
1961 if (trans_table == NULL || filename == NULL) {
1962 return false;
1963 }
1964
1965 FILE *f;
1966
1967 if ((f = fopen(filename, "wb")) != NULL) {
1968 bool ret = transliteration_table_write(f);
1969 fclose(f);
1970 return ret;
1971 } else {
1972 return false;
1973 }
1974
1975 }
1976
transliteration_module_init(void)1977 bool transliteration_module_init(void) {
1978 trans_table = transliteration_table_new();
1979 return trans_table != NULL;
1980 }
1981
transliteration_module_setup(char * filename)1982 bool transliteration_module_setup(char *filename) {
1983 if (trans_table == NULL) {
1984 return transliteration_table_load(filename == NULL ? DEFAULT_TRANSLITERATION_PATH : filename);
1985 }
1986
1987 return true;
1988 }
1989
1990
transliteration_module_teardown(void)1991 void transliteration_module_teardown(void) {
1992 transliteration_table_destroy();
1993 trans_table = NULL;
1994 }
1995
1996