1 #ifndef TRANSLITERATE_H
2 #define TRANSLITERATE_H
3 
4 #include <stdlib.h>
5 #include <stdint.h>
6 #include <stdbool.h>
7 
8 #include "collections.h"
9 #include "constants.h"
10 #include "klib/khash.h"
11 #include "string_utils.h"
12 #include "trie.h"
13 #include "trie_search.h"
14 #include "unicode_scripts.h"
15 
16 #define LATIN_ASCII "latin-ascii"
17 #define LATIN_ASCII_SIMPLE "latin-ascii-simple"
18 #define HTML_ESCAPE "html-escape"
19 
20 #define TRANSLITERATION_DATA_FILE "transliteration.dat"
21 #define DEFAULT_TRANSLITERATION_PATH LIBPOSTAL_TRANSLITERATION_DIR PATH_SEPARATOR TRANSLITERATION_DATA_FILE
22 
23 #define MAX_TRANS_NAME_LEN 100
24 
25 typedef enum {
26     STEP_RULESET,
27     STEP_TRANSFORM,
28     STEP_UNICODE_NORMALIZATION
29 } step_type_t;
30 
31 typedef struct transliteration_step {
32     step_type_t type;
33     char *name;
34 } transliteration_step_t;
35 
36 transliteration_step_t *transliteration_step_new(char *name, step_type_t type);
37 void transliteration_step_destroy(transliteration_step_t *self);
38 
39 VECTOR_INIT_FREE_DATA(step_array, transliteration_step_t *, transliteration_step_destroy)
40 
41 typedef struct transliterator {
42     char *name;
43     uint8_t internal;
44     uint32_t steps_index;
45     size_t steps_length;
46 } transliterator_t;
47 
48 #define MAX_GROUP_LEN 5
49 
50 typedef struct group_capture {
51     size_t start;
52     size_t len;
53 } group_capture_t;
54 
55 VECTOR_INIT(group_capture_array, group_capture_t)
56 
57 typedef struct transliteration_replacement {
58     uint32_t string_index;
59     uint32_t revisit_index;
60     size_t num_groups;
61     group_capture_array *groups;
62 } transliteration_replacement_t;
63 
64 transliteration_replacement_t *transliteration_replacement_new(
65     uint32_t string_index,
66     uint32_t revisit_index,
67     group_capture_array *groups
68 );
69 
70 void transliteration_replacement_destroy(transliteration_replacement_t *self);
71 
72 VECTOR_INIT_FREE_DATA(transliteration_replacement_array, transliteration_replacement_t *, transliteration_replacement_destroy)
73 
74 KHASH_MAP_INIT_STR(str_transliterator, transliterator_t *)
75 
76 #define kh_script_lang_hash(key)  ((khint_t)(key).script ^ (((key).language == NULL) ? 0 : kh_str_hash_func((key).language)))
77 #define kh_script_lang_equal(a, b)  (((a).script == (b).script) && strcmp((a).language, (b).language) == 0)
78 
79 typedef struct transliterator_index {
80     size_t transliterator_index;
81     size_t num_transliterators;
82 } transliterator_index_t;
83 
84 #define NULL_TRANSLITERATOR_INDEX (transliterator_index_t) {0, 0}
85 
86 KHASH_INIT(script_language_index, script_language_t, transliterator_index_t, 1, kh_script_lang_hash, kh_script_lang_equal)
87 
88 typedef struct transliteration_table {
89     khash_t(str_transliterator) *transliterators;
90 
91     khash_t(script_language_index) *script_languages;
92     cstring_array *transliterator_names;
93 
94     step_array *steps;
95     trie_t *trie;
96 
97     transliteration_replacement_array *replacements;
98     cstring_array *replacement_strings;
99     cstring_array *revisit_strings;
100 } transliteration_table_t;
101 
102 // Control characters are special
103 #define WORD_BOUNDARY_CHAR "\x01"
104 #define WORD_BOUNDARY_CODEPOINT 1
105 #define WORD_BOUNDARY_CHAR_LEN strlen(WORD_BOUNDARY_CHAR)
106 #define PRE_CONTEXT_CHAR "\x86"
107 #define PRE_CONTEXT_CODEPOINT 134
108 #define PRE_CONTEXT_CHAR_LEN strlen(PRE_CONTEXT_CHAR)
109 #define POST_CONTEXT_CHAR "\x87"
110 #define POST_CONTEXT_CODEPOINT 135
111 #define POST_CONTEXT_CHAR_LEN strlen(POST_CONTEXT_CHAR)
112 #define EMPTY_TRANSITION_CHAR "\x04"
113 #define EMPTY_TRANSITION_CODEPOINT 4
114 #define EMPTY_TRANSITION_CHAR_LEN strlen(EMPTY_TRANSITION_CHAR)
115 #define REPEAT_CHAR "\x05"
116 #define REPEAT_CODEPOINT 5
117 #define REPEAT_CHAR_LEN strlen(REPEAT_CHAR)
118 #define GROUP_INDICATOR_CHAR "\x1d"
119 #define GROUP_INDICATOR_CODEPOINT 29
120 #define GROUP_INDICATOR_CHAR_LEN strlen(GROUP_INDICATOR_CHAR)
121 #define BEGIN_SET_CHAR "\x0f"
122 #define BEGIN_SET_CODEPOINT 15
123 #define BEGIN_SET_CHAR_LEN strlen(BEGIN_SET_CHAR)
124 #define END_SET_CHAR "\x0e"
125 #define END_SET_CODEPOINT 14
126 #define END_SET_CHAR_LEN strlen(END_SET_CHAR)
127 
128 
129 #define DOLLAR_CODEPOINT 36
130 
131 #define LPAREN_CODEPOINT 40
132 #define RPAREN_CODEPOINT 41
133 
134 #define STAR_CODEPOINT 42
135 #define PLUS_CODEPOINT 43
136 
137 #define LSQUARE_CODEPOINT 91
138 #define BACKSLASH_CODEPOINT 92
139 #define RSQUARE_CODEPOINT 93
140 
141 #define LCURLY_CODEPOINT 123
142 #define RCURLY_CODEPOINT 125
143 
144 
145 // Primary API
146 transliteration_table_t *get_transliteration_table(void);
147 
148 transliterator_t *transliterator_new(char *name, uint8_t internal, uint32_t steps_index, size_t steps_length);
149 void transliterator_destroy(transliterator_t *self);
150 
151 bool transliteration_table_add_transliterator(transliterator_t *trans);
152 
153 transliterator_t *get_transliterator(char *name);
154 char *transliterate(char *trans_name, char *str, size_t len);
155 
156 bool transliteration_table_add_script_language(script_language_t script_language, transliterator_index_t index);
157 transliterator_index_t get_transliterator_index_for_script_language(script_t script, char *language);
158 
159 #define foreach_transliterator(script, language, transliterator_var, code) do {                                                     \
160         transliteration_table_t *__trans_table = get_transliteration_table();                                                       \
161         transliterator_index_t __index = get_transliterator_index_for_script_language(script, language);                            \
162         for (size_t __i = __index.transliterator_index; __i < __index.transliterator_index + __index.num_transliterators; __i++) {  \
163             transliterator_var = cstring_array_get_string(__trans_table->transliterator_names, (uint32_t)__i);                                \
164             if (transliterator_var == NULL) break;                                                                                  \
165             code;                                                                                                                   \
166         }                                                                                                                           \
167     } while (0);
168 
169 bool transliteration_table_write(FILE *file);
170 bool transliteration_table_save(char *filename);
171 
172 // Module setup/teardown
173 bool transliteration_module_init(void);
174 bool transliteration_module_setup(char *filename);
175 void transliteration_module_teardown(void);
176 
177 #endif
178