1 #ifndef TRANSLITERATE_H 2 #define TRANSLITERATE_H 3 4 #include <stdlib.h> 5 #include <stdint.h> 6 #include <stdbool.h> 7 8 #include "collections.h" 9 #include "constants.h" 10 #include "klib/khash.h" 11 #include "string_utils.h" 12 #include "trie.h" 13 #include "trie_search.h" 14 #include "unicode_scripts.h" 15 16 #define LATIN_ASCII "latin-ascii" 17 #define LATIN_ASCII_SIMPLE "latin-ascii-simple" 18 #define HTML_ESCAPE "html-escape" 19 20 #define TRANSLITERATION_DATA_FILE "transliteration.dat" 21 #define DEFAULT_TRANSLITERATION_PATH LIBPOSTAL_TRANSLITERATION_DIR PATH_SEPARATOR TRANSLITERATION_DATA_FILE 22 23 #define MAX_TRANS_NAME_LEN 100 24 25 typedef enum { 26 STEP_RULESET, 27 STEP_TRANSFORM, 28 STEP_UNICODE_NORMALIZATION 29 } step_type_t; 30 31 typedef struct transliteration_step { 32 step_type_t type; 33 char *name; 34 } transliteration_step_t; 35 36 transliteration_step_t *transliteration_step_new(char *name, step_type_t type); 37 void transliteration_step_destroy(transliteration_step_t *self); 38 39 VECTOR_INIT_FREE_DATA(step_array, transliteration_step_t *, transliteration_step_destroy) 40 41 typedef struct transliterator { 42 char *name; 43 uint8_t internal; 44 uint32_t steps_index; 45 size_t steps_length; 46 } transliterator_t; 47 48 #define MAX_GROUP_LEN 5 49 50 typedef struct group_capture { 51 size_t start; 52 size_t len; 53 } group_capture_t; 54 55 VECTOR_INIT(group_capture_array, group_capture_t) 56 57 typedef struct transliteration_replacement { 58 uint32_t string_index; 59 uint32_t revisit_index; 60 size_t num_groups; 61 group_capture_array *groups; 62 } transliteration_replacement_t; 63 64 transliteration_replacement_t *transliteration_replacement_new( 65 uint32_t string_index, 66 uint32_t revisit_index, 67 group_capture_array *groups 68 ); 69 70 void transliteration_replacement_destroy(transliteration_replacement_t *self); 71 72 VECTOR_INIT_FREE_DATA(transliteration_replacement_array, transliteration_replacement_t *, transliteration_replacement_destroy) 73 74 KHASH_MAP_INIT_STR(str_transliterator, transliterator_t *) 75 76 #define kh_script_lang_hash(key) ((khint_t)(key).script ^ (((key).language == NULL) ? 0 : kh_str_hash_func((key).language))) 77 #define kh_script_lang_equal(a, b) (((a).script == (b).script) && strcmp((a).language, (b).language) == 0) 78 79 typedef struct transliterator_index { 80 size_t transliterator_index; 81 size_t num_transliterators; 82 } transliterator_index_t; 83 84 #define NULL_TRANSLITERATOR_INDEX (transliterator_index_t) {0, 0} 85 86 KHASH_INIT(script_language_index, script_language_t, transliterator_index_t, 1, kh_script_lang_hash, kh_script_lang_equal) 87 88 typedef struct transliteration_table { 89 khash_t(str_transliterator) *transliterators; 90 91 khash_t(script_language_index) *script_languages; 92 cstring_array *transliterator_names; 93 94 step_array *steps; 95 trie_t *trie; 96 97 transliteration_replacement_array *replacements; 98 cstring_array *replacement_strings; 99 cstring_array *revisit_strings; 100 } transliteration_table_t; 101 102 // Control characters are special 103 #define WORD_BOUNDARY_CHAR "\x01" 104 #define WORD_BOUNDARY_CODEPOINT 1 105 #define WORD_BOUNDARY_CHAR_LEN strlen(WORD_BOUNDARY_CHAR) 106 #define PRE_CONTEXT_CHAR "\x86" 107 #define PRE_CONTEXT_CODEPOINT 134 108 #define PRE_CONTEXT_CHAR_LEN strlen(PRE_CONTEXT_CHAR) 109 #define POST_CONTEXT_CHAR "\x87" 110 #define POST_CONTEXT_CODEPOINT 135 111 #define POST_CONTEXT_CHAR_LEN strlen(POST_CONTEXT_CHAR) 112 #define EMPTY_TRANSITION_CHAR "\x04" 113 #define EMPTY_TRANSITION_CODEPOINT 4 114 #define EMPTY_TRANSITION_CHAR_LEN strlen(EMPTY_TRANSITION_CHAR) 115 #define REPEAT_CHAR "\x05" 116 #define REPEAT_CODEPOINT 5 117 #define REPEAT_CHAR_LEN strlen(REPEAT_CHAR) 118 #define GROUP_INDICATOR_CHAR "\x1d" 119 #define GROUP_INDICATOR_CODEPOINT 29 120 #define GROUP_INDICATOR_CHAR_LEN strlen(GROUP_INDICATOR_CHAR) 121 #define BEGIN_SET_CHAR "\x0f" 122 #define BEGIN_SET_CODEPOINT 15 123 #define BEGIN_SET_CHAR_LEN strlen(BEGIN_SET_CHAR) 124 #define END_SET_CHAR "\x0e" 125 #define END_SET_CODEPOINT 14 126 #define END_SET_CHAR_LEN strlen(END_SET_CHAR) 127 128 129 #define DOLLAR_CODEPOINT 36 130 131 #define LPAREN_CODEPOINT 40 132 #define RPAREN_CODEPOINT 41 133 134 #define STAR_CODEPOINT 42 135 #define PLUS_CODEPOINT 43 136 137 #define LSQUARE_CODEPOINT 91 138 #define BACKSLASH_CODEPOINT 92 139 #define RSQUARE_CODEPOINT 93 140 141 #define LCURLY_CODEPOINT 123 142 #define RCURLY_CODEPOINT 125 143 144 145 // Primary API 146 transliteration_table_t *get_transliteration_table(void); 147 148 transliterator_t *transliterator_new(char *name, uint8_t internal, uint32_t steps_index, size_t steps_length); 149 void transliterator_destroy(transliterator_t *self); 150 151 bool transliteration_table_add_transliterator(transliterator_t *trans); 152 153 transliterator_t *get_transliterator(char *name); 154 char *transliterate(char *trans_name, char *str, size_t len); 155 156 bool transliteration_table_add_script_language(script_language_t script_language, transliterator_index_t index); 157 transliterator_index_t get_transliterator_index_for_script_language(script_t script, char *language); 158 159 #define foreach_transliterator(script, language, transliterator_var, code) do { \ 160 transliteration_table_t *__trans_table = get_transliteration_table(); \ 161 transliterator_index_t __index = get_transliterator_index_for_script_language(script, language); \ 162 for (size_t __i = __index.transliterator_index; __i < __index.transliterator_index + __index.num_transliterators; __i++) { \ 163 transliterator_var = cstring_array_get_string(__trans_table->transliterator_names, (uint32_t)__i); \ 164 if (transliterator_var == NULL) break; \ 165 code; \ 166 } \ 167 } while (0); 168 169 bool transliteration_table_write(FILE *file); 170 bool transliteration_table_save(char *filename); 171 172 // Module setup/teardown 173 bool transliteration_module_init(void); 174 bool transliteration_module_setup(char *filename); 175 void transliteration_module_teardown(void); 176 177 #endif 178