1 #include <stdlib.h>
2 
3 #include "expand.h"
4 
5 #include "log/log.h"
6 
7 #include "address_dictionary.h"
8 #include "collections.h"
9 #include "constants.h"
10 #include "language_classifier.h"
11 #include "numex.h"
12 #include "normalize.h"
13 #include "scanner.h"
14 #include "string_utils.h"
15 #include "token_types.h"
16 #include "transliterate.h"
17 
18 
19 #define DEFAULT_KEY_LEN 32
20 
21 #define EXCESSIVE_PERMUTATIONS 100
22 
get_normalize_token_options(libpostal_normalize_options_t options)23 inline uint64_t get_normalize_token_options(libpostal_normalize_options_t options) {
24     uint64_t normalize_token_options = 0;
25 
26     normalize_token_options |= options.delete_final_periods ? NORMALIZE_TOKEN_DELETE_FINAL_PERIOD : 0;
27     normalize_token_options |= options.delete_acronym_periods ? NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS : 0;
28     normalize_token_options |= options.drop_english_possessives ? NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES : 0;
29     normalize_token_options |= options.delete_apostrophes ? NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE : 0;
30 
31     return normalize_token_options;
32 }
33 
get_normalize_string_options(libpostal_normalize_options_t options)34 inline uint64_t get_normalize_string_options(libpostal_normalize_options_t options) {
35     uint64_t normalize_string_options = 0;
36     normalize_string_options |= options.transliterate ? NORMALIZE_STRING_TRANSLITERATE : 0;
37     normalize_string_options |= options.latin_ascii ? NORMALIZE_STRING_LATIN_ASCII : 0;
38     normalize_string_options |= options.decompose ? NORMALIZE_STRING_DECOMPOSE : 0;
39     normalize_string_options |= options.strip_accents ? NORMALIZE_STRING_STRIP_ACCENTS : 0;
40     normalize_string_options |= options.lowercase ? NORMALIZE_STRING_LOWERCASE : 0;
41     normalize_string_options |= options.trim_string ? NORMALIZE_STRING_TRIM : 0;
42     normalize_string_options |= options.expand_numex ? NORMALIZE_STRING_REPLACE_NUMEX : 0;
43 
44     return normalize_string_options;
45 }
46 
add_normalized_strings_token(cstring_array * strings,char * str,token_t token,libpostal_normalize_options_t options)47 void add_normalized_strings_token(cstring_array *strings, char *str, token_t token, libpostal_normalize_options_t options) {
48 
49     uint64_t normalize_token_options = get_normalize_token_options(options);
50 
51     if (token.type != WHITESPACE ) {
52 
53         bool contains_hyphen = string_contains_hyphen_len(str + token.offset, token.len);
54 
55         if (!contains_hyphen || token.type == HYPHEN) {
56             log_debug("str = %s, token = {%zu, %zu, %u}\n", str, token.offset, token.len, token.type);
57             normalize_token(strings, str, token, normalize_token_options);
58         } else if (is_word_token(token.type)) {
59 
60             size_t prefix_hyphen_len = string_hyphen_prefix_len(str + token.offset, token.len);
61             if (prefix_hyphen_len > 0) {
62                 token.offset += prefix_hyphen_len;
63             }
64 
65             size_t suffix_hyphen_len = string_hyphen_suffix_len(str + token.offset, token.len);
66             if (suffix_hyphen_len > 0) {
67                 token.len -= suffix_hyphen_len;
68             }
69 
70             normalize_token(strings, str, token, normalize_token_options);
71 
72             if (options.replace_word_hyphens) {
73                 normalize_token_options |= NORMALIZE_TOKEN_REPLACE_HYPHENS;
74                 normalize_token(strings, str, token, normalize_token_options);
75                 normalize_token_options ^= NORMALIZE_TOKEN_REPLACE_HYPHENS;
76             }
77 
78             if (options.delete_word_hyphens) {
79                 normalize_token_options |= NORMALIZE_TOKEN_DELETE_HYPHENS;
80                 normalize_token(strings, str, token, normalize_token_options);
81                 normalize_token_options ^= NORMALIZE_TOKEN_DELETE_HYPHENS;
82             }
83 
84         } else if (is_numeric_token(token.type)) {
85 
86             normalize_token(strings, str, token, normalize_token_options);
87 
88             if (options.replace_word_hyphens || options.replace_numeric_hyphens) {
89                 if (options.replace_word_hyphens) {
90                     normalize_token_options |= NORMALIZE_TOKEN_REPLACE_HYPHENS;
91                 }
92 
93                 if (options.replace_numeric_hyphens) {
94                     normalize_token_options |= NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS;
95                 }
96 
97                 normalize_token(strings, str, token, normalize_token_options);
98                 normalize_token_options ^= NORMALIZE_TOKEN_REPLACE_HYPHENS | NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS;
99             }
100 
101             if (options.delete_numeric_hyphens) {
102                 normalize_token_options |= NORMALIZE_TOKEN_DELETE_HYPHENS;
103                 normalize_token(strings, str, token, normalize_token_options);
104                 normalize_token_options ^= NORMALIZE_TOKEN_DELETE_HYPHENS;
105             }
106         }
107 
108         if (is_numeric_token(token.type) && options.split_alpha_from_numeric) {
109             bool split_alpha_from_numeric = true;
110 
111             for (size_t i = 0; i < options.num_languages; i++) {
112                 char *lang = options.languages[i];
113                 if (valid_ordinal_suffix_len(str, token, NULL_TOKEN, lang) > 1) {
114                     split_alpha_from_numeric = false;
115                     break;
116                 }
117             }
118 
119             if (split_alpha_from_numeric) {
120                 normalize_token_options |= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC;
121                 normalize_token(strings, str, token, normalize_token_options);
122                 normalize_token_options ^= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC;
123             }
124         }
125     } else {
126         cstring_array_add_string(strings, " ");
127     }
128 }
129 
add_postprocessed_string(cstring_array * strings,char * str,libpostal_normalize_options_t options)130 void add_postprocessed_string(cstring_array *strings, char *str, libpostal_normalize_options_t options) {
131     cstring_array_add_string(strings, str);
132 
133     if (options.roman_numerals) {
134         char *numex_replaced = replace_numeric_expressions(str, LATIN_LANGUAGE_CODE);
135         if (numex_replaced != NULL) {
136             cstring_array_add_string(strings, numex_replaced);
137             free(numex_replaced);
138         }
139 
140     }
141 
142 }
143 
valid_affix_expansions(phrase_t phrase,libpostal_normalize_options_t options)144 address_expansion_array *valid_affix_expansions(phrase_t phrase, libpostal_normalize_options_t options) {
145     uint32_t expansion_index = phrase.data;
146     address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index);
147     if (value != NULL && value->components & options.address_components) {
148         return value->expansions;
149     }
150 
151     return NULL;
152 }
153 
cat_affix_expansion(char_array * key,char * str,address_expansion_t expansion,token_t token,phrase_t phrase,libpostal_normalize_options_t options)154 inline void cat_affix_expansion(char_array *key, char *str, address_expansion_t expansion, token_t token, phrase_t phrase, libpostal_normalize_options_t options) {
155     if (expansion.canonical_index != NULL_CANONICAL_INDEX) {
156         char *canonical = address_dictionary_get_canonical(expansion.canonical_index);
157         uint64_t normalize_string_options = get_normalize_string_options(options);
158         char *canonical_normalized = normalize_string_latin(canonical, strlen(canonical), normalize_string_options);
159         canonical = canonical_normalized != NULL ? canonical_normalized : canonical;
160 
161         char_array_cat(key, canonical);
162         if (canonical_normalized != NULL) {
163             free(canonical_normalized);
164         }
165     } else {
166         char_array_cat_len(key, str + token.offset + phrase.start, phrase.len);
167     }
168 }
169 
170 
add_affix_expansions(string_tree_t * tree,char * str,char * lang,token_t token,phrase_t prefix,phrase_t suffix,libpostal_normalize_options_t options,bool with_period)171 bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, token_t token, phrase_t prefix, phrase_t suffix, libpostal_normalize_options_t options, bool with_period) {
172     cstring_array *strings = tree->strings;
173 
174     size_t skip_period = with_period ? 1 : 0;
175 
176     bool have_suffix = suffix.len > 0 && suffix.len < token.len;
177     bool have_prefix = prefix.len > 0 && prefix.len + with_period < token.len;
178 
179     if (!have_suffix && !have_prefix) {
180         return false;
181     }
182 
183     address_expansion_array *prefix_expansions = NULL;
184     address_expansion_array *suffix_expansions = NULL;
185 
186     address_expansion_t prefix_expansion;
187     address_expansion_t suffix_expansion;
188 
189     char *expansion;
190 
191     size_t num_strings = 0;
192     char *root_word = NULL;
193     size_t root_len;
194     token_t root_token;
195     cstring_array *root_strings = NULL;
196     int add_space = 0;
197     int spaces = 0;
198 
199     size_t prefix_start, prefix_end, root_end, suffix_start;
200 
201     if (have_prefix) {
202         prefix_expansions = valid_affix_expansions(prefix, options);
203         if (prefix_expansions == NULL) have_prefix = false;
204     }
205 
206     if (have_suffix) {
207         suffix_expansions = valid_affix_expansions(suffix, options);
208         if (suffix_expansions == NULL) have_suffix = false;
209     }
210 
211     if (!have_suffix && !have_prefix) {
212         return false;
213     }
214 
215     char_array *key = char_array_new_size(token.len);
216 
217     if (have_prefix && have_suffix) {
218         for (size_t i = 0; i < prefix_expansions->n; i++) {
219             prefix_expansion = prefix_expansions->a[i];
220             char_array_clear(key);
221 
222             cat_affix_expansion(key, str, prefix_expansion, token, prefix, options);
223             prefix_start = key->n - 1;
224 
225             add_space = (int)prefix_expansion.separable || with_period;
226             if (prefix.len + skip_period + suffix.len < token.len && !prefix_expansion.separable) {
227                 add_space = suffix_expansion.separable || with_period;
228             }
229 
230             for (spaces = skip_period; spaces <= add_space; spaces++) {
231                 key->n = prefix_start;
232                 if (spaces) {
233                     char_array_cat(key, " ");
234                 }
235 
236                 prefix_end = key->n;
237 
238                 if (prefix.len + skip_period + suffix.len < token.len) {
239                     root_len = token.len - suffix.len - prefix.len - skip_period;
240                     size_t root_start = token.offset + prefix.len + skip_period;
241                     size_t prefix_hyphen_len = string_hyphen_prefix_len(str + root_start, root_len);
242                     root_start += prefix_hyphen_len;
243                     root_len -= prefix_hyphen_len;
244                     size_t suffix_hyphen_len = string_hyphen_suffix_len(str + root_start, root_len);
245                     root_len -= suffix_hyphen_len;
246                     root_token = (token_t){root_start, root_len, token.type};
247                     root_strings = cstring_array_new_size(root_len);
248                     add_normalized_strings_token(root_strings, str, root_token, options);
249                     num_strings = cstring_array_num_strings(root_strings);
250 
251                     for (size_t j = 0; j < num_strings; j++) {
252                         key->n = prefix_end;
253                         root_word = cstring_array_get_string(root_strings, j);
254                         char_array_cat(key, root_word);
255                         root_end = key->n - 1;
256 
257                         for (size_t k = 0; k < suffix_expansions->n; k++) {
258                             key->n = root_end;
259                             suffix_expansion = suffix_expansions->a[k];
260 
261                             int add_suffix_space = suffix_expansion.separable;
262 
263                             suffix_start = key->n;
264                             for (int suffix_spaces = skip_period; suffix_spaces <= add_suffix_space; suffix_spaces++) {
265                                 key->n = suffix_start;
266                                 if (suffix_spaces) {
267                                     char_array_cat(key, " ");
268                                 }
269 
270                                 cat_affix_expansion(key, str, suffix_expansion, token, suffix, options);
271 
272                                 expansion = char_array_get_string(key);
273                                 cstring_array_add_string(strings, expansion);
274 
275                             }
276 
277 
278                         }
279                     }
280 
281                     cstring_array_destroy(root_strings);
282                     root_strings = NULL;
283 
284                 } else {
285                     for (size_t j = 0; j < suffix_expansions->n; j++) {
286                         key->n = prefix_end - skip_period;
287                         suffix_expansion = suffix_expansions->a[j];
288 
289                         cat_affix_expansion(key, str, suffix_expansion, token, suffix, options);
290 
291                         expansion = char_array_get_string(key);
292                         cstring_array_add_string(tree->strings, expansion);
293                     }
294                 }
295             }
296 
297         }
298     } else if (have_suffix) {
299         log_debug("suffix.start=%" PRId32 "\n", suffix.start);
300         root_len = suffix.start;
301         root_token = (token_t){token.offset, root_len, token.type};
302         log_debug("root_len=%zu\n", root_len);
303         log_debug("root_token = {%zu, %zu, %u}\n", root_token.offset, root_token.len, root_token.type);
304 
305         root_strings = cstring_array_new_size(root_len + 1);
306         add_normalized_strings_token(root_strings, str, root_token, options);
307         num_strings = cstring_array_num_strings(root_strings);
308 
309         log_debug("num_strings = %zu\n", num_strings);
310 
311         for (size_t j = 0; j < num_strings; j++) {
312             char_array_clear(key);
313             root_word = cstring_array_get_string(root_strings, j);
314             log_debug("root_word=%s\n", root_word);
315             char_array_cat(key, root_word);
316             root_end = key->n - 1;
317 
318             for (size_t k = 0; k < suffix_expansions->n; k++) {
319                 key->n = root_end;
320                 suffix_expansion = suffix_expansions->a[k];
321 
322                 add_space = (suffix_expansion.separable || with_period) && suffix.len < token.len;
323                 suffix_start = key->n;
324 
325                 for (int spaces = skip_period; spaces <= add_space; spaces++) {
326                     key->n = suffix_start;
327                     if (spaces) {
328                         char_array_cat(key, " ");
329                     }
330 
331                     cat_affix_expansion(key, str, suffix_expansion, token, suffix, options);
332 
333                     expansion = char_array_get_string(key);
334                     cstring_array_add_string(tree->strings, expansion);
335                 }
336             }
337         }
338     } else if (have_prefix) {
339         if (prefix.len + skip_period <= token.len) {
340             root_len = token.len - prefix.len - skip_period;
341             size_t root_start = token.offset + prefix.len + skip_period;
342             size_t prefix_hyphen_len = string_hyphen_prefix_len(str + root_start, root_len);
343             root_start += prefix_hyphen_len;
344             root_len -= prefix_hyphen_len;
345             size_t suffix_hyphen_len = string_hyphen_suffix_len(str + root_start, root_len);
346             root_len -= suffix_hyphen_len;
347             root_token = (token_t){root_start, root_len, token.type};
348             root_strings = cstring_array_new_size(root_len);
349             add_normalized_strings_token(root_strings, str, root_token, options);
350             num_strings = cstring_array_num_strings(root_strings);
351 
352         } else {
353             root_strings = cstring_array_new_size(token.len);
354             add_normalized_strings_token(root_strings, str, token, options);
355             num_strings = cstring_array_num_strings(root_strings);
356 
357             for (size_t k = 0; k < num_strings; k++) {
358                 root_word = cstring_array_get_string(root_strings, k);
359                 cstring_array_add_string(tree->strings, root_word);
360             }
361 
362             char_array_destroy(key);
363             cstring_array_destroy(root_strings);
364             return false;
365 
366         }
367 
368         for (size_t j = 0; j < prefix_expansions->n; j++) {
369             char_array_clear(key);
370             prefix_expansion = prefix_expansions->a[j];
371 
372             cat_affix_expansion(key, str, prefix_expansion, token, prefix, options);
373             prefix_end = key->n - 1;
374 
375             add_space = (prefix_expansion.separable || with_period) && prefix.len + skip_period < token.len;
376             for (int spaces = skip_period; spaces <= add_space; spaces++) {
377                 key->n = prefix_end;
378                 if (spaces) {
379                     char_array_cat(key, " ");
380                 }
381                 size_t prefix_space_len = key->n - spaces;
382                 for (size_t k = 0; k < num_strings; k++) {
383                     key->n = prefix_space_len;
384                     root_word = cstring_array_get_string(root_strings, k);
385                     char_array_cat(key, root_word);
386 
387                     expansion = char_array_get_string(key);
388                     cstring_array_add_string(tree->strings, expansion);
389                 }
390 
391             }
392         }
393     }
394 
395     char_array_destroy(key);
396 
397     if (root_strings != NULL) {
398         cstring_array_destroy(root_strings);
399     }
400 
401     return true;
402 
403 }
404 
expand_affixes(string_tree_t * tree,char * str,char * lang,token_t token,libpostal_normalize_options_t options)405 inline bool expand_affixes(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options) {
406     phrase_t suffix = search_address_dictionaries_suffix(str + token.offset, token.len, lang);
407 
408     phrase_t prefix = search_address_dictionaries_prefix(str + token.offset, token.len, lang);
409 
410     if ((suffix.len == 0 && prefix.len == 0)) return false;
411 
412     bool with_period = false;
413 
414     return add_affix_expansions(tree, str, lang, token, prefix, suffix, options, with_period);
415 }
416 
expand_affixes_period(string_tree_t * tree,char * str,char * lang,token_t token,libpostal_normalize_options_t options)417 inline bool expand_affixes_period(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options) {
418     ssize_t first_period_index = string_next_period_len(str + token.offset, token.len);
419     if (first_period_index > 0) {
420         ssize_t next_period_index = string_next_period_len(str + token.offset + first_period_index + 1, token.len - first_period_index - 1);
421         // Token contains only one period or one + a final period
422         if (next_period_index < 0 || next_period_index == token.len - 1) {
423             phrase_t prefix = search_address_dictionaries_substring(str + token.offset, first_period_index, lang);
424 
425             phrase_t suffix = search_address_dictionaries_substring(str + token.offset + first_period_index + 1, token.len - first_period_index - 1, lang);
426             if (suffix.len > 0) {
427                 suffix.start = first_period_index + 1;
428             }
429 
430             if (suffix.len == 0 && prefix.len == 0) return false;
431 
432             bool with_period = true;
433 
434             return add_affix_expansions(tree, str, lang, token, prefix, suffix, options, with_period);
435         } else {
436             return false;
437         }
438     } else {
439         return false;
440     }
441 }
442 
add_period_affixes_or_token(string_tree_t * tree,char * str,token_t token,libpostal_normalize_options_t options)443 bool add_period_affixes_or_token(string_tree_t *tree, char *str, token_t token, libpostal_normalize_options_t options) {
444     bool have_period_affixes = false;
445     if (string_contains_period_len(str + token.offset, token.len)) {
446         for (size_t l = 0; l < options.num_languages; l++) {
447             char *lang = options.languages[l];
448             if (expand_affixes_period(tree, str, lang, token, options)) {
449                 have_period_affixes = true;
450                 break;
451             }
452         }
453     }
454 
455     if (!have_period_affixes) {
456         string_tree_add_string_len(tree, str + token.offset, token.len);
457     }
458 
459     return have_period_affixes;
460 }
461 
462 
gazetteer_ignorable_components(uint16_t dictionary_id)463 static inline uint32_t gazetteer_ignorable_components(uint16_t dictionary_id) {
464     switch (dictionary_id) {
465         case DICTIONARY_ACADEMIC_DEGREE:
466             return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
467         case DICTIONARY_BUILDING_TYPE:
468             return LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_UNIT;
469         case DICTIONARY_COMPANY_TYPE:
470             return LIBPOSTAL_ADDRESS_NAME;
471         case DICTIONARY_DIRECTIONAL:
472             return LIBPOSTAL_ADDRESS_STREET;
473         case DICTIONARY_ELISION:
474             return LIBPOSTAL_ADDRESS_ANY;
475         case DICTIONARY_ENTRANCE:
476             return LIBPOSTAL_ADDRESS_ENTRANCE;
477         case DICTIONARY_HOUSE_NUMBER:
478             return LIBPOSTAL_ADDRESS_HOUSE_NUMBER;
479         case DICTIONARY_LEVEL_NUMBERED:
480             return LIBPOSTAL_ADDRESS_LEVEL;
481         case DICTIONARY_LEVEL_STANDALONE:
482             return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ANY);
483         case DICTIONARY_LEVEL_MEZZANINE:
484             return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_LEVEL| LIBPOSTAL_ADDRESS_ANY);
485         case DICTIONARY_LEVEL_BASEMENT:
486             return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ANY);
487         case DICTIONARY_LEVEL_SUB_BASEMENT:
488             return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ANY);
489         case DICTIONARY_NUMBER:
490             return LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_PO_BOX | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_ENTRANCE | LIBPOSTAL_ADDRESS_STREET;
491         case DICTIONARY_NO_NUMBER:
492             return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_ANY);
493         case DICTIONARY_PERSONAL_TITLE:
494             return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
495         case DICTIONARY_PLACE_NAME:
496             return LIBPOSTAL_ADDRESS_NAME;
497         case DICTIONARY_POST_OFFICE:
498             return LIBPOSTAL_ADDRESS_PO_BOX;
499         case DICTIONARY_POSTAL_CODE:
500             return LIBPOSTAL_ADDRESS_POSTAL_CODE;
501         case DICTIONARY_QUALIFIER:
502             return LIBPOSTAL_ADDRESS_TOPONYM;
503         case DICTIONARY_STAIRCASE:
504             return LIBPOSTAL_ADDRESS_STAIRCASE;
505         case DICTIONARY_STOPWORD:
506             return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_TOPONYM;
507         case DICTIONARY_STREET_TYPE:
508             return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
509         case DICTIONARY_UNIT_NUMBERED:
510             return LIBPOSTAL_ADDRESS_UNIT;
511         case DICTIONARY_UNIT_STANDALONE:
512             return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_ANY);
513         case DICTIONARY_UNIT_DIRECTION:
514             return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_ANY);
515         default:
516             return LIBPOSTAL_ADDRESS_NONE;
517     }
518 }
519 
520 
gazetteer_valid_components(uint16_t dictionary_id)521 static inline uint32_t gazetteer_valid_components(uint16_t dictionary_id) {
522     switch (dictionary_id) {
523         case DICTIONARY_DIRECTIONAL:
524             return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_TOPONYM | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_ENTRANCE;
525         case DICTIONARY_STOPWORD:
526             return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_TOPONYM;
527         case DICTIONARY_STREET_NAME:
528             return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
529         case DICTIONARY_STREET_TYPE:
530             return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
531         case DICTIONARY_SYNONYM:
532             return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_TOPONYM;
533         default:
534             return LIBPOSTAL_ADDRESS_NONE;
535     }
536 }
537 
gazetteer_edge_ignorable_components(uint16_t dictionary_id)538 static inline uint32_t gazetteer_edge_ignorable_components(uint16_t dictionary_id) {
539     switch (dictionary_id) {
540         // Pre/post directionals can be removed if there are non-phrase tokens
541         case DICTIONARY_DIRECTIONAL:
542             return LIBPOSTAL_ADDRESS_STREET;
543         case DICTIONARY_COMPANY_TYPE:
544             return LIBPOSTAL_ADDRESS_NAME;
545         case DICTIONARY_PLACE_NAME:
546             return LIBPOSTAL_ADDRESS_NAME;
547         default:
548             return LIBPOSTAL_ADDRESS_NONE;
549     }
550 }
551 
gazetteer_specifier_components(uint16_t dictionary_id)552 static inline uint32_t gazetteer_specifier_components(uint16_t dictionary_id) {
553     switch (dictionary_id) {
554         case DICTIONARY_LEVEL_STANDALONE:
555             return LIBPOSTAL_ADDRESS_LEVEL;
556         case DICTIONARY_LEVEL_MEZZANINE:
557             return LIBPOSTAL_ADDRESS_LEVEL;
558         case DICTIONARY_LEVEL_BASEMENT:
559             return LIBPOSTAL_ADDRESS_LEVEL;
560         case DICTIONARY_LEVEL_SUB_BASEMENT:
561             return LIBPOSTAL_ADDRESS_LEVEL;
562         case DICTIONARY_UNIT_STANDALONE:
563             return LIBPOSTAL_ADDRESS_UNIT;
564         default:
565             return LIBPOSTAL_ADDRESS_NONE;
566     }
567 }
568 
569 
gazetteer_possible_root_components(uint16_t dictionary_id)570 static inline uint32_t gazetteer_possible_root_components(uint16_t dictionary_id) {
571     switch (dictionary_id) {
572         case DICTIONARY_ACADEMIC_DEGREE:
573             return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
574         case DICTIONARY_DIRECTIONAL:
575             return LIBPOSTAL_ADDRESS_STREET;
576         case DICTIONARY_PERSONAL_TITLE:
577             return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
578         case DICTIONARY_NUMBER:
579             return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
580         case DICTIONARY_PLACE_NAME:
581             return LIBPOSTAL_ADDRESS_STREET;
582         case DICTIONARY_QUALIFIER:
583             return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
584         case DICTIONARY_STREET_NAME:
585             return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
586         case DICTIONARY_SYNONYM:
587             return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
588         case DICTIONARY_TOPONYM:
589             return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
590         default:
591             return LIBPOSTAL_ADDRESS_NONE;
592     }
593 }
594 
595 static const uint16_t NUMERIC_ADDRESS_COMPONENTS = (LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_PO_BOX | LIBPOSTAL_ADDRESS_POSTAL_CODE | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_ENTRANCE | LIBPOSTAL_ADDRESS_STREET);
596 
597 typedef enum {
598     GAZETTEER_MATCH_IGNORABLE,
599     GAZETTEER_MATCH_EDGE_IGNORABLE,
600     GAZETTEER_MATCH_POSSIBLE_ROOT,
601     GAZETTEER_MATCH_SPECIFIER,
602     GAZETTEER_MATCH_VALID_COMPONENTS
603 } gazetteer_match_type_t;
604 
605 
address_expansion_matches_type_for_components(address_expansion_t expansion,uint32_t address_components,gazetteer_match_type_t match_type)606 static inline bool address_expansion_matches_type_for_components(address_expansion_t expansion, uint32_t address_components, gazetteer_match_type_t match_type) {
607     for (uint32_t j = 0; j < expansion.num_dictionaries; j++) {
608         uint16_t dictionary_id = expansion.dictionary_ids[j];
609         uint32_t components = 0;
610         switch (match_type) {
611             case GAZETTEER_MATCH_IGNORABLE:
612                 components =  gazetteer_ignorable_components(dictionary_id);
613                 break;
614             case GAZETTEER_MATCH_EDGE_IGNORABLE:
615                 components =  gazetteer_edge_ignorable_components(dictionary_id);
616                 break;
617             case GAZETTEER_MATCH_POSSIBLE_ROOT:
618                 components =  gazetteer_possible_root_components(dictionary_id);
619                 break;
620             case GAZETTEER_MATCH_SPECIFIER:
621                 components =  gazetteer_specifier_components(dictionary_id);
622                 break;
623             case GAZETTEER_MATCH_VALID_COMPONENTS:
624                 components = gazetteer_valid_components(dictionary_id);
625                 break;
626             default:
627                 break;
628         }
629         if (components & address_components) {
630             return true;
631         }
632     }
633     return false;
634 }
635 
address_expansion_is_ignorable_for_components(address_expansion_t expansion,uint32_t address_components)636 bool address_expansion_is_ignorable_for_components(address_expansion_t expansion, uint32_t address_components) {
637     return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_IGNORABLE);
638 }
639 
address_expansion_is_edge_ignorable_for_components(address_expansion_t expansion,uint32_t address_components)640 bool address_expansion_is_edge_ignorable_for_components(address_expansion_t expansion, uint32_t address_components) {
641     return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_EDGE_IGNORABLE);
642 }
643 
address_expansion_is_possible_root_for_components(address_expansion_t expansion,uint32_t address_components)644 bool address_expansion_is_possible_root_for_components(address_expansion_t expansion, uint32_t address_components) {
645     return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_POSSIBLE_ROOT);
646 }
647 
address_expansion_is_specifier_for_components(address_expansion_t expansion,uint32_t address_components)648 bool address_expansion_is_specifier_for_components(address_expansion_t expansion, uint32_t address_components) {
649     return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_SPECIFIER);
650 }
651 
address_expansion_is_valid_for_components(address_expansion_t expansion,uint32_t address_components)652 bool address_expansion_is_valid_for_components(address_expansion_t expansion, uint32_t address_components) {
653     return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_VALID_COMPONENTS);
654 }
655 
656 
address_phrase_matches_type_for_components(phrase_t phrase,uint32_t address_components,gazetteer_match_type_t match_type)657 bool address_phrase_matches_type_for_components(phrase_t phrase, uint32_t address_components, gazetteer_match_type_t match_type) {
658     uint32_t expansion_index = phrase.data;
659     address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index);
660 
661     if (value == NULL) return false;
662 
663     address_expansion_array *expansions = value->expansions;
664     if (expansions == NULL) return false;
665 
666     for (size_t i = 0; i < expansions->n; i++) {
667         address_expansion_t expansion = expansions->a[i];
668 
669         if (address_expansion_matches_type_for_components(expansion, address_components, match_type)) {
670             return true;
671         }
672     }
673     return false;
674 }
675 
address_phrase_is_ignorable_for_components(phrase_t phrase,uint32_t address_components)676 inline bool address_phrase_is_ignorable_for_components(phrase_t phrase, uint32_t address_components) {
677     return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_IGNORABLE);
678 }
679 
address_phrase_is_edge_ignorable_for_components(phrase_t phrase,uint32_t address_components)680 inline bool address_phrase_is_edge_ignorable_for_components(phrase_t phrase, uint32_t address_components) {
681     return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_EDGE_IGNORABLE);
682 }
683 
684 
address_phrase_is_possible_root_for_components(phrase_t phrase,uint32_t address_components)685 inline bool address_phrase_is_possible_root_for_components(phrase_t phrase, uint32_t address_components) {
686     return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_POSSIBLE_ROOT);
687 }
688 
address_phrase_is_specifier_for_components(phrase_t phrase,uint32_t address_components)689 inline bool address_phrase_is_specifier_for_components(phrase_t phrase, uint32_t address_components) {
690     return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_SPECIFIER);
691 }
692 
address_phrase_is_valid_for_components(phrase_t phrase,uint32_t address_components)693 inline bool address_phrase_is_valid_for_components(phrase_t phrase, uint32_t address_components) {
694     return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_VALID_COMPONENTS);
695 }
696 
697 
address_phrase_contains_unambiguous_expansion(phrase_t phrase)698 bool address_phrase_contains_unambiguous_expansion(phrase_t phrase) {
699     address_expansion_value_t *value = address_dictionary_get_expansions(phrase.data);
700     if (value == NULL) return false;
701 
702     address_expansion_array *expansions = value->expansions;
703     if (expansions == NULL) return false;
704 
705     address_expansion_t *expansions_array = expansions->a;
706 
707     for (size_t i = 0; i < expansions->n; i++) {
708         address_expansion_t expansion = expansions_array[i];
709         if (!address_expansion_in_dictionary(expansion, DICTIONARY_AMBIGUOUS_EXPANSION)) {
710             return true;
711         }
712     }
713     return false;
714 }
715 
add_string_alternatives_phrase_option(char * str,libpostal_normalize_options_t options,expansion_phrase_option_t phrase_option)716 string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normalize_options_t options, expansion_phrase_option_t phrase_option) {
717     char_array *key = NULL;
718 
719     log_debug("input=%s\n", str);
720     token_array *token_array = tokenize_keep_whitespace(str);
721 
722     if (token_array == NULL) {
723         return NULL;
724     }
725 
726     size_t len = strlen(str);
727 
728     token_t *tokens = token_array->a;
729     size_t num_tokens = token_array->n;
730 
731     log_debug("tokenized, num tokens=%zu\n", num_tokens);
732 
733     bool last_was_punctuation = false;
734 
735     phrase_language_array *phrases = NULL;
736     phrase_array *lang_phrases = NULL;
737 
738     for (size_t i = 0; i < options.num_languages; i++)  {
739         char *lang = options.languages[i];
740         log_debug("lang=%s\n", lang);
741 
742         lang_phrases = search_address_dictionaries_tokens(str, token_array, lang);
743 
744         if (lang_phrases == NULL) {
745             log_debug("lang_phrases NULL\n");
746             continue;
747         }
748 
749         log_debug("lang_phrases->n = %zu\n", lang_phrases->n);
750 
751         phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n);
752 
753         for (size_t j = 0; j < lang_phrases->n; j++) {
754             phrase_t p = lang_phrases->a[j];
755             log_debug("lang=%s, (%d, %d)\n", lang, p.start, p.len);
756             phrase_language_array_push(phrases, (phrase_language_t){lang, p});
757         }
758 
759         phrase_array_destroy(lang_phrases);
760     }
761 
762 
763     lang_phrases = search_address_dictionaries_tokens(str, token_array, ALL_LANGUAGES);
764     if (lang_phrases != NULL) {
765         phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n);
766 
767         for (size_t j = 0; j < lang_phrases->n; j++) {
768             phrase_t p = lang_phrases->a[j];
769             phrase_language_array_push(phrases, (phrase_language_t){ALL_LANGUAGES, p});
770         }
771         phrase_array_destroy(lang_phrases);
772 
773     }
774 
775     string_tree_t *tree = string_tree_new_size(len);
776 
777     bool last_added_was_whitespace = false;
778 
779     uint64_t normalize_string_options = get_normalize_string_options(options);
780 
781     if (phrases != NULL) {
782         log_debug("phrases not NULL, n=%zu\n", phrases->n);
783         ks_introsort(phrase_language_array, phrases->n, phrases->a);
784 
785         phrase_language_t phrase_lang;
786 
787         size_t start = 0;
788         size_t end = 0;
789 
790         phrase_t phrase = NULL_PHRASE;
791         phrase_t prev_phrase = NULL_PHRASE;
792 
793         key = key != NULL ? key : char_array_new_size(DEFAULT_KEY_LEN);
794 
795         log_debug("phrase_option = %d\n", phrase_option);
796 
797         bool delete_phrases = phrase_option == DELETE_PHRASES;
798         bool expand_phrases = phrase_option == EXPAND_PHRASES;
799 
800         size_t num_phrases = phrases->n;
801 
802         bool have_non_phrase_tokens = false;
803         bool have_non_phrase_word_tokens = false;
804         bool have_canonical_phrases = false;
805         bool have_ambiguous = false;
806         bool have_possible_root = false;
807         bool have_strictly_ignorable = false;
808         bool have_strictly_ignorable_abbreviation = false;
809 
810         size_t prev_phrase_end = 0;
811 
812         if (delete_phrases) {
813             for (size_t i = 0; i < num_phrases; i++) {
814                 phrase_lang = phrases->a[i];
815                 phrase = phrase_lang.phrase;
816 
817                 log_debug("phrase.start = %zu, prev_phrase_end = %zu\n", phrase.start, prev_phrase_end);
818 
819                 token_t inter_token;
820                 if (phrase.start > prev_phrase_end) {
821                     for (size_t j = prev_phrase_end; j < phrase.start; j++) {
822                         inter_token = tokens[j];
823                         if (!is_punctuation(inter_token.type) && !is_whitespace(inter_token.type)) {
824                             log_debug("have_non_phrase_tokens\n");
825                             have_non_phrase_tokens = true;
826                             have_non_phrase_word_tokens = have_non_phrase_word_tokens || is_word_token(inter_token.type);
827                             break;
828                         }
829                     }
830                 }
831 
832                 if (i == num_phrases - 1 && phrase.start + phrase.len < num_tokens) {
833                     for (size_t j = phrase.start + phrase.len; j < num_tokens; j++) {
834                         inter_token = tokens[j];
835                         if (!is_punctuation(inter_token.type) && !is_whitespace(inter_token.type)) {
836                             have_non_phrase_tokens = true;
837                             have_non_phrase_word_tokens = have_non_phrase_word_tokens || is_word_token(inter_token.type);
838                             break;
839                         }
840                     }
841                 }
842 
843                 bool phrase_is_ambiguous = address_phrase_in_dictionary(phrase, DICTIONARY_AMBIGUOUS_EXPANSION);
844                 bool phrase_is_strictly_ignorable = address_phrase_is_ignorable_for_components(phrase, options.address_components) && !phrase_is_ambiguous;
845                 bool phrase_is_canonical = address_phrase_has_canonical_interpretation(phrase);
846 
847                 have_non_phrase_tokens = have_non_phrase_tokens || (!phrase_is_strictly_ignorable && !phrase_is_ambiguous);
848                 log_debug("have_non_phrase_word_tokens = %d, phrase_is_strictly_ignorable = %d, phrase_is_ambiguous = %d\n", have_non_phrase_word_tokens, phrase_is_strictly_ignorable, phrase_is_ambiguous);
849                 if (!have_non_phrase_word_tokens && !phrase_is_strictly_ignorable && !phrase_is_ambiguous) {
850                     for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) {
851                         token_t pt = tokens[j];
852                         if (is_word_token(pt.type)) {
853                             log_debug("have_non_phrase_word_tokens\n");
854                             have_non_phrase_word_tokens = true;
855                             break;
856                         }
857                     }
858                 }
859 
860 
861                 have_strictly_ignorable = have_strictly_ignorable || phrase_is_strictly_ignorable;
862                 have_strictly_ignorable_abbreviation = have_strictly_ignorable_abbreviation || (phrase_is_strictly_ignorable && !phrase_is_canonical);
863                 if (have_strictly_ignorable_abbreviation) {
864                     log_debug("have_strictly_ignorable=%zu, phrase_is_canonical=%zu\n", have_strictly_ignorable, phrase_is_canonical);
865                 }
866 
867                 have_possible_root = have_possible_root | address_phrase_is_possible_root_for_components(phrase, options.address_components);
868 
869                 have_canonical_phrases = have_canonical_phrases || (phrase_is_canonical && !phrase_is_ambiguous);
870                 have_ambiguous = have_ambiguous || phrase_is_ambiguous;
871 
872                 prev_phrase_end = phrase.start + phrase.len;
873             }
874 
875 
876             log_debug("have_non_phrase_tokens = %d\n", have_non_phrase_tokens);
877             log_debug("have_canonical_phrases = %d\n", have_canonical_phrases);
878             log_debug("have_ambiguous = %d\n", have_ambiguous);
879             log_debug("have_strictly_ignorable = %d\n", have_strictly_ignorable);
880             log_debug("have_strictly_ignorable_abbreviation = %d\n", have_strictly_ignorable_abbreviation);
881         }
882 
883         bool skipped_last_edge_phrase = false;
884 
885         for (size_t i = 0; i < phrases->n; i++) {
886             phrase_lang = phrases->a[i];
887 
888             phrase = phrase_lang.phrase;
889 
890             log_debug("phrase.start=%d, phrase.len=%d, lang=%s, prev_phrase.start=%d, prev_phrase.len=%d\n", phrase.start, phrase.len, phrase_lang.language, prev_phrase.start, prev_phrase.len);
891 
892             if ((phrase.start > prev_phrase.start && phrase.start < prev_phrase.start + prev_phrase.len) || (phrase.start == prev_phrase.start && i > 0 && phrase.len < prev_phrase.len)) {
893                 log_debug("continuing\n");
894                 continue;
895             }
896 
897             char_array_clear(key);
898 
899             char_array_cat(key, phrase_lang.language);
900             char_array_cat(key, NAMESPACE_SEPARATOR_CHAR);
901 
902             size_t namespace_len = key->n;
903 
904             end = phrase.start;
905 
906             log_debug("start=%zu, end=%zu\n", start, end);
907             for (size_t j = start; j < end; j++) {
908                 log_debug("Adding token %zu\n", j);
909                 token_t token = tokens[j];
910                 if (is_punctuation(token.type)) {
911                     last_was_punctuation = true;
912                     continue;
913                 }
914 
915                 if (token.type != WHITESPACE) {
916                     if ((phrase.start > 0 && last_was_punctuation) || (!last_added_was_whitespace && string_tree_num_tokens(tree) > 0) ) {
917                         log_debug("Adding space\n");
918                         string_tree_add_string(tree, " ");
919                         string_tree_finalize_token(tree);
920                     }
921                     log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset);
922 
923                     bool have_period_affixes = add_period_affixes_or_token(tree, str, token, options);
924                     string_tree_finalize_token(tree);
925                     last_added_was_whitespace = false;
926                 } else if (!delete_phrases && !last_added_was_whitespace && string_tree_num_tokens(tree) > 0 ) {
927                     log_debug("Adding pre-phrase whitespace\n");
928                     last_added_was_whitespace = true;
929                     string_tree_add_string(tree, " ");
930                     string_tree_finalize_token(tree);
931                 } else {
932                     continue;
933                 }
934 
935                 last_was_punctuation = false;
936             }
937 
938             size_t added_expansions = 0;
939             token_t token;
940 
941             uint32_t expansion_index = phrase.data;
942             address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index);
943 
944             bool expansion_valid_components = (value->components & options.address_components) || address_phrase_is_valid_for_components(phrase, options.address_components);
945 
946             bool is_numeric_component = (value->components & options.address_components & NUMERIC_ADDRESS_COMPONENTS);
947 
948             if (expansion_valid_components) {
949                 key->n = namespace_len;
950                 for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) {
951                     token = tokens[j];
952                     if (token.type != WHITESPACE) {
953                         char_array_cat_len(key, str + token.offset, token.len);
954                         last_added_was_whitespace = false;
955                     } else if (!last_added_was_whitespace) {
956                         char_array_cat(key, " ");
957                         last_added_was_whitespace = true;
958                     }
959                 }
960 
961                 char *key_str = char_array_get_string(key);
962                 log_debug("key_str=%s\n", key_str);
963                 address_expansion_array *expansions = value->expansions;
964 
965                 if (expansions != NULL) {
966                     bool current_phrase_have_ambiguous = delete_phrases && address_phrase_in_dictionary(phrase, DICTIONARY_AMBIGUOUS_EXPANSION);
967                     bool added_pre_phrase_space = false;
968                     bool current_phrase_have_ignorable = delete_phrases && address_phrase_is_ignorable_for_components(phrase, options.address_components);
969                     bool current_phrase_have_edge_ignorable = false;
970 
971                     bool current_phrase_have_specifier = delete_phrases && address_phrase_is_specifier_for_components(phrase, options.address_components);
972                     bool current_phrase_have_canonical = delete_phrases && address_phrase_has_canonical_interpretation(phrase);
973                     bool current_phrase_have_possible_root = delete_phrases && address_phrase_is_possible_root_for_components(phrase, options.address_components);
974 
975                     bool current_phrase_have_valid = address_phrase_is_valid_for_components(phrase, options.address_components);
976 
977                     log_debug("current_phrase_have_specifier = %d\n", current_phrase_have_specifier);
978 
979                     bool current_phrase_have_unambiguous = delete_phrases && address_phrase_contains_unambiguous_expansion(phrase);
980 
981                     /*
982                     Edge phrase handling. This is primarily for handling pre-directionals/post-directionals
983                     in English and other languages.
984                     */
985                     bool skip_edge_phrase = false;
986                     bool other_phrase_is_ignorable = false;
987 
988                     if (delete_phrases) {
989                         phrase_language_t other_phrase_lang;
990                         phrase_t other_phrase;
991 
992                         log_debug("i = %zu, phrase.start = %u\n", i, phrase.start);
993                         if (i == 0 && phrase.start == 0 && phrase.start + phrase.len < num_tokens) {
994                             current_phrase_have_edge_ignorable = address_phrase_is_edge_ignorable_for_components(phrase, options.address_components);
995                             // Delete "E" in "E 125th St"
996                             if (current_phrase_have_edge_ignorable) {
997                                 log_debug("edge-ignorable phrase [%u, %u]\n", phrase.start, phrase.start + phrase.len);
998                                 skip_edge_phrase = true;
999                             }
1000 
1001                             if (!skip_edge_phrase || !have_non_phrase_tokens) {
1002                                 for (size_t other_i = i + 1; other_i < phrases->n; other_i++) {
1003                                     other_phrase_lang = phrases->a[other_i];
1004                                     other_phrase = other_phrase_lang.phrase;
1005                                     log_debug("phrase.start + phrase.len = %u\n", phrase.start + phrase.len);
1006                                     log_debug("other_phrase.start = %u, other_phrase.len = %u, lang=%s\n", other_phrase.start, other_phrase.len, other_phrase_lang.language);
1007                                     if (other_phrase.start >= phrase.start + phrase.len && string_equals(other_phrase_lang.language, phrase_lang.language)) {
1008                                         if (other_phrase.start + other_phrase.len == num_tokens) {
1009                                             skip_edge_phrase = false;
1010                                             if (current_phrase_have_edge_ignorable || (current_phrase_have_ambiguous && current_phrase_have_canonical)) {
1011                                                 // don't delete the "E" in "E St"
1012                                                 log_debug("initial phrase is edge ignorable out of two phrases. Checking next phrase is ignorable.\n");
1013 
1014                                                 skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(other_phrase) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components)));
1015                                                 log_debug("skip_edge_phrase = %d\n", skip_edge_phrase);
1016                                             } else {
1017                                                 log_debug("initial phrase is not edge-ignorable out of two phrases. Checking next phrase is edge ignorable.\n");
1018                                                 // delete "Avenue" in "Avenue E"
1019                                                 other_phrase_is_ignorable = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components) || (address_phrase_in_dictionary(other_phrase, DICTIONARY_AMBIGUOUS_EXPANSION) && address_phrase_has_canonical_interpretation(other_phrase));
1020                                                 skip_edge_phrase = other_phrase_is_ignorable && address_phrase_is_ignorable_for_components(phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(phrase) && address_phrase_is_possible_root_for_components(phrase, options.address_components));
1021 
1022                                             }
1023                                         } else {
1024                                             // If we encounter an ignorable phrase like St and we're _not_ the end of the string e.g. "E St SE", the first token is probably a legit token instead of a pre-directional
1025                                             skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !((address_phrase_has_canonical_interpretation(other_phrase) || address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components)) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components)));
1026                                             log_debug("phrase is possible root. skip_edge_phrase = %d\n", skip_edge_phrase);
1027                                         }
1028                                         break;
1029                                     }
1030                                 }
1031                             }
1032                         } else if (phrases->n > 1 && i == phrases->n - 1 && phrase.start + phrase.len == num_tokens && phrase.start > 0) {
1033                             current_phrase_have_edge_ignorable = address_phrase_is_edge_ignorable_for_components(phrase, options.address_components);
1034                             if (current_phrase_have_edge_ignorable) {
1035                                 log_debug("edge-ignorable phrase [%u, %u]\n", phrase.start, phrase.start + phrase.len);
1036                                 skip_edge_phrase = true;
1037                             }
1038 
1039                             log_debug("have_non_phrase_tokens = %d\n", have_non_phrase_tokens);
1040                             if (!skip_edge_phrase || !have_non_phrase_tokens) {
1041                                 for (ssize_t other_j = i - 1; other_j >= 0; other_j--) {
1042                                     other_phrase_lang = phrases->a[other_j];
1043                                     other_phrase = other_phrase_lang.phrase;
1044                                     log_debug("phrase.start + phrase.len = %u\n", phrase.start + phrase.len);
1045                                     log_debug("other_phrase.start = %u, other_phrase.len = %u, lang=%s\n", other_phrase.start, other_phrase.len, other_phrase_lang.language);
1046                                     if (other_phrase.start + other_phrase.len <= phrase.start && string_equals(other_phrase_lang.language, phrase_lang.language)) {
1047                                         if (other_phrase.start == 0) {
1048                                             //other_phrase_invalid = address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !address_phrase_has_canonical_interpretation(other_phrase) && !address_phrase_is_possible_root_for_components(other_phrase, options.address_components);
1049                                             skip_edge_phrase = false;
1050                                             if (current_phrase_have_edge_ignorable || (current_phrase_have_ambiguous && current_phrase_have_canonical)) {
1051                                                 // don't delete the "E" in "Avenue E"
1052                                                 log_debug("final phrase is edge ignorable out of two phrases. Checking previous phrase is ignorable.\n");
1053 
1054                                                 skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(other_phrase) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components))) && string_tree_num_tokens(tree) > 0;
1055                                             } else {
1056                                                 log_debug("final phrase is not edge-ignorable out of two phrases. Checking previous phrase is edge ignorable.\n");
1057                                                 // delete "St" in "E St"
1058                                                 other_phrase_is_ignorable = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components) || (address_phrase_in_dictionary(other_phrase, DICTIONARY_AMBIGUOUS_EXPANSION) && address_phrase_has_canonical_interpretation(other_phrase));
1059                                                 skip_edge_phrase = other_phrase_is_ignorable && address_phrase_is_ignorable_for_components(phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(phrase) && address_phrase_is_possible_root_for_components(phrase, options.address_components));
1060 
1061                                                 //skip_edge_phrase = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components);
1062                                             }
1063                                         }
1064                                         break;
1065                                     }
1066                                 }
1067                             }
1068                         }
1069                     }
1070 
1071                     if (phrase.start == prev_phrase.start && phrase.len == prev_phrase.len && skipped_last_edge_phrase) {
1072                         skip_edge_phrase = true;
1073                     }
1074 
1075                     for (size_t j = 0; j < expansions->n; j++) {
1076                         if (skip_edge_phrase) {
1077                             skipped_last_edge_phrase = true;
1078                             log_debug("skip edge phrase\n");
1079                             continue;
1080                         } else {
1081                             skipped_last_edge_phrase = false;
1082                         }
1083 
1084                         address_expansion_t expansion = expansions->a[j];
1085 
1086                         bool current_phrase_ignorable = false;
1087                         bool current_phrase_expandable = expand_phrases && expansion.canonical_index != NULL_CANONICAL_INDEX;
1088 
1089                         bool is_ambiguous = address_expansion_in_dictionary(expansion, DICTIONARY_AMBIGUOUS_EXPANSION);
1090                         bool is_valid_for_components = address_expansion_is_valid_for_components(expansion, options.address_components);
1091 
1092                         if (delete_phrases) {
1093                             bool is_ignorable = address_expansion_is_ignorable_for_components(expansion, options.address_components);
1094                             bool is_canonical = expansion.canonical_index == NULL_CANONICAL_INDEX;
1095 
1096                             log_debug("is_ignorable = %d, is_canonical = %d, is_ambiguous = %d, current_phrase_have_ambiguous = %d, current_phrase_have_unambiguous = %d, have_strictly_ignorable = %d, current_phrase_have_ignorable=%d, current_phrase_have_possible_root=%d\n", is_ignorable, is_canonical, is_ambiguous, current_phrase_have_ambiguous, current_phrase_have_unambiguous, have_strictly_ignorable, current_phrase_have_ignorable, current_phrase_have_possible_root);
1097 
1098                             current_phrase_expandable = current_phrase_expandable || current_phrase_have_ambiguous;
1099 
1100                             if (!is_canonical) {
1101                                 char *canon = address_dictionary_get_canonical(expansion.canonical_index);
1102                                 log_debug("canonical = %s\n", canon);
1103                             }
1104 
1105                             // Edge phrase calculations from above
1106                             if (current_phrase_have_edge_ignorable || other_phrase_is_ignorable) {
1107                                 log_debug("current_phrase_have_edge_ignorable\n");
1108                                 log_debug("skip_edge_phrase = %d\n", skip_edge_phrase);
1109                                 current_phrase_ignorable = skip_edge_phrase;
1110                             // Don't delete "PH" in "PH 1" for unit expansions
1111                             } else if (is_ignorable && current_phrase_have_specifier) {
1112                                 log_debug("current_phrase_have_specifier\n");
1113                                 current_phrase_ignorable = false;
1114                             // Delete "Avenue" in "5th Avenue"
1115                             } else if (is_ignorable && is_canonical && !current_phrase_have_ambiguous) {
1116                                 log_debug("is_ignorable && is_canonical && !current_phrase_have_ambiguous\n");
1117                                 current_phrase_ignorable = have_non_phrase_tokens || (have_possible_root && !current_phrase_have_possible_root) || string_tree_num_tokens(tree) > 0;
1118                                 log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable);
1119                             // Delete "Ave" in "5th Ave" or "Pl" in "Park Pl S"
1120                             } else if (is_ignorable && !is_canonical && !is_ambiguous && !current_phrase_have_ambiguous) {
1121                                 log_debug("is_ignorable && !is_canonical && !current_phrase_have_ambiguous\n");
1122                                 current_phrase_ignorable = have_non_phrase_tokens || (have_possible_root && !current_phrase_have_possible_root) || string_tree_num_tokens(tree) > 0;
1123                                 log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable);
1124                             } else if (current_phrase_have_ambiguous && (have_non_phrase_word_tokens || is_numeric_component || have_canonical_phrases || have_possible_root)) {
1125                                 log_debug("current_phrase_have_ambiguous && have_non_phrase_tokens = %d, have_canonical_phrases = %d, have_possible_root = %d, have_non_phrase_word_tokens = %d, is_numeric_component = %d, have_non_phrase_tokens = %d\n", have_non_phrase_tokens, have_canonical_phrases, have_possible_root, have_non_phrase_word_tokens, is_numeric_component, have_non_phrase_tokens);
1126                                 current_phrase_ignorable = (is_ignorable && !(have_possible_root && !current_phrase_have_possible_root)) || (current_phrase_have_ambiguous && (have_non_phrase_word_tokens || (is_numeric_component && have_non_phrase_tokens)) && current_phrase_have_ignorable && current_phrase_have_unambiguous);
1127                                 log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable);
1128                             } else if (!is_valid_for_components && !is_ambiguous) {
1129                                 log_debug("!is_valid_for_components\n");
1130                                 current_phrase_ignorable = current_phrase_have_ignorable || current_phrase_have_valid;
1131                                 log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable);
1132                             } else {
1133                                 log_debug("none of the above\n");
1134                             }
1135 
1136                             if (!current_phrase_ignorable && !last_added_was_whitespace && string_tree_num_tokens(tree) > 0 && !added_pre_phrase_space) {
1137                                 log_debug("Adding space\n");
1138                                 string_tree_add_string(tree, " ");
1139                                 string_tree_finalize_token(tree);
1140                                 last_added_was_whitespace = true;
1141                                 added_pre_phrase_space = true;
1142                             }
1143 
1144                         }
1145 
1146                         if (current_phrase_ignorable) {
1147                             continue;
1148                         }
1149 
1150                         if (delete_phrases) {
1151                             current_phrase_expandable = !current_phrase_ignorable;
1152                         } else {
1153                             current_phrase_expandable = (expansion.address_components & options.address_components) || is_valid_for_components;
1154                         }
1155 
1156                         log_debug("current_phrase_expandable = %d\n", current_phrase_expandable);
1157 
1158                         log_debug("expansion.canonical_index = %d\n", expansion.canonical_index);
1159 
1160                         if (expansion.canonical_index != NULL_CANONICAL_INDEX && current_phrase_expandable) {
1161                             log_debug("expansion.canonical_index != NULL_CANONICAL_INDEX, delete_phrases = %d, phrase_option = %d\n", delete_phrases, phrase_option);
1162                             char *canonical = address_dictionary_get_canonical(expansion.canonical_index);
1163                             char *canonical_normalized = normalize_string_latin(canonical, strlen(canonical), normalize_string_options);
1164 
1165                             canonical = canonical_normalized != NULL ? canonical_normalized : canonical;
1166 
1167                             if (phrase.start + phrase.len < num_tokens - 1) {
1168                                 token_t next_token = tokens[phrase.start + phrase.len];
1169                                 if (!is_numeric_token(next_token.type)) {
1170                                     log_debug("non-canonical phrase, adding canonical string: %s\n", canonical);
1171                                     string_tree_add_string(tree, canonical);
1172                                     last_added_was_whitespace = false;
1173                                 } else {
1174                                     log_debug("adding canonical with cstring_array methods: %s\n", canonical);
1175                                     uint32_t start_index = cstring_array_start_token(tree->strings);
1176                                     cstring_array_append_string(tree->strings, canonical);
1177                                     cstring_array_append_string(tree->strings, " ");
1178                                     last_added_was_whitespace = true;
1179                                     cstring_array_terminate(tree->strings);
1180                                 }
1181                             } else {
1182                                 log_debug("adding canonical: %s\n", canonical);
1183                                 string_tree_add_string(tree, canonical);
1184                                 last_added_was_whitespace = false;
1185                             }
1186 
1187                             if (canonical_normalized != NULL) {
1188                                 free(canonical_normalized);
1189                             }
1190                         } else if (expansion.canonical_index == NULL_CANONICAL_INDEX || !current_phrase_expandable) {
1191                             log_debug("canonical phrase, adding canonical string\n");
1192 
1193                             uint32_t start_index = cstring_array_start_token(tree->strings);
1194                             for (size_t k = phrase.start; k < phrase.start + phrase.len; k++) {
1195                                 token = tokens[k];
1196                                 if (token.type != WHITESPACE) {
1197                                     cstring_array_append_string_len(tree->strings, str + token.offset, token.len);
1198                                     last_added_was_whitespace = false;
1199                                 } else {
1200                                     log_debug("space\n");
1201                                     cstring_array_append_string(tree->strings, " ");
1202                                     last_added_was_whitespace = true;
1203                                 }
1204                             }
1205                             cstring_array_terminate(tree->strings);
1206                         } else {
1207                             continue;
1208                         }
1209 
1210                         added_expansions++;
1211                     }
1212 
1213                 }
1214             }
1215 
1216             log_debug("expansion_valid_components == %d\n", expansion_valid_components);
1217 
1218             if (added_expansions == 0 && (!delete_phrases || !expansion_valid_components)) {
1219                 if (!last_added_was_whitespace && string_tree_num_tokens(tree) > 0) {
1220                     log_debug("Adding space\n");
1221                     string_tree_add_string(tree, " ");
1222                     string_tree_finalize_token(tree);
1223                     last_added_was_whitespace = true;
1224                 }
1225 
1226                 uint32_t start_index = cstring_array_start_token(tree->strings);
1227 
1228                 for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) {
1229                     token = tokens[j];
1230 
1231                     if (token.type != WHITESPACE) {
1232                         log_debug("Adding canonical token, %.*s\n", (int)token.len, str + token.offset);
1233                         cstring_array_append_string_len(tree->strings, str + token.offset, token.len);
1234                         last_added_was_whitespace = false;
1235                     } else if (!last_added_was_whitespace) {
1236                         log_debug("Adding space\n");
1237                         cstring_array_append_string(tree->strings, " ");
1238                         last_added_was_whitespace = true;
1239                     }
1240 
1241                 }
1242 
1243                 cstring_array_terminate(tree->strings);
1244 
1245             }
1246 
1247             if (!delete_phrases || !expansion_valid_components || added_expansions > 0) {
1248                 log_debug("i=%zu\n", i);
1249                 bool end_of_phrase = false;
1250                 if (i < phrases->n - 1) {
1251                     phrase_t next_phrase = phrases->a[i + 1].phrase;
1252                     end_of_phrase = (next_phrase.start != phrase.start || next_phrase.len != phrase.len);
1253                 } else {
1254                     end_of_phrase = true;
1255                 }
1256 
1257                 log_debug("end_of_phrase=%d\n", end_of_phrase);
1258                 if (end_of_phrase) {
1259                     log_debug("finalize at i=%zu\n", i);
1260                     string_tree_finalize_token(tree);
1261                 }
1262             }
1263 
1264             start = phrase.start + phrase.len;
1265             prev_phrase = phrase;
1266 
1267         }
1268 
1269         char_array_destroy(key);
1270 
1271         end = (int)num_tokens;
1272 
1273         if (phrase.start + phrase.len > 0 && phrase.start + phrase.len <= end - 1 && !last_added_was_whitespace) {
1274             token_t next_token = tokens[phrase.start + phrase.len];
1275             if (next_token.type != WHITESPACE && !last_added_was_whitespace && string_tree_num_tokens(tree) > 0 && !is_ideographic(next_token.type)) {
1276                 log_debug("space after phrase\n");
1277                 string_tree_add_string(tree, " ");
1278                 last_added_was_whitespace = true;
1279                 string_tree_finalize_token(tree);
1280             }
1281         }
1282 
1283 
1284         for (size_t j = start; j < end; j++) {
1285             log_debug("On token %zu\n", j);
1286             token_t token = tokens[j];
1287             if (is_punctuation(token.type)) {
1288                 log_debug("last_was_punctuation\n");
1289                 last_was_punctuation = true;
1290                 continue;
1291             }
1292 
1293             if (token.type != WHITESPACE) {
1294                 if (j > 0 && last_was_punctuation && !last_added_was_whitespace && string_tree_num_tokens(tree) > 0) {
1295                     log_debug("Adding another space\n");
1296                     string_tree_add_string(tree, " ");
1297                     string_tree_finalize_token(tree);
1298                 }
1299                 log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset);
1300 
1301                 bool have_period_affixes = add_period_affixes_or_token(tree, str, token, options);
1302                 last_added_was_whitespace = false;
1303             } else if (!last_added_was_whitespace && string_tree_num_tokens(tree) > 0) {
1304                 log_debug("Adding space IV\n");
1305                 string_tree_add_string(tree, " ");
1306                 last_added_was_whitespace = true;
1307             } else {
1308                 log_debug("Skipping token %zu\n", j);
1309                 continue;
1310             }
1311 
1312             last_was_punctuation = false;
1313             string_tree_finalize_token(tree);
1314 
1315         }
1316 
1317     } else {
1318         log_debug("phrases NULL\n");
1319         for (size_t j = 0; j < num_tokens; j++) {
1320             log_debug("On token %zu\n", j);
1321             token_t token = tokens[j];
1322             if (is_punctuation(token.type)) {
1323                 log_debug("punctuation, skipping\n");
1324                 last_was_punctuation = true;
1325                 continue;
1326             }
1327 
1328             if (token.type != WHITESPACE) {
1329                 if (last_was_punctuation && !last_added_was_whitespace && string_tree_num_tokens(tree) > 0) {
1330                     log_debug("Adding space V\n");
1331                     string_tree_add_string(tree, " ");
1332                     string_tree_finalize_token(tree);
1333                 }
1334 
1335                 bool have_period_affixes = add_period_affixes_or_token(tree, str, token, options);
1336                 last_added_was_whitespace = false;
1337             } else if (!last_added_was_whitespace && string_tree_num_tokens(tree) > 0) {
1338                 log_debug("Adding space VI\n");
1339                 string_tree_add_string(tree, " ");
1340                 last_added_was_whitespace = true;
1341             } else {
1342                 continue;
1343             }
1344 
1345             last_was_punctuation = false;
1346             string_tree_finalize_token(tree);
1347         }
1348     }
1349 
1350     if (phrases != NULL) {
1351         phrase_language_array_destroy(phrases);
1352     }
1353 
1354     token_array_destroy(token_array);
1355 
1356     return tree;
1357 }
1358 
normalize_ordinal_suffixes(string_tree_t * tree,char * str,char * lang,token_t token,size_t i,token_t prev_token,libpostal_normalize_options_t options)1359 inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, size_t i, token_t prev_token, libpostal_normalize_options_t options) {
1360     size_t len_ordinal_suffix = valid_ordinal_suffix_len(str, token, prev_token, lang);
1361 
1362     if (len_ordinal_suffix > 0) {
1363         cstring_array *strings = tree->strings;
1364         // Add the original form first. When this function returns true,
1365         // add_normalized_strings_token won't be called a second time.
1366         add_normalized_strings_token(strings, str, token, options);
1367         token_t normalized_token = token;
1368         normalized_token.len = token.len - len_ordinal_suffix;
1369         add_normalized_strings_token(strings, str, normalized_token, options);
1370         return true;
1371     }
1372 
1373     return false;
1374 }
1375 
add_normalized_strings_tokenized(string_tree_t * tree,char * str,token_array * tokens,libpostal_normalize_options_t options)1376 inline void add_normalized_strings_tokenized(string_tree_t *tree, char *str, token_array *tokens, libpostal_normalize_options_t options) {
1377     cstring_array *strings = tree->strings;
1378 
1379     token_t prev_token = (token_t){0, 0, 0};
1380 
1381     for (size_t i = 0; i < tokens->n; i++) {
1382         token_t token = tokens->a[i];
1383         bool have_phrase = false;
1384         bool have_ordinal = false;
1385 
1386         if (is_special_token(token.type)) {
1387             string_tree_add_string_len(tree, str + token.offset, token.len);
1388             string_tree_finalize_token(tree);
1389             continue;
1390         }
1391 
1392         for (size_t j = 0; j < options.num_languages; j++) {
1393             char *lang = options.languages[j];
1394             if (expand_affixes(tree, str, lang, token, options)) {
1395                 have_phrase = true;
1396                 break;
1397             }
1398 
1399             if (normalize_ordinal_suffixes(tree, str, lang, token, i, prev_token, options)) {
1400                 have_ordinal = true;
1401                 break;
1402             }
1403         }
1404 
1405         if (!have_phrase && !have_ordinal) {
1406             add_normalized_strings_token(strings, str, token, options);
1407         }
1408 
1409         string_tree_finalize_token(tree);
1410         prev_token = token;
1411     }
1412 
1413 }
1414 
1415 
expand_alternative_phrase_option(cstring_array * strings,khash_t (str_set)* unique_strings,char * str,libpostal_normalize_options_t options,expansion_phrase_option_t phrase_option)1416 void expand_alternative_phrase_option(cstring_array *strings, khash_t(str_set) *unique_strings, char *str, libpostal_normalize_options_t options, expansion_phrase_option_t phrase_option) {
1417     size_t len = strlen(str);
1418     token_array *tokens = tokenize_keep_whitespace(str);
1419     string_tree_t *token_tree = string_tree_new_size(len);
1420 
1421     add_normalized_strings_tokenized(token_tree, str, tokens, options);
1422 
1423     string_tree_iterator_t *tokenized_iter = string_tree_iterator_new(token_tree);
1424 
1425     string_tree_iterator_t *iter;
1426 
1427     char_array *temp_string = char_array_new_size(len);
1428 
1429     char *token;
1430 
1431     char *lang;
1432 
1433     kh_resize(str_set, unique_strings, kh_size(unique_strings) + tokenized_iter->remaining);
1434 
1435     bool excessive_perms_outer = tokenized_iter->remaining >= EXCESSIVE_PERMUTATIONS;
1436 
1437     if (!excessive_perms_outer) {
1438         kh_resize(str_set, unique_strings, kh_size(unique_strings) + tokenized_iter->remaining);
1439     }
1440 
1441     log_debug("tokenized_iter->remaining=%d\n", tokenized_iter->remaining);
1442 
1443     for (; !string_tree_iterator_done(tokenized_iter); string_tree_iterator_next(tokenized_iter)) {
1444         char_array_clear(temp_string);
1445 
1446         string_tree_iterator_foreach_token(tokenized_iter, token, {
1447             if (token == NULL) {
1448                 continue;
1449             }
1450             char_array_append(temp_string, token);
1451         })
1452         char_array_terminate(temp_string);
1453 
1454         char *tokenized_str = char_array_get_string(temp_string);
1455 
1456         string_tree_t *alternatives;
1457 
1458         int ret;
1459         log_debug("Adding alternatives for single normalization\n");
1460         alternatives = add_string_alternatives_phrase_option(tokenized_str, options, phrase_option);
1461 
1462         log_debug("num strings = %" PRIu32 "\n", string_tree_num_strings(alternatives));
1463 
1464         if (alternatives == NULL) {
1465             log_debug("alternatives = NULL\n");
1466             continue;
1467         }
1468 
1469         iter = string_tree_iterator_new(alternatives);
1470         log_debug("iter->num_tokens=%d\n", iter->num_tokens);
1471         log_debug("iter->remaining=%d\n", iter->remaining);
1472 
1473         bool excessive_perms_inner = iter->remaining >= EXCESSIVE_PERMUTATIONS;
1474 
1475         if (!excessive_perms_inner && !excessive_perms_outer) {
1476             for (; !string_tree_iterator_done(iter); string_tree_iterator_next(iter)) {
1477                 char_array_clear(temp_string);
1478                 string_tree_iterator_foreach_token(iter, token, {
1479                     if (token == NULL) {
1480                         log_debug("token=NULL\n");
1481                     } else {
1482                         log_debug("token=%s\n", token);
1483                         char_array_append(temp_string, token);
1484                     }
1485                 })
1486                 char_array_terminate(temp_string);
1487 
1488                 token = char_array_get_string(temp_string);
1489 
1490                 size_t token_len = strlen(token);
1491 
1492                 if (token_len == 0) continue;
1493 
1494                 size_t left_spaces = string_left_spaces_len(token, token_len);
1495                 size_t right_spaces = string_right_spaces_len(token, token_len);
1496 
1497                 if (left_spaces + right_spaces == token_len) {
1498                     continue;
1499                 }
1500 
1501                 char *dupe_token = strndup(token + left_spaces, token_len - left_spaces - right_spaces);
1502 
1503                 log_debug("full string=%s\n", token);
1504                 khiter_t k = kh_get(str_set, unique_strings, dupe_token);
1505 
1506                 if (k == kh_end(unique_strings)) {
1507                     log_debug("doing postprocessing\n");
1508                     add_postprocessed_string(strings, dupe_token, options);
1509                     k = kh_put(str_set, unique_strings, dupe_token, &ret);
1510                 } else {
1511                     free(dupe_token);
1512                 }
1513 
1514                 log_debug("iter->remaining = %d\n", iter->remaining);
1515 
1516             }
1517         } else {
1518             cstring_array_add_string(strings, tokenized_str);
1519         }
1520 
1521         string_tree_iterator_destroy(iter);
1522         string_tree_destroy(alternatives);
1523 
1524         if (excessive_perms_outer) {
1525             break;
1526         }
1527     }
1528 
1529     string_tree_iterator_destroy(tokenized_iter);
1530     string_tree_destroy(token_tree);
1531 
1532     token_array_destroy(tokens);
1533 
1534     char_array_destroy(temp_string);
1535 }
1536 
1537 
1538 
expand_address_phrase_option(char * input,libpostal_normalize_options_t options,size_t * n,expansion_phrase_option_t phrase_option)1539 cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_options_t options, size_t *n, expansion_phrase_option_t phrase_option) {
1540     options.address_components |= LIBPOSTAL_ADDRESS_ANY;
1541 
1542     uint64_t normalize_string_options = get_normalize_string_options(options);
1543 
1544     size_t len = strlen(input);
1545 
1546     language_classifier_response_t *lang_response = NULL;
1547 
1548     if (options.num_languages == 0) {
1549          lang_response = classify_languages(input);
1550          if (lang_response != NULL) {
1551             options.num_languages = lang_response->num_languages;
1552             options.languages = lang_response->languages;
1553          }
1554     }
1555 
1556     string_tree_t *tree = normalize_string_languages(input, normalize_string_options, options.num_languages, options.languages);
1557 
1558     cstring_array *strings = cstring_array_new_size(len * 2);
1559     char_array *temp_string = char_array_new_size(len);
1560 
1561     khash_t(str_set) *unique_strings = kh_init(str_set);
1562 
1563     char *token;
1564 
1565     log_debug("string_tree_num_tokens(tree) = %d\n", string_tree_num_tokens(tree));
1566 
1567     if (string_tree_num_strings(tree) == 1) {
1568         char *normalized = string_tree_get_alternative(tree, 0, 0);
1569         expand_alternative_phrase_option(strings, unique_strings, normalized, options, phrase_option);
1570 
1571     } else {
1572         log_debug("Adding alternatives for multiple normalizations\n");
1573         string_tree_iterator_t *iter = string_tree_iterator_new(tree);
1574 
1575         for (; !string_tree_iterator_done(iter); string_tree_iterator_next(iter)) {
1576             char *segment;
1577             char_array_clear(temp_string);
1578             bool is_first = true;
1579 
1580             string_tree_iterator_foreach_token(iter, segment, {
1581                 if (!is_first) {
1582                     char_array_append(temp_string, " ");
1583                 }
1584                 char_array_append(temp_string, segment);
1585                 is_first = false;
1586             })
1587             char_array_terminate(temp_string);
1588             token = char_array_get_string(temp_string);
1589             log_debug("current permutation = %s\n", token);
1590             expand_alternative_phrase_option(strings, unique_strings, token, options, phrase_option);
1591         }
1592 
1593         string_tree_iterator_destroy(iter);
1594     }
1595 
1596     char *key_str = NULL;
1597     for (size_t i = kh_begin(unique_strings); i != kh_end(unique_strings); ++i) {
1598         if (!kh_exist(unique_strings, i)) continue;
1599         key_str = (char *)kh_key(unique_strings, i);
1600         free(key_str);
1601     }
1602 
1603     kh_destroy(str_set, unique_strings);
1604 
1605     if (lang_response != NULL) {
1606         language_classifier_response_destroy(lang_response);
1607     }
1608 
1609     char_array_destroy(temp_string);
1610     string_tree_destroy(tree);
1611 
1612     *n = cstring_array_num_strings(strings);
1613 
1614     return strings;
1615 
1616 }
1617 
expand_address(char * input,libpostal_normalize_options_t options,size_t * n)1618 cstring_array *expand_address(char *input, libpostal_normalize_options_t options, size_t *n) {
1619     return expand_address_phrase_option(input, options, n, EXPAND_PHRASES);
1620 }
1621 
expand_address_root(char * input,libpostal_normalize_options_t options,size_t * n)1622 cstring_array *expand_address_root(char *input, libpostal_normalize_options_t options, size_t *n) {
1623     return expand_address_phrase_option(input, options, n, DELETE_PHRASES);
1624 }
1625 
1626 
expansion_array_destroy(char ** expansions,size_t n)1627 void expansion_array_destroy(char **expansions, size_t n) {
1628     for (size_t i = 0; i < n; i++) {
1629         free(expansions[i]);
1630     }
1631     free(expansions);
1632 }
1633 
1634