1 #include <stdlib.h>
2
3 #include "expand.h"
4
5 #include "log/log.h"
6
7 #include "address_dictionary.h"
8 #include "collections.h"
9 #include "constants.h"
10 #include "language_classifier.h"
11 #include "numex.h"
12 #include "normalize.h"
13 #include "scanner.h"
14 #include "string_utils.h"
15 #include "token_types.h"
16 #include "transliterate.h"
17
18
19 #define DEFAULT_KEY_LEN 32
20
21 #define EXCESSIVE_PERMUTATIONS 100
22
get_normalize_token_options(libpostal_normalize_options_t options)23 inline uint64_t get_normalize_token_options(libpostal_normalize_options_t options) {
24 uint64_t normalize_token_options = 0;
25
26 normalize_token_options |= options.delete_final_periods ? NORMALIZE_TOKEN_DELETE_FINAL_PERIOD : 0;
27 normalize_token_options |= options.delete_acronym_periods ? NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS : 0;
28 normalize_token_options |= options.drop_english_possessives ? NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES : 0;
29 normalize_token_options |= options.delete_apostrophes ? NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE : 0;
30
31 return normalize_token_options;
32 }
33
get_normalize_string_options(libpostal_normalize_options_t options)34 inline uint64_t get_normalize_string_options(libpostal_normalize_options_t options) {
35 uint64_t normalize_string_options = 0;
36 normalize_string_options |= options.transliterate ? NORMALIZE_STRING_TRANSLITERATE : 0;
37 normalize_string_options |= options.latin_ascii ? NORMALIZE_STRING_LATIN_ASCII : 0;
38 normalize_string_options |= options.decompose ? NORMALIZE_STRING_DECOMPOSE : 0;
39 normalize_string_options |= options.strip_accents ? NORMALIZE_STRING_STRIP_ACCENTS : 0;
40 normalize_string_options |= options.lowercase ? NORMALIZE_STRING_LOWERCASE : 0;
41 normalize_string_options |= options.trim_string ? NORMALIZE_STRING_TRIM : 0;
42 normalize_string_options |= options.expand_numex ? NORMALIZE_STRING_REPLACE_NUMEX : 0;
43
44 return normalize_string_options;
45 }
46
add_normalized_strings_token(cstring_array * strings,char * str,token_t token,libpostal_normalize_options_t options)47 void add_normalized_strings_token(cstring_array *strings, char *str, token_t token, libpostal_normalize_options_t options) {
48
49 uint64_t normalize_token_options = get_normalize_token_options(options);
50
51 if (token.type != WHITESPACE ) {
52
53 bool contains_hyphen = string_contains_hyphen_len(str + token.offset, token.len);
54
55 if (!contains_hyphen || token.type == HYPHEN) {
56 log_debug("str = %s, token = {%zu, %zu, %u}\n", str, token.offset, token.len, token.type);
57 normalize_token(strings, str, token, normalize_token_options);
58 } else if (is_word_token(token.type)) {
59
60 size_t prefix_hyphen_len = string_hyphen_prefix_len(str + token.offset, token.len);
61 if (prefix_hyphen_len > 0) {
62 token.offset += prefix_hyphen_len;
63 }
64
65 size_t suffix_hyphen_len = string_hyphen_suffix_len(str + token.offset, token.len);
66 if (suffix_hyphen_len > 0) {
67 token.len -= suffix_hyphen_len;
68 }
69
70 normalize_token(strings, str, token, normalize_token_options);
71
72 if (options.replace_word_hyphens) {
73 normalize_token_options |= NORMALIZE_TOKEN_REPLACE_HYPHENS;
74 normalize_token(strings, str, token, normalize_token_options);
75 normalize_token_options ^= NORMALIZE_TOKEN_REPLACE_HYPHENS;
76 }
77
78 if (options.delete_word_hyphens) {
79 normalize_token_options |= NORMALIZE_TOKEN_DELETE_HYPHENS;
80 normalize_token(strings, str, token, normalize_token_options);
81 normalize_token_options ^= NORMALIZE_TOKEN_DELETE_HYPHENS;
82 }
83
84 } else if (is_numeric_token(token.type)) {
85
86 normalize_token(strings, str, token, normalize_token_options);
87
88 if (options.replace_word_hyphens || options.replace_numeric_hyphens) {
89 if (options.replace_word_hyphens) {
90 normalize_token_options |= NORMALIZE_TOKEN_REPLACE_HYPHENS;
91 }
92
93 if (options.replace_numeric_hyphens) {
94 normalize_token_options |= NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS;
95 }
96
97 normalize_token(strings, str, token, normalize_token_options);
98 normalize_token_options ^= NORMALIZE_TOKEN_REPLACE_HYPHENS | NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS;
99 }
100
101 if (options.delete_numeric_hyphens) {
102 normalize_token_options |= NORMALIZE_TOKEN_DELETE_HYPHENS;
103 normalize_token(strings, str, token, normalize_token_options);
104 normalize_token_options ^= NORMALIZE_TOKEN_DELETE_HYPHENS;
105 }
106 }
107
108 if (is_numeric_token(token.type) && options.split_alpha_from_numeric) {
109 bool split_alpha_from_numeric = true;
110
111 for (size_t i = 0; i < options.num_languages; i++) {
112 char *lang = options.languages[i];
113 if (valid_ordinal_suffix_len(str, token, NULL_TOKEN, lang) > 1) {
114 split_alpha_from_numeric = false;
115 break;
116 }
117 }
118
119 if (split_alpha_from_numeric) {
120 normalize_token_options |= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC;
121 normalize_token(strings, str, token, normalize_token_options);
122 normalize_token_options ^= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC;
123 }
124 }
125 } else {
126 cstring_array_add_string(strings, " ");
127 }
128 }
129
add_postprocessed_string(cstring_array * strings,char * str,libpostal_normalize_options_t options)130 void add_postprocessed_string(cstring_array *strings, char *str, libpostal_normalize_options_t options) {
131 cstring_array_add_string(strings, str);
132
133 if (options.roman_numerals) {
134 char *numex_replaced = replace_numeric_expressions(str, LATIN_LANGUAGE_CODE);
135 if (numex_replaced != NULL) {
136 cstring_array_add_string(strings, numex_replaced);
137 free(numex_replaced);
138 }
139
140 }
141
142 }
143
valid_affix_expansions(phrase_t phrase,libpostal_normalize_options_t options)144 address_expansion_array *valid_affix_expansions(phrase_t phrase, libpostal_normalize_options_t options) {
145 uint32_t expansion_index = phrase.data;
146 address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index);
147 if (value != NULL && value->components & options.address_components) {
148 return value->expansions;
149 }
150
151 return NULL;
152 }
153
cat_affix_expansion(char_array * key,char * str,address_expansion_t expansion,token_t token,phrase_t phrase,libpostal_normalize_options_t options)154 inline void cat_affix_expansion(char_array *key, char *str, address_expansion_t expansion, token_t token, phrase_t phrase, libpostal_normalize_options_t options) {
155 if (expansion.canonical_index != NULL_CANONICAL_INDEX) {
156 char *canonical = address_dictionary_get_canonical(expansion.canonical_index);
157 uint64_t normalize_string_options = get_normalize_string_options(options);
158 char *canonical_normalized = normalize_string_latin(canonical, strlen(canonical), normalize_string_options);
159 canonical = canonical_normalized != NULL ? canonical_normalized : canonical;
160
161 char_array_cat(key, canonical);
162 if (canonical_normalized != NULL) {
163 free(canonical_normalized);
164 }
165 } else {
166 char_array_cat_len(key, str + token.offset + phrase.start, phrase.len);
167 }
168 }
169
170
add_affix_expansions(string_tree_t * tree,char * str,char * lang,token_t token,phrase_t prefix,phrase_t suffix,libpostal_normalize_options_t options,bool with_period)171 bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, token_t token, phrase_t prefix, phrase_t suffix, libpostal_normalize_options_t options, bool with_period) {
172 cstring_array *strings = tree->strings;
173
174 size_t skip_period = with_period ? 1 : 0;
175
176 bool have_suffix = suffix.len > 0 && suffix.len < token.len;
177 bool have_prefix = prefix.len > 0 && prefix.len + with_period < token.len;
178
179 if (!have_suffix && !have_prefix) {
180 return false;
181 }
182
183 address_expansion_array *prefix_expansions = NULL;
184 address_expansion_array *suffix_expansions = NULL;
185
186 address_expansion_t prefix_expansion;
187 address_expansion_t suffix_expansion;
188
189 char *expansion;
190
191 size_t num_strings = 0;
192 char *root_word = NULL;
193 size_t root_len;
194 token_t root_token;
195 cstring_array *root_strings = NULL;
196 int add_space = 0;
197 int spaces = 0;
198
199 size_t prefix_start, prefix_end, root_end, suffix_start;
200
201 if (have_prefix) {
202 prefix_expansions = valid_affix_expansions(prefix, options);
203 if (prefix_expansions == NULL) have_prefix = false;
204 }
205
206 if (have_suffix) {
207 suffix_expansions = valid_affix_expansions(suffix, options);
208 if (suffix_expansions == NULL) have_suffix = false;
209 }
210
211 if (!have_suffix && !have_prefix) {
212 return false;
213 }
214
215 char_array *key = char_array_new_size(token.len);
216
217 if (have_prefix && have_suffix) {
218 for (size_t i = 0; i < prefix_expansions->n; i++) {
219 prefix_expansion = prefix_expansions->a[i];
220 char_array_clear(key);
221
222 cat_affix_expansion(key, str, prefix_expansion, token, prefix, options);
223 prefix_start = key->n - 1;
224
225 add_space = (int)prefix_expansion.separable || with_period;
226 if (prefix.len + skip_period + suffix.len < token.len && !prefix_expansion.separable) {
227 add_space = suffix_expansion.separable || with_period;
228 }
229
230 for (spaces = skip_period; spaces <= add_space; spaces++) {
231 key->n = prefix_start;
232 if (spaces) {
233 char_array_cat(key, " ");
234 }
235
236 prefix_end = key->n;
237
238 if (prefix.len + skip_period + suffix.len < token.len) {
239 root_len = token.len - suffix.len - prefix.len - skip_period;
240 size_t root_start = token.offset + prefix.len + skip_period;
241 size_t prefix_hyphen_len = string_hyphen_prefix_len(str + root_start, root_len);
242 root_start += prefix_hyphen_len;
243 root_len -= prefix_hyphen_len;
244 size_t suffix_hyphen_len = string_hyphen_suffix_len(str + root_start, root_len);
245 root_len -= suffix_hyphen_len;
246 root_token = (token_t){root_start, root_len, token.type};
247 root_strings = cstring_array_new_size(root_len);
248 add_normalized_strings_token(root_strings, str, root_token, options);
249 num_strings = cstring_array_num_strings(root_strings);
250
251 for (size_t j = 0; j < num_strings; j++) {
252 key->n = prefix_end;
253 root_word = cstring_array_get_string(root_strings, j);
254 char_array_cat(key, root_word);
255 root_end = key->n - 1;
256
257 for (size_t k = 0; k < suffix_expansions->n; k++) {
258 key->n = root_end;
259 suffix_expansion = suffix_expansions->a[k];
260
261 int add_suffix_space = suffix_expansion.separable;
262
263 suffix_start = key->n;
264 for (int suffix_spaces = skip_period; suffix_spaces <= add_suffix_space; suffix_spaces++) {
265 key->n = suffix_start;
266 if (suffix_spaces) {
267 char_array_cat(key, " ");
268 }
269
270 cat_affix_expansion(key, str, suffix_expansion, token, suffix, options);
271
272 expansion = char_array_get_string(key);
273 cstring_array_add_string(strings, expansion);
274
275 }
276
277
278 }
279 }
280
281 cstring_array_destroy(root_strings);
282 root_strings = NULL;
283
284 } else {
285 for (size_t j = 0; j < suffix_expansions->n; j++) {
286 key->n = prefix_end - skip_period;
287 suffix_expansion = suffix_expansions->a[j];
288
289 cat_affix_expansion(key, str, suffix_expansion, token, suffix, options);
290
291 expansion = char_array_get_string(key);
292 cstring_array_add_string(tree->strings, expansion);
293 }
294 }
295 }
296
297 }
298 } else if (have_suffix) {
299 log_debug("suffix.start=%" PRId32 "\n", suffix.start);
300 root_len = suffix.start;
301 root_token = (token_t){token.offset, root_len, token.type};
302 log_debug("root_len=%zu\n", root_len);
303 log_debug("root_token = {%zu, %zu, %u}\n", root_token.offset, root_token.len, root_token.type);
304
305 root_strings = cstring_array_new_size(root_len + 1);
306 add_normalized_strings_token(root_strings, str, root_token, options);
307 num_strings = cstring_array_num_strings(root_strings);
308
309 log_debug("num_strings = %zu\n", num_strings);
310
311 for (size_t j = 0; j < num_strings; j++) {
312 char_array_clear(key);
313 root_word = cstring_array_get_string(root_strings, j);
314 log_debug("root_word=%s\n", root_word);
315 char_array_cat(key, root_word);
316 root_end = key->n - 1;
317
318 for (size_t k = 0; k < suffix_expansions->n; k++) {
319 key->n = root_end;
320 suffix_expansion = suffix_expansions->a[k];
321
322 add_space = (suffix_expansion.separable || with_period) && suffix.len < token.len;
323 suffix_start = key->n;
324
325 for (int spaces = skip_period; spaces <= add_space; spaces++) {
326 key->n = suffix_start;
327 if (spaces) {
328 char_array_cat(key, " ");
329 }
330
331 cat_affix_expansion(key, str, suffix_expansion, token, suffix, options);
332
333 expansion = char_array_get_string(key);
334 cstring_array_add_string(tree->strings, expansion);
335 }
336 }
337 }
338 } else if (have_prefix) {
339 if (prefix.len + skip_period <= token.len) {
340 root_len = token.len - prefix.len - skip_period;
341 size_t root_start = token.offset + prefix.len + skip_period;
342 size_t prefix_hyphen_len = string_hyphen_prefix_len(str + root_start, root_len);
343 root_start += prefix_hyphen_len;
344 root_len -= prefix_hyphen_len;
345 size_t suffix_hyphen_len = string_hyphen_suffix_len(str + root_start, root_len);
346 root_len -= suffix_hyphen_len;
347 root_token = (token_t){root_start, root_len, token.type};
348 root_strings = cstring_array_new_size(root_len);
349 add_normalized_strings_token(root_strings, str, root_token, options);
350 num_strings = cstring_array_num_strings(root_strings);
351
352 } else {
353 root_strings = cstring_array_new_size(token.len);
354 add_normalized_strings_token(root_strings, str, token, options);
355 num_strings = cstring_array_num_strings(root_strings);
356
357 for (size_t k = 0; k < num_strings; k++) {
358 root_word = cstring_array_get_string(root_strings, k);
359 cstring_array_add_string(tree->strings, root_word);
360 }
361
362 char_array_destroy(key);
363 cstring_array_destroy(root_strings);
364 return false;
365
366 }
367
368 for (size_t j = 0; j < prefix_expansions->n; j++) {
369 char_array_clear(key);
370 prefix_expansion = prefix_expansions->a[j];
371
372 cat_affix_expansion(key, str, prefix_expansion, token, prefix, options);
373 prefix_end = key->n - 1;
374
375 add_space = (prefix_expansion.separable || with_period) && prefix.len + skip_period < token.len;
376 for (int spaces = skip_period; spaces <= add_space; spaces++) {
377 key->n = prefix_end;
378 if (spaces) {
379 char_array_cat(key, " ");
380 }
381 size_t prefix_space_len = key->n - spaces;
382 for (size_t k = 0; k < num_strings; k++) {
383 key->n = prefix_space_len;
384 root_word = cstring_array_get_string(root_strings, k);
385 char_array_cat(key, root_word);
386
387 expansion = char_array_get_string(key);
388 cstring_array_add_string(tree->strings, expansion);
389 }
390
391 }
392 }
393 }
394
395 char_array_destroy(key);
396
397 if (root_strings != NULL) {
398 cstring_array_destroy(root_strings);
399 }
400
401 return true;
402
403 }
404
expand_affixes(string_tree_t * tree,char * str,char * lang,token_t token,libpostal_normalize_options_t options)405 inline bool expand_affixes(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options) {
406 phrase_t suffix = search_address_dictionaries_suffix(str + token.offset, token.len, lang);
407
408 phrase_t prefix = search_address_dictionaries_prefix(str + token.offset, token.len, lang);
409
410 if ((suffix.len == 0 && prefix.len == 0)) return false;
411
412 bool with_period = false;
413
414 return add_affix_expansions(tree, str, lang, token, prefix, suffix, options, with_period);
415 }
416
expand_affixes_period(string_tree_t * tree,char * str,char * lang,token_t token,libpostal_normalize_options_t options)417 inline bool expand_affixes_period(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options) {
418 ssize_t first_period_index = string_next_period_len(str + token.offset, token.len);
419 if (first_period_index > 0) {
420 ssize_t next_period_index = string_next_period_len(str + token.offset + first_period_index + 1, token.len - first_period_index - 1);
421 // Token contains only one period or one + a final period
422 if (next_period_index < 0 || next_period_index == token.len - 1) {
423 phrase_t prefix = search_address_dictionaries_substring(str + token.offset, first_period_index, lang);
424
425 phrase_t suffix = search_address_dictionaries_substring(str + token.offset + first_period_index + 1, token.len - first_period_index - 1, lang);
426 if (suffix.len > 0) {
427 suffix.start = first_period_index + 1;
428 }
429
430 if (suffix.len == 0 && prefix.len == 0) return false;
431
432 bool with_period = true;
433
434 return add_affix_expansions(tree, str, lang, token, prefix, suffix, options, with_period);
435 } else {
436 return false;
437 }
438 } else {
439 return false;
440 }
441 }
442
add_period_affixes_or_token(string_tree_t * tree,char * str,token_t token,libpostal_normalize_options_t options)443 bool add_period_affixes_or_token(string_tree_t *tree, char *str, token_t token, libpostal_normalize_options_t options) {
444 bool have_period_affixes = false;
445 if (string_contains_period_len(str + token.offset, token.len)) {
446 for (size_t l = 0; l < options.num_languages; l++) {
447 char *lang = options.languages[l];
448 if (expand_affixes_period(tree, str, lang, token, options)) {
449 have_period_affixes = true;
450 break;
451 }
452 }
453 }
454
455 if (!have_period_affixes) {
456 string_tree_add_string_len(tree, str + token.offset, token.len);
457 }
458
459 return have_period_affixes;
460 }
461
462
gazetteer_ignorable_components(uint16_t dictionary_id)463 static inline uint32_t gazetteer_ignorable_components(uint16_t dictionary_id) {
464 switch (dictionary_id) {
465 case DICTIONARY_ACADEMIC_DEGREE:
466 return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
467 case DICTIONARY_BUILDING_TYPE:
468 return LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_UNIT;
469 case DICTIONARY_COMPANY_TYPE:
470 return LIBPOSTAL_ADDRESS_NAME;
471 case DICTIONARY_DIRECTIONAL:
472 return LIBPOSTAL_ADDRESS_STREET;
473 case DICTIONARY_ELISION:
474 return LIBPOSTAL_ADDRESS_ANY;
475 case DICTIONARY_ENTRANCE:
476 return LIBPOSTAL_ADDRESS_ENTRANCE;
477 case DICTIONARY_HOUSE_NUMBER:
478 return LIBPOSTAL_ADDRESS_HOUSE_NUMBER;
479 case DICTIONARY_LEVEL_NUMBERED:
480 return LIBPOSTAL_ADDRESS_LEVEL;
481 case DICTIONARY_LEVEL_STANDALONE:
482 return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ANY);
483 case DICTIONARY_LEVEL_MEZZANINE:
484 return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_LEVEL| LIBPOSTAL_ADDRESS_ANY);
485 case DICTIONARY_LEVEL_BASEMENT:
486 return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ANY);
487 case DICTIONARY_LEVEL_SUB_BASEMENT:
488 return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ANY);
489 case DICTIONARY_NUMBER:
490 return LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_PO_BOX | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_ENTRANCE | LIBPOSTAL_ADDRESS_STREET;
491 case DICTIONARY_NO_NUMBER:
492 return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_ANY);
493 case DICTIONARY_PERSONAL_TITLE:
494 return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
495 case DICTIONARY_PLACE_NAME:
496 return LIBPOSTAL_ADDRESS_NAME;
497 case DICTIONARY_POST_OFFICE:
498 return LIBPOSTAL_ADDRESS_PO_BOX;
499 case DICTIONARY_POSTAL_CODE:
500 return LIBPOSTAL_ADDRESS_POSTAL_CODE;
501 case DICTIONARY_QUALIFIER:
502 return LIBPOSTAL_ADDRESS_TOPONYM;
503 case DICTIONARY_STAIRCASE:
504 return LIBPOSTAL_ADDRESS_STAIRCASE;
505 case DICTIONARY_STOPWORD:
506 return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_TOPONYM;
507 case DICTIONARY_STREET_TYPE:
508 return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
509 case DICTIONARY_UNIT_NUMBERED:
510 return LIBPOSTAL_ADDRESS_UNIT;
511 case DICTIONARY_UNIT_STANDALONE:
512 return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_ANY);
513 case DICTIONARY_UNIT_DIRECTION:
514 return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_ANY);
515 default:
516 return LIBPOSTAL_ADDRESS_NONE;
517 }
518 }
519
520
gazetteer_valid_components(uint16_t dictionary_id)521 static inline uint32_t gazetteer_valid_components(uint16_t dictionary_id) {
522 switch (dictionary_id) {
523 case DICTIONARY_DIRECTIONAL:
524 return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_TOPONYM | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_ENTRANCE;
525 case DICTIONARY_STOPWORD:
526 return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_TOPONYM;
527 case DICTIONARY_STREET_NAME:
528 return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
529 case DICTIONARY_STREET_TYPE:
530 return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
531 case DICTIONARY_SYNONYM:
532 return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_TOPONYM;
533 default:
534 return LIBPOSTAL_ADDRESS_NONE;
535 }
536 }
537
gazetteer_edge_ignorable_components(uint16_t dictionary_id)538 static inline uint32_t gazetteer_edge_ignorable_components(uint16_t dictionary_id) {
539 switch (dictionary_id) {
540 // Pre/post directionals can be removed if there are non-phrase tokens
541 case DICTIONARY_DIRECTIONAL:
542 return LIBPOSTAL_ADDRESS_STREET;
543 case DICTIONARY_COMPANY_TYPE:
544 return LIBPOSTAL_ADDRESS_NAME;
545 case DICTIONARY_PLACE_NAME:
546 return LIBPOSTAL_ADDRESS_NAME;
547 default:
548 return LIBPOSTAL_ADDRESS_NONE;
549 }
550 }
551
gazetteer_specifier_components(uint16_t dictionary_id)552 static inline uint32_t gazetteer_specifier_components(uint16_t dictionary_id) {
553 switch (dictionary_id) {
554 case DICTIONARY_LEVEL_STANDALONE:
555 return LIBPOSTAL_ADDRESS_LEVEL;
556 case DICTIONARY_LEVEL_MEZZANINE:
557 return LIBPOSTAL_ADDRESS_LEVEL;
558 case DICTIONARY_LEVEL_BASEMENT:
559 return LIBPOSTAL_ADDRESS_LEVEL;
560 case DICTIONARY_LEVEL_SUB_BASEMENT:
561 return LIBPOSTAL_ADDRESS_LEVEL;
562 case DICTIONARY_UNIT_STANDALONE:
563 return LIBPOSTAL_ADDRESS_UNIT;
564 default:
565 return LIBPOSTAL_ADDRESS_NONE;
566 }
567 }
568
569
gazetteer_possible_root_components(uint16_t dictionary_id)570 static inline uint32_t gazetteer_possible_root_components(uint16_t dictionary_id) {
571 switch (dictionary_id) {
572 case DICTIONARY_ACADEMIC_DEGREE:
573 return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
574 case DICTIONARY_DIRECTIONAL:
575 return LIBPOSTAL_ADDRESS_STREET;
576 case DICTIONARY_PERSONAL_TITLE:
577 return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
578 case DICTIONARY_NUMBER:
579 return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
580 case DICTIONARY_PLACE_NAME:
581 return LIBPOSTAL_ADDRESS_STREET;
582 case DICTIONARY_QUALIFIER:
583 return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
584 case DICTIONARY_STREET_NAME:
585 return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
586 case DICTIONARY_SYNONYM:
587 return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
588 case DICTIONARY_TOPONYM:
589 return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
590 default:
591 return LIBPOSTAL_ADDRESS_NONE;
592 }
593 }
594
595 static const uint16_t NUMERIC_ADDRESS_COMPONENTS = (LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_PO_BOX | LIBPOSTAL_ADDRESS_POSTAL_CODE | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_ENTRANCE | LIBPOSTAL_ADDRESS_STREET);
596
597 typedef enum {
598 GAZETTEER_MATCH_IGNORABLE,
599 GAZETTEER_MATCH_EDGE_IGNORABLE,
600 GAZETTEER_MATCH_POSSIBLE_ROOT,
601 GAZETTEER_MATCH_SPECIFIER,
602 GAZETTEER_MATCH_VALID_COMPONENTS
603 } gazetteer_match_type_t;
604
605
address_expansion_matches_type_for_components(address_expansion_t expansion,uint32_t address_components,gazetteer_match_type_t match_type)606 static inline bool address_expansion_matches_type_for_components(address_expansion_t expansion, uint32_t address_components, gazetteer_match_type_t match_type) {
607 for (uint32_t j = 0; j < expansion.num_dictionaries; j++) {
608 uint16_t dictionary_id = expansion.dictionary_ids[j];
609 uint32_t components = 0;
610 switch (match_type) {
611 case GAZETTEER_MATCH_IGNORABLE:
612 components = gazetteer_ignorable_components(dictionary_id);
613 break;
614 case GAZETTEER_MATCH_EDGE_IGNORABLE:
615 components = gazetteer_edge_ignorable_components(dictionary_id);
616 break;
617 case GAZETTEER_MATCH_POSSIBLE_ROOT:
618 components = gazetteer_possible_root_components(dictionary_id);
619 break;
620 case GAZETTEER_MATCH_SPECIFIER:
621 components = gazetteer_specifier_components(dictionary_id);
622 break;
623 case GAZETTEER_MATCH_VALID_COMPONENTS:
624 components = gazetteer_valid_components(dictionary_id);
625 break;
626 default:
627 break;
628 }
629 if (components & address_components) {
630 return true;
631 }
632 }
633 return false;
634 }
635
address_expansion_is_ignorable_for_components(address_expansion_t expansion,uint32_t address_components)636 bool address_expansion_is_ignorable_for_components(address_expansion_t expansion, uint32_t address_components) {
637 return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_IGNORABLE);
638 }
639
address_expansion_is_edge_ignorable_for_components(address_expansion_t expansion,uint32_t address_components)640 bool address_expansion_is_edge_ignorable_for_components(address_expansion_t expansion, uint32_t address_components) {
641 return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_EDGE_IGNORABLE);
642 }
643
address_expansion_is_possible_root_for_components(address_expansion_t expansion,uint32_t address_components)644 bool address_expansion_is_possible_root_for_components(address_expansion_t expansion, uint32_t address_components) {
645 return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_POSSIBLE_ROOT);
646 }
647
address_expansion_is_specifier_for_components(address_expansion_t expansion,uint32_t address_components)648 bool address_expansion_is_specifier_for_components(address_expansion_t expansion, uint32_t address_components) {
649 return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_SPECIFIER);
650 }
651
address_expansion_is_valid_for_components(address_expansion_t expansion,uint32_t address_components)652 bool address_expansion_is_valid_for_components(address_expansion_t expansion, uint32_t address_components) {
653 return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_VALID_COMPONENTS);
654 }
655
656
address_phrase_matches_type_for_components(phrase_t phrase,uint32_t address_components,gazetteer_match_type_t match_type)657 bool address_phrase_matches_type_for_components(phrase_t phrase, uint32_t address_components, gazetteer_match_type_t match_type) {
658 uint32_t expansion_index = phrase.data;
659 address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index);
660
661 if (value == NULL) return false;
662
663 address_expansion_array *expansions = value->expansions;
664 if (expansions == NULL) return false;
665
666 for (size_t i = 0; i < expansions->n; i++) {
667 address_expansion_t expansion = expansions->a[i];
668
669 if (address_expansion_matches_type_for_components(expansion, address_components, match_type)) {
670 return true;
671 }
672 }
673 return false;
674 }
675
address_phrase_is_ignorable_for_components(phrase_t phrase,uint32_t address_components)676 inline bool address_phrase_is_ignorable_for_components(phrase_t phrase, uint32_t address_components) {
677 return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_IGNORABLE);
678 }
679
address_phrase_is_edge_ignorable_for_components(phrase_t phrase,uint32_t address_components)680 inline bool address_phrase_is_edge_ignorable_for_components(phrase_t phrase, uint32_t address_components) {
681 return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_EDGE_IGNORABLE);
682 }
683
684
address_phrase_is_possible_root_for_components(phrase_t phrase,uint32_t address_components)685 inline bool address_phrase_is_possible_root_for_components(phrase_t phrase, uint32_t address_components) {
686 return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_POSSIBLE_ROOT);
687 }
688
address_phrase_is_specifier_for_components(phrase_t phrase,uint32_t address_components)689 inline bool address_phrase_is_specifier_for_components(phrase_t phrase, uint32_t address_components) {
690 return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_SPECIFIER);
691 }
692
address_phrase_is_valid_for_components(phrase_t phrase,uint32_t address_components)693 inline bool address_phrase_is_valid_for_components(phrase_t phrase, uint32_t address_components) {
694 return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_VALID_COMPONENTS);
695 }
696
697
address_phrase_contains_unambiguous_expansion(phrase_t phrase)698 bool address_phrase_contains_unambiguous_expansion(phrase_t phrase) {
699 address_expansion_value_t *value = address_dictionary_get_expansions(phrase.data);
700 if (value == NULL) return false;
701
702 address_expansion_array *expansions = value->expansions;
703 if (expansions == NULL) return false;
704
705 address_expansion_t *expansions_array = expansions->a;
706
707 for (size_t i = 0; i < expansions->n; i++) {
708 address_expansion_t expansion = expansions_array[i];
709 if (!address_expansion_in_dictionary(expansion, DICTIONARY_AMBIGUOUS_EXPANSION)) {
710 return true;
711 }
712 }
713 return false;
714 }
715
add_string_alternatives_phrase_option(char * str,libpostal_normalize_options_t options,expansion_phrase_option_t phrase_option)716 string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normalize_options_t options, expansion_phrase_option_t phrase_option) {
717 char_array *key = NULL;
718
719 log_debug("input=%s\n", str);
720 token_array *token_array = tokenize_keep_whitespace(str);
721
722 if (token_array == NULL) {
723 return NULL;
724 }
725
726 size_t len = strlen(str);
727
728 token_t *tokens = token_array->a;
729 size_t num_tokens = token_array->n;
730
731 log_debug("tokenized, num tokens=%zu\n", num_tokens);
732
733 bool last_was_punctuation = false;
734
735 phrase_language_array *phrases = NULL;
736 phrase_array *lang_phrases = NULL;
737
738 for (size_t i = 0; i < options.num_languages; i++) {
739 char *lang = options.languages[i];
740 log_debug("lang=%s\n", lang);
741
742 lang_phrases = search_address_dictionaries_tokens(str, token_array, lang);
743
744 if (lang_phrases == NULL) {
745 log_debug("lang_phrases NULL\n");
746 continue;
747 }
748
749 log_debug("lang_phrases->n = %zu\n", lang_phrases->n);
750
751 phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n);
752
753 for (size_t j = 0; j < lang_phrases->n; j++) {
754 phrase_t p = lang_phrases->a[j];
755 log_debug("lang=%s, (%d, %d)\n", lang, p.start, p.len);
756 phrase_language_array_push(phrases, (phrase_language_t){lang, p});
757 }
758
759 phrase_array_destroy(lang_phrases);
760 }
761
762
763 lang_phrases = search_address_dictionaries_tokens(str, token_array, ALL_LANGUAGES);
764 if (lang_phrases != NULL) {
765 phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n);
766
767 for (size_t j = 0; j < lang_phrases->n; j++) {
768 phrase_t p = lang_phrases->a[j];
769 phrase_language_array_push(phrases, (phrase_language_t){ALL_LANGUAGES, p});
770 }
771 phrase_array_destroy(lang_phrases);
772
773 }
774
775 string_tree_t *tree = string_tree_new_size(len);
776
777 bool last_added_was_whitespace = false;
778
779 uint64_t normalize_string_options = get_normalize_string_options(options);
780
781 if (phrases != NULL) {
782 log_debug("phrases not NULL, n=%zu\n", phrases->n);
783 ks_introsort(phrase_language_array, phrases->n, phrases->a);
784
785 phrase_language_t phrase_lang;
786
787 size_t start = 0;
788 size_t end = 0;
789
790 phrase_t phrase = NULL_PHRASE;
791 phrase_t prev_phrase = NULL_PHRASE;
792
793 key = key != NULL ? key : char_array_new_size(DEFAULT_KEY_LEN);
794
795 log_debug("phrase_option = %d\n", phrase_option);
796
797 bool delete_phrases = phrase_option == DELETE_PHRASES;
798 bool expand_phrases = phrase_option == EXPAND_PHRASES;
799
800 size_t num_phrases = phrases->n;
801
802 bool have_non_phrase_tokens = false;
803 bool have_non_phrase_word_tokens = false;
804 bool have_canonical_phrases = false;
805 bool have_ambiguous = false;
806 bool have_possible_root = false;
807 bool have_strictly_ignorable = false;
808 bool have_strictly_ignorable_abbreviation = false;
809
810 size_t prev_phrase_end = 0;
811
812 if (delete_phrases) {
813 for (size_t i = 0; i < num_phrases; i++) {
814 phrase_lang = phrases->a[i];
815 phrase = phrase_lang.phrase;
816
817 log_debug("phrase.start = %zu, prev_phrase_end = %zu\n", phrase.start, prev_phrase_end);
818
819 token_t inter_token;
820 if (phrase.start > prev_phrase_end) {
821 for (size_t j = prev_phrase_end; j < phrase.start; j++) {
822 inter_token = tokens[j];
823 if (!is_punctuation(inter_token.type) && !is_whitespace(inter_token.type)) {
824 log_debug("have_non_phrase_tokens\n");
825 have_non_phrase_tokens = true;
826 have_non_phrase_word_tokens = have_non_phrase_word_tokens || is_word_token(inter_token.type);
827 break;
828 }
829 }
830 }
831
832 if (i == num_phrases - 1 && phrase.start + phrase.len < num_tokens) {
833 for (size_t j = phrase.start + phrase.len; j < num_tokens; j++) {
834 inter_token = tokens[j];
835 if (!is_punctuation(inter_token.type) && !is_whitespace(inter_token.type)) {
836 have_non_phrase_tokens = true;
837 have_non_phrase_word_tokens = have_non_phrase_word_tokens || is_word_token(inter_token.type);
838 break;
839 }
840 }
841 }
842
843 bool phrase_is_ambiguous = address_phrase_in_dictionary(phrase, DICTIONARY_AMBIGUOUS_EXPANSION);
844 bool phrase_is_strictly_ignorable = address_phrase_is_ignorable_for_components(phrase, options.address_components) && !phrase_is_ambiguous;
845 bool phrase_is_canonical = address_phrase_has_canonical_interpretation(phrase);
846
847 have_non_phrase_tokens = have_non_phrase_tokens || (!phrase_is_strictly_ignorable && !phrase_is_ambiguous);
848 log_debug("have_non_phrase_word_tokens = %d, phrase_is_strictly_ignorable = %d, phrase_is_ambiguous = %d\n", have_non_phrase_word_tokens, phrase_is_strictly_ignorable, phrase_is_ambiguous);
849 if (!have_non_phrase_word_tokens && !phrase_is_strictly_ignorable && !phrase_is_ambiguous) {
850 for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) {
851 token_t pt = tokens[j];
852 if (is_word_token(pt.type)) {
853 log_debug("have_non_phrase_word_tokens\n");
854 have_non_phrase_word_tokens = true;
855 break;
856 }
857 }
858 }
859
860
861 have_strictly_ignorable = have_strictly_ignorable || phrase_is_strictly_ignorable;
862 have_strictly_ignorable_abbreviation = have_strictly_ignorable_abbreviation || (phrase_is_strictly_ignorable && !phrase_is_canonical);
863 if (have_strictly_ignorable_abbreviation) {
864 log_debug("have_strictly_ignorable=%zu, phrase_is_canonical=%zu\n", have_strictly_ignorable, phrase_is_canonical);
865 }
866
867 have_possible_root = have_possible_root | address_phrase_is_possible_root_for_components(phrase, options.address_components);
868
869 have_canonical_phrases = have_canonical_phrases || (phrase_is_canonical && !phrase_is_ambiguous);
870 have_ambiguous = have_ambiguous || phrase_is_ambiguous;
871
872 prev_phrase_end = phrase.start + phrase.len;
873 }
874
875
876 log_debug("have_non_phrase_tokens = %d\n", have_non_phrase_tokens);
877 log_debug("have_canonical_phrases = %d\n", have_canonical_phrases);
878 log_debug("have_ambiguous = %d\n", have_ambiguous);
879 log_debug("have_strictly_ignorable = %d\n", have_strictly_ignorable);
880 log_debug("have_strictly_ignorable_abbreviation = %d\n", have_strictly_ignorable_abbreviation);
881 }
882
883 bool skipped_last_edge_phrase = false;
884
885 for (size_t i = 0; i < phrases->n; i++) {
886 phrase_lang = phrases->a[i];
887
888 phrase = phrase_lang.phrase;
889
890 log_debug("phrase.start=%d, phrase.len=%d, lang=%s, prev_phrase.start=%d, prev_phrase.len=%d\n", phrase.start, phrase.len, phrase_lang.language, prev_phrase.start, prev_phrase.len);
891
892 if ((phrase.start > prev_phrase.start && phrase.start < prev_phrase.start + prev_phrase.len) || (phrase.start == prev_phrase.start && i > 0 && phrase.len < prev_phrase.len)) {
893 log_debug("continuing\n");
894 continue;
895 }
896
897 char_array_clear(key);
898
899 char_array_cat(key, phrase_lang.language);
900 char_array_cat(key, NAMESPACE_SEPARATOR_CHAR);
901
902 size_t namespace_len = key->n;
903
904 end = phrase.start;
905
906 log_debug("start=%zu, end=%zu\n", start, end);
907 for (size_t j = start; j < end; j++) {
908 log_debug("Adding token %zu\n", j);
909 token_t token = tokens[j];
910 if (is_punctuation(token.type)) {
911 last_was_punctuation = true;
912 continue;
913 }
914
915 if (token.type != WHITESPACE) {
916 if ((phrase.start > 0 && last_was_punctuation) || (!last_added_was_whitespace && string_tree_num_tokens(tree) > 0) ) {
917 log_debug("Adding space\n");
918 string_tree_add_string(tree, " ");
919 string_tree_finalize_token(tree);
920 }
921 log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset);
922
923 bool have_period_affixes = add_period_affixes_or_token(tree, str, token, options);
924 string_tree_finalize_token(tree);
925 last_added_was_whitespace = false;
926 } else if (!delete_phrases && !last_added_was_whitespace && string_tree_num_tokens(tree) > 0 ) {
927 log_debug("Adding pre-phrase whitespace\n");
928 last_added_was_whitespace = true;
929 string_tree_add_string(tree, " ");
930 string_tree_finalize_token(tree);
931 } else {
932 continue;
933 }
934
935 last_was_punctuation = false;
936 }
937
938 size_t added_expansions = 0;
939 token_t token;
940
941 uint32_t expansion_index = phrase.data;
942 address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index);
943
944 bool expansion_valid_components = (value->components & options.address_components) || address_phrase_is_valid_for_components(phrase, options.address_components);
945
946 bool is_numeric_component = (value->components & options.address_components & NUMERIC_ADDRESS_COMPONENTS);
947
948 if (expansion_valid_components) {
949 key->n = namespace_len;
950 for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) {
951 token = tokens[j];
952 if (token.type != WHITESPACE) {
953 char_array_cat_len(key, str + token.offset, token.len);
954 last_added_was_whitespace = false;
955 } else if (!last_added_was_whitespace) {
956 char_array_cat(key, " ");
957 last_added_was_whitespace = true;
958 }
959 }
960
961 char *key_str = char_array_get_string(key);
962 log_debug("key_str=%s\n", key_str);
963 address_expansion_array *expansions = value->expansions;
964
965 if (expansions != NULL) {
966 bool current_phrase_have_ambiguous = delete_phrases && address_phrase_in_dictionary(phrase, DICTIONARY_AMBIGUOUS_EXPANSION);
967 bool added_pre_phrase_space = false;
968 bool current_phrase_have_ignorable = delete_phrases && address_phrase_is_ignorable_for_components(phrase, options.address_components);
969 bool current_phrase_have_edge_ignorable = false;
970
971 bool current_phrase_have_specifier = delete_phrases && address_phrase_is_specifier_for_components(phrase, options.address_components);
972 bool current_phrase_have_canonical = delete_phrases && address_phrase_has_canonical_interpretation(phrase);
973 bool current_phrase_have_possible_root = delete_phrases && address_phrase_is_possible_root_for_components(phrase, options.address_components);
974
975 bool current_phrase_have_valid = address_phrase_is_valid_for_components(phrase, options.address_components);
976
977 log_debug("current_phrase_have_specifier = %d\n", current_phrase_have_specifier);
978
979 bool current_phrase_have_unambiguous = delete_phrases && address_phrase_contains_unambiguous_expansion(phrase);
980
981 /*
982 Edge phrase handling. This is primarily for handling pre-directionals/post-directionals
983 in English and other languages.
984 */
985 bool skip_edge_phrase = false;
986 bool other_phrase_is_ignorable = false;
987
988 if (delete_phrases) {
989 phrase_language_t other_phrase_lang;
990 phrase_t other_phrase;
991
992 log_debug("i = %zu, phrase.start = %u\n", i, phrase.start);
993 if (i == 0 && phrase.start == 0 && phrase.start + phrase.len < num_tokens) {
994 current_phrase_have_edge_ignorable = address_phrase_is_edge_ignorable_for_components(phrase, options.address_components);
995 // Delete "E" in "E 125th St"
996 if (current_phrase_have_edge_ignorable) {
997 log_debug("edge-ignorable phrase [%u, %u]\n", phrase.start, phrase.start + phrase.len);
998 skip_edge_phrase = true;
999 }
1000
1001 if (!skip_edge_phrase || !have_non_phrase_tokens) {
1002 for (size_t other_i = i + 1; other_i < phrases->n; other_i++) {
1003 other_phrase_lang = phrases->a[other_i];
1004 other_phrase = other_phrase_lang.phrase;
1005 log_debug("phrase.start + phrase.len = %u\n", phrase.start + phrase.len);
1006 log_debug("other_phrase.start = %u, other_phrase.len = %u, lang=%s\n", other_phrase.start, other_phrase.len, other_phrase_lang.language);
1007 if (other_phrase.start >= phrase.start + phrase.len && string_equals(other_phrase_lang.language, phrase_lang.language)) {
1008 if (other_phrase.start + other_phrase.len == num_tokens) {
1009 skip_edge_phrase = false;
1010 if (current_phrase_have_edge_ignorable || (current_phrase_have_ambiguous && current_phrase_have_canonical)) {
1011 // don't delete the "E" in "E St"
1012 log_debug("initial phrase is edge ignorable out of two phrases. Checking next phrase is ignorable.\n");
1013
1014 skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(other_phrase) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components)));
1015 log_debug("skip_edge_phrase = %d\n", skip_edge_phrase);
1016 } else {
1017 log_debug("initial phrase is not edge-ignorable out of two phrases. Checking next phrase is edge ignorable.\n");
1018 // delete "Avenue" in "Avenue E"
1019 other_phrase_is_ignorable = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components) || (address_phrase_in_dictionary(other_phrase, DICTIONARY_AMBIGUOUS_EXPANSION) && address_phrase_has_canonical_interpretation(other_phrase));
1020 skip_edge_phrase = other_phrase_is_ignorable && address_phrase_is_ignorable_for_components(phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(phrase) && address_phrase_is_possible_root_for_components(phrase, options.address_components));
1021
1022 }
1023 } else {
1024 // If we encounter an ignorable phrase like St and we're _not_ the end of the string e.g. "E St SE", the first token is probably a legit token instead of a pre-directional
1025 skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !((address_phrase_has_canonical_interpretation(other_phrase) || address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components)) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components)));
1026 log_debug("phrase is possible root. skip_edge_phrase = %d\n", skip_edge_phrase);
1027 }
1028 break;
1029 }
1030 }
1031 }
1032 } else if (phrases->n > 1 && i == phrases->n - 1 && phrase.start + phrase.len == num_tokens && phrase.start > 0) {
1033 current_phrase_have_edge_ignorable = address_phrase_is_edge_ignorable_for_components(phrase, options.address_components);
1034 if (current_phrase_have_edge_ignorable) {
1035 log_debug("edge-ignorable phrase [%u, %u]\n", phrase.start, phrase.start + phrase.len);
1036 skip_edge_phrase = true;
1037 }
1038
1039 log_debug("have_non_phrase_tokens = %d\n", have_non_phrase_tokens);
1040 if (!skip_edge_phrase || !have_non_phrase_tokens) {
1041 for (ssize_t other_j = i - 1; other_j >= 0; other_j--) {
1042 other_phrase_lang = phrases->a[other_j];
1043 other_phrase = other_phrase_lang.phrase;
1044 log_debug("phrase.start + phrase.len = %u\n", phrase.start + phrase.len);
1045 log_debug("other_phrase.start = %u, other_phrase.len = %u, lang=%s\n", other_phrase.start, other_phrase.len, other_phrase_lang.language);
1046 if (other_phrase.start + other_phrase.len <= phrase.start && string_equals(other_phrase_lang.language, phrase_lang.language)) {
1047 if (other_phrase.start == 0) {
1048 //other_phrase_invalid = address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !address_phrase_has_canonical_interpretation(other_phrase) && !address_phrase_is_possible_root_for_components(other_phrase, options.address_components);
1049 skip_edge_phrase = false;
1050 if (current_phrase_have_edge_ignorable || (current_phrase_have_ambiguous && current_phrase_have_canonical)) {
1051 // don't delete the "E" in "Avenue E"
1052 log_debug("final phrase is edge ignorable out of two phrases. Checking previous phrase is ignorable.\n");
1053
1054 skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(other_phrase) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components))) && string_tree_num_tokens(tree) > 0;
1055 } else {
1056 log_debug("final phrase is not edge-ignorable out of two phrases. Checking previous phrase is edge ignorable.\n");
1057 // delete "St" in "E St"
1058 other_phrase_is_ignorable = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components) || (address_phrase_in_dictionary(other_phrase, DICTIONARY_AMBIGUOUS_EXPANSION) && address_phrase_has_canonical_interpretation(other_phrase));
1059 skip_edge_phrase = other_phrase_is_ignorable && address_phrase_is_ignorable_for_components(phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(phrase) && address_phrase_is_possible_root_for_components(phrase, options.address_components));
1060
1061 //skip_edge_phrase = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components);
1062 }
1063 }
1064 break;
1065 }
1066 }
1067 }
1068 }
1069 }
1070
1071 if (phrase.start == prev_phrase.start && phrase.len == prev_phrase.len && skipped_last_edge_phrase) {
1072 skip_edge_phrase = true;
1073 }
1074
1075 for (size_t j = 0; j < expansions->n; j++) {
1076 if (skip_edge_phrase) {
1077 skipped_last_edge_phrase = true;
1078 log_debug("skip edge phrase\n");
1079 continue;
1080 } else {
1081 skipped_last_edge_phrase = false;
1082 }
1083
1084 address_expansion_t expansion = expansions->a[j];
1085
1086 bool current_phrase_ignorable = false;
1087 bool current_phrase_expandable = expand_phrases && expansion.canonical_index != NULL_CANONICAL_INDEX;
1088
1089 bool is_ambiguous = address_expansion_in_dictionary(expansion, DICTIONARY_AMBIGUOUS_EXPANSION);
1090 bool is_valid_for_components = address_expansion_is_valid_for_components(expansion, options.address_components);
1091
1092 if (delete_phrases) {
1093 bool is_ignorable = address_expansion_is_ignorable_for_components(expansion, options.address_components);
1094 bool is_canonical = expansion.canonical_index == NULL_CANONICAL_INDEX;
1095
1096 log_debug("is_ignorable = %d, is_canonical = %d, is_ambiguous = %d, current_phrase_have_ambiguous = %d, current_phrase_have_unambiguous = %d, have_strictly_ignorable = %d, current_phrase_have_ignorable=%d, current_phrase_have_possible_root=%d\n", is_ignorable, is_canonical, is_ambiguous, current_phrase_have_ambiguous, current_phrase_have_unambiguous, have_strictly_ignorable, current_phrase_have_ignorable, current_phrase_have_possible_root);
1097
1098 current_phrase_expandable = current_phrase_expandable || current_phrase_have_ambiguous;
1099
1100 if (!is_canonical) {
1101 char *canon = address_dictionary_get_canonical(expansion.canonical_index);
1102 log_debug("canonical = %s\n", canon);
1103 }
1104
1105 // Edge phrase calculations from above
1106 if (current_phrase_have_edge_ignorable || other_phrase_is_ignorable) {
1107 log_debug("current_phrase_have_edge_ignorable\n");
1108 log_debug("skip_edge_phrase = %d\n", skip_edge_phrase);
1109 current_phrase_ignorable = skip_edge_phrase;
1110 // Don't delete "PH" in "PH 1" for unit expansions
1111 } else if (is_ignorable && current_phrase_have_specifier) {
1112 log_debug("current_phrase_have_specifier\n");
1113 current_phrase_ignorable = false;
1114 // Delete "Avenue" in "5th Avenue"
1115 } else if (is_ignorable && is_canonical && !current_phrase_have_ambiguous) {
1116 log_debug("is_ignorable && is_canonical && !current_phrase_have_ambiguous\n");
1117 current_phrase_ignorable = have_non_phrase_tokens || (have_possible_root && !current_phrase_have_possible_root) || string_tree_num_tokens(tree) > 0;
1118 log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable);
1119 // Delete "Ave" in "5th Ave" or "Pl" in "Park Pl S"
1120 } else if (is_ignorable && !is_canonical && !is_ambiguous && !current_phrase_have_ambiguous) {
1121 log_debug("is_ignorable && !is_canonical && !current_phrase_have_ambiguous\n");
1122 current_phrase_ignorable = have_non_phrase_tokens || (have_possible_root && !current_phrase_have_possible_root) || string_tree_num_tokens(tree) > 0;
1123 log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable);
1124 } else if (current_phrase_have_ambiguous && (have_non_phrase_word_tokens || is_numeric_component || have_canonical_phrases || have_possible_root)) {
1125 log_debug("current_phrase_have_ambiguous && have_non_phrase_tokens = %d, have_canonical_phrases = %d, have_possible_root = %d, have_non_phrase_word_tokens = %d, is_numeric_component = %d, have_non_phrase_tokens = %d\n", have_non_phrase_tokens, have_canonical_phrases, have_possible_root, have_non_phrase_word_tokens, is_numeric_component, have_non_phrase_tokens);
1126 current_phrase_ignorable = (is_ignorable && !(have_possible_root && !current_phrase_have_possible_root)) || (current_phrase_have_ambiguous && (have_non_phrase_word_tokens || (is_numeric_component && have_non_phrase_tokens)) && current_phrase_have_ignorable && current_phrase_have_unambiguous);
1127 log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable);
1128 } else if (!is_valid_for_components && !is_ambiguous) {
1129 log_debug("!is_valid_for_components\n");
1130 current_phrase_ignorable = current_phrase_have_ignorable || current_phrase_have_valid;
1131 log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable);
1132 } else {
1133 log_debug("none of the above\n");
1134 }
1135
1136 if (!current_phrase_ignorable && !last_added_was_whitespace && string_tree_num_tokens(tree) > 0 && !added_pre_phrase_space) {
1137 log_debug("Adding space\n");
1138 string_tree_add_string(tree, " ");
1139 string_tree_finalize_token(tree);
1140 last_added_was_whitespace = true;
1141 added_pre_phrase_space = true;
1142 }
1143
1144 }
1145
1146 if (current_phrase_ignorable) {
1147 continue;
1148 }
1149
1150 if (delete_phrases) {
1151 current_phrase_expandable = !current_phrase_ignorable;
1152 } else {
1153 current_phrase_expandable = (expansion.address_components & options.address_components) || is_valid_for_components;
1154 }
1155
1156 log_debug("current_phrase_expandable = %d\n", current_phrase_expandable);
1157
1158 log_debug("expansion.canonical_index = %d\n", expansion.canonical_index);
1159
1160 if (expansion.canonical_index != NULL_CANONICAL_INDEX && current_phrase_expandable) {
1161 log_debug("expansion.canonical_index != NULL_CANONICAL_INDEX, delete_phrases = %d, phrase_option = %d\n", delete_phrases, phrase_option);
1162 char *canonical = address_dictionary_get_canonical(expansion.canonical_index);
1163 char *canonical_normalized = normalize_string_latin(canonical, strlen(canonical), normalize_string_options);
1164
1165 canonical = canonical_normalized != NULL ? canonical_normalized : canonical;
1166
1167 if (phrase.start + phrase.len < num_tokens - 1) {
1168 token_t next_token = tokens[phrase.start + phrase.len];
1169 if (!is_numeric_token(next_token.type)) {
1170 log_debug("non-canonical phrase, adding canonical string: %s\n", canonical);
1171 string_tree_add_string(tree, canonical);
1172 last_added_was_whitespace = false;
1173 } else {
1174 log_debug("adding canonical with cstring_array methods: %s\n", canonical);
1175 uint32_t start_index = cstring_array_start_token(tree->strings);
1176 cstring_array_append_string(tree->strings, canonical);
1177 cstring_array_append_string(tree->strings, " ");
1178 last_added_was_whitespace = true;
1179 cstring_array_terminate(tree->strings);
1180 }
1181 } else {
1182 log_debug("adding canonical: %s\n", canonical);
1183 string_tree_add_string(tree, canonical);
1184 last_added_was_whitespace = false;
1185 }
1186
1187 if (canonical_normalized != NULL) {
1188 free(canonical_normalized);
1189 }
1190 } else if (expansion.canonical_index == NULL_CANONICAL_INDEX || !current_phrase_expandable) {
1191 log_debug("canonical phrase, adding canonical string\n");
1192
1193 uint32_t start_index = cstring_array_start_token(tree->strings);
1194 for (size_t k = phrase.start; k < phrase.start + phrase.len; k++) {
1195 token = tokens[k];
1196 if (token.type != WHITESPACE) {
1197 cstring_array_append_string_len(tree->strings, str + token.offset, token.len);
1198 last_added_was_whitespace = false;
1199 } else {
1200 log_debug("space\n");
1201 cstring_array_append_string(tree->strings, " ");
1202 last_added_was_whitespace = true;
1203 }
1204 }
1205 cstring_array_terminate(tree->strings);
1206 } else {
1207 continue;
1208 }
1209
1210 added_expansions++;
1211 }
1212
1213 }
1214 }
1215
1216 log_debug("expansion_valid_components == %d\n", expansion_valid_components);
1217
1218 if (added_expansions == 0 && (!delete_phrases || !expansion_valid_components)) {
1219 if (!last_added_was_whitespace && string_tree_num_tokens(tree) > 0) {
1220 log_debug("Adding space\n");
1221 string_tree_add_string(tree, " ");
1222 string_tree_finalize_token(tree);
1223 last_added_was_whitespace = true;
1224 }
1225
1226 uint32_t start_index = cstring_array_start_token(tree->strings);
1227
1228 for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) {
1229 token = tokens[j];
1230
1231 if (token.type != WHITESPACE) {
1232 log_debug("Adding canonical token, %.*s\n", (int)token.len, str + token.offset);
1233 cstring_array_append_string_len(tree->strings, str + token.offset, token.len);
1234 last_added_was_whitespace = false;
1235 } else if (!last_added_was_whitespace) {
1236 log_debug("Adding space\n");
1237 cstring_array_append_string(tree->strings, " ");
1238 last_added_was_whitespace = true;
1239 }
1240
1241 }
1242
1243 cstring_array_terminate(tree->strings);
1244
1245 }
1246
1247 if (!delete_phrases || !expansion_valid_components || added_expansions > 0) {
1248 log_debug("i=%zu\n", i);
1249 bool end_of_phrase = false;
1250 if (i < phrases->n - 1) {
1251 phrase_t next_phrase = phrases->a[i + 1].phrase;
1252 end_of_phrase = (next_phrase.start != phrase.start || next_phrase.len != phrase.len);
1253 } else {
1254 end_of_phrase = true;
1255 }
1256
1257 log_debug("end_of_phrase=%d\n", end_of_phrase);
1258 if (end_of_phrase) {
1259 log_debug("finalize at i=%zu\n", i);
1260 string_tree_finalize_token(tree);
1261 }
1262 }
1263
1264 start = phrase.start + phrase.len;
1265 prev_phrase = phrase;
1266
1267 }
1268
1269 char_array_destroy(key);
1270
1271 end = (int)num_tokens;
1272
1273 if (phrase.start + phrase.len > 0 && phrase.start + phrase.len <= end - 1 && !last_added_was_whitespace) {
1274 token_t next_token = tokens[phrase.start + phrase.len];
1275 if (next_token.type != WHITESPACE && !last_added_was_whitespace && string_tree_num_tokens(tree) > 0 && !is_ideographic(next_token.type)) {
1276 log_debug("space after phrase\n");
1277 string_tree_add_string(tree, " ");
1278 last_added_was_whitespace = true;
1279 string_tree_finalize_token(tree);
1280 }
1281 }
1282
1283
1284 for (size_t j = start; j < end; j++) {
1285 log_debug("On token %zu\n", j);
1286 token_t token = tokens[j];
1287 if (is_punctuation(token.type)) {
1288 log_debug("last_was_punctuation\n");
1289 last_was_punctuation = true;
1290 continue;
1291 }
1292
1293 if (token.type != WHITESPACE) {
1294 if (j > 0 && last_was_punctuation && !last_added_was_whitespace && string_tree_num_tokens(tree) > 0) {
1295 log_debug("Adding another space\n");
1296 string_tree_add_string(tree, " ");
1297 string_tree_finalize_token(tree);
1298 }
1299 log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset);
1300
1301 bool have_period_affixes = add_period_affixes_or_token(tree, str, token, options);
1302 last_added_was_whitespace = false;
1303 } else if (!last_added_was_whitespace && string_tree_num_tokens(tree) > 0) {
1304 log_debug("Adding space IV\n");
1305 string_tree_add_string(tree, " ");
1306 last_added_was_whitespace = true;
1307 } else {
1308 log_debug("Skipping token %zu\n", j);
1309 continue;
1310 }
1311
1312 last_was_punctuation = false;
1313 string_tree_finalize_token(tree);
1314
1315 }
1316
1317 } else {
1318 log_debug("phrases NULL\n");
1319 for (size_t j = 0; j < num_tokens; j++) {
1320 log_debug("On token %zu\n", j);
1321 token_t token = tokens[j];
1322 if (is_punctuation(token.type)) {
1323 log_debug("punctuation, skipping\n");
1324 last_was_punctuation = true;
1325 continue;
1326 }
1327
1328 if (token.type != WHITESPACE) {
1329 if (last_was_punctuation && !last_added_was_whitespace && string_tree_num_tokens(tree) > 0) {
1330 log_debug("Adding space V\n");
1331 string_tree_add_string(tree, " ");
1332 string_tree_finalize_token(tree);
1333 }
1334
1335 bool have_period_affixes = add_period_affixes_or_token(tree, str, token, options);
1336 last_added_was_whitespace = false;
1337 } else if (!last_added_was_whitespace && string_tree_num_tokens(tree) > 0) {
1338 log_debug("Adding space VI\n");
1339 string_tree_add_string(tree, " ");
1340 last_added_was_whitespace = true;
1341 } else {
1342 continue;
1343 }
1344
1345 last_was_punctuation = false;
1346 string_tree_finalize_token(tree);
1347 }
1348 }
1349
1350 if (phrases != NULL) {
1351 phrase_language_array_destroy(phrases);
1352 }
1353
1354 token_array_destroy(token_array);
1355
1356 return tree;
1357 }
1358
normalize_ordinal_suffixes(string_tree_t * tree,char * str,char * lang,token_t token,size_t i,token_t prev_token,libpostal_normalize_options_t options)1359 inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, size_t i, token_t prev_token, libpostal_normalize_options_t options) {
1360 size_t len_ordinal_suffix = valid_ordinal_suffix_len(str, token, prev_token, lang);
1361
1362 if (len_ordinal_suffix > 0) {
1363 cstring_array *strings = tree->strings;
1364 // Add the original form first. When this function returns true,
1365 // add_normalized_strings_token won't be called a second time.
1366 add_normalized_strings_token(strings, str, token, options);
1367 token_t normalized_token = token;
1368 normalized_token.len = token.len - len_ordinal_suffix;
1369 add_normalized_strings_token(strings, str, normalized_token, options);
1370 return true;
1371 }
1372
1373 return false;
1374 }
1375
add_normalized_strings_tokenized(string_tree_t * tree,char * str,token_array * tokens,libpostal_normalize_options_t options)1376 inline void add_normalized_strings_tokenized(string_tree_t *tree, char *str, token_array *tokens, libpostal_normalize_options_t options) {
1377 cstring_array *strings = tree->strings;
1378
1379 token_t prev_token = (token_t){0, 0, 0};
1380
1381 for (size_t i = 0; i < tokens->n; i++) {
1382 token_t token = tokens->a[i];
1383 bool have_phrase = false;
1384 bool have_ordinal = false;
1385
1386 if (is_special_token(token.type)) {
1387 string_tree_add_string_len(tree, str + token.offset, token.len);
1388 string_tree_finalize_token(tree);
1389 continue;
1390 }
1391
1392 for (size_t j = 0; j < options.num_languages; j++) {
1393 char *lang = options.languages[j];
1394 if (expand_affixes(tree, str, lang, token, options)) {
1395 have_phrase = true;
1396 break;
1397 }
1398
1399 if (normalize_ordinal_suffixes(tree, str, lang, token, i, prev_token, options)) {
1400 have_ordinal = true;
1401 break;
1402 }
1403 }
1404
1405 if (!have_phrase && !have_ordinal) {
1406 add_normalized_strings_token(strings, str, token, options);
1407 }
1408
1409 string_tree_finalize_token(tree);
1410 prev_token = token;
1411 }
1412
1413 }
1414
1415
expand_alternative_phrase_option(cstring_array * strings,khash_t (str_set)* unique_strings,char * str,libpostal_normalize_options_t options,expansion_phrase_option_t phrase_option)1416 void expand_alternative_phrase_option(cstring_array *strings, khash_t(str_set) *unique_strings, char *str, libpostal_normalize_options_t options, expansion_phrase_option_t phrase_option) {
1417 size_t len = strlen(str);
1418 token_array *tokens = tokenize_keep_whitespace(str);
1419 string_tree_t *token_tree = string_tree_new_size(len);
1420
1421 add_normalized_strings_tokenized(token_tree, str, tokens, options);
1422
1423 string_tree_iterator_t *tokenized_iter = string_tree_iterator_new(token_tree);
1424
1425 string_tree_iterator_t *iter;
1426
1427 char_array *temp_string = char_array_new_size(len);
1428
1429 char *token;
1430
1431 char *lang;
1432
1433 kh_resize(str_set, unique_strings, kh_size(unique_strings) + tokenized_iter->remaining);
1434
1435 bool excessive_perms_outer = tokenized_iter->remaining >= EXCESSIVE_PERMUTATIONS;
1436
1437 if (!excessive_perms_outer) {
1438 kh_resize(str_set, unique_strings, kh_size(unique_strings) + tokenized_iter->remaining);
1439 }
1440
1441 log_debug("tokenized_iter->remaining=%d\n", tokenized_iter->remaining);
1442
1443 for (; !string_tree_iterator_done(tokenized_iter); string_tree_iterator_next(tokenized_iter)) {
1444 char_array_clear(temp_string);
1445
1446 string_tree_iterator_foreach_token(tokenized_iter, token, {
1447 if (token == NULL) {
1448 continue;
1449 }
1450 char_array_append(temp_string, token);
1451 })
1452 char_array_terminate(temp_string);
1453
1454 char *tokenized_str = char_array_get_string(temp_string);
1455
1456 string_tree_t *alternatives;
1457
1458 int ret;
1459 log_debug("Adding alternatives for single normalization\n");
1460 alternatives = add_string_alternatives_phrase_option(tokenized_str, options, phrase_option);
1461
1462 log_debug("num strings = %" PRIu32 "\n", string_tree_num_strings(alternatives));
1463
1464 if (alternatives == NULL) {
1465 log_debug("alternatives = NULL\n");
1466 continue;
1467 }
1468
1469 iter = string_tree_iterator_new(alternatives);
1470 log_debug("iter->num_tokens=%d\n", iter->num_tokens);
1471 log_debug("iter->remaining=%d\n", iter->remaining);
1472
1473 bool excessive_perms_inner = iter->remaining >= EXCESSIVE_PERMUTATIONS;
1474
1475 if (!excessive_perms_inner && !excessive_perms_outer) {
1476 for (; !string_tree_iterator_done(iter); string_tree_iterator_next(iter)) {
1477 char_array_clear(temp_string);
1478 string_tree_iterator_foreach_token(iter, token, {
1479 if (token == NULL) {
1480 log_debug("token=NULL\n");
1481 } else {
1482 log_debug("token=%s\n", token);
1483 char_array_append(temp_string, token);
1484 }
1485 })
1486 char_array_terminate(temp_string);
1487
1488 token = char_array_get_string(temp_string);
1489
1490 size_t token_len = strlen(token);
1491
1492 if (token_len == 0) continue;
1493
1494 size_t left_spaces = string_left_spaces_len(token, token_len);
1495 size_t right_spaces = string_right_spaces_len(token, token_len);
1496
1497 if (left_spaces + right_spaces == token_len) {
1498 continue;
1499 }
1500
1501 char *dupe_token = strndup(token + left_spaces, token_len - left_spaces - right_spaces);
1502
1503 log_debug("full string=%s\n", token);
1504 khiter_t k = kh_get(str_set, unique_strings, dupe_token);
1505
1506 if (k == kh_end(unique_strings)) {
1507 log_debug("doing postprocessing\n");
1508 add_postprocessed_string(strings, dupe_token, options);
1509 k = kh_put(str_set, unique_strings, dupe_token, &ret);
1510 } else {
1511 free(dupe_token);
1512 }
1513
1514 log_debug("iter->remaining = %d\n", iter->remaining);
1515
1516 }
1517 } else {
1518 cstring_array_add_string(strings, tokenized_str);
1519 }
1520
1521 string_tree_iterator_destroy(iter);
1522 string_tree_destroy(alternatives);
1523
1524 if (excessive_perms_outer) {
1525 break;
1526 }
1527 }
1528
1529 string_tree_iterator_destroy(tokenized_iter);
1530 string_tree_destroy(token_tree);
1531
1532 token_array_destroy(tokens);
1533
1534 char_array_destroy(temp_string);
1535 }
1536
1537
1538
expand_address_phrase_option(char * input,libpostal_normalize_options_t options,size_t * n,expansion_phrase_option_t phrase_option)1539 cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_options_t options, size_t *n, expansion_phrase_option_t phrase_option) {
1540 options.address_components |= LIBPOSTAL_ADDRESS_ANY;
1541
1542 uint64_t normalize_string_options = get_normalize_string_options(options);
1543
1544 size_t len = strlen(input);
1545
1546 language_classifier_response_t *lang_response = NULL;
1547
1548 if (options.num_languages == 0) {
1549 lang_response = classify_languages(input);
1550 if (lang_response != NULL) {
1551 options.num_languages = lang_response->num_languages;
1552 options.languages = lang_response->languages;
1553 }
1554 }
1555
1556 string_tree_t *tree = normalize_string_languages(input, normalize_string_options, options.num_languages, options.languages);
1557
1558 cstring_array *strings = cstring_array_new_size(len * 2);
1559 char_array *temp_string = char_array_new_size(len);
1560
1561 khash_t(str_set) *unique_strings = kh_init(str_set);
1562
1563 char *token;
1564
1565 log_debug("string_tree_num_tokens(tree) = %d\n", string_tree_num_tokens(tree));
1566
1567 if (string_tree_num_strings(tree) == 1) {
1568 char *normalized = string_tree_get_alternative(tree, 0, 0);
1569 expand_alternative_phrase_option(strings, unique_strings, normalized, options, phrase_option);
1570
1571 } else {
1572 log_debug("Adding alternatives for multiple normalizations\n");
1573 string_tree_iterator_t *iter = string_tree_iterator_new(tree);
1574
1575 for (; !string_tree_iterator_done(iter); string_tree_iterator_next(iter)) {
1576 char *segment;
1577 char_array_clear(temp_string);
1578 bool is_first = true;
1579
1580 string_tree_iterator_foreach_token(iter, segment, {
1581 if (!is_first) {
1582 char_array_append(temp_string, " ");
1583 }
1584 char_array_append(temp_string, segment);
1585 is_first = false;
1586 })
1587 char_array_terminate(temp_string);
1588 token = char_array_get_string(temp_string);
1589 log_debug("current permutation = %s\n", token);
1590 expand_alternative_phrase_option(strings, unique_strings, token, options, phrase_option);
1591 }
1592
1593 string_tree_iterator_destroy(iter);
1594 }
1595
1596 char *key_str = NULL;
1597 for (size_t i = kh_begin(unique_strings); i != kh_end(unique_strings); ++i) {
1598 if (!kh_exist(unique_strings, i)) continue;
1599 key_str = (char *)kh_key(unique_strings, i);
1600 free(key_str);
1601 }
1602
1603 kh_destroy(str_set, unique_strings);
1604
1605 if (lang_response != NULL) {
1606 language_classifier_response_destroy(lang_response);
1607 }
1608
1609 char_array_destroy(temp_string);
1610 string_tree_destroy(tree);
1611
1612 *n = cstring_array_num_strings(strings);
1613
1614 return strings;
1615
1616 }
1617
expand_address(char * input,libpostal_normalize_options_t options,size_t * n)1618 cstring_array *expand_address(char *input, libpostal_normalize_options_t options, size_t *n) {
1619 return expand_address_phrase_option(input, options, n, EXPAND_PHRASES);
1620 }
1621
expand_address_root(char * input,libpostal_normalize_options_t options,size_t * n)1622 cstring_array *expand_address_root(char *input, libpostal_normalize_options_t options, size_t *n) {
1623 return expand_address_phrase_option(input, options, n, DELETE_PHRASES);
1624 }
1625
1626
expansion_array_destroy(char ** expansions,size_t n)1627 void expansion_array_destroy(char **expansions, size_t n) {
1628 for (size_t i = 0; i < n; i++) {
1629 free(expansions[i]);
1630 }
1631 free(expansions);
1632 }
1633
1634