1 #include "address_parser.h"
2 #include "address_dictionary.h"
3 #include "features.h"
4 #include "ngrams.h"
5 #include "scanner.h"
6 
7 #include "graph_builder.h"
8 
9 #include "klib/ksort.h"
10 #include "log/log.h"
11 
12 #define ADDRESS_PARSER_MODEL_FILENAME "address_parser.dat"
13 #define ADDRESS_PARSER_MODEL_FILENAME_CRF "address_parser_crf.dat"
14 #define ADDRESS_PARSER_VOCAB_FILENAME "address_parser_vocab.trie"
15 #define ADDRESS_PARSER_PHRASE_FILENAME "address_parser_phrases.dat"
16 #define ADDRESS_PARSER_POSTAL_CODES_FILENAME "address_parser_postal_codes.dat"
17 
18 #define UNKNOWN_WORD "UNKNOWN"
19 #define UNKNOWN_NUMERIC "UNKNOWN_NUMERIC"
20 
21 #define DEFAULT_RARE_WORD_THRESHOLD 50
22 
23 static address_parser_t *parser = NULL;
24 
25 typedef enum {
26     ADDRESS_PARSER_NULL_PHRASE,
27     ADDRESS_PARSER_DICTIONARY_PHRASE,
28     ADDRESS_PARSER_COMPONENT_PHRASE,
29     ADDRESS_PARSER_PREFIX_PHRASE,
30     ADDRESS_PARSER_SUFFIX_PHRASE
31 } address_parser_phrase_type_t;
32 
33 static parser_options_t PARSER_DEFAULT_OPTIONS = {
34     .rare_word_threshold = DEFAULT_RARE_WORD_THRESHOLD,
35     .print_features = false
36 };
37 
address_parser_new_options(parser_options_t options)38 address_parser_t *address_parser_new_options(parser_options_t options) {
39     address_parser_t *parser = calloc(1, sizeof(address_parser_t));
40     parser->options = options;
41     return parser;
42 }
43 
address_parser_new(void)44 address_parser_t *address_parser_new(void) {
45     return address_parser_new_options(PARSER_DEFAULT_OPTIONS);
46 }
47 
get_address_parser(void)48 address_parser_t *get_address_parser(void) {
49     return parser;
50 }
51 
address_parser_print_features(bool print_features)52 bool address_parser_print_features(bool print_features) {
53     if (parser == NULL) return false;
54 
55     parser->options.print_features = print_features;
56     return true;
57 }
58 
address_parser_save(address_parser_t * self,char * output_dir)59 bool address_parser_save(address_parser_t *self, char *output_dir) {
60     if (self == NULL || output_dir == NULL) return false;
61 
62     char *model_filename = NULL;
63     if (self->model_type == ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON) {
64         model_filename = ADDRESS_PARSER_MODEL_FILENAME;
65     } else if (self->model_type == ADDRESS_PARSER_TYPE_CRF) {
66         model_filename = ADDRESS_PARSER_MODEL_FILENAME_CRF;
67     } else {
68         return false;
69     }
70 
71     char_array *path = char_array_new_size(strlen(output_dir));
72 
73     char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, model_filename);
74     char *model_path = char_array_get_string(path);
75 
76     if (self->model_type == ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON) {
77         if (!averaged_perceptron_save(self->model.ap, model_path)) {
78             log_info("Error in averaged_perceptron_save\n");
79             char_array_destroy(path);
80             return false;
81         }
82     } else if (self->model_type == ADDRESS_PARSER_TYPE_CRF) {
83         if (!crf_save(self->model.crf, model_path)) {
84             log_info("Error in crf_save\n");
85             char_array_destroy(path);
86             return false;
87         }
88     }
89 
90     char_array_clear(path);
91 
92     char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, ADDRESS_PARSER_VOCAB_FILENAME);
93     char *vocab_path = char_array_get_string(path);
94 
95     if (!trie_save(self->vocab, vocab_path)) {
96         return false;
97     }
98 
99     char_array_clear(path);
100 
101     char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, ADDRESS_PARSER_PHRASE_FILENAME);
102     char *phrases_path = char_array_get_string(path);
103 
104     FILE *phrases_file = fopen(phrases_path, "w+");
105     if (phrases_file == NULL || self->phrases == NULL) {
106         return false;
107     }
108 
109     if (!trie_write(self->phrases, phrases_file)) {
110         return false;
111     }
112 
113     if (self->phrase_types == NULL) {
114         return false;
115     }
116 
117     size_t num_phrase_types = self->phrase_types->n;
118     if (!file_write_uint64(phrases_file, num_phrase_types)) {
119         return false;
120     }
121 
122     for (size_t i = 0; i < self->phrase_types->n; i++) {
123         address_parser_types_t phrase_type_value = self->phrase_types->a[i];
124         if (!file_write_uint32(phrases_file, phrase_type_value.value)) {
125             return false;
126         }
127     }
128 
129     fclose(phrases_file);
130 
131     char_array_clear(path);
132 
133     char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, ADDRESS_PARSER_POSTAL_CODES_FILENAME);
134     char *postal_codes_path = char_array_get_string(path);
135 
136     FILE *postal_codes_file = fopen(postal_codes_path, "w+");
137     if (postal_codes_file == NULL || self->postal_codes == NULL) {
138         return false;
139     }
140 
141     if (!trie_write(self->postal_codes, postal_codes_file)) {
142         return false;
143     }
144 
145     if (self->postal_code_contexts == NULL) {
146         return false;
147     }
148 
149     if (!graph_write(self->postal_code_contexts, postal_codes_file)) {
150         return false;
151     }
152 
153     fclose(postal_codes_file);
154 
155     char_array_destroy(path);
156 
157     return true;
158 }
159 
postal_code_context_exists(address_parser_t * self,uint32_t postal_code_id,uint32_t admin_id)160 static bool postal_code_context_exists(address_parser_t *self, uint32_t postal_code_id, uint32_t admin_id) {
161     graph_t *g = self->postal_code_contexts;
162 
163     return graph_has_edge(g, postal_code_id, admin_id);
164 }
165 
address_parser_load(char * dir)166 bool address_parser_load(char *dir) {
167     if (parser != NULL) return false;
168     if (dir == NULL) {
169         dir = LIBPOSTAL_ADDRESS_PARSER_DIR;
170     }
171 
172     char_array *path = char_array_new_size(strlen(dir));
173 
174     char_array_add_joined(path, PATH_SEPARATOR, true, 2, dir, ADDRESS_PARSER_MODEL_FILENAME);
175     char *model_path = char_array_get_string(path);
176 
177     if (file_exists(model_path)) {
178         averaged_perceptron_t *ap_model = averaged_perceptron_load(model_path);
179         if (ap_model != NULL) {
180             parser = address_parser_new();
181             parser->model_type = ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON;
182             parser->model.ap = ap_model;
183         } else {
184             char_array_destroy(path);
185             log_error("Averaged perceptron model could not be loaded\n");
186             return false;
187         }
188     } else {
189         model_path = NULL;
190     }
191 
192     if (model_path == NULL) {
193         char_array_clear(path);
194         char_array_add_joined(path, PATH_SEPARATOR, true, 2, dir, ADDRESS_PARSER_MODEL_FILENAME_CRF);
195         model_path = char_array_get_string(path);
196 
197         if (file_exists(model_path)) {
198             crf_t *crf_model = crf_load(model_path);
199             if (crf_model != NULL) {
200                 parser = address_parser_new();
201                 parser->model_type = ADDRESS_PARSER_TYPE_CRF;
202                 parser->model.crf = crf_model;
203             } else {
204                 char_array_destroy(path);
205                 log_error("Averaged perceptron model could not be loaded\n");
206                 return false;
207             }
208         } else {
209             model_path = NULL;
210         }
211     }
212 
213     if (parser == NULL) {
214         char_array_destroy(path);
215         log_error("Could not find parser model file of known type\n");
216         return false;
217     }
218 
219     char_array_clear(path);
220 
221     char_array_add_joined(path, PATH_SEPARATOR, true, 2, dir, ADDRESS_PARSER_VOCAB_FILENAME);
222 
223     char *vocab_path = char_array_get_string(path);
224 
225     trie_t *vocab = trie_load(vocab_path);
226 
227     if (vocab == NULL) {
228         goto exit_address_parser_created;
229     }
230 
231     parser->vocab = vocab;
232 
233     char_array_clear(path);
234 
235     char_array_add_joined(path, PATH_SEPARATOR, true, 2, dir, ADDRESS_PARSER_PHRASE_FILENAME);
236     char *phrases_path = char_array_get_string(path);
237 
238     FILE *phrases_file = fopen(phrases_path, "rb");
239     if (phrases_file == NULL) {
240         goto exit_address_parser_created;
241     }
242 
243     parser->phrases = trie_read(phrases_file);
244     if (parser->phrases == NULL) {
245         goto exit_address_parser_created;
246     }
247 
248     uint64_t num_phrase_types;
249 
250     if (!file_read_uint64(phrases_file, &num_phrase_types)) {
251         goto exit_address_parser_created;
252     }
253 
254     parser->phrase_types = address_parser_types_array_new_size(num_phrase_types);
255 
256     uint32_array *phrase_type_values = uint32_array_new_size(num_phrase_types);
257     if (!file_read_uint32_array(phrases_file, phrase_type_values->a, num_phrase_types)) {
258         uint32_array_destroy(phrase_type_values);
259         goto exit_address_parser_created;
260     }
261     phrase_type_values->n = num_phrase_types;
262 
263     for (size_t i = 0; i < phrase_type_values->n; i++) {
264         uint32_t phrase_type_value = phrase_type_values->a[i];
265         address_parser_types_t phrase_type = {.value = phrase_type_value};
266         address_parser_types_array_push(parser->phrase_types, phrase_type);
267     }
268 
269     uint32_array_destroy(phrase_type_values);
270 
271     fclose(phrases_file);
272 
273     char_array_clear(path);
274 
275     char_array_add_joined(path, PATH_SEPARATOR, true, 2, dir, ADDRESS_PARSER_POSTAL_CODES_FILENAME);
276 
277     char *postal_codes_path = char_array_get_string(path);
278 
279     FILE *postal_codes_file = fopen(postal_codes_path, "rb");
280     if (postal_codes_file == NULL) {
281         goto exit_address_parser_created;
282     }
283 
284     parser->postal_codes = trie_read(postal_codes_file);
285     if (parser->postal_codes == NULL) {
286         goto exit_address_parser_created;
287     }
288 
289     parser->postal_code_contexts = graph_read(postal_codes_file);
290 
291     if (parser->postal_code_contexts == NULL) {
292         goto exit_address_parser_created;
293     }
294 
295     fclose(postal_codes_file);
296 
297     parser->context = address_parser_context_new();
298     if (parser->context == NULL) {
299         goto exit_address_parser_created;
300     }
301 
302     char_array_destroy(path);
303     return true;
304 
305 exit_address_parser_created:
306     address_parser_destroy(parser);
307     char_array_destroy(path);
308     return false;
309 }
310 
address_parser_destroy(address_parser_t * self)311 void address_parser_destroy(address_parser_t *self) {
312     if (self == NULL) return;
313 
314     if (self->model_type == ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON && self->model.ap != NULL) {
315         averaged_perceptron_destroy(self->model.ap);
316     } else if (self->model_type == ADDRESS_PARSER_TYPE_CRF && self->model.crf != NULL) {
317         crf_destroy(self->model.crf);
318     }
319 
320     if (self->context != NULL) {
321         address_parser_context_destroy(self->context);
322     }
323 
324     if (self->vocab != NULL) {
325         trie_destroy(self->vocab);
326     }
327 
328     if (self->phrases != NULL) {
329         trie_destroy(self->phrases);
330     }
331 
332     if (self->phrase_types != NULL) {
333         address_parser_types_array_destroy(self->phrase_types);
334     }
335 
336     if (self->postal_codes != NULL) {
337         trie_destroy(self->postal_codes);
338     }
339 
340     if (self->postal_code_contexts != NULL) {
341         graph_destroy(self->postal_code_contexts);
342     }
343 
344     free(self);
345 }
346 
word_vocab_frequency(address_parser_t * parser,char * word)347 static inline uint32_t word_vocab_frequency(address_parser_t *parser, char *word) {
348     uint32_t count = 0;
349     bool has_key = trie_get_data(parser->vocab, word, &count);
350     return count;
351 }
352 
address_parser_normalize_token(cstring_array * array,char * str,token_t token)353 inline void address_parser_normalize_token(cstring_array *array, char *str, token_t token) {
354     normalize_token(array, str, token, ADDRESS_PARSER_NORMALIZE_TOKEN_OPTIONS);
355 }
356 
address_parser_normalize_phrase_token(cstring_array * array,char * str,token_t token)357 static inline void address_parser_normalize_phrase_token(cstring_array *array, char *str, token_t token) {
358     normalize_token(array, str, token, ADDRESS_PARSER_NORMALIZE_ADMIN_TOKEN_OPTIONS);
359 }
360 
address_parser_normalize_string(char * str)361 inline char *address_parser_normalize_string(char *str) {
362     return normalize_string_latin(str, strlen(str), ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS);
363 }
364 
365 
address_parser_context_destroy(address_parser_context_t * self)366 void address_parser_context_destroy(address_parser_context_t *self) {
367     if (self == NULL) return;
368 
369     if (self->phrase != NULL) {
370         char_array_destroy(self->phrase);
371     }
372 
373     if (self->context_phrase != NULL) {
374         char_array_destroy(self->context_phrase);
375     }
376 
377     if (self->long_context_phrase != NULL) {
378         char_array_destroy(self->long_context_phrase);
379     }
380 
381     if (self->component_phrase != NULL) {
382         char_array_destroy(self->component_phrase);
383     }
384 
385     if (self->context_component_phrase != NULL) {
386         char_array_destroy(self->context_component_phrase);
387     }
388 
389     if (self->long_context_component_phrase != NULL) {
390         char_array_destroy(self->long_context_component_phrase);
391     }
392 
393     if (self->prefix_phrase != NULL) {
394         char_array_destroy(self->prefix_phrase);
395     }
396 
397     if (self->context_prefix_phrase != NULL) {
398         char_array_destroy(self->context_prefix_phrase);
399     }
400 
401     if (self->long_context_prefix_phrase != NULL) {
402         char_array_destroy(self->long_context_prefix_phrase);
403     }
404 
405     if (self->suffix_phrase != NULL) {
406         char_array_destroy(self->suffix_phrase);
407     }
408 
409     if (self->context_suffix_phrase != NULL) {
410         char_array_destroy(self->context_suffix_phrase);
411     }
412 
413     if (self->long_context_suffix_phrase != NULL) {
414         char_array_destroy(self->long_context_suffix_phrase);
415     }
416 
417     if (self->ngrams != NULL) {
418         cstring_array_destroy(self->ngrams);
419     }
420 
421     if (self->sub_token != NULL) {
422         char_array_destroy(self->sub_token);
423     }
424 
425     if (self->sub_tokens != NULL) {
426         token_array_destroy(self->sub_tokens);
427     }
428 
429     if (self->separators != NULL) {
430         uint32_array_destroy(self->separators);
431     }
432 
433     if (self->normalized != NULL) {
434         cstring_array_destroy(self->normalized);
435     }
436 
437     if (self->normalized_tokens != NULL) {
438         token_array_destroy(self->normalized_tokens);
439     }
440 
441     if (self->normalized_admin != NULL) {
442         cstring_array_destroy(self->normalized_admin);
443     }
444 
445     if (self->normalized_admin_tokens != NULL) {
446         token_array_destroy(self->normalized_admin_tokens);
447     }
448 
449     if (self->features != NULL) {
450         cstring_array_destroy(self->features);
451     }
452 
453     if (self->prev_tag_features != NULL) {
454         cstring_array_destroy(self->prev_tag_features);
455     }
456 
457     if (self->prev2_tag_features != NULL) {
458         cstring_array_destroy(self->prev2_tag_features);
459     }
460 
461     if (self->tokenized_str != NULL) {
462         tokenized_string_destroy(self->tokenized_str);
463     }
464 
465     if (self->address_dictionary_phrases != NULL) {
466         phrase_array_destroy(self->address_dictionary_phrases);
467     }
468 
469     if (self->address_phrase_memberships != NULL) {
470         int64_array_destroy(self->address_phrase_memberships);
471     }
472 
473     if (self->component_phrases != NULL) {
474         phrase_array_destroy(self->component_phrases);
475     }
476 
477     if (self->component_phrase_memberships != NULL) {
478         int64_array_destroy(self->component_phrase_memberships);
479     }
480 
481     if (self->postal_code_phrases != NULL) {
482         phrase_array_destroy(self->postal_code_phrases);
483     }
484 
485     if (self->postal_code_phrase_memberships != NULL) {
486         int64_array_destroy(self->postal_code_phrase_memberships);
487     }
488 
489     if (self->prefix_phrases != NULL) {
490         phrase_array_destroy(self->prefix_phrases);
491     }
492 
493     if (self->suffix_phrases != NULL) {
494         phrase_array_destroy(self->suffix_phrases);
495     }
496 
497     free(self);
498 }
499 
address_parser_context_new(void)500 address_parser_context_t *address_parser_context_new(void) {
501     address_parser_context_t *context = malloc(sizeof(address_parser_context_t));
502 
503     if (context == NULL) return NULL;
504 
505     context->language = NULL;
506     context->country = NULL;
507 
508     context->phrase = char_array_new();
509     if (context->phrase == NULL) {
510         goto exit_address_parser_context_allocated;
511     }
512 
513     context->context_phrase = char_array_new();
514     if (context->context_phrase == NULL) {
515         goto exit_address_parser_context_allocated;
516     }
517 
518     context->long_context_phrase = char_array_new();
519     if (context->long_context_phrase == NULL) {
520         goto exit_address_parser_context_allocated;
521     }
522 
523     context->component_phrase = char_array_new();
524     if (context->component_phrase == NULL) {
525         goto exit_address_parser_context_allocated;
526     }
527 
528     context->context_component_phrase = char_array_new();
529     if (context->context_component_phrase == NULL) {
530         goto exit_address_parser_context_allocated;
531     }
532 
533     context->long_context_component_phrase = char_array_new();
534     if (context->long_context_component_phrase == NULL) {
535         goto exit_address_parser_context_allocated;
536     }
537 
538     context->prefix_phrase = char_array_new();
539     if (context->prefix_phrase == NULL) {
540         goto exit_address_parser_context_allocated;
541     }
542 
543     context->context_prefix_phrase = char_array_new();
544     if (context->context_prefix_phrase == NULL) {
545         goto exit_address_parser_context_allocated;
546     }
547 
548     context->long_context_prefix_phrase = char_array_new();
549     if (context->long_context_prefix_phrase == NULL) {
550         goto exit_address_parser_context_allocated;
551     }
552 
553     context->suffix_phrase = char_array_new();
554     if (context->suffix_phrase == NULL) {
555         goto exit_address_parser_context_allocated;
556     }
557 
558     context->context_suffix_phrase = char_array_new();
559     if (context->context_suffix_phrase == NULL) {
560         goto exit_address_parser_context_allocated;
561     }
562 
563     context->long_context_suffix_phrase = char_array_new();
564     if (context->long_context_suffix_phrase == NULL) {
565         goto exit_address_parser_context_allocated;
566     }
567 
568     context->ngrams = cstring_array_new();
569     if (context->ngrams == NULL) {
570         goto exit_address_parser_context_allocated;
571     }
572 
573     context->sub_token = char_array_new();
574     if (context->sub_token == NULL) {
575         goto exit_address_parser_context_allocated;
576     }
577 
578     context->sub_tokens = token_array_new();
579     if (context->sub_tokens == NULL) {
580         goto exit_address_parser_context_allocated;
581     }
582 
583     context->separators = uint32_array_new();
584     if (context->separators == NULL) {
585         goto exit_address_parser_context_allocated;
586     }
587 
588     context->normalized = cstring_array_new();
589     if (context->normalized == NULL) {
590         goto exit_address_parser_context_allocated;
591     }
592 
593     context->normalized_tokens = token_array_new();
594     if (context->normalized_tokens == NULL) {
595         goto exit_address_parser_context_allocated;
596     }
597 
598     context->normalized_admin = cstring_array_new();
599     if (context->normalized_admin == NULL) {
600         goto exit_address_parser_context_allocated;
601     }
602 
603     context->normalized_admin_tokens = token_array_new();
604     if (context->normalized_admin_tokens == NULL) {
605         goto exit_address_parser_context_allocated;
606     }
607 
608     context->features = cstring_array_new();
609     if (context->features == NULL) {
610         goto exit_address_parser_context_allocated;
611     }
612 
613     context->prev_tag_features = cstring_array_new();
614     if (context->prev_tag_features == NULL) {
615         goto exit_address_parser_context_allocated;
616     }
617 
618     context->prev2_tag_features = cstring_array_new();
619     if (context->prev2_tag_features == NULL) {
620         goto exit_address_parser_context_allocated;
621     }
622 
623     context->tokenized_str = tokenized_string_new();
624     if (context->tokenized_str == NULL) {
625         goto exit_address_parser_context_allocated;
626     }
627 
628     context->address_dictionary_phrases = phrase_array_new();
629     if (context->address_dictionary_phrases == NULL) {
630         goto exit_address_parser_context_allocated;
631     }
632 
633     context->address_phrase_memberships = int64_array_new();
634     if (context->address_phrase_memberships == NULL) {
635         goto exit_address_parser_context_allocated;
636     }
637 
638     context->component_phrases = phrase_array_new();
639     if (context->component_phrases == NULL) {
640         goto exit_address_parser_context_allocated;
641     }
642 
643     context->component_phrase_memberships = int64_array_new();
644     if (context->component_phrase_memberships == NULL) {
645         goto exit_address_parser_context_allocated;
646     }
647 
648     context->postal_code_phrases = phrase_array_new();
649     if (context->postal_code_phrases == NULL) {
650         goto exit_address_parser_context_allocated;
651     }
652 
653     context->postal_code_phrase_memberships = int64_array_new();
654     if (context->postal_code_phrase_memberships == NULL) {
655         goto exit_address_parser_context_allocated;
656     }
657 
658     context->prefix_phrases = phrase_array_new();
659     if (context->prefix_phrases == NULL) {
660         goto exit_address_parser_context_allocated;
661     }
662 
663     context->suffix_phrases = phrase_array_new();
664     if (context->suffix_phrases == NULL) {
665         goto exit_address_parser_context_allocated;
666     }
667 
668     return context;
669 
670 exit_address_parser_context_allocated:
671     address_parser_context_destroy(context);
672     return NULL;
673 }
674 
is_valid_component_phrase(cstring_array * strings,phrase_t phrase)675 bool is_valid_component_phrase(cstring_array *strings, phrase_t phrase) {
676     bool valid = false;
677     for (uint32_t i = phrase.start; i < phrase.start + phrase.len; i++) {
678         char *s = cstring_array_get_string(strings, i);
679         if (!string_is_digit(s, strlen(s))) {
680             valid = true;
681             break;
682         }
683     }
684     return valid;
685 }
686 
address_parser_context_fill(address_parser_context_t * context,address_parser_t * parser,tokenized_string_t * tokenized_str,char * language,char * country)687 void address_parser_context_fill(address_parser_context_t *context, address_parser_t *parser, tokenized_string_t *tokenized_str, char *language, char *country) {
688     uint32_t token_index;
689     char *word;
690     phrase_t phrase;
691 
692     context->language = language;
693     context->country = country;
694 
695     cstring_array *normalized = context->normalized;
696     token_array *normalized_tokens = context->normalized_tokens;
697     cstring_array_clear(normalized);
698     token_array_clear(normalized_tokens);
699 
700     cstring_array *normalized_admin = context->normalized_admin;
701     token_array *normalized_admin_tokens = context->normalized_admin_tokens;
702     cstring_array_clear(normalized_admin);
703     token_array_clear(normalized_admin_tokens);
704 
705     char *str = tokenized_str->str;
706     token_array *tokens = tokenized_str->tokens;
707 
708     cstring_array_foreach(tokenized_str->strings, token_index, word, {
709         token_t token = tokens->a[token_index];
710 
711         size_t token_offset = normalized->str->n;
712         address_parser_normalize_token(normalized, str, token);
713         size_t token_len;
714         if (normalized->str->n > token_offset) {
715            token_len = normalized->str->n - 1 - token_offset;
716         } else {
717             token_len = 0;
718         }
719         token_t normalized_token;
720         normalized_token.offset = token_offset;
721         normalized_token.len = token_len;
722         normalized_token.type = token.type;
723         token_array_push(normalized_tokens, normalized_token);
724 
725         size_t admin_token_offset = normalized_admin->str->n;
726         address_parser_normalize_phrase_token(normalized_admin, str, token);
727         size_t admin_token_len;
728         if (normalized_admin->str->n > admin_token_offset) {
729            admin_token_len = normalized_admin->str->n - 1 - admin_token_offset;
730         } else {
731             admin_token_len = 0;
732         }
733         token_t normalized_admin_token;
734         normalized_admin_token.offset = admin_token_offset;
735         normalized_admin_token.len = admin_token_len;
736         normalized_admin_token.type = token.type;
737         token_array_push(normalized_admin_tokens, normalized_admin_token);
738     })
739 
740     char *normalized_str = normalized->str->a;
741     char *normalized_str_admin = normalized_admin->str->a;
742 
743     /*
744     Address dictionary phrases
745     --------------------------
746     Recognizing phrases that occur in libpostal's dictionaries.
747 
748     Note: if the dictionaries are updates to try to improve the parser,
749     we'll need to retrain. This can be done without rebuilding the
750     training data (a long-running process which can take up to a week),
751     but will require running address_parser_train, the main training script.
752     */
753 
754     phrase_array_clear(context->address_dictionary_phrases);
755     int64_array_clear(context->address_phrase_memberships);
756 
757     phrase_array *address_dictionary_phrases = context->address_dictionary_phrases;
758     int64_array *address_phrase_memberships = context->address_phrase_memberships;
759 
760     size_t num_tokens = tokens->n;
761 
762     bool have_address_phrases = search_address_dictionaries_tokens_with_phrases(normalized_str, normalized_tokens, NULL, &address_dictionary_phrases);
763     token_phrase_memberships(address_dictionary_phrases, address_phrase_memberships, num_tokens);
764 
765     phrase_array_clear(context->prefix_phrases);
766     phrase_array_clear(context->suffix_phrases);
767 
768     for (size_t i = 0; i < num_tokens; i++) {
769         token_t token = tokens->a[i];
770         char *word_pre_norm = tokenized_string_get_token(tokenized_str, i);
771 
772         phrase_t prefix_phrase = search_address_dictionaries_prefix(word_pre_norm, token.len, NULL);
773         phrase_array_push(context->prefix_phrases, prefix_phrase);
774 
775         phrase_t suffix_phrase = search_address_dictionaries_suffix(word_pre_norm, token.len, NULL);
776         phrase_array_push(context->suffix_phrases, suffix_phrase);
777     }
778 
779     /*
780     Component phrases
781     -----------------
782     Precomputed phrases for cities, states, countries, etc. from the training data
783 
784     Note: if the training data has lots of mislabeled examples (e.g. Brooklyn as city
785     instead of a city_district), this may cause the parser to get confused. It will
786     penalize itself for getting the wrong answer when really the underlying data
787     is simply ambiguous. In the OSM training data a lot of work has been done to
788     ensure that there's little or no systematic mislabeling. As such, other data
789     sets shouldn't be added willy-nilly unless the labels are consistent.
790     */
791 
792     phrase_array_clear(context->component_phrases);
793     int64_array_clear(context->component_phrase_memberships);
794 
795     phrase_array *component_phrases = context->component_phrases;
796     int64_array *component_phrase_memberships = context->component_phrase_memberships;
797 
798     bool have_component_phrases = trie_search_tokens_with_phrases(parser->phrases, normalized_str_admin, normalized_admin_tokens, &component_phrases);
799     token_phrase_memberships(component_phrases, component_phrase_memberships, num_tokens);
800 
801     for (size_t i = 0; i < component_phrases->n; i++) {
802         phrase_t phrase = component_phrases->a[i];
803         if (!is_valid_component_phrase(context->normalized_admin, phrase)) {
804             for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) {
805                 component_phrase_memberships->a[j] = NULL_PHRASE_MEMBERSHIP;
806             }
807         }
808     }
809 
810     phrase_array_clear(context->postal_code_phrases);
811     int64_array_clear(context->postal_code_phrase_memberships);
812 
813     phrase_array *postal_code_phrases = context->postal_code_phrases;
814     int64_array *postal_code_phrase_memberships = context->postal_code_phrase_memberships;
815 
816     bool have_postal_code_phrases = trie_search_tokens_with_phrases(parser->postal_codes, normalized_str_admin, normalized_admin_tokens, &postal_code_phrases);
817     token_phrase_memberships(postal_code_phrases, postal_code_phrase_memberships, num_tokens);
818 
819 }
820 
phrase_at_index(phrase_array * phrases,int64_array * phrase_memberships,uint32_t i)821 static inline phrase_t phrase_at_index(phrase_array *phrases, int64_array *phrase_memberships, uint32_t i) {
822     if (phrases == NULL || phrase_memberships == NULL || i > phrase_memberships->n - 1) {
823         return NULL_PHRASE;
824     }
825 
826     int64_t phrase_index = phrase_memberships->a[i];
827     if (phrase_index != NULL_PHRASE_MEMBERSHIP) {
828         phrase_t phrase = phrases->a[phrase_index];
829         return phrase;
830     }
831 
832     return NULL_PHRASE;
833 }
834 
phrase_prefix(char * word,size_t len,phrase_t prefix_phrase,char_array * prefix_phrase_array)835 char *phrase_prefix(char *word, size_t len, phrase_t prefix_phrase, char_array *prefix_phrase_array) {
836     char_array_clear(prefix_phrase_array);
837     size_t prefix_len = prefix_phrase.len;
838     char_array_add_len(prefix_phrase_array, word, prefix_len);
839     char *prefix = char_array_get_string(prefix_phrase_array);
840     return prefix;
841 }
842 
phrase_suffix(char * word,size_t len,phrase_t suffix_phrase,char_array * suffix_phrase_array)843 char *phrase_suffix(char *word, size_t len, phrase_t suffix_phrase, char_array *suffix_phrase_array) {
844     char_array_clear(suffix_phrase_array);
845     size_t suffix_len = suffix_phrase.len;
846     char_array_add_len(suffix_phrase_array, word + (len - suffix_len), suffix_len);
847     char *suffix = char_array_get_string(suffix_phrase_array);
848     return suffix;
849 }
850 
is_valid_dictionary_phrase(phrase_t phrase)851 bool is_valid_dictionary_phrase(phrase_t phrase) {
852     uint32_t expansion_index = phrase.data;
853     address_expansion_value_t *expansion_value = address_dictionary_get_expansions(expansion_index);
854 
855     if (expansion_value == NULL) {
856         log_warn("expansion_value is NULL for index %u\n", expansion_index);
857         return false;
858     }
859     uint32_t address_phrase_types = expansion_value->components;
860 
861     if (address_phrase_types & (LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ENTRANCE | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_POSTAL_CODE | LIBPOSTAL_ADDRESS_PO_BOX)) {
862         for (size_t i = 0; i < expansion_value->expansions->n; i++) {
863             address_expansion_t expansion = expansion_value->expansions->a[i];
864             if (!address_expansion_in_dictionary(expansion, DICTIONARY_TOPONYM)) {
865                 return true;
866             }
867         }
868     }
869 
870     return false;
871 }
872 
873 typedef struct address_parser_phrase {
874     char *str;
875     address_parser_phrase_type_t type;
876     phrase_t phrase;
877 } address_parser_phrase_t;
878 
is_plain_word_phrase_type(address_parser_phrase_type_t type)879 static inline bool is_plain_word_phrase_type(address_parser_phrase_type_t type) {
880     return type == ADDRESS_PARSER_NULL_PHRASE || type == ADDRESS_PARSER_SUFFIX_PHRASE || type == ADDRESS_PARSER_PREFIX_PHRASE;
881 }
882 
word_or_phrase_at_index(address_parser_t * parser,tokenized_string_t * tokenized,address_parser_context_t * context,uint32_t i,bool long_context)883 static address_parser_phrase_t word_or_phrase_at_index(address_parser_t *parser, tokenized_string_t *tokenized, address_parser_context_t *context, uint32_t i, bool long_context) {
884     phrase_t phrase;
885     address_parser_phrase_t response;
886     char *phrase_string = NULL;
887 
888     phrase = phrase_at_index(context->address_dictionary_phrases, context->address_phrase_memberships, i);
889 
890     phrase_t component_phrase = phrase_at_index(context->component_phrases, context->component_phrase_memberships, i);
891 
892     if (phrase.len > 0 && is_valid_dictionary_phrase(phrase) && component_phrase.len <= phrase.len) {
893         phrase_string = cstring_array_get_phrase(context->normalized, long_context ? context->long_context_phrase : context->context_phrase, phrase),
894 
895         response = (address_parser_phrase_t){
896             phrase_string,
897             ADDRESS_PARSER_DICTIONARY_PHRASE,
898             phrase
899         };
900         return response;
901     }
902 
903     phrase = component_phrase;
904 
905     if (phrase.len > 0) {
906         phrase_string = cstring_array_get_phrase(context->normalized_admin, long_context ? context->long_context_component_phrase : context->context_component_phrase, phrase);
907 
908         response = (address_parser_phrase_t){
909             phrase_string,
910             ADDRESS_PARSER_COMPONENT_PHRASE,
911             phrase
912         };
913         return response;
914     }
915 
916     phrase_t prefix_phrase = context->prefix_phrases->a[i];
917     phrase_t suffix_phrase = context->suffix_phrases->a[i];
918 
919     uint32_t expansion_index;
920     address_expansion_value_t *expansion_value;
921 
922     cstring_array *normalized = context->normalized;
923 
924     char *word = cstring_array_get_string(normalized, i);
925     token_t token = tokenized->tokens->a[i];
926 
927     // Suffixes like straße, etc.
928     if (suffix_phrase.len > 0) {
929         expansion_index = suffix_phrase.data;
930         expansion_value = address_dictionary_get_expansions(expansion_index);
931 
932         if (expansion_value->components & LIBPOSTAL_ADDRESS_STREET) {
933             response = (address_parser_phrase_t){
934                 word,
935                 ADDRESS_PARSER_SUFFIX_PHRASE,
936                 suffix_phrase
937             };
938             return response;
939         }
940     }
941 
942     // Prefixes like hinter, etc.
943     if (prefix_phrase.len > 0) {
944         expansion_index = prefix_phrase.data;
945         expansion_value = address_dictionary_get_expansions(expansion_index);
946 
947         // Don't include elisions like l', d', etc. which are in the LIBPOSTAL_ADDRESS_ANY category
948         if (expansion_value->components ^ LIBPOSTAL_ADDRESS_ANY) {
949             response = (address_parser_phrase_t){
950                 word,
951                 ADDRESS_PARSER_PREFIX_PHRASE,
952                 prefix_phrase
953             };
954             return response;
955         }
956     }
957 
958     response = (address_parser_phrase_t){
959         word,
960         ADDRESS_PARSER_NULL_PHRASE,
961         NULL_PHRASE
962     };
963     return response;
964 
965 }
966 
phrase_index(int64_array * phrase_memberships,size_t start,int8_t direction)967 static inline int64_t phrase_index(int64_array *phrase_memberships, size_t start, int8_t direction) {
968     if (phrase_memberships == NULL) {
969         return -1;
970     }
971 
972     int64_t *memberships = phrase_memberships->a;
973     int64_t membership;
974 
975     if (direction == -1) {
976         for (ssize_t idx = start; idx >= 0; idx--) {
977             if (memberships[idx] != NULL_PHRASE_MEMBERSHIP) {
978                 return (int64_t)idx;
979             }
980         }
981     } else if (direction == 1) {
982         size_t n = phrase_memberships->n;
983         for (size_t idx = start; idx < n; idx++) {
984             if (memberships[idx] != NULL_PHRASE_MEMBERSHIP) {
985                 return (int64_t)idx;
986             }
987         }
988     }
989 
990     return -1;
991 }
992 
993 
next_numeric_token_index(tokenized_string_t * tokenized,address_parser_context_t * context,size_t start)994 static inline int64_t next_numeric_token_index(tokenized_string_t *tokenized, address_parser_context_t *context, size_t start) {
995     if (context == NULL) return -1;
996 
997     token_array *tokens = tokenized->tokens;
998 
999     if (tokens == NULL || start > tokens->n - 1) return -1;
1000 
1001     phrase_t phrase;
1002 
1003     for (size_t i = start; i < tokens->n; i++) {
1004         if (context->address_phrase_memberships->a[i] == NULL_PHRASE_MEMBERSHIP &&
1005             context->component_phrase_memberships->a[i] == NULL_PHRASE_MEMBERSHIP) {
1006             token_t token = tokens->a[i];
1007             if (token.type != NUMERIC && token.type != IDEOGRAPHIC_NUMBER) {
1008                 return i;
1009             }
1010         }
1011     }
1012 
1013     return -1;
1014 }
1015 
1016 
add_phrase_features(cstring_array * features,uint32_t phrase_types,uint32_t component,char * phrase_type,char * phrase_string)1017 static inline void add_phrase_features(cstring_array *features, uint32_t phrase_types, uint32_t component, char *phrase_type, char *phrase_string) {
1018     if (phrase_types == component) {
1019         log_debug("phrase=%s, phrase_types=%d\n", phrase_string, phrase_types);
1020         feature_array_add(features, 2, "unambiguous phrase type", phrase_type);
1021         feature_array_add(features, 3, "unambiguous phrase type+phrase", phrase_type, phrase_string);
1022     } else if (phrase_types & component) {
1023         feature_array_add(features, 3, "phrase type+phrase", phrase_type, phrase_string);
1024     }
1025 }
1026 
add_ngram_features(cstring_array * features,char * feature_prefix,cstring_array * ngrams,char * str,size_t n,size_t prefix_len,size_t suffix_len)1027 static bool add_ngram_features(cstring_array *features, char *feature_prefix, cstring_array *ngrams, char *str, size_t n, size_t prefix_len, size_t suffix_len) {
1028     if (features == NULL || ngrams == NULL) return false;
1029 
1030     size_t len = strlen(str);
1031 
1032     if (n == 0 || n > len - 1) return false;
1033 
1034     size_t ngram_num_chars_len = INT64_MAX_STRING_SIZE;
1035     char ngram_num_chars[ngram_num_chars_len];
1036     sprintf(ngram_num_chars, "%zu", n);
1037 
1038     bool known_prefix = prefix_len > 0;
1039     bool known_suffix = suffix_len > 0;
1040 
1041     cstring_array_clear(ngrams);
1042     if (!add_ngrams(ngrams, n, str + prefix_len, len - suffix_len - prefix_len, !known_prefix, !known_suffix)) {
1043         return false;
1044     }
1045 
1046     uint32_t idx;
1047     char *ngram;
1048 
1049     if (feature_prefix != NULL) {
1050         cstring_array_foreach(ngrams, idx, ngram, {
1051             feature_array_add(features, 4, feature_prefix, "ngrams", ngram_num_chars, ngram);
1052         })
1053     } else {
1054         cstring_array_foreach(ngrams, idx, ngram, {
1055             feature_array_add(features, 3, "ngrams", ngram_num_chars, ngram);
1056         })
1057     }
1058 
1059     return true;
1060 }
1061 
1062 /*
1063 address_parser_features
1064 -----------------------
1065 
1066 This is a feature function similar to those found in MEMM and CRF models.
1067 
1068 Follows the signature of a tagger_feature_function so it can be called
1069 as a function pointer by the averaged perceptron or CRF model.
1070 
1071 Parameters:
1072 
1073 address_parser_t *self: a pointer to the address_parser struct, which contains
1074 word frequencies and perhaps other useful corpus-wide statistics.
1075 
1076 address_parser_context_t *context: The context struct containing:
1077 - phrase dictionary memberships for all the tokens
1078 - country (if knkown)
1079 - language (if known)
1080 - features array
1081 
1082 tokenized_string_t *tokenized: the sequence of tokens for parsing
1083 uint32_t i: the current token index
1084 char *prev: the predicted tag at index i - 1
1085 char *prev2: the predicted tag at index i - 2
1086 
1087 */
1088 
address_parser_features(void * self,void * ctx,tokenized_string_t * tokenized,uint32_t idx)1089 bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenized, uint32_t idx) {
1090     if (self == NULL || ctx == NULL) return false;
1091 
1092     address_parser_t *parser = (address_parser_t *)self;
1093     address_parser_context_t *context = (address_parser_context_t *)ctx;
1094 
1095     cstring_array *features = context->features;
1096     cstring_array *prev_tag_features = context->prev_tag_features;
1097     cstring_array *prev2_tag_features = context->prev2_tag_features;
1098     char *language = context->language;
1099     char *country = context->country;
1100 
1101     phrase_array *address_dictionary_phrases = context->address_dictionary_phrases;
1102     int64_array *address_phrase_memberships = context->address_phrase_memberships;
1103     phrase_array *component_phrases = context->component_phrases;
1104     int64_array *component_phrase_memberships = context->component_phrase_memberships;
1105     phrase_array *postal_code_phrases = context->postal_code_phrases;
1106     int64_array *postal_code_phrase_memberships = context->postal_code_phrase_memberships;
1107     cstring_array *normalized = context->normalized;
1108 
1109     uint32_array *separators = context->separators;
1110 
1111     cstring_array_clear(features);
1112     cstring_array_clear(prev_tag_features);
1113     cstring_array_clear(prev2_tag_features);
1114 
1115     token_array *tokens = tokenized->tokens;
1116 
1117     token_t token = tokens->a[idx];
1118 
1119     ssize_t last_index = (ssize_t)idx - 1;
1120     ssize_t next_index = (ssize_t)idx + 1;
1121 
1122     char *word_pre_norm = tokenized_string_get_token(tokenized, idx);
1123 
1124     char *word = cstring_array_get_string(normalized, idx);
1125     if (word == NULL) {
1126         log_error("got NULL word at %d\n", idx);
1127         return false;
1128     }
1129 
1130     size_t word_len = strlen(word);
1131 
1132     log_debug("word=%s\n", word);
1133 
1134     phrase_t phrase = NULL_PHRASE;
1135     phrase_t component_phrase = NULL_PHRASE;
1136 
1137     char *phrase_string = NULL;
1138     char *component_phrase_string = NULL;
1139 
1140     int64_t address_phrase_index = address_phrase_memberships->a[idx];
1141     int64_t component_phrase_index = component_phrase_memberships->a[idx];
1142 
1143     if (address_phrase_index != NULL_PHRASE_MEMBERSHIP) {
1144         phrase = address_dictionary_phrases->a[address_phrase_index];
1145     }
1146 
1147     if (component_phrase_index != NULL_PHRASE_MEMBERSHIP) {
1148         component_phrase = component_phrases->a[component_phrase_index];
1149     }
1150 
1151     char_array *phrase_tokens = context->phrase;
1152     char_array *component_phrase_tokens = context->component_phrase;
1153 
1154     uint32_t expansion_index;
1155     address_expansion_value_t *expansion_value;
1156 
1157     bool add_word_feature = true;
1158 
1159     size_t num_tokens = tokenized->tokens->n;
1160 
1161     // Address dictionary phrases
1162     if (phrase.len > 0 && phrase.len >= component_phrase.len) {
1163         log_debug("phrase\n");
1164 
1165         last_index = (ssize_t)phrase.start - 1;
1166         next_index = (ssize_t)phrase.start + phrase.len;
1167 
1168         if(is_valid_dictionary_phrase(phrase)) {
1169             uint32_t expansion_index = phrase.data;
1170             address_expansion_value_t *expansion_value = address_dictionary_get_expansions(expansion_index);
1171 
1172             if (expansion_value == NULL) {
1173                 log_warn("expansion_value is NULL for index %u\n", expansion_index);
1174                 return false;
1175             }
1176             uint32_t address_phrase_types = expansion_value->components;
1177 
1178             phrase_string = cstring_array_get_phrase(context->normalized, phrase_tokens, phrase);
1179 
1180             add_word_feature = false;
1181             log_debug("phrase_string=%s\n", phrase_string);
1182 
1183             add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_STREET, "street", phrase_string);
1184             add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_NAME, "name", phrase_string);
1185             add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_CATEGORY, "category", phrase_string);
1186             add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_UNIT, "unit", phrase_string);
1187             add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_PO_BOX, "po_box", phrase_string);
1188             add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_LEVEL, "level", phrase_string);
1189             add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_ENTRANCE, "entrance", phrase_string);
1190             add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_STAIRCASE, "staircase", phrase_string);
1191             add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_HOUSE_NUMBER, "house_number", phrase_string);
1192             add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_POSTAL_CODE, "postal_code", phrase_string);
1193         }
1194     }
1195 
1196 
1197     address_parser_types_t types;
1198 
1199     // Component phrases
1200     if (component_phrase.len > 0 && component_phrase.len >= phrase.len) {
1201         component_phrase = component_phrases->a[component_phrase_index];
1202 
1203         component_phrase_string = cstring_array_get_phrase(context->normalized_admin, component_phrase_tokens, component_phrase);
1204 
1205         uint32_t component_phrase_index = component_phrase.data;
1206         if (component_phrase_index > parser->phrase_types->n) {
1207             log_error("Invalid component_phrase_index: %u (parser->phrase_types->n=%zu)\n", component_phrase_index, parser->phrase_types->n);
1208             return false;
1209         }
1210 
1211         types = parser->phrase_types->a[component_phrase_index];
1212 
1213         uint32_t component_phrase_types = types.components;
1214         uint32_t most_common = types.most_common;
1215 
1216         if (last_index >= (ssize_t)component_phrase.start - 1) {
1217             last_index = (ssize_t)component_phrase.start - 1;
1218         }
1219 
1220         if (next_index < (ssize_t)component_phrase.start + component_phrase.len) {
1221             next_index = (ssize_t)component_phrase.start + component_phrase.len;
1222         }
1223 
1224         if (component_phrase_string != NULL && component_phrase_types > 0) {
1225             feature_array_add(features, 2, "phrase", component_phrase_string);
1226             add_word_feature = false;
1227         }
1228 
1229         if (component_phrase_types > 0) {
1230             add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_SUBURB, "suburb", component_phrase_string);
1231             add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_CITY, "city", component_phrase_string);
1232             add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_CITY_DISTRICT, "city_district", component_phrase_string);
1233             add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_ISLAND, "island", component_phrase_string);
1234             add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_STATE_DISTRICT, "state_district", component_phrase_string);
1235             add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_STATE, "state", component_phrase_string);
1236             add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_COUNTRY_REGION, "country_region", component_phrase_string);
1237             add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_COUNTRY, "country", component_phrase_string);
1238             add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_WORLD_REGION, "world_region", component_phrase_string);
1239         }
1240 
1241         if (component_phrase_types != most_common) {
1242             if (most_common == ADDRESS_PARSER_BOUNDARY_CITY) {
1243                 feature_array_add(features, 2, "commonly city", component_phrase_string);
1244             } else if (most_common == ADDRESS_PARSER_BOUNDARY_COUNTRY) {
1245                 feature_array_add(features, 2, "commonly country", component_phrase_string);
1246             } else if (most_common == ADDRESS_PARSER_BOUNDARY_SUBURB) {
1247                 feature_array_add(features, 2, "commonly suburb", component_phrase_string);
1248             } else if (most_common == ADDRESS_PARSER_BOUNDARY_CITY_DISTRICT) {
1249                 feature_array_add(features, 2, "commonly city_district", component_phrase_string);
1250             } else if (most_common == ADDRESS_PARSER_BOUNDARY_STATE) {
1251                 feature_array_add(features, 2, "commonly state", component_phrase_string);
1252             } else if (most_common == ADDRESS_PARSER_BOUNDARY_COUNTRY_REGION) {
1253                 feature_array_add(features, 2, "commonly country_region", component_phrase_string);
1254             } else if (most_common == ADDRESS_PARSER_BOUNDARY_STATE_DISTRICT) {
1255                 feature_array_add(features, 2, "commonly state_district", component_phrase_string);
1256             } else if (most_common == ADDRESS_PARSER_BOUNDARY_ISLAND) {
1257                 feature_array_add(features, 2, "commonly island", component_phrase_string);
1258             }
1259         }
1260     }
1261 
1262     bool possible_postal_code = false;
1263     bool postal_code_have_admin = false;
1264     int64_t postal_code_phrase_index = postal_code_phrase_memberships->a[idx];
1265     phrase_t postal_code_phrase = NULL_PHRASE;
1266 
1267     if (postal_code_phrase_index != NULL_PHRASE_MEMBERSHIP) {
1268         postal_code_phrase = postal_code_phrases->a[postal_code_phrase_index];
1269 
1270         uint32_t postal_code_id = postal_code_phrase.data;
1271 
1272         possible_postal_code = true;
1273 
1274         if (last_index >= (ssize_t)postal_code_phrase.start - 1) {
1275             last_index = (ssize_t)postal_code_phrase.start - 1;
1276         }
1277 
1278         if (next_index < (ssize_t)postal_code_phrase.start + postal_code_phrase.len) {
1279             next_index = (ssize_t)postal_code_phrase.start + postal_code_phrase.len;
1280         }
1281 
1282         uint32_t admin_id;
1283         uint64_t postal_code_context;
1284 
1285         khiter_t k;
1286 
1287         if (last_index >= 0) {
1288             int64_t last_component_phrase_index = component_phrase_memberships->a[last_index];
1289             if (last_component_phrase_index != NULL_PHRASE_MEMBERSHIP) {
1290                 phrase_t last_component_phrase = component_phrases->a[last_component_phrase_index];
1291                 admin_id = last_component_phrase.data;
1292 
1293                 if (postal_code_context_exists(parser, postal_code_id, admin_id)) {
1294                     postal_code_have_admin = true;
1295                 }
1296             }
1297         }
1298 
1299         if (!postal_code_have_admin && next_index < num_tokens) {
1300             int64_t next_component_phrase_index = component_phrase_memberships->a[next_index];
1301             if (next_component_phrase_index != NULL_PHRASE_MEMBERSHIP) {
1302                 phrase_t next_component_phrase = component_phrases->a[next_component_phrase_index];
1303                 admin_id = next_component_phrase.data;
1304                 if (postal_code_context_exists(parser, postal_code_id, admin_id)) {
1305                     postal_code_have_admin = true;
1306                 }
1307             }
1308         }
1309 
1310     }
1311 
1312     if (possible_postal_code) {
1313         if (postal_code_have_admin) {
1314             feature_array_add(features, 1, "postcode have context");
1315             feature_array_add(features, 2, "postcode have context", word);
1316         } else {
1317             feature_array_add(features, 2, "postcode no context", word);
1318         }
1319     }
1320 
1321     uint32_t word_freq = word_vocab_frequency(parser, word);
1322 
1323     bool is_word = is_word_token(token.type);
1324 
1325     bool is_unknown_word = false;
1326     bool is_unknown = false;
1327 
1328     bool known_prefix = false;
1329     bool known_suffix = false;
1330 
1331     size_t prefix_len = 0;
1332     size_t suffix_len = 0;
1333 
1334     char *prefix = NULL;
1335     char *suffix = NULL;
1336 
1337     if (add_word_feature) {
1338         // Bias unit, acts as an intercept
1339         feature_array_add(features, 1, "bias");
1340 
1341         phrase_t prefix_phrase = context->prefix_phrases->a[idx];
1342         phrase_t suffix_phrase = context->suffix_phrases->a[idx];
1343 
1344         // Prefixes like hinter, etc.
1345         if (prefix_phrase.len > 0) {
1346             expansion_index = prefix_phrase.data;
1347             expansion_value = address_dictionary_get_expansions(expansion_index);
1348 
1349             // Don't include elisions like l', d', etc. which are in the LIBPOSTAL_ADDRESS_ANY category
1350             if (expansion_value->components ^ LIBPOSTAL_ADDRESS_ANY) {
1351                 known_prefix = true;
1352                 char_array_clear(phrase_tokens);
1353                 prefix_len = prefix_phrase.len;
1354                 char_array_add_len(phrase_tokens, word_pre_norm, prefix_len);
1355                 prefix = char_array_get_string(phrase_tokens);
1356                 log_debug("got prefix: %s\n", prefix);
1357                 feature_array_add(features, 2, "prefix", prefix);
1358             }
1359         }
1360 
1361         // Suffixes like straße, etc.
1362         if (suffix_phrase.len > 0) {
1363             expansion_index = suffix_phrase.data;
1364             expansion_value = address_dictionary_get_expansions(expansion_index);
1365 
1366             if (expansion_value->components & LIBPOSTAL_ADDRESS_STREET) {
1367                 known_suffix = true;
1368                 char_array_clear(context->suffix_phrase);
1369                 suffix_len = suffix_phrase.len;
1370                 size_t word_pre_norm_len = cstring_array_token_length(tokenized->strings, idx);
1371                 size_t suffix_offset = word_pre_norm_len - suffix_len;
1372                 char_array_add_len(context->suffix_phrase, word_pre_norm + suffix_offset, suffix_len);
1373                 suffix = char_array_get_string(context->suffix_phrase);
1374                 log_debug("got suffix: %s\n", suffix);
1375                 feature_array_add(features, 2, "suffix", suffix);
1376             }
1377         }
1378 
1379         bool is_hyphenated = false;
1380 
1381         // For rare words and unknown words (so unknown words can benefit from statistics of known but super common words)
1382         if (word_freq <= parser->options.rare_word_threshold && is_word) {
1383             log_debug("rare word: %s\n", word);
1384             bool ngrams_added = false;
1385             size_t hyphenated_word_offset = 0;
1386             bool first_sub_token = true;
1387             bool last_sub_token = true;
1388 
1389             ssize_t next_hyphen_index;
1390 
1391             token_array_clear(context->sub_tokens);
1392 
1393             do {
1394                 next_hyphen_index = string_next_hyphen_index(word + hyphenated_word_offset, word_len - hyphenated_word_offset);
1395                 char *sub_word = word;
1396                 size_t sub_word_len = word_len;
1397 
1398                 if (next_hyphen_index >= 0) {
1399                     is_hyphenated = true;
1400                     char_array_clear(context->sub_token);
1401                     char_array_add_len(context->sub_token, word + hyphenated_word_offset, next_hyphen_index);
1402                     token_array_push(context->sub_tokens, (token_t){hyphenated_word_offset, next_hyphen_index, token.type});
1403                     sub_word = char_array_get_string(context->sub_token);
1404                     sub_word_len = context->sub_token->n;
1405                     last_sub_token = false;
1406                 } else if (is_hyphenated) {
1407                     char_array_clear(context->sub_token);
1408                     char_array_add_len(context->sub_token, word + hyphenated_word_offset, word_len - hyphenated_word_offset);
1409                     sub_word = char_array_get_string(context->sub_token);
1410                     sub_word_len = context->sub_token->n;
1411                     last_sub_token = true;
1412                 }
1413 
1414                 bool add_prefix = first_sub_token && prefix_len < sub_word_len;
1415                 bool add_suffix = last_sub_token && suffix_len < sub_word_len;
1416 
1417                 uint32_t sub_word_freq = word_freq;
1418                 if (is_hyphenated) {
1419                     sub_word_freq = word_vocab_frequency(parser, sub_word);
1420                     if (sub_word_freq > 0) {
1421                         feature_array_add(features, 2, "sub_word", sub_word);
1422                     }
1423 
1424                 }
1425 
1426                 if (sub_word_freq <= parser->options.rare_word_threshold) {
1427                     // prefix/suffix features from 3-6 characters
1428                     for (size_t ng = 3; ng <= 6; ng++) {
1429                         ngrams_added = add_ngram_features(features, is_hyphenated ? "sub_word" : "word", context->ngrams, sub_word, ng, add_prefix ? prefix_len : 0, add_suffix ? suffix_len : 0);
1430                     }
1431                 }
1432 
1433                 hyphenated_word_offset += next_hyphen_index + 1;
1434                 first_sub_token = false;
1435 
1436                 log_debug("next_hyphen_index=%zd\n", next_hyphen_index);
1437             } while(next_hyphen_index >= 0);
1438 
1439         }
1440 
1441         if (word_freq > 0) {
1442             // The individual word
1443             feature_array_add(features, 2, "word", word);
1444         } else {
1445             log_debug("word not in vocab: %s\n", word);
1446 
1447             is_unknown = true;
1448             word = (token.type != NUMERIC && token.type != IDEOGRAPHIC_NUMBER) ? UNKNOWN_WORD : UNKNOWN_NUMERIC;
1449 
1450             if (is_word_token(token.type)) {
1451                 is_unknown_word = true;
1452             }
1453         }
1454 
1455         if (idx == 0 && !is_unknown_word) {
1456             feature_array_add(features, 2, "first word", word);
1457             //feature_array_add(features, 3, "first word+next word", word, next_word);
1458         }
1459 
1460     } else if (component_phrase_string != NULL) {
1461         word = component_phrase_string;
1462     } else if (phrase_string != NULL) {
1463         word = phrase_string;
1464     }
1465 
1466     if (last_index == idx - 1) {
1467         // Previous tag and current word
1468         feature_array_add(prev_tag_features, 2, "word", word);
1469 
1470         // Previous two tags and current word
1471         if (parser->model_type == ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON) {
1472             // In the CRF this is accounted for by the transition weights
1473             // so only need it for the averaged perceptron
1474             feature_array_add(prev_tag_features, 1, "trans");
1475 
1476             // Averaged perceptron uses two tags of history, CRF uses one
1477             feature_array_add(prev2_tag_features, 2, "word", word);
1478             feature_array_add(prev2_tag_features, 1, "trans");
1479         }
1480     }
1481 
1482     if (last_index >= 0) {
1483         address_parser_phrase_t prev_word_or_phrase = word_or_phrase_at_index(parser, tokenized, context, last_index, false);
1484         char *prev_word = prev_word_or_phrase.str;
1485 
1486         if (is_plain_word_phrase_type(prev_word_or_phrase.type)) {
1487             uint32_t prev_word_freq = word_vocab_frequency(parser, prev_word);
1488             token_t prev_token = tokenized->tokens->a[last_index];
1489             bool prev_token_numeric = is_numeric_token(prev_token.type);
1490             if (prev_word_freq == 0) {
1491                 prev_word = !prev_token_numeric ? UNKNOWN_WORD : UNKNOWN_NUMERIC;
1492             }
1493         }
1494 
1495         // Previous word
1496         feature_array_add(features, 2, "prev word", prev_word);
1497 
1498 
1499         if (last_index == idx - 1) {
1500             feature_array_add(prev_tag_features, 2, "prev word", prev_word);
1501         }
1502 
1503         // Previous word and current word
1504         feature_array_add(features, 3, "prev word+word", prev_word, word);
1505     }
1506 
1507     if (next_index < num_tokens) {
1508         address_parser_phrase_t next_word_or_phrase = word_or_phrase_at_index(parser, tokenized, context, next_index, false);
1509         char *next_word = next_word_or_phrase.str;
1510         size_t next_word_len = 1;
1511 
1512         if (is_plain_word_phrase_type(next_word_or_phrase.type)) {
1513             uint32_t next_word_freq = word_vocab_frequency(parser, next_word);
1514             token_t next_token = tokenized->tokens->a[next_index];
1515             bool next_token_numeric = is_numeric_token(next_token.type);
1516             if (next_word_freq == 0) {
1517                 next_word = !next_token_numeric ? UNKNOWN_WORD : UNKNOWN_NUMERIC;
1518             }
1519         } else {
1520             next_word_len = next_word_or_phrase.phrase.len;
1521         }
1522 
1523         // Next word e.g. if the current word is unknown and the next word is "street"
1524         feature_array_add(features, 2, "next word", next_word);
1525 
1526         // Current word and next word
1527         feature_array_add(features, 3, "word+next word", word, next_word);
1528 
1529         // Prev tag, current word and next word
1530         //feature_array_add(features, 4, "prev tag+word+next word", prev || "START", word, next_word);
1531 
1532         // Venue names ("house") are almost always at the beginning of the string
1533         // and often contain out-of-vocabulary words. Consider a case like "Barboncino 781 Franklin Ave".
1534         // The features available to classify "Barboncino" are going to be unknown word featuers (n-grams),
1535         // next word features (unknown word where next word=DDD is just as likely to be a street)
1536         // and no previous tags of history since it's the first word. If the parser predicts the
1537         // first token correctly, it's going to have an easier time getting the rest of the sequence
1538         // correct (unknown word + prev tag was "house" is probably still part of the venue, etc.) so
1539         // we're only really worried about that first token.  This group of features, called
1540         // "long-context features" finds the relative position of the next numeric token as well
1541         // as the next street-level phrase (words like "ave", "street", etc.) in the right context.
1542         // In an English or French address, if we know there's a number somewhere to our right,
1543         // and that a word like "Ave" appears to the right of the number, it's very likely that
1544         // the current unknown word is part of a venue name. Similarly, if a venue-word like "Pizzeria"
1545         // occurred prior to the number, that would also be strong evidence that we're in a venue name.
1546         // Conversely, if we're in a Spanish address and a word like "Calle" comes before the first number
1547         // to our right, it's also likely that we're in a venue name, but we'd need to note that the
1548         // phrase we saw was "Calle" and not an English thoroughfare type.
1549 
1550         if (idx == 0 && add_word_feature && is_unknown_word) {
1551             bool seen_number = false;
1552             bool seen_phrase = false;
1553             for (uint32_t right_idx = idx + 1; right_idx < num_tokens; right_idx++) {
1554                 token_t right_token = tokens->a[right_idx];
1555 
1556                 /* Check */
1557                 address_parser_phrase_t right_context_word_or_phrase = word_or_phrase_at_index(parser, tokenized, context, right_idx, true);
1558                 address_parser_phrase_type_t right_context_phrase_type = right_context_word_or_phrase.type;
1559                 if (right_context_phrase_type != ADDRESS_PARSER_NULL_PHRASE &&
1560                     right_context_phrase_type != ADDRESS_PARSER_DICTIONARY_PHRASE &&
1561                     right_context_phrase_type != ADDRESS_PARSER_SUFFIX_PHRASE &&
1562                     right_context_phrase_type != ADDRESS_PARSER_PREFIX_PHRASE) {
1563                     continue;
1564                 }
1565                 char *right_context_word = right_context_word_or_phrase.str;
1566                 phrase_t right_context_phrase = right_context_word_or_phrase.phrase;
1567 
1568                 phrase_t suffix_phrase = context->suffix_phrases->a[right_idx];
1569 
1570                 uint32_t right_context_expansion_index;
1571                 address_expansion_value_t *right_context_expansion_value;
1572 
1573                 uint32_t right_context_components = 0;
1574                 bool right_context_name = false;
1575                 bool right_context_street = false;
1576 
1577                 if (right_context_phrase.len > 0) {
1578                     right_context_expansion_index = right_context_phrase.data;
1579                     right_context_expansion_value = address_dictionary_get_expansions(right_context_expansion_index);
1580                     right_context_components = right_context_expansion_value->components;
1581 
1582                     char *right_affix_type = NULL;
1583                     char *right_context_affix = NULL;
1584 
1585                     char *relation_to_number = seen_number ? "after number" : "before number";
1586 
1587                     seen_phrase = true;
1588 
1589                     char *right_context_word_pre_norm;
1590 
1591                      if (right_context_phrase_type == ADDRESS_PARSER_SUFFIX_PHRASE) {
1592                         right_affix_type = "suffix";
1593                         right_context_word_pre_norm = tokenized_string_get_token(tokenized, right_idx);
1594                         right_context_affix = phrase_suffix(right_context_word, strlen(right_context_word_pre_norm), right_context_phrase, context->long_context_suffix_phrase);
1595                     } else if (right_context_word_or_phrase.type == ADDRESS_PARSER_PREFIX_PHRASE) {
1596                         right_affix_type = "prefix";
1597                         right_context_word_pre_norm = tokenized_string_get_token(tokenized, right_idx);
1598                         right_context_affix = phrase_prefix(right_context_word, strlen(right_context_word_pre_norm), right_context_phrase, context->long_context_suffix_phrase);
1599                     }
1600 
1601                     if (right_context_components & LIBPOSTAL_ADDRESS_STREET && !(right_context_components & LIBPOSTAL_ADDRESS_NAME)) {
1602                         feature_array_add(features, 2, "first word unknown+street phrase right", relation_to_number);
1603                         feature_array_add(features, 3, "first word unknown+street phrase right", relation_to_number, right_context_word);
1604                         if (right_context_affix != NULL && right_affix_type != NULL) {
1605                             feature_array_add(features, 4, "first word unknown+street affix right", relation_to_number, right_affix_type, right_context_affix);
1606                         }
1607                         break;
1608                     } else if (right_context_components & LIBPOSTAL_ADDRESS_NAME && !(right_context_components & LIBPOSTAL_ADDRESS_STREET)) {
1609                         feature_array_add(features, 2, "first word unknown+venue phrase right", relation_to_number);
1610                         feature_array_add(features, 3, "first word unknown+venue phrase right", relation_to_number, right_context_word);
1611                         if (right_context_affix != NULL && right_affix_type != NULL) {
1612                             feature_array_add(features, 4, "first word unknown+venue affix right", relation_to_number, right_affix_type, right_context_affix);
1613                         }
1614                     } else if (right_context_components & (LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET)) {
1615                         if (seen_number) {
1616                             feature_array_add(features, 1, "first word unknown+number+ambiguous phrase right");
1617                             feature_array_add(features, 2, "first word unknown+number+ambiguous phrase right", right_context_word);
1618                             if (right_context_affix != NULL && right_affix_type != NULL) {
1619                                 feature_array_add(features, 3, "first word unknown+number+ambiguous affix right", right_affix_type, right_context_affix);
1620                             }
1621                             break;
1622                         } else {
1623                             continue;
1624                         }
1625                     }
1626 
1627                     if (seen_number) break;
1628                 }
1629 
1630                 if (is_numeric_token(right_token.type)) {
1631                     seen_number = true;
1632                     char *relation_to_phrase = seen_phrase ? "after phrase" : "before phrase";
1633                     feature_array_add(features, 2, "first word unknown+number right", relation_to_phrase);
1634                     feature_array_add(features, 3, "first word unknown+number right", relation_to_phrase, right_context_word);
1635                     if (seen_phrase) break;
1636                 }
1637             }
1638         }
1639     }
1640 
1641     return true;
1642 
1643 }
1644 
address_parser_predict(address_parser_t * self,address_parser_context_t * context,cstring_array * token_labels,tagger_feature_function feature_function,tokenized_string_t * tokenized_str)1645 bool address_parser_predict(address_parser_t *self, address_parser_context_t *context, cstring_array *token_labels, tagger_feature_function feature_function, tokenized_string_t *tokenized_str) {
1646     if (self->model_type == ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON) {
1647         return averaged_perceptron_tagger_predict(self->model.ap, self, context, context->features, context->prev_tag_features, context->prev2_tag_features, token_labels, feature_function, tokenized_str, self->options.print_features);
1648     } else if (self->model_type == ADDRESS_PARSER_TYPE_CRF) {
1649         return crf_tagger_predict(self->model.crf, self, context, context->features, context->prev_tag_features, token_labels, feature_function, tokenized_str, self->options.print_features);
1650     } else {
1651         log_error("Parser has unknown model type\n");
1652     }
1653     return false;
1654 }
1655 
address_parser_response_new(void)1656 libpostal_address_parser_response_t *address_parser_response_new(void) {
1657     libpostal_address_parser_response_t *response = malloc(sizeof(libpostal_address_parser_response_t));
1658     return response;
1659 }
1660 
address_parser_parse(char * address,char * language,char * country)1661 libpostal_address_parser_response_t *address_parser_parse(char *address, char *language, char *country) {
1662     if (address == NULL) return NULL;
1663 
1664     address_parser_t *parser = get_address_parser();
1665     if (parser == NULL || parser->context == NULL) {
1666         log_error("parser is not setup, call libpostal_setup_address_parser()\n");
1667         return NULL;
1668     }
1669 
1670     address_parser_context_t *context = parser->context;
1671 
1672     char *normalized = address_parser_normalize_string(address);
1673     bool is_normalized = normalized != NULL;
1674     if (!is_normalized) {
1675         normalized = address;
1676     }
1677 
1678     token_array *tokens = tokenize(normalized);
1679 
1680     tokenized_string_t *tokenized_str = tokenized_string_new_from_str_size(normalized, strlen(normalized), tokens->n);
1681 
1682     for (size_t i = 0; i < tokens->n; i++) {
1683         token_t token = tokens->a[i];
1684         if (ADDRESS_PARSER_IS_SEPARATOR(token.type)) {
1685             uint32_array_pop(context->separators);
1686             uint32_array_push(context->separators, ADDRESS_SEPARATOR_FIELD_INTERNAL);
1687             continue;
1688         } else if (ADDRESS_PARSER_IS_IGNORABLE(token.type)) {
1689             continue;
1690         }
1691 
1692         tokenized_string_add_token(tokenized_str, (const char *)normalized, token.len, token.type, token.offset);
1693         uint32_array_push(context->separators, ADDRESS_SEPARATOR_NONE);
1694     }
1695 
1696     // This parser was trained without knowing language/country.
1697     // If at some point we build country-specific/language-specific
1698     // parsers, these parameters could be used to select a model.
1699     // The language parameter does technically control which dictionaries
1700     // are searched at the street level. It's possible with e.g. a phrase
1701     // like "de", which can be either the German country code or a stopword
1702     // in Spanish, that even in the case where it's being used as a country code,
1703     // it's possible that both the street-level and admin-level phrase features
1704     // may be working together as a kind of intercept. Depriving the model
1705     // of the street-level phrase features by passing in a known language
1706     // may change the decision threshold so explicitly ignore these
1707     // options until there's a use for them (country-specific or language-specific
1708     // parser models).
1709 
1710     language = NULL;
1711     country = NULL;
1712     address_parser_context_fill(context, parser, tokenized_str, language, country);
1713 
1714     libpostal_address_parser_response_t *response = NULL;
1715 
1716     // If the whole input string is a single known phrase at the SUBURB level or higher, bypass sequence prediction altogether
1717     phrase_t only_phrase = NULL_PHRASE;
1718     token_t token, prev_token;
1719     bool is_postal = false;
1720     if (context->component_phrases->n == 1) {
1721         only_phrase = context->component_phrases->a[0];
1722     } else if (context->postal_code_phrases->n == 1) {
1723         only_phrase = context->postal_code_phrases->a[0];
1724         is_postal = true;
1725     }
1726 
1727     if (only_phrase.start == 0 && only_phrase.len == tokenized_str->tokens->n && only_phrase.len > 0) {
1728         uint32_t most_common = 0;
1729 
1730         char *label = NULL;
1731 
1732         if (!is_postal) {
1733             uint32_t component_phrase_index = only_phrase.data;
1734             address_parser_types_t types = parser->phrase_types->a[component_phrase_index];
1735             most_common = types.most_common;
1736 
1737             if (most_common == ADDRESS_PARSER_BOUNDARY_CITY) {
1738                 label = strdup(ADDRESS_PARSER_LABEL_CITY);
1739             } else if (most_common == ADDRESS_PARSER_BOUNDARY_STATE) {
1740                 label = strdup(ADDRESS_PARSER_LABEL_STATE);
1741             } else if (most_common == ADDRESS_PARSER_BOUNDARY_COUNTRY) {
1742                 label = strdup(ADDRESS_PARSER_LABEL_COUNTRY);
1743             } else if (most_common == ADDRESS_PARSER_BOUNDARY_STATE_DISTRICT) {
1744                 label = strdup(ADDRESS_PARSER_LABEL_STATE_DISTRICT);
1745             } else if (most_common == ADDRESS_PARSER_BOUNDARY_COUNTRY_REGION) {
1746                 label = strdup(ADDRESS_PARSER_LABEL_COUNTRY_REGION);
1747             } else if (most_common == ADDRESS_PARSER_BOUNDARY_SUBURB) {
1748                 label = strdup(ADDRESS_PARSER_LABEL_SUBURB);
1749             } else if (most_common == ADDRESS_PARSER_BOUNDARY_CITY_DISTRICT) {
1750                 label = strdup(ADDRESS_PARSER_LABEL_CITY_DISTRICT);
1751             } else if (most_common == ADDRESS_PARSER_BOUNDARY_WORLD_REGION) {
1752                 label = strdup(ADDRESS_PARSER_LABEL_WORLD_REGION);
1753             }
1754         } else {
1755             label = strdup(ADDRESS_PARSER_LABEL_POSTAL_CODE);
1756         }
1757 
1758         // Implicit: if most_common is not one of the above, ignore and parse regularly
1759         if (label != NULL) {
1760             char **single_label = malloc(sizeof(char *));
1761             single_label[0] = label;
1762             char **single_component = malloc(sizeof(char *));
1763             single_component[0] = strdup(normalized);
1764 
1765             response = address_parser_response_new();
1766 
1767             response->num_components = 1;
1768             response->labels = single_label;
1769             response->components = single_component;
1770 
1771             token_array_destroy(tokens);
1772             tokenized_string_destroy(tokenized_str);
1773 
1774             if (is_normalized) {
1775                 free(normalized);
1776             }
1777             return response;
1778         }
1779     }
1780 
1781     cstring_array *token_labels = cstring_array_new_size(tokens->n);
1782 
1783     char *prev_label = NULL;
1784 
1785     bool prediction_success = address_parser_predict(parser, context, token_labels, &address_parser_features, tokenized_str);
1786 
1787     if (prediction_success) {
1788         response = address_parser_response_new();
1789 
1790         size_t num_strings = cstring_array_num_strings(tokenized_str->strings);
1791 
1792         cstring_array *labels = cstring_array_new_size(num_strings);
1793         cstring_array *components = cstring_array_new_size(strlen(address) + num_strings);
1794 
1795         token_t *tokens = tokenized_str->tokens->a;
1796 
1797         for (size_t i = 0; i < num_strings; i++) {
1798             char *str = tokenized_string_get_token(tokenized_str, i);
1799 
1800             char *label = cstring_array_get_string(token_labels, i);
1801 
1802             if (prev_label == NULL || strcmp(label, prev_label) != 0) {
1803                 cstring_array_add_string(labels, label);
1804                 cstring_array_start_token(components);
1805 
1806             }
1807 
1808             if (prev_label != NULL && strcmp(label, prev_label) == 0) {
1809                 token = tokens[i];
1810                 prev_token = tokens[i - 1];
1811                 if (token.offset > prev_token.offset + prev_token.len) {
1812                     cstring_array_cat_string(components, " ");
1813                 }
1814                 cstring_array_cat_string(components, str);
1815             } else {
1816                 cstring_array_append_string(components, str);
1817                 cstring_array_terminate(components);
1818             }
1819 
1820             prev_label = label;
1821         }
1822         response->num_components = cstring_array_num_strings(components);
1823         response->components = cstring_array_to_strings(components);
1824         response->labels = cstring_array_to_strings(labels);
1825 
1826     } else {
1827         log_error("Error in prediction\n");
1828     }
1829 
1830     token_array_destroy(tokens);
1831     tokenized_string_destroy(tokenized_str);
1832     cstring_array_destroy(token_labels);
1833 
1834     if (is_normalized) {
1835         free(normalized);
1836     }
1837 
1838     return response;
1839 }
1840 
1841 
1842 
address_parser_module_setup(char * dir)1843 bool address_parser_module_setup(char *dir) {
1844     if (parser == NULL) {
1845         return address_parser_load(dir);
1846     }
1847     return true;
1848 }
1849 
address_parser_module_teardown(void)1850 void address_parser_module_teardown(void) {
1851     if (parser != NULL) {
1852         address_parser_destroy(parser);
1853     }
1854     parser = NULL;
1855 }
1856