1 #include "address_parser.h"
2 #include "address_dictionary.h"
3 #include "features.h"
4 #include "ngrams.h"
5 #include "scanner.h"
6
7 #include "graph_builder.h"
8
9 #include "klib/ksort.h"
10 #include "log/log.h"
11
12 #define ADDRESS_PARSER_MODEL_FILENAME "address_parser.dat"
13 #define ADDRESS_PARSER_MODEL_FILENAME_CRF "address_parser_crf.dat"
14 #define ADDRESS_PARSER_VOCAB_FILENAME "address_parser_vocab.trie"
15 #define ADDRESS_PARSER_PHRASE_FILENAME "address_parser_phrases.dat"
16 #define ADDRESS_PARSER_POSTAL_CODES_FILENAME "address_parser_postal_codes.dat"
17
18 #define UNKNOWN_WORD "UNKNOWN"
19 #define UNKNOWN_NUMERIC "UNKNOWN_NUMERIC"
20
21 #define DEFAULT_RARE_WORD_THRESHOLD 50
22
23 static address_parser_t *parser = NULL;
24
25 typedef enum {
26 ADDRESS_PARSER_NULL_PHRASE,
27 ADDRESS_PARSER_DICTIONARY_PHRASE,
28 ADDRESS_PARSER_COMPONENT_PHRASE,
29 ADDRESS_PARSER_PREFIX_PHRASE,
30 ADDRESS_PARSER_SUFFIX_PHRASE
31 } address_parser_phrase_type_t;
32
33 static parser_options_t PARSER_DEFAULT_OPTIONS = {
34 .rare_word_threshold = DEFAULT_RARE_WORD_THRESHOLD,
35 .print_features = false
36 };
37
address_parser_new_options(parser_options_t options)38 address_parser_t *address_parser_new_options(parser_options_t options) {
39 address_parser_t *parser = calloc(1, sizeof(address_parser_t));
40 parser->options = options;
41 return parser;
42 }
43
address_parser_new(void)44 address_parser_t *address_parser_new(void) {
45 return address_parser_new_options(PARSER_DEFAULT_OPTIONS);
46 }
47
get_address_parser(void)48 address_parser_t *get_address_parser(void) {
49 return parser;
50 }
51
address_parser_print_features(bool print_features)52 bool address_parser_print_features(bool print_features) {
53 if (parser == NULL) return false;
54
55 parser->options.print_features = print_features;
56 return true;
57 }
58
address_parser_save(address_parser_t * self,char * output_dir)59 bool address_parser_save(address_parser_t *self, char *output_dir) {
60 if (self == NULL || output_dir == NULL) return false;
61
62 char *model_filename = NULL;
63 if (self->model_type == ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON) {
64 model_filename = ADDRESS_PARSER_MODEL_FILENAME;
65 } else if (self->model_type == ADDRESS_PARSER_TYPE_CRF) {
66 model_filename = ADDRESS_PARSER_MODEL_FILENAME_CRF;
67 } else {
68 return false;
69 }
70
71 char_array *path = char_array_new_size(strlen(output_dir));
72
73 char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, model_filename);
74 char *model_path = char_array_get_string(path);
75
76 if (self->model_type == ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON) {
77 if (!averaged_perceptron_save(self->model.ap, model_path)) {
78 log_info("Error in averaged_perceptron_save\n");
79 char_array_destroy(path);
80 return false;
81 }
82 } else if (self->model_type == ADDRESS_PARSER_TYPE_CRF) {
83 if (!crf_save(self->model.crf, model_path)) {
84 log_info("Error in crf_save\n");
85 char_array_destroy(path);
86 return false;
87 }
88 }
89
90 char_array_clear(path);
91
92 char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, ADDRESS_PARSER_VOCAB_FILENAME);
93 char *vocab_path = char_array_get_string(path);
94
95 if (!trie_save(self->vocab, vocab_path)) {
96 return false;
97 }
98
99 char_array_clear(path);
100
101 char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, ADDRESS_PARSER_PHRASE_FILENAME);
102 char *phrases_path = char_array_get_string(path);
103
104 FILE *phrases_file = fopen(phrases_path, "w+");
105 if (phrases_file == NULL || self->phrases == NULL) {
106 return false;
107 }
108
109 if (!trie_write(self->phrases, phrases_file)) {
110 return false;
111 }
112
113 if (self->phrase_types == NULL) {
114 return false;
115 }
116
117 size_t num_phrase_types = self->phrase_types->n;
118 if (!file_write_uint64(phrases_file, num_phrase_types)) {
119 return false;
120 }
121
122 for (size_t i = 0; i < self->phrase_types->n; i++) {
123 address_parser_types_t phrase_type_value = self->phrase_types->a[i];
124 if (!file_write_uint32(phrases_file, phrase_type_value.value)) {
125 return false;
126 }
127 }
128
129 fclose(phrases_file);
130
131 char_array_clear(path);
132
133 char_array_add_joined(path, PATH_SEPARATOR, true, 2, output_dir, ADDRESS_PARSER_POSTAL_CODES_FILENAME);
134 char *postal_codes_path = char_array_get_string(path);
135
136 FILE *postal_codes_file = fopen(postal_codes_path, "w+");
137 if (postal_codes_file == NULL || self->postal_codes == NULL) {
138 return false;
139 }
140
141 if (!trie_write(self->postal_codes, postal_codes_file)) {
142 return false;
143 }
144
145 if (self->postal_code_contexts == NULL) {
146 return false;
147 }
148
149 if (!graph_write(self->postal_code_contexts, postal_codes_file)) {
150 return false;
151 }
152
153 fclose(postal_codes_file);
154
155 char_array_destroy(path);
156
157 return true;
158 }
159
postal_code_context_exists(address_parser_t * self,uint32_t postal_code_id,uint32_t admin_id)160 static bool postal_code_context_exists(address_parser_t *self, uint32_t postal_code_id, uint32_t admin_id) {
161 graph_t *g = self->postal_code_contexts;
162
163 return graph_has_edge(g, postal_code_id, admin_id);
164 }
165
address_parser_load(char * dir)166 bool address_parser_load(char *dir) {
167 if (parser != NULL) return false;
168 if (dir == NULL) {
169 dir = LIBPOSTAL_ADDRESS_PARSER_DIR;
170 }
171
172 char_array *path = char_array_new_size(strlen(dir));
173
174 char_array_add_joined(path, PATH_SEPARATOR, true, 2, dir, ADDRESS_PARSER_MODEL_FILENAME);
175 char *model_path = char_array_get_string(path);
176
177 if (file_exists(model_path)) {
178 averaged_perceptron_t *ap_model = averaged_perceptron_load(model_path);
179 if (ap_model != NULL) {
180 parser = address_parser_new();
181 parser->model_type = ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON;
182 parser->model.ap = ap_model;
183 } else {
184 char_array_destroy(path);
185 log_error("Averaged perceptron model could not be loaded\n");
186 return false;
187 }
188 } else {
189 model_path = NULL;
190 }
191
192 if (model_path == NULL) {
193 char_array_clear(path);
194 char_array_add_joined(path, PATH_SEPARATOR, true, 2, dir, ADDRESS_PARSER_MODEL_FILENAME_CRF);
195 model_path = char_array_get_string(path);
196
197 if (file_exists(model_path)) {
198 crf_t *crf_model = crf_load(model_path);
199 if (crf_model != NULL) {
200 parser = address_parser_new();
201 parser->model_type = ADDRESS_PARSER_TYPE_CRF;
202 parser->model.crf = crf_model;
203 } else {
204 char_array_destroy(path);
205 log_error("Averaged perceptron model could not be loaded\n");
206 return false;
207 }
208 } else {
209 model_path = NULL;
210 }
211 }
212
213 if (parser == NULL) {
214 char_array_destroy(path);
215 log_error("Could not find parser model file of known type\n");
216 return false;
217 }
218
219 char_array_clear(path);
220
221 char_array_add_joined(path, PATH_SEPARATOR, true, 2, dir, ADDRESS_PARSER_VOCAB_FILENAME);
222
223 char *vocab_path = char_array_get_string(path);
224
225 trie_t *vocab = trie_load(vocab_path);
226
227 if (vocab == NULL) {
228 goto exit_address_parser_created;
229 }
230
231 parser->vocab = vocab;
232
233 char_array_clear(path);
234
235 char_array_add_joined(path, PATH_SEPARATOR, true, 2, dir, ADDRESS_PARSER_PHRASE_FILENAME);
236 char *phrases_path = char_array_get_string(path);
237
238 FILE *phrases_file = fopen(phrases_path, "rb");
239 if (phrases_file == NULL) {
240 goto exit_address_parser_created;
241 }
242
243 parser->phrases = trie_read(phrases_file);
244 if (parser->phrases == NULL) {
245 goto exit_address_parser_created;
246 }
247
248 uint64_t num_phrase_types;
249
250 if (!file_read_uint64(phrases_file, &num_phrase_types)) {
251 goto exit_address_parser_created;
252 }
253
254 parser->phrase_types = address_parser_types_array_new_size(num_phrase_types);
255
256 uint32_array *phrase_type_values = uint32_array_new_size(num_phrase_types);
257 if (!file_read_uint32_array(phrases_file, phrase_type_values->a, num_phrase_types)) {
258 uint32_array_destroy(phrase_type_values);
259 goto exit_address_parser_created;
260 }
261 phrase_type_values->n = num_phrase_types;
262
263 for (size_t i = 0; i < phrase_type_values->n; i++) {
264 uint32_t phrase_type_value = phrase_type_values->a[i];
265 address_parser_types_t phrase_type = {.value = phrase_type_value};
266 address_parser_types_array_push(parser->phrase_types, phrase_type);
267 }
268
269 uint32_array_destroy(phrase_type_values);
270
271 fclose(phrases_file);
272
273 char_array_clear(path);
274
275 char_array_add_joined(path, PATH_SEPARATOR, true, 2, dir, ADDRESS_PARSER_POSTAL_CODES_FILENAME);
276
277 char *postal_codes_path = char_array_get_string(path);
278
279 FILE *postal_codes_file = fopen(postal_codes_path, "rb");
280 if (postal_codes_file == NULL) {
281 goto exit_address_parser_created;
282 }
283
284 parser->postal_codes = trie_read(postal_codes_file);
285 if (parser->postal_codes == NULL) {
286 goto exit_address_parser_created;
287 }
288
289 parser->postal_code_contexts = graph_read(postal_codes_file);
290
291 if (parser->postal_code_contexts == NULL) {
292 goto exit_address_parser_created;
293 }
294
295 fclose(postal_codes_file);
296
297 parser->context = address_parser_context_new();
298 if (parser->context == NULL) {
299 goto exit_address_parser_created;
300 }
301
302 char_array_destroy(path);
303 return true;
304
305 exit_address_parser_created:
306 address_parser_destroy(parser);
307 char_array_destroy(path);
308 return false;
309 }
310
address_parser_destroy(address_parser_t * self)311 void address_parser_destroy(address_parser_t *self) {
312 if (self == NULL) return;
313
314 if (self->model_type == ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON && self->model.ap != NULL) {
315 averaged_perceptron_destroy(self->model.ap);
316 } else if (self->model_type == ADDRESS_PARSER_TYPE_CRF && self->model.crf != NULL) {
317 crf_destroy(self->model.crf);
318 }
319
320 if (self->context != NULL) {
321 address_parser_context_destroy(self->context);
322 }
323
324 if (self->vocab != NULL) {
325 trie_destroy(self->vocab);
326 }
327
328 if (self->phrases != NULL) {
329 trie_destroy(self->phrases);
330 }
331
332 if (self->phrase_types != NULL) {
333 address_parser_types_array_destroy(self->phrase_types);
334 }
335
336 if (self->postal_codes != NULL) {
337 trie_destroy(self->postal_codes);
338 }
339
340 if (self->postal_code_contexts != NULL) {
341 graph_destroy(self->postal_code_contexts);
342 }
343
344 free(self);
345 }
346
word_vocab_frequency(address_parser_t * parser,char * word)347 static inline uint32_t word_vocab_frequency(address_parser_t *parser, char *word) {
348 uint32_t count = 0;
349 bool has_key = trie_get_data(parser->vocab, word, &count);
350 return count;
351 }
352
address_parser_normalize_token(cstring_array * array,char * str,token_t token)353 inline void address_parser_normalize_token(cstring_array *array, char *str, token_t token) {
354 normalize_token(array, str, token, ADDRESS_PARSER_NORMALIZE_TOKEN_OPTIONS);
355 }
356
address_parser_normalize_phrase_token(cstring_array * array,char * str,token_t token)357 static inline void address_parser_normalize_phrase_token(cstring_array *array, char *str, token_t token) {
358 normalize_token(array, str, token, ADDRESS_PARSER_NORMALIZE_ADMIN_TOKEN_OPTIONS);
359 }
360
address_parser_normalize_string(char * str)361 inline char *address_parser_normalize_string(char *str) {
362 return normalize_string_latin(str, strlen(str), ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS);
363 }
364
365
address_parser_context_destroy(address_parser_context_t * self)366 void address_parser_context_destroy(address_parser_context_t *self) {
367 if (self == NULL) return;
368
369 if (self->phrase != NULL) {
370 char_array_destroy(self->phrase);
371 }
372
373 if (self->context_phrase != NULL) {
374 char_array_destroy(self->context_phrase);
375 }
376
377 if (self->long_context_phrase != NULL) {
378 char_array_destroy(self->long_context_phrase);
379 }
380
381 if (self->component_phrase != NULL) {
382 char_array_destroy(self->component_phrase);
383 }
384
385 if (self->context_component_phrase != NULL) {
386 char_array_destroy(self->context_component_phrase);
387 }
388
389 if (self->long_context_component_phrase != NULL) {
390 char_array_destroy(self->long_context_component_phrase);
391 }
392
393 if (self->prefix_phrase != NULL) {
394 char_array_destroy(self->prefix_phrase);
395 }
396
397 if (self->context_prefix_phrase != NULL) {
398 char_array_destroy(self->context_prefix_phrase);
399 }
400
401 if (self->long_context_prefix_phrase != NULL) {
402 char_array_destroy(self->long_context_prefix_phrase);
403 }
404
405 if (self->suffix_phrase != NULL) {
406 char_array_destroy(self->suffix_phrase);
407 }
408
409 if (self->context_suffix_phrase != NULL) {
410 char_array_destroy(self->context_suffix_phrase);
411 }
412
413 if (self->long_context_suffix_phrase != NULL) {
414 char_array_destroy(self->long_context_suffix_phrase);
415 }
416
417 if (self->ngrams != NULL) {
418 cstring_array_destroy(self->ngrams);
419 }
420
421 if (self->sub_token != NULL) {
422 char_array_destroy(self->sub_token);
423 }
424
425 if (self->sub_tokens != NULL) {
426 token_array_destroy(self->sub_tokens);
427 }
428
429 if (self->separators != NULL) {
430 uint32_array_destroy(self->separators);
431 }
432
433 if (self->normalized != NULL) {
434 cstring_array_destroy(self->normalized);
435 }
436
437 if (self->normalized_tokens != NULL) {
438 token_array_destroy(self->normalized_tokens);
439 }
440
441 if (self->normalized_admin != NULL) {
442 cstring_array_destroy(self->normalized_admin);
443 }
444
445 if (self->normalized_admin_tokens != NULL) {
446 token_array_destroy(self->normalized_admin_tokens);
447 }
448
449 if (self->features != NULL) {
450 cstring_array_destroy(self->features);
451 }
452
453 if (self->prev_tag_features != NULL) {
454 cstring_array_destroy(self->prev_tag_features);
455 }
456
457 if (self->prev2_tag_features != NULL) {
458 cstring_array_destroy(self->prev2_tag_features);
459 }
460
461 if (self->tokenized_str != NULL) {
462 tokenized_string_destroy(self->tokenized_str);
463 }
464
465 if (self->address_dictionary_phrases != NULL) {
466 phrase_array_destroy(self->address_dictionary_phrases);
467 }
468
469 if (self->address_phrase_memberships != NULL) {
470 int64_array_destroy(self->address_phrase_memberships);
471 }
472
473 if (self->component_phrases != NULL) {
474 phrase_array_destroy(self->component_phrases);
475 }
476
477 if (self->component_phrase_memberships != NULL) {
478 int64_array_destroy(self->component_phrase_memberships);
479 }
480
481 if (self->postal_code_phrases != NULL) {
482 phrase_array_destroy(self->postal_code_phrases);
483 }
484
485 if (self->postal_code_phrase_memberships != NULL) {
486 int64_array_destroy(self->postal_code_phrase_memberships);
487 }
488
489 if (self->prefix_phrases != NULL) {
490 phrase_array_destroy(self->prefix_phrases);
491 }
492
493 if (self->suffix_phrases != NULL) {
494 phrase_array_destroy(self->suffix_phrases);
495 }
496
497 free(self);
498 }
499
address_parser_context_new(void)500 address_parser_context_t *address_parser_context_new(void) {
501 address_parser_context_t *context = malloc(sizeof(address_parser_context_t));
502
503 if (context == NULL) return NULL;
504
505 context->language = NULL;
506 context->country = NULL;
507
508 context->phrase = char_array_new();
509 if (context->phrase == NULL) {
510 goto exit_address_parser_context_allocated;
511 }
512
513 context->context_phrase = char_array_new();
514 if (context->context_phrase == NULL) {
515 goto exit_address_parser_context_allocated;
516 }
517
518 context->long_context_phrase = char_array_new();
519 if (context->long_context_phrase == NULL) {
520 goto exit_address_parser_context_allocated;
521 }
522
523 context->component_phrase = char_array_new();
524 if (context->component_phrase == NULL) {
525 goto exit_address_parser_context_allocated;
526 }
527
528 context->context_component_phrase = char_array_new();
529 if (context->context_component_phrase == NULL) {
530 goto exit_address_parser_context_allocated;
531 }
532
533 context->long_context_component_phrase = char_array_new();
534 if (context->long_context_component_phrase == NULL) {
535 goto exit_address_parser_context_allocated;
536 }
537
538 context->prefix_phrase = char_array_new();
539 if (context->prefix_phrase == NULL) {
540 goto exit_address_parser_context_allocated;
541 }
542
543 context->context_prefix_phrase = char_array_new();
544 if (context->context_prefix_phrase == NULL) {
545 goto exit_address_parser_context_allocated;
546 }
547
548 context->long_context_prefix_phrase = char_array_new();
549 if (context->long_context_prefix_phrase == NULL) {
550 goto exit_address_parser_context_allocated;
551 }
552
553 context->suffix_phrase = char_array_new();
554 if (context->suffix_phrase == NULL) {
555 goto exit_address_parser_context_allocated;
556 }
557
558 context->context_suffix_phrase = char_array_new();
559 if (context->context_suffix_phrase == NULL) {
560 goto exit_address_parser_context_allocated;
561 }
562
563 context->long_context_suffix_phrase = char_array_new();
564 if (context->long_context_suffix_phrase == NULL) {
565 goto exit_address_parser_context_allocated;
566 }
567
568 context->ngrams = cstring_array_new();
569 if (context->ngrams == NULL) {
570 goto exit_address_parser_context_allocated;
571 }
572
573 context->sub_token = char_array_new();
574 if (context->sub_token == NULL) {
575 goto exit_address_parser_context_allocated;
576 }
577
578 context->sub_tokens = token_array_new();
579 if (context->sub_tokens == NULL) {
580 goto exit_address_parser_context_allocated;
581 }
582
583 context->separators = uint32_array_new();
584 if (context->separators == NULL) {
585 goto exit_address_parser_context_allocated;
586 }
587
588 context->normalized = cstring_array_new();
589 if (context->normalized == NULL) {
590 goto exit_address_parser_context_allocated;
591 }
592
593 context->normalized_tokens = token_array_new();
594 if (context->normalized_tokens == NULL) {
595 goto exit_address_parser_context_allocated;
596 }
597
598 context->normalized_admin = cstring_array_new();
599 if (context->normalized_admin == NULL) {
600 goto exit_address_parser_context_allocated;
601 }
602
603 context->normalized_admin_tokens = token_array_new();
604 if (context->normalized_admin_tokens == NULL) {
605 goto exit_address_parser_context_allocated;
606 }
607
608 context->features = cstring_array_new();
609 if (context->features == NULL) {
610 goto exit_address_parser_context_allocated;
611 }
612
613 context->prev_tag_features = cstring_array_new();
614 if (context->prev_tag_features == NULL) {
615 goto exit_address_parser_context_allocated;
616 }
617
618 context->prev2_tag_features = cstring_array_new();
619 if (context->prev2_tag_features == NULL) {
620 goto exit_address_parser_context_allocated;
621 }
622
623 context->tokenized_str = tokenized_string_new();
624 if (context->tokenized_str == NULL) {
625 goto exit_address_parser_context_allocated;
626 }
627
628 context->address_dictionary_phrases = phrase_array_new();
629 if (context->address_dictionary_phrases == NULL) {
630 goto exit_address_parser_context_allocated;
631 }
632
633 context->address_phrase_memberships = int64_array_new();
634 if (context->address_phrase_memberships == NULL) {
635 goto exit_address_parser_context_allocated;
636 }
637
638 context->component_phrases = phrase_array_new();
639 if (context->component_phrases == NULL) {
640 goto exit_address_parser_context_allocated;
641 }
642
643 context->component_phrase_memberships = int64_array_new();
644 if (context->component_phrase_memberships == NULL) {
645 goto exit_address_parser_context_allocated;
646 }
647
648 context->postal_code_phrases = phrase_array_new();
649 if (context->postal_code_phrases == NULL) {
650 goto exit_address_parser_context_allocated;
651 }
652
653 context->postal_code_phrase_memberships = int64_array_new();
654 if (context->postal_code_phrase_memberships == NULL) {
655 goto exit_address_parser_context_allocated;
656 }
657
658 context->prefix_phrases = phrase_array_new();
659 if (context->prefix_phrases == NULL) {
660 goto exit_address_parser_context_allocated;
661 }
662
663 context->suffix_phrases = phrase_array_new();
664 if (context->suffix_phrases == NULL) {
665 goto exit_address_parser_context_allocated;
666 }
667
668 return context;
669
670 exit_address_parser_context_allocated:
671 address_parser_context_destroy(context);
672 return NULL;
673 }
674
is_valid_component_phrase(cstring_array * strings,phrase_t phrase)675 bool is_valid_component_phrase(cstring_array *strings, phrase_t phrase) {
676 bool valid = false;
677 for (uint32_t i = phrase.start; i < phrase.start + phrase.len; i++) {
678 char *s = cstring_array_get_string(strings, i);
679 if (!string_is_digit(s, strlen(s))) {
680 valid = true;
681 break;
682 }
683 }
684 return valid;
685 }
686
address_parser_context_fill(address_parser_context_t * context,address_parser_t * parser,tokenized_string_t * tokenized_str,char * language,char * country)687 void address_parser_context_fill(address_parser_context_t *context, address_parser_t *parser, tokenized_string_t *tokenized_str, char *language, char *country) {
688 uint32_t token_index;
689 char *word;
690 phrase_t phrase;
691
692 context->language = language;
693 context->country = country;
694
695 cstring_array *normalized = context->normalized;
696 token_array *normalized_tokens = context->normalized_tokens;
697 cstring_array_clear(normalized);
698 token_array_clear(normalized_tokens);
699
700 cstring_array *normalized_admin = context->normalized_admin;
701 token_array *normalized_admin_tokens = context->normalized_admin_tokens;
702 cstring_array_clear(normalized_admin);
703 token_array_clear(normalized_admin_tokens);
704
705 char *str = tokenized_str->str;
706 token_array *tokens = tokenized_str->tokens;
707
708 cstring_array_foreach(tokenized_str->strings, token_index, word, {
709 token_t token = tokens->a[token_index];
710
711 size_t token_offset = normalized->str->n;
712 address_parser_normalize_token(normalized, str, token);
713 size_t token_len;
714 if (normalized->str->n > token_offset) {
715 token_len = normalized->str->n - 1 - token_offset;
716 } else {
717 token_len = 0;
718 }
719 token_t normalized_token;
720 normalized_token.offset = token_offset;
721 normalized_token.len = token_len;
722 normalized_token.type = token.type;
723 token_array_push(normalized_tokens, normalized_token);
724
725 size_t admin_token_offset = normalized_admin->str->n;
726 address_parser_normalize_phrase_token(normalized_admin, str, token);
727 size_t admin_token_len;
728 if (normalized_admin->str->n > admin_token_offset) {
729 admin_token_len = normalized_admin->str->n - 1 - admin_token_offset;
730 } else {
731 admin_token_len = 0;
732 }
733 token_t normalized_admin_token;
734 normalized_admin_token.offset = admin_token_offset;
735 normalized_admin_token.len = admin_token_len;
736 normalized_admin_token.type = token.type;
737 token_array_push(normalized_admin_tokens, normalized_admin_token);
738 })
739
740 char *normalized_str = normalized->str->a;
741 char *normalized_str_admin = normalized_admin->str->a;
742
743 /*
744 Address dictionary phrases
745 --------------------------
746 Recognizing phrases that occur in libpostal's dictionaries.
747
748 Note: if the dictionaries are updates to try to improve the parser,
749 we'll need to retrain. This can be done without rebuilding the
750 training data (a long-running process which can take up to a week),
751 but will require running address_parser_train, the main training script.
752 */
753
754 phrase_array_clear(context->address_dictionary_phrases);
755 int64_array_clear(context->address_phrase_memberships);
756
757 phrase_array *address_dictionary_phrases = context->address_dictionary_phrases;
758 int64_array *address_phrase_memberships = context->address_phrase_memberships;
759
760 size_t num_tokens = tokens->n;
761
762 bool have_address_phrases = search_address_dictionaries_tokens_with_phrases(normalized_str, normalized_tokens, NULL, &address_dictionary_phrases);
763 token_phrase_memberships(address_dictionary_phrases, address_phrase_memberships, num_tokens);
764
765 phrase_array_clear(context->prefix_phrases);
766 phrase_array_clear(context->suffix_phrases);
767
768 for (size_t i = 0; i < num_tokens; i++) {
769 token_t token = tokens->a[i];
770 char *word_pre_norm = tokenized_string_get_token(tokenized_str, i);
771
772 phrase_t prefix_phrase = search_address_dictionaries_prefix(word_pre_norm, token.len, NULL);
773 phrase_array_push(context->prefix_phrases, prefix_phrase);
774
775 phrase_t suffix_phrase = search_address_dictionaries_suffix(word_pre_norm, token.len, NULL);
776 phrase_array_push(context->suffix_phrases, suffix_phrase);
777 }
778
779 /*
780 Component phrases
781 -----------------
782 Precomputed phrases for cities, states, countries, etc. from the training data
783
784 Note: if the training data has lots of mislabeled examples (e.g. Brooklyn as city
785 instead of a city_district), this may cause the parser to get confused. It will
786 penalize itself for getting the wrong answer when really the underlying data
787 is simply ambiguous. In the OSM training data a lot of work has been done to
788 ensure that there's little or no systematic mislabeling. As such, other data
789 sets shouldn't be added willy-nilly unless the labels are consistent.
790 */
791
792 phrase_array_clear(context->component_phrases);
793 int64_array_clear(context->component_phrase_memberships);
794
795 phrase_array *component_phrases = context->component_phrases;
796 int64_array *component_phrase_memberships = context->component_phrase_memberships;
797
798 bool have_component_phrases = trie_search_tokens_with_phrases(parser->phrases, normalized_str_admin, normalized_admin_tokens, &component_phrases);
799 token_phrase_memberships(component_phrases, component_phrase_memberships, num_tokens);
800
801 for (size_t i = 0; i < component_phrases->n; i++) {
802 phrase_t phrase = component_phrases->a[i];
803 if (!is_valid_component_phrase(context->normalized_admin, phrase)) {
804 for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) {
805 component_phrase_memberships->a[j] = NULL_PHRASE_MEMBERSHIP;
806 }
807 }
808 }
809
810 phrase_array_clear(context->postal_code_phrases);
811 int64_array_clear(context->postal_code_phrase_memberships);
812
813 phrase_array *postal_code_phrases = context->postal_code_phrases;
814 int64_array *postal_code_phrase_memberships = context->postal_code_phrase_memberships;
815
816 bool have_postal_code_phrases = trie_search_tokens_with_phrases(parser->postal_codes, normalized_str_admin, normalized_admin_tokens, &postal_code_phrases);
817 token_phrase_memberships(postal_code_phrases, postal_code_phrase_memberships, num_tokens);
818
819 }
820
phrase_at_index(phrase_array * phrases,int64_array * phrase_memberships,uint32_t i)821 static inline phrase_t phrase_at_index(phrase_array *phrases, int64_array *phrase_memberships, uint32_t i) {
822 if (phrases == NULL || phrase_memberships == NULL || i > phrase_memberships->n - 1) {
823 return NULL_PHRASE;
824 }
825
826 int64_t phrase_index = phrase_memberships->a[i];
827 if (phrase_index != NULL_PHRASE_MEMBERSHIP) {
828 phrase_t phrase = phrases->a[phrase_index];
829 return phrase;
830 }
831
832 return NULL_PHRASE;
833 }
834
phrase_prefix(char * word,size_t len,phrase_t prefix_phrase,char_array * prefix_phrase_array)835 char *phrase_prefix(char *word, size_t len, phrase_t prefix_phrase, char_array *prefix_phrase_array) {
836 char_array_clear(prefix_phrase_array);
837 size_t prefix_len = prefix_phrase.len;
838 char_array_add_len(prefix_phrase_array, word, prefix_len);
839 char *prefix = char_array_get_string(prefix_phrase_array);
840 return prefix;
841 }
842
phrase_suffix(char * word,size_t len,phrase_t suffix_phrase,char_array * suffix_phrase_array)843 char *phrase_suffix(char *word, size_t len, phrase_t suffix_phrase, char_array *suffix_phrase_array) {
844 char_array_clear(suffix_phrase_array);
845 size_t suffix_len = suffix_phrase.len;
846 char_array_add_len(suffix_phrase_array, word + (len - suffix_len), suffix_len);
847 char *suffix = char_array_get_string(suffix_phrase_array);
848 return suffix;
849 }
850
is_valid_dictionary_phrase(phrase_t phrase)851 bool is_valid_dictionary_phrase(phrase_t phrase) {
852 uint32_t expansion_index = phrase.data;
853 address_expansion_value_t *expansion_value = address_dictionary_get_expansions(expansion_index);
854
855 if (expansion_value == NULL) {
856 log_warn("expansion_value is NULL for index %u\n", expansion_index);
857 return false;
858 }
859 uint32_t address_phrase_types = expansion_value->components;
860
861 if (address_phrase_types & (LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ENTRANCE | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_POSTAL_CODE | LIBPOSTAL_ADDRESS_PO_BOX)) {
862 for (size_t i = 0; i < expansion_value->expansions->n; i++) {
863 address_expansion_t expansion = expansion_value->expansions->a[i];
864 if (!address_expansion_in_dictionary(expansion, DICTIONARY_TOPONYM)) {
865 return true;
866 }
867 }
868 }
869
870 return false;
871 }
872
873 typedef struct address_parser_phrase {
874 char *str;
875 address_parser_phrase_type_t type;
876 phrase_t phrase;
877 } address_parser_phrase_t;
878
is_plain_word_phrase_type(address_parser_phrase_type_t type)879 static inline bool is_plain_word_phrase_type(address_parser_phrase_type_t type) {
880 return type == ADDRESS_PARSER_NULL_PHRASE || type == ADDRESS_PARSER_SUFFIX_PHRASE || type == ADDRESS_PARSER_PREFIX_PHRASE;
881 }
882
word_or_phrase_at_index(address_parser_t * parser,tokenized_string_t * tokenized,address_parser_context_t * context,uint32_t i,bool long_context)883 static address_parser_phrase_t word_or_phrase_at_index(address_parser_t *parser, tokenized_string_t *tokenized, address_parser_context_t *context, uint32_t i, bool long_context) {
884 phrase_t phrase;
885 address_parser_phrase_t response;
886 char *phrase_string = NULL;
887
888 phrase = phrase_at_index(context->address_dictionary_phrases, context->address_phrase_memberships, i);
889
890 phrase_t component_phrase = phrase_at_index(context->component_phrases, context->component_phrase_memberships, i);
891
892 if (phrase.len > 0 && is_valid_dictionary_phrase(phrase) && component_phrase.len <= phrase.len) {
893 phrase_string = cstring_array_get_phrase(context->normalized, long_context ? context->long_context_phrase : context->context_phrase, phrase),
894
895 response = (address_parser_phrase_t){
896 phrase_string,
897 ADDRESS_PARSER_DICTIONARY_PHRASE,
898 phrase
899 };
900 return response;
901 }
902
903 phrase = component_phrase;
904
905 if (phrase.len > 0) {
906 phrase_string = cstring_array_get_phrase(context->normalized_admin, long_context ? context->long_context_component_phrase : context->context_component_phrase, phrase);
907
908 response = (address_parser_phrase_t){
909 phrase_string,
910 ADDRESS_PARSER_COMPONENT_PHRASE,
911 phrase
912 };
913 return response;
914 }
915
916 phrase_t prefix_phrase = context->prefix_phrases->a[i];
917 phrase_t suffix_phrase = context->suffix_phrases->a[i];
918
919 uint32_t expansion_index;
920 address_expansion_value_t *expansion_value;
921
922 cstring_array *normalized = context->normalized;
923
924 char *word = cstring_array_get_string(normalized, i);
925 token_t token = tokenized->tokens->a[i];
926
927 // Suffixes like straße, etc.
928 if (suffix_phrase.len > 0) {
929 expansion_index = suffix_phrase.data;
930 expansion_value = address_dictionary_get_expansions(expansion_index);
931
932 if (expansion_value->components & LIBPOSTAL_ADDRESS_STREET) {
933 response = (address_parser_phrase_t){
934 word,
935 ADDRESS_PARSER_SUFFIX_PHRASE,
936 suffix_phrase
937 };
938 return response;
939 }
940 }
941
942 // Prefixes like hinter, etc.
943 if (prefix_phrase.len > 0) {
944 expansion_index = prefix_phrase.data;
945 expansion_value = address_dictionary_get_expansions(expansion_index);
946
947 // Don't include elisions like l', d', etc. which are in the LIBPOSTAL_ADDRESS_ANY category
948 if (expansion_value->components ^ LIBPOSTAL_ADDRESS_ANY) {
949 response = (address_parser_phrase_t){
950 word,
951 ADDRESS_PARSER_PREFIX_PHRASE,
952 prefix_phrase
953 };
954 return response;
955 }
956 }
957
958 response = (address_parser_phrase_t){
959 word,
960 ADDRESS_PARSER_NULL_PHRASE,
961 NULL_PHRASE
962 };
963 return response;
964
965 }
966
phrase_index(int64_array * phrase_memberships,size_t start,int8_t direction)967 static inline int64_t phrase_index(int64_array *phrase_memberships, size_t start, int8_t direction) {
968 if (phrase_memberships == NULL) {
969 return -1;
970 }
971
972 int64_t *memberships = phrase_memberships->a;
973 int64_t membership;
974
975 if (direction == -1) {
976 for (ssize_t idx = start; idx >= 0; idx--) {
977 if (memberships[idx] != NULL_PHRASE_MEMBERSHIP) {
978 return (int64_t)idx;
979 }
980 }
981 } else if (direction == 1) {
982 size_t n = phrase_memberships->n;
983 for (size_t idx = start; idx < n; idx++) {
984 if (memberships[idx] != NULL_PHRASE_MEMBERSHIP) {
985 return (int64_t)idx;
986 }
987 }
988 }
989
990 return -1;
991 }
992
993
next_numeric_token_index(tokenized_string_t * tokenized,address_parser_context_t * context,size_t start)994 static inline int64_t next_numeric_token_index(tokenized_string_t *tokenized, address_parser_context_t *context, size_t start) {
995 if (context == NULL) return -1;
996
997 token_array *tokens = tokenized->tokens;
998
999 if (tokens == NULL || start > tokens->n - 1) return -1;
1000
1001 phrase_t phrase;
1002
1003 for (size_t i = start; i < tokens->n; i++) {
1004 if (context->address_phrase_memberships->a[i] == NULL_PHRASE_MEMBERSHIP &&
1005 context->component_phrase_memberships->a[i] == NULL_PHRASE_MEMBERSHIP) {
1006 token_t token = tokens->a[i];
1007 if (token.type != NUMERIC && token.type != IDEOGRAPHIC_NUMBER) {
1008 return i;
1009 }
1010 }
1011 }
1012
1013 return -1;
1014 }
1015
1016
add_phrase_features(cstring_array * features,uint32_t phrase_types,uint32_t component,char * phrase_type,char * phrase_string)1017 static inline void add_phrase_features(cstring_array *features, uint32_t phrase_types, uint32_t component, char *phrase_type, char *phrase_string) {
1018 if (phrase_types == component) {
1019 log_debug("phrase=%s, phrase_types=%d\n", phrase_string, phrase_types);
1020 feature_array_add(features, 2, "unambiguous phrase type", phrase_type);
1021 feature_array_add(features, 3, "unambiguous phrase type+phrase", phrase_type, phrase_string);
1022 } else if (phrase_types & component) {
1023 feature_array_add(features, 3, "phrase type+phrase", phrase_type, phrase_string);
1024 }
1025 }
1026
add_ngram_features(cstring_array * features,char * feature_prefix,cstring_array * ngrams,char * str,size_t n,size_t prefix_len,size_t suffix_len)1027 static bool add_ngram_features(cstring_array *features, char *feature_prefix, cstring_array *ngrams, char *str, size_t n, size_t prefix_len, size_t suffix_len) {
1028 if (features == NULL || ngrams == NULL) return false;
1029
1030 size_t len = strlen(str);
1031
1032 if (n == 0 || n > len - 1) return false;
1033
1034 size_t ngram_num_chars_len = INT64_MAX_STRING_SIZE;
1035 char ngram_num_chars[ngram_num_chars_len];
1036 sprintf(ngram_num_chars, "%zu", n);
1037
1038 bool known_prefix = prefix_len > 0;
1039 bool known_suffix = suffix_len > 0;
1040
1041 cstring_array_clear(ngrams);
1042 if (!add_ngrams(ngrams, n, str + prefix_len, len - suffix_len - prefix_len, !known_prefix, !known_suffix)) {
1043 return false;
1044 }
1045
1046 uint32_t idx;
1047 char *ngram;
1048
1049 if (feature_prefix != NULL) {
1050 cstring_array_foreach(ngrams, idx, ngram, {
1051 feature_array_add(features, 4, feature_prefix, "ngrams", ngram_num_chars, ngram);
1052 })
1053 } else {
1054 cstring_array_foreach(ngrams, idx, ngram, {
1055 feature_array_add(features, 3, "ngrams", ngram_num_chars, ngram);
1056 })
1057 }
1058
1059 return true;
1060 }
1061
1062 /*
1063 address_parser_features
1064 -----------------------
1065
1066 This is a feature function similar to those found in MEMM and CRF models.
1067
1068 Follows the signature of a tagger_feature_function so it can be called
1069 as a function pointer by the averaged perceptron or CRF model.
1070
1071 Parameters:
1072
1073 address_parser_t *self: a pointer to the address_parser struct, which contains
1074 word frequencies and perhaps other useful corpus-wide statistics.
1075
1076 address_parser_context_t *context: The context struct containing:
1077 - phrase dictionary memberships for all the tokens
1078 - country (if knkown)
1079 - language (if known)
1080 - features array
1081
1082 tokenized_string_t *tokenized: the sequence of tokens for parsing
1083 uint32_t i: the current token index
1084 char *prev: the predicted tag at index i - 1
1085 char *prev2: the predicted tag at index i - 2
1086
1087 */
1088
address_parser_features(void * self,void * ctx,tokenized_string_t * tokenized,uint32_t idx)1089 bool address_parser_features(void *self, void *ctx, tokenized_string_t *tokenized, uint32_t idx) {
1090 if (self == NULL || ctx == NULL) return false;
1091
1092 address_parser_t *parser = (address_parser_t *)self;
1093 address_parser_context_t *context = (address_parser_context_t *)ctx;
1094
1095 cstring_array *features = context->features;
1096 cstring_array *prev_tag_features = context->prev_tag_features;
1097 cstring_array *prev2_tag_features = context->prev2_tag_features;
1098 char *language = context->language;
1099 char *country = context->country;
1100
1101 phrase_array *address_dictionary_phrases = context->address_dictionary_phrases;
1102 int64_array *address_phrase_memberships = context->address_phrase_memberships;
1103 phrase_array *component_phrases = context->component_phrases;
1104 int64_array *component_phrase_memberships = context->component_phrase_memberships;
1105 phrase_array *postal_code_phrases = context->postal_code_phrases;
1106 int64_array *postal_code_phrase_memberships = context->postal_code_phrase_memberships;
1107 cstring_array *normalized = context->normalized;
1108
1109 uint32_array *separators = context->separators;
1110
1111 cstring_array_clear(features);
1112 cstring_array_clear(prev_tag_features);
1113 cstring_array_clear(prev2_tag_features);
1114
1115 token_array *tokens = tokenized->tokens;
1116
1117 token_t token = tokens->a[idx];
1118
1119 ssize_t last_index = (ssize_t)idx - 1;
1120 ssize_t next_index = (ssize_t)idx + 1;
1121
1122 char *word_pre_norm = tokenized_string_get_token(tokenized, idx);
1123
1124 char *word = cstring_array_get_string(normalized, idx);
1125 if (word == NULL) {
1126 log_error("got NULL word at %d\n", idx);
1127 return false;
1128 }
1129
1130 size_t word_len = strlen(word);
1131
1132 log_debug("word=%s\n", word);
1133
1134 phrase_t phrase = NULL_PHRASE;
1135 phrase_t component_phrase = NULL_PHRASE;
1136
1137 char *phrase_string = NULL;
1138 char *component_phrase_string = NULL;
1139
1140 int64_t address_phrase_index = address_phrase_memberships->a[idx];
1141 int64_t component_phrase_index = component_phrase_memberships->a[idx];
1142
1143 if (address_phrase_index != NULL_PHRASE_MEMBERSHIP) {
1144 phrase = address_dictionary_phrases->a[address_phrase_index];
1145 }
1146
1147 if (component_phrase_index != NULL_PHRASE_MEMBERSHIP) {
1148 component_phrase = component_phrases->a[component_phrase_index];
1149 }
1150
1151 char_array *phrase_tokens = context->phrase;
1152 char_array *component_phrase_tokens = context->component_phrase;
1153
1154 uint32_t expansion_index;
1155 address_expansion_value_t *expansion_value;
1156
1157 bool add_word_feature = true;
1158
1159 size_t num_tokens = tokenized->tokens->n;
1160
1161 // Address dictionary phrases
1162 if (phrase.len > 0 && phrase.len >= component_phrase.len) {
1163 log_debug("phrase\n");
1164
1165 last_index = (ssize_t)phrase.start - 1;
1166 next_index = (ssize_t)phrase.start + phrase.len;
1167
1168 if(is_valid_dictionary_phrase(phrase)) {
1169 uint32_t expansion_index = phrase.data;
1170 address_expansion_value_t *expansion_value = address_dictionary_get_expansions(expansion_index);
1171
1172 if (expansion_value == NULL) {
1173 log_warn("expansion_value is NULL for index %u\n", expansion_index);
1174 return false;
1175 }
1176 uint32_t address_phrase_types = expansion_value->components;
1177
1178 phrase_string = cstring_array_get_phrase(context->normalized, phrase_tokens, phrase);
1179
1180 add_word_feature = false;
1181 log_debug("phrase_string=%s\n", phrase_string);
1182
1183 add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_STREET, "street", phrase_string);
1184 add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_NAME, "name", phrase_string);
1185 add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_CATEGORY, "category", phrase_string);
1186 add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_UNIT, "unit", phrase_string);
1187 add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_PO_BOX, "po_box", phrase_string);
1188 add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_LEVEL, "level", phrase_string);
1189 add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_ENTRANCE, "entrance", phrase_string);
1190 add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_STAIRCASE, "staircase", phrase_string);
1191 add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_HOUSE_NUMBER, "house_number", phrase_string);
1192 add_phrase_features(features, address_phrase_types, LIBPOSTAL_ADDRESS_POSTAL_CODE, "postal_code", phrase_string);
1193 }
1194 }
1195
1196
1197 address_parser_types_t types;
1198
1199 // Component phrases
1200 if (component_phrase.len > 0 && component_phrase.len >= phrase.len) {
1201 component_phrase = component_phrases->a[component_phrase_index];
1202
1203 component_phrase_string = cstring_array_get_phrase(context->normalized_admin, component_phrase_tokens, component_phrase);
1204
1205 uint32_t component_phrase_index = component_phrase.data;
1206 if (component_phrase_index > parser->phrase_types->n) {
1207 log_error("Invalid component_phrase_index: %u (parser->phrase_types->n=%zu)\n", component_phrase_index, parser->phrase_types->n);
1208 return false;
1209 }
1210
1211 types = parser->phrase_types->a[component_phrase_index];
1212
1213 uint32_t component_phrase_types = types.components;
1214 uint32_t most_common = types.most_common;
1215
1216 if (last_index >= (ssize_t)component_phrase.start - 1) {
1217 last_index = (ssize_t)component_phrase.start - 1;
1218 }
1219
1220 if (next_index < (ssize_t)component_phrase.start + component_phrase.len) {
1221 next_index = (ssize_t)component_phrase.start + component_phrase.len;
1222 }
1223
1224 if (component_phrase_string != NULL && component_phrase_types > 0) {
1225 feature_array_add(features, 2, "phrase", component_phrase_string);
1226 add_word_feature = false;
1227 }
1228
1229 if (component_phrase_types > 0) {
1230 add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_SUBURB, "suburb", component_phrase_string);
1231 add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_CITY, "city", component_phrase_string);
1232 add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_CITY_DISTRICT, "city_district", component_phrase_string);
1233 add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_ISLAND, "island", component_phrase_string);
1234 add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_STATE_DISTRICT, "state_district", component_phrase_string);
1235 add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_STATE, "state", component_phrase_string);
1236 add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_COUNTRY_REGION, "country_region", component_phrase_string);
1237 add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_COUNTRY, "country", component_phrase_string);
1238 add_phrase_features(features, component_phrase_types, ADDRESS_COMPONENT_WORLD_REGION, "world_region", component_phrase_string);
1239 }
1240
1241 if (component_phrase_types != most_common) {
1242 if (most_common == ADDRESS_PARSER_BOUNDARY_CITY) {
1243 feature_array_add(features, 2, "commonly city", component_phrase_string);
1244 } else if (most_common == ADDRESS_PARSER_BOUNDARY_COUNTRY) {
1245 feature_array_add(features, 2, "commonly country", component_phrase_string);
1246 } else if (most_common == ADDRESS_PARSER_BOUNDARY_SUBURB) {
1247 feature_array_add(features, 2, "commonly suburb", component_phrase_string);
1248 } else if (most_common == ADDRESS_PARSER_BOUNDARY_CITY_DISTRICT) {
1249 feature_array_add(features, 2, "commonly city_district", component_phrase_string);
1250 } else if (most_common == ADDRESS_PARSER_BOUNDARY_STATE) {
1251 feature_array_add(features, 2, "commonly state", component_phrase_string);
1252 } else if (most_common == ADDRESS_PARSER_BOUNDARY_COUNTRY_REGION) {
1253 feature_array_add(features, 2, "commonly country_region", component_phrase_string);
1254 } else if (most_common == ADDRESS_PARSER_BOUNDARY_STATE_DISTRICT) {
1255 feature_array_add(features, 2, "commonly state_district", component_phrase_string);
1256 } else if (most_common == ADDRESS_PARSER_BOUNDARY_ISLAND) {
1257 feature_array_add(features, 2, "commonly island", component_phrase_string);
1258 }
1259 }
1260 }
1261
1262 bool possible_postal_code = false;
1263 bool postal_code_have_admin = false;
1264 int64_t postal_code_phrase_index = postal_code_phrase_memberships->a[idx];
1265 phrase_t postal_code_phrase = NULL_PHRASE;
1266
1267 if (postal_code_phrase_index != NULL_PHRASE_MEMBERSHIP) {
1268 postal_code_phrase = postal_code_phrases->a[postal_code_phrase_index];
1269
1270 uint32_t postal_code_id = postal_code_phrase.data;
1271
1272 possible_postal_code = true;
1273
1274 if (last_index >= (ssize_t)postal_code_phrase.start - 1) {
1275 last_index = (ssize_t)postal_code_phrase.start - 1;
1276 }
1277
1278 if (next_index < (ssize_t)postal_code_phrase.start + postal_code_phrase.len) {
1279 next_index = (ssize_t)postal_code_phrase.start + postal_code_phrase.len;
1280 }
1281
1282 uint32_t admin_id;
1283 uint64_t postal_code_context;
1284
1285 khiter_t k;
1286
1287 if (last_index >= 0) {
1288 int64_t last_component_phrase_index = component_phrase_memberships->a[last_index];
1289 if (last_component_phrase_index != NULL_PHRASE_MEMBERSHIP) {
1290 phrase_t last_component_phrase = component_phrases->a[last_component_phrase_index];
1291 admin_id = last_component_phrase.data;
1292
1293 if (postal_code_context_exists(parser, postal_code_id, admin_id)) {
1294 postal_code_have_admin = true;
1295 }
1296 }
1297 }
1298
1299 if (!postal_code_have_admin && next_index < num_tokens) {
1300 int64_t next_component_phrase_index = component_phrase_memberships->a[next_index];
1301 if (next_component_phrase_index != NULL_PHRASE_MEMBERSHIP) {
1302 phrase_t next_component_phrase = component_phrases->a[next_component_phrase_index];
1303 admin_id = next_component_phrase.data;
1304 if (postal_code_context_exists(parser, postal_code_id, admin_id)) {
1305 postal_code_have_admin = true;
1306 }
1307 }
1308 }
1309
1310 }
1311
1312 if (possible_postal_code) {
1313 if (postal_code_have_admin) {
1314 feature_array_add(features, 1, "postcode have context");
1315 feature_array_add(features, 2, "postcode have context", word);
1316 } else {
1317 feature_array_add(features, 2, "postcode no context", word);
1318 }
1319 }
1320
1321 uint32_t word_freq = word_vocab_frequency(parser, word);
1322
1323 bool is_word = is_word_token(token.type);
1324
1325 bool is_unknown_word = false;
1326 bool is_unknown = false;
1327
1328 bool known_prefix = false;
1329 bool known_suffix = false;
1330
1331 size_t prefix_len = 0;
1332 size_t suffix_len = 0;
1333
1334 char *prefix = NULL;
1335 char *suffix = NULL;
1336
1337 if (add_word_feature) {
1338 // Bias unit, acts as an intercept
1339 feature_array_add(features, 1, "bias");
1340
1341 phrase_t prefix_phrase = context->prefix_phrases->a[idx];
1342 phrase_t suffix_phrase = context->suffix_phrases->a[idx];
1343
1344 // Prefixes like hinter, etc.
1345 if (prefix_phrase.len > 0) {
1346 expansion_index = prefix_phrase.data;
1347 expansion_value = address_dictionary_get_expansions(expansion_index);
1348
1349 // Don't include elisions like l', d', etc. which are in the LIBPOSTAL_ADDRESS_ANY category
1350 if (expansion_value->components ^ LIBPOSTAL_ADDRESS_ANY) {
1351 known_prefix = true;
1352 char_array_clear(phrase_tokens);
1353 prefix_len = prefix_phrase.len;
1354 char_array_add_len(phrase_tokens, word_pre_norm, prefix_len);
1355 prefix = char_array_get_string(phrase_tokens);
1356 log_debug("got prefix: %s\n", prefix);
1357 feature_array_add(features, 2, "prefix", prefix);
1358 }
1359 }
1360
1361 // Suffixes like straße, etc.
1362 if (suffix_phrase.len > 0) {
1363 expansion_index = suffix_phrase.data;
1364 expansion_value = address_dictionary_get_expansions(expansion_index);
1365
1366 if (expansion_value->components & LIBPOSTAL_ADDRESS_STREET) {
1367 known_suffix = true;
1368 char_array_clear(context->suffix_phrase);
1369 suffix_len = suffix_phrase.len;
1370 size_t word_pre_norm_len = cstring_array_token_length(tokenized->strings, idx);
1371 size_t suffix_offset = word_pre_norm_len - suffix_len;
1372 char_array_add_len(context->suffix_phrase, word_pre_norm + suffix_offset, suffix_len);
1373 suffix = char_array_get_string(context->suffix_phrase);
1374 log_debug("got suffix: %s\n", suffix);
1375 feature_array_add(features, 2, "suffix", suffix);
1376 }
1377 }
1378
1379 bool is_hyphenated = false;
1380
1381 // For rare words and unknown words (so unknown words can benefit from statistics of known but super common words)
1382 if (word_freq <= parser->options.rare_word_threshold && is_word) {
1383 log_debug("rare word: %s\n", word);
1384 bool ngrams_added = false;
1385 size_t hyphenated_word_offset = 0;
1386 bool first_sub_token = true;
1387 bool last_sub_token = true;
1388
1389 ssize_t next_hyphen_index;
1390
1391 token_array_clear(context->sub_tokens);
1392
1393 do {
1394 next_hyphen_index = string_next_hyphen_index(word + hyphenated_word_offset, word_len - hyphenated_word_offset);
1395 char *sub_word = word;
1396 size_t sub_word_len = word_len;
1397
1398 if (next_hyphen_index >= 0) {
1399 is_hyphenated = true;
1400 char_array_clear(context->sub_token);
1401 char_array_add_len(context->sub_token, word + hyphenated_word_offset, next_hyphen_index);
1402 token_array_push(context->sub_tokens, (token_t){hyphenated_word_offset, next_hyphen_index, token.type});
1403 sub_word = char_array_get_string(context->sub_token);
1404 sub_word_len = context->sub_token->n;
1405 last_sub_token = false;
1406 } else if (is_hyphenated) {
1407 char_array_clear(context->sub_token);
1408 char_array_add_len(context->sub_token, word + hyphenated_word_offset, word_len - hyphenated_word_offset);
1409 sub_word = char_array_get_string(context->sub_token);
1410 sub_word_len = context->sub_token->n;
1411 last_sub_token = true;
1412 }
1413
1414 bool add_prefix = first_sub_token && prefix_len < sub_word_len;
1415 bool add_suffix = last_sub_token && suffix_len < sub_word_len;
1416
1417 uint32_t sub_word_freq = word_freq;
1418 if (is_hyphenated) {
1419 sub_word_freq = word_vocab_frequency(parser, sub_word);
1420 if (sub_word_freq > 0) {
1421 feature_array_add(features, 2, "sub_word", sub_word);
1422 }
1423
1424 }
1425
1426 if (sub_word_freq <= parser->options.rare_word_threshold) {
1427 // prefix/suffix features from 3-6 characters
1428 for (size_t ng = 3; ng <= 6; ng++) {
1429 ngrams_added = add_ngram_features(features, is_hyphenated ? "sub_word" : "word", context->ngrams, sub_word, ng, add_prefix ? prefix_len : 0, add_suffix ? suffix_len : 0);
1430 }
1431 }
1432
1433 hyphenated_word_offset += next_hyphen_index + 1;
1434 first_sub_token = false;
1435
1436 log_debug("next_hyphen_index=%zd\n", next_hyphen_index);
1437 } while(next_hyphen_index >= 0);
1438
1439 }
1440
1441 if (word_freq > 0) {
1442 // The individual word
1443 feature_array_add(features, 2, "word", word);
1444 } else {
1445 log_debug("word not in vocab: %s\n", word);
1446
1447 is_unknown = true;
1448 word = (token.type != NUMERIC && token.type != IDEOGRAPHIC_NUMBER) ? UNKNOWN_WORD : UNKNOWN_NUMERIC;
1449
1450 if (is_word_token(token.type)) {
1451 is_unknown_word = true;
1452 }
1453 }
1454
1455 if (idx == 0 && !is_unknown_word) {
1456 feature_array_add(features, 2, "first word", word);
1457 //feature_array_add(features, 3, "first word+next word", word, next_word);
1458 }
1459
1460 } else if (component_phrase_string != NULL) {
1461 word = component_phrase_string;
1462 } else if (phrase_string != NULL) {
1463 word = phrase_string;
1464 }
1465
1466 if (last_index == idx - 1) {
1467 // Previous tag and current word
1468 feature_array_add(prev_tag_features, 2, "word", word);
1469
1470 // Previous two tags and current word
1471 if (parser->model_type == ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON) {
1472 // In the CRF this is accounted for by the transition weights
1473 // so only need it for the averaged perceptron
1474 feature_array_add(prev_tag_features, 1, "trans");
1475
1476 // Averaged perceptron uses two tags of history, CRF uses one
1477 feature_array_add(prev2_tag_features, 2, "word", word);
1478 feature_array_add(prev2_tag_features, 1, "trans");
1479 }
1480 }
1481
1482 if (last_index >= 0) {
1483 address_parser_phrase_t prev_word_or_phrase = word_or_phrase_at_index(parser, tokenized, context, last_index, false);
1484 char *prev_word = prev_word_or_phrase.str;
1485
1486 if (is_plain_word_phrase_type(prev_word_or_phrase.type)) {
1487 uint32_t prev_word_freq = word_vocab_frequency(parser, prev_word);
1488 token_t prev_token = tokenized->tokens->a[last_index];
1489 bool prev_token_numeric = is_numeric_token(prev_token.type);
1490 if (prev_word_freq == 0) {
1491 prev_word = !prev_token_numeric ? UNKNOWN_WORD : UNKNOWN_NUMERIC;
1492 }
1493 }
1494
1495 // Previous word
1496 feature_array_add(features, 2, "prev word", prev_word);
1497
1498
1499 if (last_index == idx - 1) {
1500 feature_array_add(prev_tag_features, 2, "prev word", prev_word);
1501 }
1502
1503 // Previous word and current word
1504 feature_array_add(features, 3, "prev word+word", prev_word, word);
1505 }
1506
1507 if (next_index < num_tokens) {
1508 address_parser_phrase_t next_word_or_phrase = word_or_phrase_at_index(parser, tokenized, context, next_index, false);
1509 char *next_word = next_word_or_phrase.str;
1510 size_t next_word_len = 1;
1511
1512 if (is_plain_word_phrase_type(next_word_or_phrase.type)) {
1513 uint32_t next_word_freq = word_vocab_frequency(parser, next_word);
1514 token_t next_token = tokenized->tokens->a[next_index];
1515 bool next_token_numeric = is_numeric_token(next_token.type);
1516 if (next_word_freq == 0) {
1517 next_word = !next_token_numeric ? UNKNOWN_WORD : UNKNOWN_NUMERIC;
1518 }
1519 } else {
1520 next_word_len = next_word_or_phrase.phrase.len;
1521 }
1522
1523 // Next word e.g. if the current word is unknown and the next word is "street"
1524 feature_array_add(features, 2, "next word", next_word);
1525
1526 // Current word and next word
1527 feature_array_add(features, 3, "word+next word", word, next_word);
1528
1529 // Prev tag, current word and next word
1530 //feature_array_add(features, 4, "prev tag+word+next word", prev || "START", word, next_word);
1531
1532 // Venue names ("house") are almost always at the beginning of the string
1533 // and often contain out-of-vocabulary words. Consider a case like "Barboncino 781 Franklin Ave".
1534 // The features available to classify "Barboncino" are going to be unknown word featuers (n-grams),
1535 // next word features (unknown word where next word=DDD is just as likely to be a street)
1536 // and no previous tags of history since it's the first word. If the parser predicts the
1537 // first token correctly, it's going to have an easier time getting the rest of the sequence
1538 // correct (unknown word + prev tag was "house" is probably still part of the venue, etc.) so
1539 // we're only really worried about that first token. This group of features, called
1540 // "long-context features" finds the relative position of the next numeric token as well
1541 // as the next street-level phrase (words like "ave", "street", etc.) in the right context.
1542 // In an English or French address, if we know there's a number somewhere to our right,
1543 // and that a word like "Ave" appears to the right of the number, it's very likely that
1544 // the current unknown word is part of a venue name. Similarly, if a venue-word like "Pizzeria"
1545 // occurred prior to the number, that would also be strong evidence that we're in a venue name.
1546 // Conversely, if we're in a Spanish address and a word like "Calle" comes before the first number
1547 // to our right, it's also likely that we're in a venue name, but we'd need to note that the
1548 // phrase we saw was "Calle" and not an English thoroughfare type.
1549
1550 if (idx == 0 && add_word_feature && is_unknown_word) {
1551 bool seen_number = false;
1552 bool seen_phrase = false;
1553 for (uint32_t right_idx = idx + 1; right_idx < num_tokens; right_idx++) {
1554 token_t right_token = tokens->a[right_idx];
1555
1556 /* Check */
1557 address_parser_phrase_t right_context_word_or_phrase = word_or_phrase_at_index(parser, tokenized, context, right_idx, true);
1558 address_parser_phrase_type_t right_context_phrase_type = right_context_word_or_phrase.type;
1559 if (right_context_phrase_type != ADDRESS_PARSER_NULL_PHRASE &&
1560 right_context_phrase_type != ADDRESS_PARSER_DICTIONARY_PHRASE &&
1561 right_context_phrase_type != ADDRESS_PARSER_SUFFIX_PHRASE &&
1562 right_context_phrase_type != ADDRESS_PARSER_PREFIX_PHRASE) {
1563 continue;
1564 }
1565 char *right_context_word = right_context_word_or_phrase.str;
1566 phrase_t right_context_phrase = right_context_word_or_phrase.phrase;
1567
1568 phrase_t suffix_phrase = context->suffix_phrases->a[right_idx];
1569
1570 uint32_t right_context_expansion_index;
1571 address_expansion_value_t *right_context_expansion_value;
1572
1573 uint32_t right_context_components = 0;
1574 bool right_context_name = false;
1575 bool right_context_street = false;
1576
1577 if (right_context_phrase.len > 0) {
1578 right_context_expansion_index = right_context_phrase.data;
1579 right_context_expansion_value = address_dictionary_get_expansions(right_context_expansion_index);
1580 right_context_components = right_context_expansion_value->components;
1581
1582 char *right_affix_type = NULL;
1583 char *right_context_affix = NULL;
1584
1585 char *relation_to_number = seen_number ? "after number" : "before number";
1586
1587 seen_phrase = true;
1588
1589 char *right_context_word_pre_norm;
1590
1591 if (right_context_phrase_type == ADDRESS_PARSER_SUFFIX_PHRASE) {
1592 right_affix_type = "suffix";
1593 right_context_word_pre_norm = tokenized_string_get_token(tokenized, right_idx);
1594 right_context_affix = phrase_suffix(right_context_word, strlen(right_context_word_pre_norm), right_context_phrase, context->long_context_suffix_phrase);
1595 } else if (right_context_word_or_phrase.type == ADDRESS_PARSER_PREFIX_PHRASE) {
1596 right_affix_type = "prefix";
1597 right_context_word_pre_norm = tokenized_string_get_token(tokenized, right_idx);
1598 right_context_affix = phrase_prefix(right_context_word, strlen(right_context_word_pre_norm), right_context_phrase, context->long_context_suffix_phrase);
1599 }
1600
1601 if (right_context_components & LIBPOSTAL_ADDRESS_STREET && !(right_context_components & LIBPOSTAL_ADDRESS_NAME)) {
1602 feature_array_add(features, 2, "first word unknown+street phrase right", relation_to_number);
1603 feature_array_add(features, 3, "first word unknown+street phrase right", relation_to_number, right_context_word);
1604 if (right_context_affix != NULL && right_affix_type != NULL) {
1605 feature_array_add(features, 4, "first word unknown+street affix right", relation_to_number, right_affix_type, right_context_affix);
1606 }
1607 break;
1608 } else if (right_context_components & LIBPOSTAL_ADDRESS_NAME && !(right_context_components & LIBPOSTAL_ADDRESS_STREET)) {
1609 feature_array_add(features, 2, "first word unknown+venue phrase right", relation_to_number);
1610 feature_array_add(features, 3, "first word unknown+venue phrase right", relation_to_number, right_context_word);
1611 if (right_context_affix != NULL && right_affix_type != NULL) {
1612 feature_array_add(features, 4, "first word unknown+venue affix right", relation_to_number, right_affix_type, right_context_affix);
1613 }
1614 } else if (right_context_components & (LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET)) {
1615 if (seen_number) {
1616 feature_array_add(features, 1, "first word unknown+number+ambiguous phrase right");
1617 feature_array_add(features, 2, "first word unknown+number+ambiguous phrase right", right_context_word);
1618 if (right_context_affix != NULL && right_affix_type != NULL) {
1619 feature_array_add(features, 3, "first word unknown+number+ambiguous affix right", right_affix_type, right_context_affix);
1620 }
1621 break;
1622 } else {
1623 continue;
1624 }
1625 }
1626
1627 if (seen_number) break;
1628 }
1629
1630 if (is_numeric_token(right_token.type)) {
1631 seen_number = true;
1632 char *relation_to_phrase = seen_phrase ? "after phrase" : "before phrase";
1633 feature_array_add(features, 2, "first word unknown+number right", relation_to_phrase);
1634 feature_array_add(features, 3, "first word unknown+number right", relation_to_phrase, right_context_word);
1635 if (seen_phrase) break;
1636 }
1637 }
1638 }
1639 }
1640
1641 return true;
1642
1643 }
1644
address_parser_predict(address_parser_t * self,address_parser_context_t * context,cstring_array * token_labels,tagger_feature_function feature_function,tokenized_string_t * tokenized_str)1645 bool address_parser_predict(address_parser_t *self, address_parser_context_t *context, cstring_array *token_labels, tagger_feature_function feature_function, tokenized_string_t *tokenized_str) {
1646 if (self->model_type == ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON) {
1647 return averaged_perceptron_tagger_predict(self->model.ap, self, context, context->features, context->prev_tag_features, context->prev2_tag_features, token_labels, feature_function, tokenized_str, self->options.print_features);
1648 } else if (self->model_type == ADDRESS_PARSER_TYPE_CRF) {
1649 return crf_tagger_predict(self->model.crf, self, context, context->features, context->prev_tag_features, token_labels, feature_function, tokenized_str, self->options.print_features);
1650 } else {
1651 log_error("Parser has unknown model type\n");
1652 }
1653 return false;
1654 }
1655
address_parser_response_new(void)1656 libpostal_address_parser_response_t *address_parser_response_new(void) {
1657 libpostal_address_parser_response_t *response = malloc(sizeof(libpostal_address_parser_response_t));
1658 return response;
1659 }
1660
address_parser_parse(char * address,char * language,char * country)1661 libpostal_address_parser_response_t *address_parser_parse(char *address, char *language, char *country) {
1662 if (address == NULL) return NULL;
1663
1664 address_parser_t *parser = get_address_parser();
1665 if (parser == NULL || parser->context == NULL) {
1666 log_error("parser is not setup, call libpostal_setup_address_parser()\n");
1667 return NULL;
1668 }
1669
1670 address_parser_context_t *context = parser->context;
1671
1672 char *normalized = address_parser_normalize_string(address);
1673 bool is_normalized = normalized != NULL;
1674 if (!is_normalized) {
1675 normalized = address;
1676 }
1677
1678 token_array *tokens = tokenize(normalized);
1679
1680 tokenized_string_t *tokenized_str = tokenized_string_new_from_str_size(normalized, strlen(normalized), tokens->n);
1681
1682 for (size_t i = 0; i < tokens->n; i++) {
1683 token_t token = tokens->a[i];
1684 if (ADDRESS_PARSER_IS_SEPARATOR(token.type)) {
1685 uint32_array_pop(context->separators);
1686 uint32_array_push(context->separators, ADDRESS_SEPARATOR_FIELD_INTERNAL);
1687 continue;
1688 } else if (ADDRESS_PARSER_IS_IGNORABLE(token.type)) {
1689 continue;
1690 }
1691
1692 tokenized_string_add_token(tokenized_str, (const char *)normalized, token.len, token.type, token.offset);
1693 uint32_array_push(context->separators, ADDRESS_SEPARATOR_NONE);
1694 }
1695
1696 // This parser was trained without knowing language/country.
1697 // If at some point we build country-specific/language-specific
1698 // parsers, these parameters could be used to select a model.
1699 // The language parameter does technically control which dictionaries
1700 // are searched at the street level. It's possible with e.g. a phrase
1701 // like "de", which can be either the German country code or a stopword
1702 // in Spanish, that even in the case where it's being used as a country code,
1703 // it's possible that both the street-level and admin-level phrase features
1704 // may be working together as a kind of intercept. Depriving the model
1705 // of the street-level phrase features by passing in a known language
1706 // may change the decision threshold so explicitly ignore these
1707 // options until there's a use for them (country-specific or language-specific
1708 // parser models).
1709
1710 language = NULL;
1711 country = NULL;
1712 address_parser_context_fill(context, parser, tokenized_str, language, country);
1713
1714 libpostal_address_parser_response_t *response = NULL;
1715
1716 // If the whole input string is a single known phrase at the SUBURB level or higher, bypass sequence prediction altogether
1717 phrase_t only_phrase = NULL_PHRASE;
1718 token_t token, prev_token;
1719 bool is_postal = false;
1720 if (context->component_phrases->n == 1) {
1721 only_phrase = context->component_phrases->a[0];
1722 } else if (context->postal_code_phrases->n == 1) {
1723 only_phrase = context->postal_code_phrases->a[0];
1724 is_postal = true;
1725 }
1726
1727 if (only_phrase.start == 0 && only_phrase.len == tokenized_str->tokens->n && only_phrase.len > 0) {
1728 uint32_t most_common = 0;
1729
1730 char *label = NULL;
1731
1732 if (!is_postal) {
1733 uint32_t component_phrase_index = only_phrase.data;
1734 address_parser_types_t types = parser->phrase_types->a[component_phrase_index];
1735 most_common = types.most_common;
1736
1737 if (most_common == ADDRESS_PARSER_BOUNDARY_CITY) {
1738 label = strdup(ADDRESS_PARSER_LABEL_CITY);
1739 } else if (most_common == ADDRESS_PARSER_BOUNDARY_STATE) {
1740 label = strdup(ADDRESS_PARSER_LABEL_STATE);
1741 } else if (most_common == ADDRESS_PARSER_BOUNDARY_COUNTRY) {
1742 label = strdup(ADDRESS_PARSER_LABEL_COUNTRY);
1743 } else if (most_common == ADDRESS_PARSER_BOUNDARY_STATE_DISTRICT) {
1744 label = strdup(ADDRESS_PARSER_LABEL_STATE_DISTRICT);
1745 } else if (most_common == ADDRESS_PARSER_BOUNDARY_COUNTRY_REGION) {
1746 label = strdup(ADDRESS_PARSER_LABEL_COUNTRY_REGION);
1747 } else if (most_common == ADDRESS_PARSER_BOUNDARY_SUBURB) {
1748 label = strdup(ADDRESS_PARSER_LABEL_SUBURB);
1749 } else if (most_common == ADDRESS_PARSER_BOUNDARY_CITY_DISTRICT) {
1750 label = strdup(ADDRESS_PARSER_LABEL_CITY_DISTRICT);
1751 } else if (most_common == ADDRESS_PARSER_BOUNDARY_WORLD_REGION) {
1752 label = strdup(ADDRESS_PARSER_LABEL_WORLD_REGION);
1753 }
1754 } else {
1755 label = strdup(ADDRESS_PARSER_LABEL_POSTAL_CODE);
1756 }
1757
1758 // Implicit: if most_common is not one of the above, ignore and parse regularly
1759 if (label != NULL) {
1760 char **single_label = malloc(sizeof(char *));
1761 single_label[0] = label;
1762 char **single_component = malloc(sizeof(char *));
1763 single_component[0] = strdup(normalized);
1764
1765 response = address_parser_response_new();
1766
1767 response->num_components = 1;
1768 response->labels = single_label;
1769 response->components = single_component;
1770
1771 token_array_destroy(tokens);
1772 tokenized_string_destroy(tokenized_str);
1773
1774 if (is_normalized) {
1775 free(normalized);
1776 }
1777 return response;
1778 }
1779 }
1780
1781 cstring_array *token_labels = cstring_array_new_size(tokens->n);
1782
1783 char *prev_label = NULL;
1784
1785 bool prediction_success = address_parser_predict(parser, context, token_labels, &address_parser_features, tokenized_str);
1786
1787 if (prediction_success) {
1788 response = address_parser_response_new();
1789
1790 size_t num_strings = cstring_array_num_strings(tokenized_str->strings);
1791
1792 cstring_array *labels = cstring_array_new_size(num_strings);
1793 cstring_array *components = cstring_array_new_size(strlen(address) + num_strings);
1794
1795 token_t *tokens = tokenized_str->tokens->a;
1796
1797 for (size_t i = 0; i < num_strings; i++) {
1798 char *str = tokenized_string_get_token(tokenized_str, i);
1799
1800 char *label = cstring_array_get_string(token_labels, i);
1801
1802 if (prev_label == NULL || strcmp(label, prev_label) != 0) {
1803 cstring_array_add_string(labels, label);
1804 cstring_array_start_token(components);
1805
1806 }
1807
1808 if (prev_label != NULL && strcmp(label, prev_label) == 0) {
1809 token = tokens[i];
1810 prev_token = tokens[i - 1];
1811 if (token.offset > prev_token.offset + prev_token.len) {
1812 cstring_array_cat_string(components, " ");
1813 }
1814 cstring_array_cat_string(components, str);
1815 } else {
1816 cstring_array_append_string(components, str);
1817 cstring_array_terminate(components);
1818 }
1819
1820 prev_label = label;
1821 }
1822 response->num_components = cstring_array_num_strings(components);
1823 response->components = cstring_array_to_strings(components);
1824 response->labels = cstring_array_to_strings(labels);
1825
1826 } else {
1827 log_error("Error in prediction\n");
1828 }
1829
1830 token_array_destroy(tokens);
1831 tokenized_string_destroy(tokenized_str);
1832 cstring_array_destroy(token_labels);
1833
1834 if (is_normalized) {
1835 free(normalized);
1836 }
1837
1838 return response;
1839 }
1840
1841
1842
address_parser_module_setup(char * dir)1843 bool address_parser_module_setup(char *dir) {
1844 if (parser == NULL) {
1845 return address_parser_load(dir);
1846 }
1847 return true;
1848 }
1849
address_parser_module_teardown(void)1850 void address_parser_module_teardown(void) {
1851 if (parser != NULL) {
1852 address_parser_destroy(parser);
1853 }
1854 parser = NULL;
1855 }
1856