1 #include <stdlib.h>
2 
3 #include "libpostal.h"
4 
5 #include "klib/khash.h"
6 #include "klib/ksort.h"
7 #include "log/log.h"
8 
9 #include "address_dictionary.h"
10 #include "address_parser.h"
11 #include "dedupe.h"
12 #include "expand.h"
13 
14 #include "language_classifier.h"
15 #include "near_dupe.h"
16 #include "normalize.h"
17 #include "place.h"
18 #include "scanner.h"
19 #include "string_utils.h"
20 #include "token_types.h"
21 
22 static libpostal_normalize_options_t LIBPOSTAL_DEFAULT_OPTIONS = {
23         .languages = NULL,
24         .num_languages = 0,
25         .address_components = LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_PO_BOX | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ENTRANCE | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_POSTAL_CODE,
26         .latin_ascii = true,
27         .transliterate = true,
28         .strip_accents = true,
29         .decompose = true,
30         .lowercase = true,
31         .trim_string = true,
32         .drop_parentheticals = true,
33         .replace_numeric_hyphens = false,
34         .delete_numeric_hyphens = false,
35         .split_alpha_from_numeric = true,
36         .replace_word_hyphens = true,
37         .delete_word_hyphens = true,
38         .delete_final_periods = true,
39         .delete_acronym_periods = true,
40         .drop_english_possessives = true,
41         .delete_apostrophes = true,
42         .expand_numex = true,
43         .roman_numerals = true
44 };
45 
libpostal_get_default_options(void)46 libpostal_normalize_options_t libpostal_get_default_options(void) {
47     return LIBPOSTAL_DEFAULT_OPTIONS;
48 }
49 
libpostal_expand_address(char * input,libpostal_normalize_options_t options,size_t * n)50 char **libpostal_expand_address(char *input, libpostal_normalize_options_t options, size_t *n) {
51     cstring_array *strings = expand_address(input, options, n);
52     if (strings == NULL) return NULL;
53     return cstring_array_to_strings(strings);
54 }
55 
libpostal_expand_address_root(char * input,libpostal_normalize_options_t options,size_t * n)56 char **libpostal_expand_address_root(char *input, libpostal_normalize_options_t options, size_t *n) {
57     cstring_array *strings = expand_address_root(input, options, n);
58     if (strings == NULL) return NULL;
59     return cstring_array_to_strings(strings);
60 }
61 
libpostal_expansion_array_destroy(char ** expansions,size_t n)62 void libpostal_expansion_array_destroy(char **expansions, size_t n) {
63     expansion_array_destroy(expansions, n);
64 }
65 
66 #define DEFAULT_NEAR_DUPE_GEOHASH_PRECISION 6
67 
68 static libpostal_near_dupe_hash_options_t LIBPOSTAL_NEAR_DUPE_HASH_DEFAULT_OPTIONS = {
69     .with_name = true,
70     .with_address = true,
71     .with_unit = false,
72     .with_city_or_equivalent = true,
73     .with_small_containing_boundaries = true,
74     .with_postal_code = true,
75     .with_latlon = false,
76     .latitude = 0.0,
77     .longitude = 0.0,
78     .geohash_precision = DEFAULT_NEAR_DUPE_GEOHASH_PRECISION,
79     .name_and_address_keys = true,
80     .name_only_keys = false,
81     .address_only_keys = false
82 };
83 
libpostal_get_near_dupe_hash_default_options(void)84 libpostal_near_dupe_hash_options_t libpostal_get_near_dupe_hash_default_options(void) {
85     return LIBPOSTAL_NEAR_DUPE_HASH_DEFAULT_OPTIONS;
86 }
87 
libpostal_near_dupe_hashes(size_t num_components,char ** labels,char ** values,libpostal_near_dupe_hash_options_t options,size_t * num_hashes)88 char **libpostal_near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t *num_hashes) {
89     cstring_array *strings = near_dupe_hashes(num_components, labels, values, options);
90     if (strings == NULL) {
91         *num_hashes = 0;
92         return NULL;
93     }
94     *num_hashes = cstring_array_num_strings(strings);
95     return cstring_array_to_strings(strings);
96 }
97 
98 
libpostal_near_dupe_hashes_languages(size_t num_components,char ** labels,char ** values,libpostal_near_dupe_hash_options_t options,size_t num_languages,char ** languages,size_t * num_hashes)99 char **libpostal_near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages, size_t *num_hashes) {
100     cstring_array *strings = near_dupe_hashes_languages(num_components, labels, values, options, num_languages, languages);
101     if (strings == NULL) {
102         *num_hashes = 0;
103         return NULL;
104     }
105     *num_hashes = cstring_array_num_strings(strings);
106     return cstring_array_to_strings(strings);
107 }
108 
109 
libpostal_place_languages(size_t num_components,char ** labels,char ** values,size_t * num_languages)110 char **libpostal_place_languages(size_t num_components, char **labels, char **values, size_t *num_languages) {
111     language_classifier_response_t *lang_response = place_languages(num_components, labels, values);
112     if (lang_response == NULL) {
113         *num_languages = 0;
114         return NULL;
115     }
116 
117     char **languages = lang_response->languages;
118     lang_response->languages = NULL;
119     *num_languages = lang_response->num_languages;
120     lang_response->num_languages = 0;
121 
122     language_classifier_response_destroy(lang_response);
123     return languages;
124 }
125 
126 static libpostal_duplicate_options_t LIBPOSTAL_DUPLICATE_DEFAULT_OPTIONS = {
127     .num_languages = 0,
128     .languages = NULL
129 };
130 
libpostal_get_default_duplicate_options(void)131 libpostal_duplicate_options_t libpostal_get_default_duplicate_options(void) {
132     return LIBPOSTAL_DUPLICATE_DEFAULT_OPTIONS;
133 }
134 
libpostal_get_duplicate_options_with_languages(size_t num_languages,char ** languages)135 libpostal_duplicate_options_t libpostal_get_duplicate_options_with_languages(size_t num_languages, char **languages) {
136     libpostal_duplicate_options_t options = LIBPOSTAL_DUPLICATE_DEFAULT_OPTIONS;
137     options.num_languages = num_languages;
138     options.languages = languages;
139     return options;
140 }
141 
libpostal_is_name_duplicate(char * value1,char * value2,libpostal_duplicate_options_t options)142 libpostal_duplicate_status_t libpostal_is_name_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) {
143     return is_name_duplicate(value1, value2, options);
144 }
145 
libpostal_is_street_duplicate(char * value1,char * value2,libpostal_duplicate_options_t options)146 libpostal_duplicate_status_t libpostal_is_street_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) {
147     return is_street_duplicate(value1, value2, options);
148 }
149 
libpostal_is_house_number_duplicate(char * value1,char * value2,libpostal_duplicate_options_t options)150 libpostal_duplicate_status_t libpostal_is_house_number_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) {
151     return is_house_number_duplicate(value1, value2, options);
152 }
153 
libpostal_is_po_box_duplicate(char * value1,char * value2,libpostal_duplicate_options_t options)154 libpostal_duplicate_status_t libpostal_is_po_box_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) {
155     return is_po_box_duplicate(value1, value2, options);
156 }
157 
libpostal_is_unit_duplicate(char * value1,char * value2,libpostal_duplicate_options_t options)158 libpostal_duplicate_status_t libpostal_is_unit_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) {
159     return is_unit_duplicate(value1, value2, options);
160 }
161 
libpostal_is_floor_duplicate(char * value1,char * value2,libpostal_duplicate_options_t options)162 libpostal_duplicate_status_t libpostal_is_floor_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) {
163     return is_floor_duplicate(value1, value2, options);
164 }
165 
libpostal_is_postal_code_duplicate(char * value1,char * value2,libpostal_duplicate_options_t options)166 libpostal_duplicate_status_t libpostal_is_postal_code_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) {
167     return is_postal_code_duplicate(value1, value2, options);
168 }
169 
libpostal_is_toponym_duplicate(size_t num_components1,char ** labels1,char ** values1,size_t num_components2,char ** labels2,char ** values2,libpostal_duplicate_options_t options)170 libpostal_duplicate_status_t libpostal_is_toponym_duplicate(size_t num_components1, char **labels1, char **values1, size_t num_components2, char **labels2, char **values2, libpostal_duplicate_options_t options) {
171     return is_toponym_duplicate(num_components1, labels1, values1, num_components2, labels2, values2, options);
172 }
173 
174 #define DEFAULT_FUZZY_DUPLICATE_NEEDS_REVIEW_THRESHOLD 0.7
175 #define DEFAULT_FUZZY_DUPLICATE_LIKELY_DUPE_THRESHOLD 0.9
176 
177 static libpostal_fuzzy_duplicate_options_t DEFAULT_FUZZY_DUPLICATE_OPTIONS = {
178     .num_languages = 0,
179     .languages = NULL,
180     .needs_review_threshold = DEFAULT_FUZZY_DUPLICATE_NEEDS_REVIEW_THRESHOLD,
181     .likely_dupe_threshold = DEFAULT_FUZZY_DUPLICATE_LIKELY_DUPE_THRESHOLD
182 };
183 
184 
libpostal_get_default_fuzzy_duplicate_options(void)185 libpostal_fuzzy_duplicate_options_t libpostal_get_default_fuzzy_duplicate_options(void) {
186     return DEFAULT_FUZZY_DUPLICATE_OPTIONS;
187 }
188 
libpostal_get_default_fuzzy_duplicate_options_with_languages(size_t num_languages,char ** languages)189 libpostal_fuzzy_duplicate_options_t libpostal_get_default_fuzzy_duplicate_options_with_languages(size_t num_languages, char **languages) {
190     libpostal_fuzzy_duplicate_options_t options = DEFAULT_FUZZY_DUPLICATE_OPTIONS;
191     options.num_languages = num_languages;
192     options.languages = languages;
193     return options;
194 }
195 
196 
libpostal_is_name_duplicate_fuzzy(size_t num_tokens1,char ** tokens1,double * token_scores1,size_t num_tokens2,char ** tokens2,double * token_scores2,libpostal_fuzzy_duplicate_options_t options)197 libpostal_fuzzy_duplicate_status_t libpostal_is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options) {
198     return is_name_duplicate_fuzzy(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options);
199 }
200 
libpostal_is_street_duplicate_fuzzy(size_t num_tokens1,char ** tokens1,double * token_scores1,size_t num_tokens2,char ** tokens2,double * token_scores2,libpostal_fuzzy_duplicate_options_t options)201 libpostal_fuzzy_duplicate_status_t libpostal_is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options) {
202     return is_street_duplicate_fuzzy(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options);
203 }
204 
205 
libpostal_address_parser_response_destroy(libpostal_address_parser_response_t * self)206 void libpostal_address_parser_response_destroy(libpostal_address_parser_response_t *self) {
207     if (self == NULL) return;
208 
209     for (size_t i = 0; i < self->num_components; i++) {
210         if (self->components != NULL) {
211             free(self->components[i]);
212         }
213 
214         if (self->labels != NULL) {
215             free(self->labels[i]);
216         }
217     }
218 
219     if (self->components != NULL) {
220         free(self->components);
221     }
222 
223     if (self->labels != NULL) {
224         free(self->labels);
225     }
226 
227     free(self);
228 }
229 
230 static libpostal_address_parser_options_t LIBPOSTAL_ADDRESS_PARSER_DEFAULT_OPTIONS =  {
231     .language = NULL,
232     .country = NULL
233 };
234 
libpostal_get_address_parser_default_options(void)235 inline libpostal_address_parser_options_t libpostal_get_address_parser_default_options(void) {
236     return LIBPOSTAL_ADDRESS_PARSER_DEFAULT_OPTIONS;
237 }
238 
libpostal_parse_address(char * address,libpostal_address_parser_options_t options)239 libpostal_address_parser_response_t *libpostal_parse_address(char *address, libpostal_address_parser_options_t options) {
240     libpostal_address_parser_response_t *parsed = address_parser_parse(address, options.language, options.country);
241 
242     if (parsed == NULL) {
243         log_error("Parser returned NULL\n");
244         return NULL;
245     }
246 
247     return parsed;
248 }
249 
libpostal_parser_print_features(bool print_features)250 bool libpostal_parser_print_features(bool print_features) {
251     return address_parser_print_features(print_features);
252 }
253 
libpostal_setup_datadir(char * datadir)254 bool libpostal_setup_datadir(char *datadir) {
255     char *transliteration_path = NULL;
256     char *numex_path = NULL;
257     char *address_dictionary_path = NULL;
258 
259     if (datadir != NULL) {
260         transliteration_path = path_join(3, datadir, LIBPOSTAL_TRANSLITERATION_SUBDIR, TRANSLITERATION_DATA_FILE);
261         numex_path = path_join(3, datadir, LIBPOSTAL_NUMEX_SUBDIR, NUMEX_DATA_FILE);
262         address_dictionary_path = path_join(3, datadir, LIBPOSTAL_ADDRESS_EXPANSIONS_SUBDIR, ADDRESS_DICTIONARY_DATA_FILE);
263     }
264 
265     if (!transliteration_module_setup(transliteration_path)) {
266         log_error("Error loading transliteration module, dir=%s\n", transliteration_path);
267         return false;
268     }
269 
270     if (!numex_module_setup(numex_path)) {
271         log_error("Error loading numex module, dir=%s\n", numex_path);
272         return false;
273     }
274 
275     if (!address_dictionary_module_setup(address_dictionary_path)) {
276         log_error("Error loading dictionary module, dir=%s\n", address_dictionary_path);
277         return false;
278     }
279 
280     if (transliteration_path != NULL) {
281         free(transliteration_path);
282     }
283 
284     if (numex_path != NULL) {
285         free(numex_path);
286     }
287 
288     if (address_dictionary_path != NULL) {
289         free(address_dictionary_path);
290     }
291 
292     return true;
293 }
294 
libpostal_setup(void)295 bool libpostal_setup(void) {
296     return libpostal_setup_datadir(NULL);
297 }
298 
libpostal_setup_language_classifier_datadir(char * datadir)299 bool libpostal_setup_language_classifier_datadir(char *datadir) {
300     char *language_classifier_dir = NULL;
301 
302     if (datadir != NULL) {
303         language_classifier_dir = path_join(2, datadir, LIBPOSTAL_LANGUAGE_CLASSIFIER_SUBDIR);
304     }
305 
306     if (!language_classifier_module_setup(language_classifier_dir)) {
307         log_error("Error loading language classifier, dir=%s\n", language_classifier_dir);
308         return false;
309     }
310 
311     if (language_classifier_dir != NULL) {
312         free(language_classifier_dir);
313     }
314 
315     return true;
316 }
317 
318 
libpostal_tokenize(char * input,bool whitespace,size_t * n)319 libpostal_token_t *libpostal_tokenize(char *input, bool whitespace, size_t *n) {
320     token_array *tokens = NULL;
321     if (!whitespace) {
322         tokens = tokenize(input);
323     } else {
324         tokens = tokenize_keep_whitespace(input);
325     }
326 
327     if (tokens == NULL) {
328         return NULL;
329     }
330 
331     libpostal_token_t *a = tokens->a;
332     *n = tokens->n;
333     free(tokens);
334     return a;
335 }
336 
337 
libpostal_normalize_string_languages(char * str,uint64_t options,size_t num_languages,char ** languages)338 char *libpostal_normalize_string_languages(char *str, uint64_t options, size_t num_languages, char **languages) {
339     if (options & LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII) {
340         return normalize_string_latin_languages(str, strlen(str), options, num_languages, languages);
341     } else {
342         return normalize_string_utf8_languages(str, options, num_languages, languages);
343     }
344 }
345 
libpostal_normalize_string(char * str,uint64_t options)346 inline char *libpostal_normalize_string(char *str, uint64_t options) {
347     return libpostal_normalize_string_languages(str, options, 0, NULL);
348 }
349 
libpostal_normalized_tokens_languages(char * input,uint64_t string_options,uint64_t token_options,bool whitespace,size_t num_languages,char ** languages,size_t * n)350 libpostal_normalized_token_t *libpostal_normalized_tokens_languages(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t num_languages, char **languages, size_t *n) {
351     if (input == NULL) {
352         return NULL;
353     }
354     char *normalized = libpostal_normalize_string_languages(input, string_options, num_languages, languages);
355     if (normalized == NULL) {
356         return NULL;
357     }
358 
359     token_array *tokens = NULL;
360     if (!whitespace) {
361         tokens = tokenize(normalized);
362     } else {
363         tokens = tokenize_keep_whitespace(normalized);
364     }
365 
366     if (tokens == NULL || tokens->a == NULL) {
367         free(normalized);
368         return NULL;
369     }
370 
371     size_t num_tokens = tokens->n;
372     token_t *token_array = tokens->a;
373     char_array *normalized_token = char_array_new_size(strlen(normalized));
374 
375     libpostal_normalized_token_t *result = malloc(sizeof(libpostal_normalized_token_t) * num_tokens);
376 
377     for (size_t i = 0; i < num_tokens; i++) {
378         token_t token = token_array[i];
379         char_array_clear(normalized_token);
380         add_normalized_token(normalized_token, normalized, token, token_options);
381         char *token_str = strdup(char_array_get_string(normalized_token));
382         result[i] = (libpostal_normalized_token_t){token_str, token};
383     }
384 
385     free(normalized);
386     token_array_destroy(tokens);
387     char_array_destroy(normalized_token);
388 
389     *n = num_tokens;
390     return result;
391 }
392 
libpostal_normalized_tokens(char * input,uint64_t string_options,uint64_t token_options,bool whitespace,size_t * n)393 inline libpostal_normalized_token_t *libpostal_normalized_tokens(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t *n) {
394     return libpostal_normalized_tokens_languages(input, string_options, token_options, whitespace, 0, NULL, n);
395 }
396 
397 
libpostal_setup_language_classifier(void)398 bool libpostal_setup_language_classifier(void) {
399     return libpostal_setup_language_classifier_datadir(NULL);
400 }
401 
libpostal_setup_parser_datadir(char * datadir)402 bool libpostal_setup_parser_datadir(char *datadir) {
403     char *parser_dir = NULL;
404 
405     if (datadir != NULL) {
406         parser_dir = path_join(2, datadir, LIBPOSTAL_ADDRESS_PARSER_SUBDIR);
407     }
408 
409     if (!address_parser_module_setup(parser_dir)) {
410         log_error("Error loading address parser module, dir=%s\n", parser_dir);
411         return false;
412     }
413 
414     if (parser_dir != NULL) {
415         free(parser_dir);
416     }
417 
418     return true;
419 }
420 
libpostal_setup_parser(void)421 bool libpostal_setup_parser(void) {
422     return libpostal_setup_parser_datadir(NULL);
423 }
424 
libpostal_teardown(void)425 void libpostal_teardown(void) {
426     transliteration_module_teardown();
427 
428     numex_module_teardown();
429 
430     address_dictionary_module_teardown();
431 }
432 
libpostal_teardown_language_classifier(void)433 void libpostal_teardown_language_classifier(void) {
434     language_classifier_module_teardown();
435 }
436 
libpostal_teardown_parser(void)437 void libpostal_teardown_parser(void) {
438     address_parser_module_teardown();
439 }
440