1 #include <stdlib.h>
2
3 #include "libpostal.h"
4
5 #include "klib/khash.h"
6 #include "klib/ksort.h"
7 #include "log/log.h"
8
9 #include "address_dictionary.h"
10 #include "address_parser.h"
11 #include "dedupe.h"
12 #include "expand.h"
13
14 #include "language_classifier.h"
15 #include "near_dupe.h"
16 #include "normalize.h"
17 #include "place.h"
18 #include "scanner.h"
19 #include "string_utils.h"
20 #include "token_types.h"
21
22 static libpostal_normalize_options_t LIBPOSTAL_DEFAULT_OPTIONS = {
23 .languages = NULL,
24 .num_languages = 0,
25 .address_components = LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_PO_BOX | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ENTRANCE | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_POSTAL_CODE,
26 .latin_ascii = true,
27 .transliterate = true,
28 .strip_accents = true,
29 .decompose = true,
30 .lowercase = true,
31 .trim_string = true,
32 .drop_parentheticals = true,
33 .replace_numeric_hyphens = false,
34 .delete_numeric_hyphens = false,
35 .split_alpha_from_numeric = true,
36 .replace_word_hyphens = true,
37 .delete_word_hyphens = true,
38 .delete_final_periods = true,
39 .delete_acronym_periods = true,
40 .drop_english_possessives = true,
41 .delete_apostrophes = true,
42 .expand_numex = true,
43 .roman_numerals = true
44 };
45
libpostal_get_default_options(void)46 libpostal_normalize_options_t libpostal_get_default_options(void) {
47 return LIBPOSTAL_DEFAULT_OPTIONS;
48 }
49
libpostal_expand_address(char * input,libpostal_normalize_options_t options,size_t * n)50 char **libpostal_expand_address(char *input, libpostal_normalize_options_t options, size_t *n) {
51 cstring_array *strings = expand_address(input, options, n);
52 if (strings == NULL) return NULL;
53 return cstring_array_to_strings(strings);
54 }
55
libpostal_expand_address_root(char * input,libpostal_normalize_options_t options,size_t * n)56 char **libpostal_expand_address_root(char *input, libpostal_normalize_options_t options, size_t *n) {
57 cstring_array *strings = expand_address_root(input, options, n);
58 if (strings == NULL) return NULL;
59 return cstring_array_to_strings(strings);
60 }
61
libpostal_expansion_array_destroy(char ** expansions,size_t n)62 void libpostal_expansion_array_destroy(char **expansions, size_t n) {
63 expansion_array_destroy(expansions, n);
64 }
65
66 #define DEFAULT_NEAR_DUPE_GEOHASH_PRECISION 6
67
68 static libpostal_near_dupe_hash_options_t LIBPOSTAL_NEAR_DUPE_HASH_DEFAULT_OPTIONS = {
69 .with_name = true,
70 .with_address = true,
71 .with_unit = false,
72 .with_city_or_equivalent = true,
73 .with_small_containing_boundaries = true,
74 .with_postal_code = true,
75 .with_latlon = false,
76 .latitude = 0.0,
77 .longitude = 0.0,
78 .geohash_precision = DEFAULT_NEAR_DUPE_GEOHASH_PRECISION,
79 .name_and_address_keys = true,
80 .name_only_keys = false,
81 .address_only_keys = false
82 };
83
libpostal_get_near_dupe_hash_default_options(void)84 libpostal_near_dupe_hash_options_t libpostal_get_near_dupe_hash_default_options(void) {
85 return LIBPOSTAL_NEAR_DUPE_HASH_DEFAULT_OPTIONS;
86 }
87
libpostal_near_dupe_hashes(size_t num_components,char ** labels,char ** values,libpostal_near_dupe_hash_options_t options,size_t * num_hashes)88 char **libpostal_near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t *num_hashes) {
89 cstring_array *strings = near_dupe_hashes(num_components, labels, values, options);
90 if (strings == NULL) {
91 *num_hashes = 0;
92 return NULL;
93 }
94 *num_hashes = cstring_array_num_strings(strings);
95 return cstring_array_to_strings(strings);
96 }
97
98
libpostal_near_dupe_hashes_languages(size_t num_components,char ** labels,char ** values,libpostal_near_dupe_hash_options_t options,size_t num_languages,char ** languages,size_t * num_hashes)99 char **libpostal_near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages, size_t *num_hashes) {
100 cstring_array *strings = near_dupe_hashes_languages(num_components, labels, values, options, num_languages, languages);
101 if (strings == NULL) {
102 *num_hashes = 0;
103 return NULL;
104 }
105 *num_hashes = cstring_array_num_strings(strings);
106 return cstring_array_to_strings(strings);
107 }
108
109
libpostal_place_languages(size_t num_components,char ** labels,char ** values,size_t * num_languages)110 char **libpostal_place_languages(size_t num_components, char **labels, char **values, size_t *num_languages) {
111 language_classifier_response_t *lang_response = place_languages(num_components, labels, values);
112 if (lang_response == NULL) {
113 *num_languages = 0;
114 return NULL;
115 }
116
117 char **languages = lang_response->languages;
118 lang_response->languages = NULL;
119 *num_languages = lang_response->num_languages;
120 lang_response->num_languages = 0;
121
122 language_classifier_response_destroy(lang_response);
123 return languages;
124 }
125
126 static libpostal_duplicate_options_t LIBPOSTAL_DUPLICATE_DEFAULT_OPTIONS = {
127 .num_languages = 0,
128 .languages = NULL
129 };
130
libpostal_get_default_duplicate_options(void)131 libpostal_duplicate_options_t libpostal_get_default_duplicate_options(void) {
132 return LIBPOSTAL_DUPLICATE_DEFAULT_OPTIONS;
133 }
134
libpostal_get_duplicate_options_with_languages(size_t num_languages,char ** languages)135 libpostal_duplicate_options_t libpostal_get_duplicate_options_with_languages(size_t num_languages, char **languages) {
136 libpostal_duplicate_options_t options = LIBPOSTAL_DUPLICATE_DEFAULT_OPTIONS;
137 options.num_languages = num_languages;
138 options.languages = languages;
139 return options;
140 }
141
libpostal_is_name_duplicate(char * value1,char * value2,libpostal_duplicate_options_t options)142 libpostal_duplicate_status_t libpostal_is_name_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) {
143 return is_name_duplicate(value1, value2, options);
144 }
145
libpostal_is_street_duplicate(char * value1,char * value2,libpostal_duplicate_options_t options)146 libpostal_duplicate_status_t libpostal_is_street_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) {
147 return is_street_duplicate(value1, value2, options);
148 }
149
libpostal_is_house_number_duplicate(char * value1,char * value2,libpostal_duplicate_options_t options)150 libpostal_duplicate_status_t libpostal_is_house_number_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) {
151 return is_house_number_duplicate(value1, value2, options);
152 }
153
libpostal_is_po_box_duplicate(char * value1,char * value2,libpostal_duplicate_options_t options)154 libpostal_duplicate_status_t libpostal_is_po_box_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) {
155 return is_po_box_duplicate(value1, value2, options);
156 }
157
libpostal_is_unit_duplicate(char * value1,char * value2,libpostal_duplicate_options_t options)158 libpostal_duplicate_status_t libpostal_is_unit_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) {
159 return is_unit_duplicate(value1, value2, options);
160 }
161
libpostal_is_floor_duplicate(char * value1,char * value2,libpostal_duplicate_options_t options)162 libpostal_duplicate_status_t libpostal_is_floor_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) {
163 return is_floor_duplicate(value1, value2, options);
164 }
165
libpostal_is_postal_code_duplicate(char * value1,char * value2,libpostal_duplicate_options_t options)166 libpostal_duplicate_status_t libpostal_is_postal_code_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) {
167 return is_postal_code_duplicate(value1, value2, options);
168 }
169
libpostal_is_toponym_duplicate(size_t num_components1,char ** labels1,char ** values1,size_t num_components2,char ** labels2,char ** values2,libpostal_duplicate_options_t options)170 libpostal_duplicate_status_t libpostal_is_toponym_duplicate(size_t num_components1, char **labels1, char **values1, size_t num_components2, char **labels2, char **values2, libpostal_duplicate_options_t options) {
171 return is_toponym_duplicate(num_components1, labels1, values1, num_components2, labels2, values2, options);
172 }
173
174 #define DEFAULT_FUZZY_DUPLICATE_NEEDS_REVIEW_THRESHOLD 0.7
175 #define DEFAULT_FUZZY_DUPLICATE_LIKELY_DUPE_THRESHOLD 0.9
176
177 static libpostal_fuzzy_duplicate_options_t DEFAULT_FUZZY_DUPLICATE_OPTIONS = {
178 .num_languages = 0,
179 .languages = NULL,
180 .needs_review_threshold = DEFAULT_FUZZY_DUPLICATE_NEEDS_REVIEW_THRESHOLD,
181 .likely_dupe_threshold = DEFAULT_FUZZY_DUPLICATE_LIKELY_DUPE_THRESHOLD
182 };
183
184
libpostal_get_default_fuzzy_duplicate_options(void)185 libpostal_fuzzy_duplicate_options_t libpostal_get_default_fuzzy_duplicate_options(void) {
186 return DEFAULT_FUZZY_DUPLICATE_OPTIONS;
187 }
188
libpostal_get_default_fuzzy_duplicate_options_with_languages(size_t num_languages,char ** languages)189 libpostal_fuzzy_duplicate_options_t libpostal_get_default_fuzzy_duplicate_options_with_languages(size_t num_languages, char **languages) {
190 libpostal_fuzzy_duplicate_options_t options = DEFAULT_FUZZY_DUPLICATE_OPTIONS;
191 options.num_languages = num_languages;
192 options.languages = languages;
193 return options;
194 }
195
196
libpostal_is_name_duplicate_fuzzy(size_t num_tokens1,char ** tokens1,double * token_scores1,size_t num_tokens2,char ** tokens2,double * token_scores2,libpostal_fuzzy_duplicate_options_t options)197 libpostal_fuzzy_duplicate_status_t libpostal_is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options) {
198 return is_name_duplicate_fuzzy(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options);
199 }
200
libpostal_is_street_duplicate_fuzzy(size_t num_tokens1,char ** tokens1,double * token_scores1,size_t num_tokens2,char ** tokens2,double * token_scores2,libpostal_fuzzy_duplicate_options_t options)201 libpostal_fuzzy_duplicate_status_t libpostal_is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options) {
202 return is_street_duplicate_fuzzy(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options);
203 }
204
205
libpostal_address_parser_response_destroy(libpostal_address_parser_response_t * self)206 void libpostal_address_parser_response_destroy(libpostal_address_parser_response_t *self) {
207 if (self == NULL) return;
208
209 for (size_t i = 0; i < self->num_components; i++) {
210 if (self->components != NULL) {
211 free(self->components[i]);
212 }
213
214 if (self->labels != NULL) {
215 free(self->labels[i]);
216 }
217 }
218
219 if (self->components != NULL) {
220 free(self->components);
221 }
222
223 if (self->labels != NULL) {
224 free(self->labels);
225 }
226
227 free(self);
228 }
229
230 static libpostal_address_parser_options_t LIBPOSTAL_ADDRESS_PARSER_DEFAULT_OPTIONS = {
231 .language = NULL,
232 .country = NULL
233 };
234
libpostal_get_address_parser_default_options(void)235 inline libpostal_address_parser_options_t libpostal_get_address_parser_default_options(void) {
236 return LIBPOSTAL_ADDRESS_PARSER_DEFAULT_OPTIONS;
237 }
238
libpostal_parse_address(char * address,libpostal_address_parser_options_t options)239 libpostal_address_parser_response_t *libpostal_parse_address(char *address, libpostal_address_parser_options_t options) {
240 libpostal_address_parser_response_t *parsed = address_parser_parse(address, options.language, options.country);
241
242 if (parsed == NULL) {
243 log_error("Parser returned NULL\n");
244 return NULL;
245 }
246
247 return parsed;
248 }
249
libpostal_parser_print_features(bool print_features)250 bool libpostal_parser_print_features(bool print_features) {
251 return address_parser_print_features(print_features);
252 }
253
libpostal_setup_datadir(char * datadir)254 bool libpostal_setup_datadir(char *datadir) {
255 char *transliteration_path = NULL;
256 char *numex_path = NULL;
257 char *address_dictionary_path = NULL;
258
259 if (datadir != NULL) {
260 transliteration_path = path_join(3, datadir, LIBPOSTAL_TRANSLITERATION_SUBDIR, TRANSLITERATION_DATA_FILE);
261 numex_path = path_join(3, datadir, LIBPOSTAL_NUMEX_SUBDIR, NUMEX_DATA_FILE);
262 address_dictionary_path = path_join(3, datadir, LIBPOSTAL_ADDRESS_EXPANSIONS_SUBDIR, ADDRESS_DICTIONARY_DATA_FILE);
263 }
264
265 if (!transliteration_module_setup(transliteration_path)) {
266 log_error("Error loading transliteration module, dir=%s\n", transliteration_path);
267 return false;
268 }
269
270 if (!numex_module_setup(numex_path)) {
271 log_error("Error loading numex module, dir=%s\n", numex_path);
272 return false;
273 }
274
275 if (!address_dictionary_module_setup(address_dictionary_path)) {
276 log_error("Error loading dictionary module, dir=%s\n", address_dictionary_path);
277 return false;
278 }
279
280 if (transliteration_path != NULL) {
281 free(transliteration_path);
282 }
283
284 if (numex_path != NULL) {
285 free(numex_path);
286 }
287
288 if (address_dictionary_path != NULL) {
289 free(address_dictionary_path);
290 }
291
292 return true;
293 }
294
libpostal_setup(void)295 bool libpostal_setup(void) {
296 return libpostal_setup_datadir(NULL);
297 }
298
libpostal_setup_language_classifier_datadir(char * datadir)299 bool libpostal_setup_language_classifier_datadir(char *datadir) {
300 char *language_classifier_dir = NULL;
301
302 if (datadir != NULL) {
303 language_classifier_dir = path_join(2, datadir, LIBPOSTAL_LANGUAGE_CLASSIFIER_SUBDIR);
304 }
305
306 if (!language_classifier_module_setup(language_classifier_dir)) {
307 log_error("Error loading language classifier, dir=%s\n", language_classifier_dir);
308 return false;
309 }
310
311 if (language_classifier_dir != NULL) {
312 free(language_classifier_dir);
313 }
314
315 return true;
316 }
317
318
libpostal_tokenize(char * input,bool whitespace,size_t * n)319 libpostal_token_t *libpostal_tokenize(char *input, bool whitespace, size_t *n) {
320 token_array *tokens = NULL;
321 if (!whitespace) {
322 tokens = tokenize(input);
323 } else {
324 tokens = tokenize_keep_whitespace(input);
325 }
326
327 if (tokens == NULL) {
328 return NULL;
329 }
330
331 libpostal_token_t *a = tokens->a;
332 *n = tokens->n;
333 free(tokens);
334 return a;
335 }
336
337
libpostal_normalize_string_languages(char * str,uint64_t options,size_t num_languages,char ** languages)338 char *libpostal_normalize_string_languages(char *str, uint64_t options, size_t num_languages, char **languages) {
339 if (options & LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII) {
340 return normalize_string_latin_languages(str, strlen(str), options, num_languages, languages);
341 } else {
342 return normalize_string_utf8_languages(str, options, num_languages, languages);
343 }
344 }
345
libpostal_normalize_string(char * str,uint64_t options)346 inline char *libpostal_normalize_string(char *str, uint64_t options) {
347 return libpostal_normalize_string_languages(str, options, 0, NULL);
348 }
349
libpostal_normalized_tokens_languages(char * input,uint64_t string_options,uint64_t token_options,bool whitespace,size_t num_languages,char ** languages,size_t * n)350 libpostal_normalized_token_t *libpostal_normalized_tokens_languages(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t num_languages, char **languages, size_t *n) {
351 if (input == NULL) {
352 return NULL;
353 }
354 char *normalized = libpostal_normalize_string_languages(input, string_options, num_languages, languages);
355 if (normalized == NULL) {
356 return NULL;
357 }
358
359 token_array *tokens = NULL;
360 if (!whitespace) {
361 tokens = tokenize(normalized);
362 } else {
363 tokens = tokenize_keep_whitespace(normalized);
364 }
365
366 if (tokens == NULL || tokens->a == NULL) {
367 free(normalized);
368 return NULL;
369 }
370
371 size_t num_tokens = tokens->n;
372 token_t *token_array = tokens->a;
373 char_array *normalized_token = char_array_new_size(strlen(normalized));
374
375 libpostal_normalized_token_t *result = malloc(sizeof(libpostal_normalized_token_t) * num_tokens);
376
377 for (size_t i = 0; i < num_tokens; i++) {
378 token_t token = token_array[i];
379 char_array_clear(normalized_token);
380 add_normalized_token(normalized_token, normalized, token, token_options);
381 char *token_str = strdup(char_array_get_string(normalized_token));
382 result[i] = (libpostal_normalized_token_t){token_str, token};
383 }
384
385 free(normalized);
386 token_array_destroy(tokens);
387 char_array_destroy(normalized_token);
388
389 *n = num_tokens;
390 return result;
391 }
392
libpostal_normalized_tokens(char * input,uint64_t string_options,uint64_t token_options,bool whitespace,size_t * n)393 inline libpostal_normalized_token_t *libpostal_normalized_tokens(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t *n) {
394 return libpostal_normalized_tokens_languages(input, string_options, token_options, whitespace, 0, NULL, n);
395 }
396
397
libpostal_setup_language_classifier(void)398 bool libpostal_setup_language_classifier(void) {
399 return libpostal_setup_language_classifier_datadir(NULL);
400 }
401
libpostal_setup_parser_datadir(char * datadir)402 bool libpostal_setup_parser_datadir(char *datadir) {
403 char *parser_dir = NULL;
404
405 if (datadir != NULL) {
406 parser_dir = path_join(2, datadir, LIBPOSTAL_ADDRESS_PARSER_SUBDIR);
407 }
408
409 if (!address_parser_module_setup(parser_dir)) {
410 log_error("Error loading address parser module, dir=%s\n", parser_dir);
411 return false;
412 }
413
414 if (parser_dir != NULL) {
415 free(parser_dir);
416 }
417
418 return true;
419 }
420
libpostal_setup_parser(void)421 bool libpostal_setup_parser(void) {
422 return libpostal_setup_parser_datadir(NULL);
423 }
424
libpostal_teardown(void)425 void libpostal_teardown(void) {
426 transliteration_module_teardown();
427
428 numex_module_teardown();
429
430 address_dictionary_module_teardown();
431 }
432
libpostal_teardown_language_classifier(void)433 void libpostal_teardown_language_classifier(void) {
434 language_classifier_module_teardown();
435 }
436
libpostal_teardown_parser(void)437 void libpostal_teardown_parser(void) {
438 address_parser_module_teardown();
439 }
440