1 #ifndef LIBPOSTAL_H
2 #define LIBPOSTAL_H
3 
4 #ifdef __cplusplus
5 extern "C" {
6 #endif
7 
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include <stdint.h>
11 #include <stdbool.h>
12 
13 #ifdef _WIN32
14 #ifdef LIBPOSTAL_EXPORTS
15 #define LIBPOSTAL_EXPORT __declspec(dllexport)
16 #else
17 #define LIBPOSTAL_EXPORT __declspec(dllimport)
18 #endif
19 #elif __GNUC__ >= 4
20 #define LIBPOSTAL_EXPORT __attribute__ ((visibility("default")))
21 #else
22 #define LIBPOSTAL_EXPORT
23 #endif
24 
25 #define LIBPOSTAL_MAX_LANGUAGE_LEN 4
26 
27 // Doing these as #defines so we can duplicate the values exactly in Python
28 
29 
30 typedef enum {
31     LIBPOSTAL_TOKEN_TYPE_END = 0,                   // Null byte
32     // Word types
33     LIBPOSTAL_TOKEN_TYPE_WORD = 1,                  // Any letter-only word (includes all unicode letters)
34     LIBPOSTAL_TOKEN_TYPE_ABBREVIATION = 2,          // Loose abbreviations (roughly anything containing a "." as we don't care about sentences in addresses)
35     LIBPOSTAL_TOKEN_TYPE_IDEOGRAPHIC_CHAR = 3,      // For languages that don't separate on whitespace (e.g. Chinese, Japanese, Korean), separate by character
36     LIBPOSTAL_TOKEN_TYPE_HANGUL_SYLLABLE = 4,       // Hangul syllable sequences which contain more than one codepoint
37     LIBPOSTAL_TOKEN_TYPE_ACRONYM = 5,               // Specifically things like U.N. where we may delete internal periods
38 
39     LIBPOSTAL_TOKEN_TYPE_PHRASE = 10,               // Not part of the first stage tokenizer, but may be used after phrase parsing
40 
41     // Special tokens
42     LIBPOSTAL_TOKEN_TYPE_EMAIL = 20,                // Make sure emails are tokenized altogether
43     LIBPOSTAL_TOKEN_TYPE_URL = 21,                  // Make sure urls are tokenized altogether
44     LIBPOSTAL_TOKEN_TYPE_US_PHONE = 22,             // US phone number (with or without country code)
45     LIBPOSTAL_TOKEN_TYPE_INTL_PHONE = 23,           // A non-US phone number (must have country code)
46 
47     // Numbers and numeric types
48     LIBPOSTAL_TOKEN_TYPE_NUMERIC = 50,              // Any sequence containing a digit
49     LIBPOSTAL_TOKEN_TYPE_ORDINAL = 51,              // 1st, 2nd, 1er, 1 etc.
50     LIBPOSTAL_TOKEN_TYPE_ROMAN_NUMERAL = 52,        // II, III, VI, etc.
51     LIBPOSTAL_TOKEN_TYPE_IDEOGRAPHIC_NUMBER = 53,   // All numeric ideographic characters, includes e.g. Han numbers and chars like "²"
52 
53     // Punctuation types, may separate a phrase
54     LIBPOSTAL_TOKEN_TYPE_PERIOD = 100,
55     LIBPOSTAL_TOKEN_TYPE_EXCLAMATION = 101,
56     LIBPOSTAL_TOKEN_TYPE_QUESTION_MARK = 102,
57     LIBPOSTAL_TOKEN_TYPE_COMMA = 103,
58     LIBPOSTAL_TOKEN_TYPE_COLON = 104,
59     LIBPOSTAL_TOKEN_TYPE_SEMICOLON = 105,
60     LIBPOSTAL_TOKEN_TYPE_PLUS = 106,
61     LIBPOSTAL_TOKEN_TYPE_AMPERSAND = 107,
62     LIBPOSTAL_TOKEN_TYPE_AT_SIGN = 108,
63     LIBPOSTAL_TOKEN_TYPE_POUND = 109,
64     LIBPOSTAL_TOKEN_TYPE_ELLIPSIS = 110,
65     LIBPOSTAL_TOKEN_TYPE_DASH = 111,
66     LIBPOSTAL_TOKEN_TYPE_BREAKING_DASH = 112,
67     LIBPOSTAL_TOKEN_TYPE_HYPHEN = 113,
68     LIBPOSTAL_TOKEN_TYPE_PUNCT_OPEN = 114,
69     LIBPOSTAL_TOKEN_TYPE_PUNCT_CLOSE = 115,
70     LIBPOSTAL_TOKEN_TYPE_DOUBLE_QUOTE = 119,
71     LIBPOSTAL_TOKEN_TYPE_SINGLE_QUOTE = 120,
72     LIBPOSTAL_TOKEN_TYPE_OPEN_QUOTE = 121,
73     LIBPOSTAL_TOKEN_TYPE_CLOSE_QUOTE = 122,
74     LIBPOSTAL_TOKEN_TYPE_SLASH = 124,
75     LIBPOSTAL_TOKEN_TYPE_BACKSLASH = 125,
76     LIBPOSTAL_TOKEN_TYPE_GREATER_THAN = 126,
77     LIBPOSTAL_TOKEN_TYPE_LESS_THAN = 127,
78 
79     // Non-letters and whitespace
80     LIBPOSTAL_TOKEN_TYPE_OTHER = 200,
81     LIBPOSTAL_TOKEN_TYPE_WHITESPACE = 300,
82     LIBPOSTAL_TOKEN_TYPE_NEWLINE = 301,
83 
84     LIBPOSTAL_TOKEN_TYPE_INVALID_CHAR = 500
85 } libpostal_token_type_t;
86 
87 
88 /*
89 Address dictionaries
90 */
91 // Bit set, should be able to keep it at a short (uint16_t)
92 #define LIBPOSTAL_ADDRESS_NONE 0
93 #define LIBPOSTAL_ADDRESS_ANY (1 << 0)
94 #define LIBPOSTAL_ADDRESS_NAME (1 << 1)
95 #define LIBPOSTAL_ADDRESS_HOUSE_NUMBER (1 << 2)
96 #define LIBPOSTAL_ADDRESS_STREET (1 << 3)
97 #define LIBPOSTAL_ADDRESS_UNIT (1 << 4)
98 #define LIBPOSTAL_ADDRESS_LEVEL (1 << 5)
99 #define LIBPOSTAL_ADDRESS_STAIRCASE (1 << 6)
100 #define LIBPOSTAL_ADDRESS_ENTRANCE (1 << 7)
101 
102 #define LIBPOSTAL_ADDRESS_CATEGORY (1 << 8)
103 #define LIBPOSTAL_ADDRESS_NEAR (1 << 9)
104 
105 #define LIBPOSTAL_ADDRESS_TOPONYM (1 << 13)
106 #define LIBPOSTAL_ADDRESS_POSTAL_CODE (1 << 14)
107 #define LIBPOSTAL_ADDRESS_PO_BOX (1 << 15)
108 #define LIBPOSTAL_ADDRESS_ALL ((1 << 16) - 1)
109 
110 typedef struct libpostal_normalize_options {
111     // List of language codes
112     char **languages;
113     size_t num_languages;
114     uint16_t address_components;
115 
116     // String options
117     bool latin_ascii;
118     bool transliterate;
119     bool strip_accents;
120     bool decompose;
121     bool lowercase;
122     bool trim_string;
123     bool drop_parentheticals;
124     bool replace_numeric_hyphens;
125     bool delete_numeric_hyphens;
126     bool split_alpha_from_numeric;
127     bool replace_word_hyphens;
128     bool delete_word_hyphens;
129     bool delete_final_periods;
130     bool delete_acronym_periods;
131     bool drop_english_possessives;
132     bool delete_apostrophes;
133     bool expand_numex;
134     bool roman_numerals;
135 
136 } libpostal_normalize_options_t;
137 
138 LIBPOSTAL_EXPORT libpostal_normalize_options_t libpostal_get_default_options(void);
139 
140 LIBPOSTAL_EXPORT char **libpostal_expand_address(char *input, libpostal_normalize_options_t options, size_t *n);
141 LIBPOSTAL_EXPORT char **libpostal_expand_address_root(char *input, libpostal_normalize_options_t options, size_t *n);
142 
143 LIBPOSTAL_EXPORT void libpostal_expansion_array_destroy(char **expansions, size_t n);
144 
145 /*
146 Address parser
147 */
148 
149 typedef struct libpostal_address_parser_response {
150     size_t num_components;
151     char **components;
152     char **labels;
153 } libpostal_address_parser_response_t;
154 
155 typedef libpostal_address_parser_response_t libpostal_parsed_address_components_t;
156 
157 typedef struct libpostal_address_parser_options {
158     char *language;
159     char *country;
160 } libpostal_address_parser_options_t;
161 
162 LIBPOSTAL_EXPORT void libpostal_address_parser_response_destroy(libpostal_address_parser_response_t *self);
163 
164 LIBPOSTAL_EXPORT libpostal_address_parser_options_t libpostal_get_address_parser_default_options(void);
165 
166 LIBPOSTAL_EXPORT libpostal_address_parser_response_t *libpostal_parse_address(char *address, libpostal_address_parser_options_t options);
167 
168 LIBPOSTAL_EXPORT bool libpostal_parser_print_features(bool print_features);
169 
170 
171 /*
172 Deduping
173 */
174 
175 
176 // Near-dupe hashing methods
177 
178 typedef struct libpostal_near_dupe_hash_options {
179     bool with_name;
180     bool with_address;
181     bool with_unit;
182     bool with_city_or_equivalent;
183     bool with_small_containing_boundaries;
184     bool with_postal_code;
185     bool with_latlon;
186     double latitude;
187     double longitude;
188     uint32_t geohash_precision;
189     bool name_and_address_keys;
190     bool name_only_keys;
191     bool address_only_keys;
192 } libpostal_near_dupe_hash_options_t;
193 
194 
195 LIBPOSTAL_EXPORT libpostal_near_dupe_hash_options_t libpostal_get_near_dupe_hash_default_options(void);
196 LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t *num_hashes);
197 LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages, size_t *num_hashes);
198 
199 // Dupe language classification
200 
201 LIBPOSTAL_EXPORT char **libpostal_place_languages(size_t num_components, char **labels, char **values, size_t *num_languages);
202 
203 // Pairwise dupe methods
204 
205 typedef enum {
206     LIBPOSTAL_NULL_DUPLICATE_STATUS = -1,
207     LIBPOSTAL_NON_DUPLICATE = 0,
208     LIBPOSTAL_POSSIBLE_DUPLICATE_NEEDS_REVIEW = 3,
209     LIBPOSTAL_LIKELY_DUPLICATE = 6,
210     LIBPOSTAL_EXACT_DUPLICATE = 9,
211 } libpostal_duplicate_status_t;
212 
213 typedef struct libpostal_duplicate_options {
214     size_t num_languages;
215     char **languages;
216 } libpostal_duplicate_options_t;
217 
218 
219 LIBPOSTAL_EXPORT libpostal_duplicate_options_t libpostal_get_default_duplicate_options(void);
220 LIBPOSTAL_EXPORT libpostal_duplicate_options_t libpostal_get_duplicate_options_with_languages(size_t num_languages, char **languages);
221 
222 LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_name_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options);
223 LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_street_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options);
224 LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_house_number_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options);
225 LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_po_box_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options);
226 LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_unit_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options);
227 LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_floor_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options);
228 LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_postal_code_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options);
229 LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_toponym_duplicate(size_t num_components1, char **labels1, char **values1, size_t num_components2, char **labels2, char **values2, libpostal_duplicate_options_t options);
230 
231 // Pairwise fuzzy dupe methods, return status & similarity
232 
233 typedef struct libpostal_fuzzy_duplicate_options {
234     size_t num_languages;
235     char **languages;
236     double needs_review_threshold;
237     double likely_dupe_threshold;
238 } libpostal_fuzzy_duplicate_options_t;
239 
240 typedef struct libpostal_fuzzy_duplicate_status {
241     libpostal_duplicate_status_t status;
242     double similarity;
243 } libpostal_fuzzy_duplicate_status_t;
244 
245 LIBPOSTAL_EXPORT libpostal_fuzzy_duplicate_options_t libpostal_get_default_fuzzy_duplicate_options(void);
246 LIBPOSTAL_EXPORT libpostal_fuzzy_duplicate_options_t libpostal_get_default_fuzzy_duplicate_options_with_languages(size_t num_languages, char **languages);
247 
248 LIBPOSTAL_EXPORT libpostal_fuzzy_duplicate_status_t libpostal_is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options);
249 LIBPOSTAL_EXPORT libpostal_fuzzy_duplicate_status_t libpostal_is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options);
250 
251 // Setup/teardown methods
252 
253 LIBPOSTAL_EXPORT bool libpostal_setup(void);
254 LIBPOSTAL_EXPORT bool libpostal_setup_datadir(char *datadir);
255 LIBPOSTAL_EXPORT void libpostal_teardown(void);
256 
257 LIBPOSTAL_EXPORT bool libpostal_setup_parser(void);
258 LIBPOSTAL_EXPORT bool libpostal_setup_parser_datadir(char *datadir);
259 LIBPOSTAL_EXPORT void libpostal_teardown_parser(void);
260 
261 LIBPOSTAL_EXPORT bool libpostal_setup_language_classifier(void);
262 LIBPOSTAL_EXPORT bool libpostal_setup_language_classifier_datadir(char *datadir);
263 LIBPOSTAL_EXPORT void libpostal_teardown_language_classifier(void);
264 
265 /* Tokenization and token normalization APIs */
266 
267 typedef struct libpostal_token {
268     size_t offset;
269     size_t len;
270     uint16_t type;
271 } libpostal_token_t;
272 
273 LIBPOSTAL_EXPORT libpostal_token_t *libpostal_tokenize(char *input, bool whitespace, size_t *n);
274 
275 // Normalize string options
276 #define LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII 1 << 0
277 #define LIBPOSTAL_NORMALIZE_STRING_TRANSLITERATE 1 << 1
278 #define LIBPOSTAL_NORMALIZE_STRING_STRIP_ACCENTS 1 << 2
279 #define LIBPOSTAL_NORMALIZE_STRING_DECOMPOSE 1 << 3
280 #define LIBPOSTAL_NORMALIZE_STRING_LOWERCASE 1 << 4
281 #define LIBPOSTAL_NORMALIZE_STRING_TRIM 1 << 5
282 #define LIBPOSTAL_NORMALIZE_STRING_REPLACE_HYPHENS 1 << 6
283 #define LIBPOSTAL_NORMALIZE_STRING_COMPOSE 1 << 7
284 #define LIBPOSTAL_NORMALIZE_STRING_SIMPLE_LATIN_ASCII 1 << 8
285 #define LIBPOSTAL_NORMALIZE_STRING_REPLACE_NUMEX 1 << 9
286 
287 // Normalize token options
288 #define LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_HYPHENS 1 << 0
289 #define LIBPOSTAL_NORMALIZE_TOKEN_DELETE_HYPHENS 1 << 1
290 #define LIBPOSTAL_NORMALIZE_TOKEN_DELETE_FINAL_PERIOD 1 << 2
291 #define LIBPOSTAL_NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS 1 << 3
292 #define LIBPOSTAL_NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES 1 << 4
293 #define LIBPOSTAL_NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE 1 << 5
294 #define LIBPOSTAL_NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC 1 << 6
295 #define LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_DIGITS 1 << 7
296 #define LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_NUMERIC_TOKEN_LETTERS 1 << 8
297 #define LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS 1 << 9
298 
299 #define LIBPOSTAL_NORMALIZE_DEFAULT_STRING_OPTIONS (LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII | LIBPOSTAL_NORMALIZE_STRING_COMPOSE | LIBPOSTAL_NORMALIZE_STRING_TRIM | LIBPOSTAL_NORMALIZE_STRING_REPLACE_HYPHENS | LIBPOSTAL_NORMALIZE_STRING_STRIP_ACCENTS | LIBPOSTAL_NORMALIZE_STRING_LOWERCASE)
300 
301 #define LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS (LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_HYPHENS | LIBPOSTAL_NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | LIBPOSTAL_NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | LIBPOSTAL_NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES | LIBPOSTAL_NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE)
302 
303 #define LIBPOSTAL_NORMALIZE_TOKEN_OPTIONS_DROP_PERIODS (LIBPOSTAL_NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | LIBPOSTAL_NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS)
304 
305 #define LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS_NUMERIC (LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS | LIBPOSTAL_NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC)
306 
307 LIBPOSTAL_EXPORT char *libpostal_normalize_string_languages(char *input, uint64_t options, size_t num_languages, char **languages);
308 LIBPOSTAL_EXPORT char *libpostal_normalize_string(char *input, uint64_t options);
309 
310 
311 typedef struct libpostal_normalized_token {
312     char *str;
313     libpostal_token_t token;
314 } libpostal_normalized_token_t;
315 
316 LIBPOSTAL_EXPORT libpostal_normalized_token_t *libpostal_normalized_tokens(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t *n);
317 LIBPOSTAL_EXPORT libpostal_normalized_token_t *libpostal_normalized_tokens_languages(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t num_languages, char **languages, size_t *n);
318 
319 
320 #ifdef __cplusplus
321 }
322 #endif
323 
324 #endif
325