1 #ifndef LIBPOSTAL_H 2 #define LIBPOSTAL_H 3 4 #ifdef __cplusplus 5 extern "C" { 6 #endif 7 8 #include <stdio.h> 9 #include <stdlib.h> 10 #include <stdint.h> 11 #include <stdbool.h> 12 13 #ifdef _WIN32 14 #ifdef LIBPOSTAL_EXPORTS 15 #define LIBPOSTAL_EXPORT __declspec(dllexport) 16 #else 17 #define LIBPOSTAL_EXPORT __declspec(dllimport) 18 #endif 19 #elif __GNUC__ >= 4 20 #define LIBPOSTAL_EXPORT __attribute__ ((visibility("default"))) 21 #else 22 #define LIBPOSTAL_EXPORT 23 #endif 24 25 #define LIBPOSTAL_MAX_LANGUAGE_LEN 4 26 27 // Doing these as #defines so we can duplicate the values exactly in Python 28 29 30 typedef enum { 31 LIBPOSTAL_TOKEN_TYPE_END = 0, // Null byte 32 // Word types 33 LIBPOSTAL_TOKEN_TYPE_WORD = 1, // Any letter-only word (includes all unicode letters) 34 LIBPOSTAL_TOKEN_TYPE_ABBREVIATION = 2, // Loose abbreviations (roughly anything containing a "." as we don't care about sentences in addresses) 35 LIBPOSTAL_TOKEN_TYPE_IDEOGRAPHIC_CHAR = 3, // For languages that don't separate on whitespace (e.g. Chinese, Japanese, Korean), separate by character 36 LIBPOSTAL_TOKEN_TYPE_HANGUL_SYLLABLE = 4, // Hangul syllable sequences which contain more than one codepoint 37 LIBPOSTAL_TOKEN_TYPE_ACRONYM = 5, // Specifically things like U.N. where we may delete internal periods 38 39 LIBPOSTAL_TOKEN_TYPE_PHRASE = 10, // Not part of the first stage tokenizer, but may be used after phrase parsing 40 41 // Special tokens 42 LIBPOSTAL_TOKEN_TYPE_EMAIL = 20, // Make sure emails are tokenized altogether 43 LIBPOSTAL_TOKEN_TYPE_URL = 21, // Make sure urls are tokenized altogether 44 LIBPOSTAL_TOKEN_TYPE_US_PHONE = 22, // US phone number (with or without country code) 45 LIBPOSTAL_TOKEN_TYPE_INTL_PHONE = 23, // A non-US phone number (must have country code) 46 47 // Numbers and numeric types 48 LIBPOSTAL_TOKEN_TYPE_NUMERIC = 50, // Any sequence containing a digit 49 LIBPOSTAL_TOKEN_TYPE_ORDINAL = 51, // 1st, 2nd, 1er, 1 etc. 50 LIBPOSTAL_TOKEN_TYPE_ROMAN_NUMERAL = 52, // II, III, VI, etc. 51 LIBPOSTAL_TOKEN_TYPE_IDEOGRAPHIC_NUMBER = 53, // All numeric ideographic characters, includes e.g. Han numbers and chars like "²" 52 53 // Punctuation types, may separate a phrase 54 LIBPOSTAL_TOKEN_TYPE_PERIOD = 100, 55 LIBPOSTAL_TOKEN_TYPE_EXCLAMATION = 101, 56 LIBPOSTAL_TOKEN_TYPE_QUESTION_MARK = 102, 57 LIBPOSTAL_TOKEN_TYPE_COMMA = 103, 58 LIBPOSTAL_TOKEN_TYPE_COLON = 104, 59 LIBPOSTAL_TOKEN_TYPE_SEMICOLON = 105, 60 LIBPOSTAL_TOKEN_TYPE_PLUS = 106, 61 LIBPOSTAL_TOKEN_TYPE_AMPERSAND = 107, 62 LIBPOSTAL_TOKEN_TYPE_AT_SIGN = 108, 63 LIBPOSTAL_TOKEN_TYPE_POUND = 109, 64 LIBPOSTAL_TOKEN_TYPE_ELLIPSIS = 110, 65 LIBPOSTAL_TOKEN_TYPE_DASH = 111, 66 LIBPOSTAL_TOKEN_TYPE_BREAKING_DASH = 112, 67 LIBPOSTAL_TOKEN_TYPE_HYPHEN = 113, 68 LIBPOSTAL_TOKEN_TYPE_PUNCT_OPEN = 114, 69 LIBPOSTAL_TOKEN_TYPE_PUNCT_CLOSE = 115, 70 LIBPOSTAL_TOKEN_TYPE_DOUBLE_QUOTE = 119, 71 LIBPOSTAL_TOKEN_TYPE_SINGLE_QUOTE = 120, 72 LIBPOSTAL_TOKEN_TYPE_OPEN_QUOTE = 121, 73 LIBPOSTAL_TOKEN_TYPE_CLOSE_QUOTE = 122, 74 LIBPOSTAL_TOKEN_TYPE_SLASH = 124, 75 LIBPOSTAL_TOKEN_TYPE_BACKSLASH = 125, 76 LIBPOSTAL_TOKEN_TYPE_GREATER_THAN = 126, 77 LIBPOSTAL_TOKEN_TYPE_LESS_THAN = 127, 78 79 // Non-letters and whitespace 80 LIBPOSTAL_TOKEN_TYPE_OTHER = 200, 81 LIBPOSTAL_TOKEN_TYPE_WHITESPACE = 300, 82 LIBPOSTAL_TOKEN_TYPE_NEWLINE = 301, 83 84 LIBPOSTAL_TOKEN_TYPE_INVALID_CHAR = 500 85 } libpostal_token_type_t; 86 87 88 /* 89 Address dictionaries 90 */ 91 // Bit set, should be able to keep it at a short (uint16_t) 92 #define LIBPOSTAL_ADDRESS_NONE 0 93 #define LIBPOSTAL_ADDRESS_ANY (1 << 0) 94 #define LIBPOSTAL_ADDRESS_NAME (1 << 1) 95 #define LIBPOSTAL_ADDRESS_HOUSE_NUMBER (1 << 2) 96 #define LIBPOSTAL_ADDRESS_STREET (1 << 3) 97 #define LIBPOSTAL_ADDRESS_UNIT (1 << 4) 98 #define LIBPOSTAL_ADDRESS_LEVEL (1 << 5) 99 #define LIBPOSTAL_ADDRESS_STAIRCASE (1 << 6) 100 #define LIBPOSTAL_ADDRESS_ENTRANCE (1 << 7) 101 102 #define LIBPOSTAL_ADDRESS_CATEGORY (1 << 8) 103 #define LIBPOSTAL_ADDRESS_NEAR (1 << 9) 104 105 #define LIBPOSTAL_ADDRESS_TOPONYM (1 << 13) 106 #define LIBPOSTAL_ADDRESS_POSTAL_CODE (1 << 14) 107 #define LIBPOSTAL_ADDRESS_PO_BOX (1 << 15) 108 #define LIBPOSTAL_ADDRESS_ALL ((1 << 16) - 1) 109 110 typedef struct libpostal_normalize_options { 111 // List of language codes 112 char **languages; 113 size_t num_languages; 114 uint16_t address_components; 115 116 // String options 117 bool latin_ascii; 118 bool transliterate; 119 bool strip_accents; 120 bool decompose; 121 bool lowercase; 122 bool trim_string; 123 bool drop_parentheticals; 124 bool replace_numeric_hyphens; 125 bool delete_numeric_hyphens; 126 bool split_alpha_from_numeric; 127 bool replace_word_hyphens; 128 bool delete_word_hyphens; 129 bool delete_final_periods; 130 bool delete_acronym_periods; 131 bool drop_english_possessives; 132 bool delete_apostrophes; 133 bool expand_numex; 134 bool roman_numerals; 135 136 } libpostal_normalize_options_t; 137 138 LIBPOSTAL_EXPORT libpostal_normalize_options_t libpostal_get_default_options(void); 139 140 LIBPOSTAL_EXPORT char **libpostal_expand_address(char *input, libpostal_normalize_options_t options, size_t *n); 141 LIBPOSTAL_EXPORT char **libpostal_expand_address_root(char *input, libpostal_normalize_options_t options, size_t *n); 142 143 LIBPOSTAL_EXPORT void libpostal_expansion_array_destroy(char **expansions, size_t n); 144 145 /* 146 Address parser 147 */ 148 149 typedef struct libpostal_address_parser_response { 150 size_t num_components; 151 char **components; 152 char **labels; 153 } libpostal_address_parser_response_t; 154 155 typedef libpostal_address_parser_response_t libpostal_parsed_address_components_t; 156 157 typedef struct libpostal_address_parser_options { 158 char *language; 159 char *country; 160 } libpostal_address_parser_options_t; 161 162 LIBPOSTAL_EXPORT void libpostal_address_parser_response_destroy(libpostal_address_parser_response_t *self); 163 164 LIBPOSTAL_EXPORT libpostal_address_parser_options_t libpostal_get_address_parser_default_options(void); 165 166 LIBPOSTAL_EXPORT libpostal_address_parser_response_t *libpostal_parse_address(char *address, libpostal_address_parser_options_t options); 167 168 LIBPOSTAL_EXPORT bool libpostal_parser_print_features(bool print_features); 169 170 171 /* 172 Deduping 173 */ 174 175 176 // Near-dupe hashing methods 177 178 typedef struct libpostal_near_dupe_hash_options { 179 bool with_name; 180 bool with_address; 181 bool with_unit; 182 bool with_city_or_equivalent; 183 bool with_small_containing_boundaries; 184 bool with_postal_code; 185 bool with_latlon; 186 double latitude; 187 double longitude; 188 uint32_t geohash_precision; 189 bool name_and_address_keys; 190 bool name_only_keys; 191 bool address_only_keys; 192 } libpostal_near_dupe_hash_options_t; 193 194 195 LIBPOSTAL_EXPORT libpostal_near_dupe_hash_options_t libpostal_get_near_dupe_hash_default_options(void); 196 LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t *num_hashes); 197 LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages, size_t *num_hashes); 198 199 // Dupe language classification 200 201 LIBPOSTAL_EXPORT char **libpostal_place_languages(size_t num_components, char **labels, char **values, size_t *num_languages); 202 203 // Pairwise dupe methods 204 205 typedef enum { 206 LIBPOSTAL_NULL_DUPLICATE_STATUS = -1, 207 LIBPOSTAL_NON_DUPLICATE = 0, 208 LIBPOSTAL_POSSIBLE_DUPLICATE_NEEDS_REVIEW = 3, 209 LIBPOSTAL_LIKELY_DUPLICATE = 6, 210 LIBPOSTAL_EXACT_DUPLICATE = 9, 211 } libpostal_duplicate_status_t; 212 213 typedef struct libpostal_duplicate_options { 214 size_t num_languages; 215 char **languages; 216 } libpostal_duplicate_options_t; 217 218 219 LIBPOSTAL_EXPORT libpostal_duplicate_options_t libpostal_get_default_duplicate_options(void); 220 LIBPOSTAL_EXPORT libpostal_duplicate_options_t libpostal_get_duplicate_options_with_languages(size_t num_languages, char **languages); 221 222 LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_name_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); 223 LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_street_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); 224 LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_house_number_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); 225 LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_po_box_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); 226 LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_unit_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); 227 LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_floor_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); 228 LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_postal_code_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); 229 LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_toponym_duplicate(size_t num_components1, char **labels1, char **values1, size_t num_components2, char **labels2, char **values2, libpostal_duplicate_options_t options); 230 231 // Pairwise fuzzy dupe methods, return status & similarity 232 233 typedef struct libpostal_fuzzy_duplicate_options { 234 size_t num_languages; 235 char **languages; 236 double needs_review_threshold; 237 double likely_dupe_threshold; 238 } libpostal_fuzzy_duplicate_options_t; 239 240 typedef struct libpostal_fuzzy_duplicate_status { 241 libpostal_duplicate_status_t status; 242 double similarity; 243 } libpostal_fuzzy_duplicate_status_t; 244 245 LIBPOSTAL_EXPORT libpostal_fuzzy_duplicate_options_t libpostal_get_default_fuzzy_duplicate_options(void); 246 LIBPOSTAL_EXPORT libpostal_fuzzy_duplicate_options_t libpostal_get_default_fuzzy_duplicate_options_with_languages(size_t num_languages, char **languages); 247 248 LIBPOSTAL_EXPORT libpostal_fuzzy_duplicate_status_t libpostal_is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options); 249 LIBPOSTAL_EXPORT libpostal_fuzzy_duplicate_status_t libpostal_is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options); 250 251 // Setup/teardown methods 252 253 LIBPOSTAL_EXPORT bool libpostal_setup(void); 254 LIBPOSTAL_EXPORT bool libpostal_setup_datadir(char *datadir); 255 LIBPOSTAL_EXPORT void libpostal_teardown(void); 256 257 LIBPOSTAL_EXPORT bool libpostal_setup_parser(void); 258 LIBPOSTAL_EXPORT bool libpostal_setup_parser_datadir(char *datadir); 259 LIBPOSTAL_EXPORT void libpostal_teardown_parser(void); 260 261 LIBPOSTAL_EXPORT bool libpostal_setup_language_classifier(void); 262 LIBPOSTAL_EXPORT bool libpostal_setup_language_classifier_datadir(char *datadir); 263 LIBPOSTAL_EXPORT void libpostal_teardown_language_classifier(void); 264 265 /* Tokenization and token normalization APIs */ 266 267 typedef struct libpostal_token { 268 size_t offset; 269 size_t len; 270 uint16_t type; 271 } libpostal_token_t; 272 273 LIBPOSTAL_EXPORT libpostal_token_t *libpostal_tokenize(char *input, bool whitespace, size_t *n); 274 275 // Normalize string options 276 #define LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII 1 << 0 277 #define LIBPOSTAL_NORMALIZE_STRING_TRANSLITERATE 1 << 1 278 #define LIBPOSTAL_NORMALIZE_STRING_STRIP_ACCENTS 1 << 2 279 #define LIBPOSTAL_NORMALIZE_STRING_DECOMPOSE 1 << 3 280 #define LIBPOSTAL_NORMALIZE_STRING_LOWERCASE 1 << 4 281 #define LIBPOSTAL_NORMALIZE_STRING_TRIM 1 << 5 282 #define LIBPOSTAL_NORMALIZE_STRING_REPLACE_HYPHENS 1 << 6 283 #define LIBPOSTAL_NORMALIZE_STRING_COMPOSE 1 << 7 284 #define LIBPOSTAL_NORMALIZE_STRING_SIMPLE_LATIN_ASCII 1 << 8 285 #define LIBPOSTAL_NORMALIZE_STRING_REPLACE_NUMEX 1 << 9 286 287 // Normalize token options 288 #define LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_HYPHENS 1 << 0 289 #define LIBPOSTAL_NORMALIZE_TOKEN_DELETE_HYPHENS 1 << 1 290 #define LIBPOSTAL_NORMALIZE_TOKEN_DELETE_FINAL_PERIOD 1 << 2 291 #define LIBPOSTAL_NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS 1 << 3 292 #define LIBPOSTAL_NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES 1 << 4 293 #define LIBPOSTAL_NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE 1 << 5 294 #define LIBPOSTAL_NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC 1 << 6 295 #define LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_DIGITS 1 << 7 296 #define LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_NUMERIC_TOKEN_LETTERS 1 << 8 297 #define LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS 1 << 9 298 299 #define LIBPOSTAL_NORMALIZE_DEFAULT_STRING_OPTIONS (LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII | LIBPOSTAL_NORMALIZE_STRING_COMPOSE | LIBPOSTAL_NORMALIZE_STRING_TRIM | LIBPOSTAL_NORMALIZE_STRING_REPLACE_HYPHENS | LIBPOSTAL_NORMALIZE_STRING_STRIP_ACCENTS | LIBPOSTAL_NORMALIZE_STRING_LOWERCASE) 300 301 #define LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS (LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_HYPHENS | LIBPOSTAL_NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | LIBPOSTAL_NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | LIBPOSTAL_NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES | LIBPOSTAL_NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE) 302 303 #define LIBPOSTAL_NORMALIZE_TOKEN_OPTIONS_DROP_PERIODS (LIBPOSTAL_NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | LIBPOSTAL_NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS) 304 305 #define LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS_NUMERIC (LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS | LIBPOSTAL_NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC) 306 307 LIBPOSTAL_EXPORT char *libpostal_normalize_string_languages(char *input, uint64_t options, size_t num_languages, char **languages); 308 LIBPOSTAL_EXPORT char *libpostal_normalize_string(char *input, uint64_t options); 309 310 311 typedef struct libpostal_normalized_token { 312 char *str; 313 libpostal_token_t token; 314 } libpostal_normalized_token_t; 315 316 LIBPOSTAL_EXPORT libpostal_normalized_token_t *libpostal_normalized_tokens(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t *n); 317 LIBPOSTAL_EXPORT libpostal_normalized_token_t *libpostal_normalized_tokens_languages(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t num_languages, char **languages, size_t *n); 318 319 320 #ifdef __cplusplus 321 } 322 #endif 323 324 #endif 325