1 /* 2 address_parser.h 3 ---------------- 4 5 International address parser, designed to use OSM training data, 6 over 40M addresses formatted with the OpenCage address formatting 7 templates: https://github.com/OpenCageData/address-formatting. 8 9 This is a sequence modeling problem similar to e.g. part-of-speech 10 tagging, named entity recognition, etc. in which we have a sequence 11 of inputs (words/tokens) and want to predict a sequence of outputs 12 (labeled part-of-address tags). This is a supervised learning model 13 and the training data is created in the Python geodata package 14 included with this repo. Example record: 15 16 en us 123/house_number Fake/road Street/road Brooklyn/city NY/state 12345/postcode 17 18 Where the fields are: {language, country, tagged address}. 19 20 After training, the address parser can take as input a tokenized 21 input string e.g. "123 Fake Street Brooklyn NY 12345" and parse 22 it into: 23 24 { 25 "house_number": "123", 26 "road": "Fake Street", 27 "city": "Brooklyn", 28 "state": "NY", 29 "postcode": "12345" 30 } 31 32 The model used is a greedy averaged perceptron rather than something 33 like a CRF since there's ample training data from OSM and the accuracy 34 on this task is already very high with the simpler model. 35 36 However, it is still worth investigating CRFs as they are relatively fast 37 at prediction time for a small number of tags, can often achieve better 38 performance and are robust to correlated features, which may not be true 39 with the general error-driven averaged perceptron. 40 41 */ 42 #ifndef ADDRESS_PARSER_H 43 #define ADDRESS_PARSER_H 44 45 #include <stdlib.h> 46 #include <stdint.h> 47 #include <stdbool.h> 48 49 #include "libpostal.h" 50 #include "libpostal_config.h" 51 52 #include "averaged_perceptron.h" 53 #include "averaged_perceptron_tagger.h" 54 #include "collections.h" 55 #include "crf.h" 56 #include "graph.h" 57 #include "normalize.h" 58 #include "string_utils.h" 59 60 #define DEFAULT_ADDRESS_PARSER_PATH LIBPOSTAL_ADDRESS_PARSER_DIR PATH_SEPARATOR "address_parser.dat" 61 62 #define ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS NORMALIZE_STRING_COMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_SIMPLE_LATIN_ASCII 63 #define ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS_LATIN NORMALIZE_STRING_COMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_LATIN_ASCII 64 #define ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS_UTF8 NORMALIZE_STRING_COMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_STRIP_ACCENTS 65 66 #define ADDRESS_PARSER_NORMALIZE_TOKEN_OPTIONS NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | NORMALIZE_TOKEN_REPLACE_DIGITS 67 #define ADDRESS_PARSER_NORMALIZE_ADMIN_TOKEN_OPTIONS ADDRESS_PARSER_NORMALIZE_TOKEN_OPTIONS ^ NORMALIZE_TOKEN_REPLACE_DIGITS 68 #define ADDRESS_PARSER_NORMALIZE_POSTAL_CODE_TOKEN_OPTIONS ADDRESS_PARSER_NORMALIZE_ADMIN_TOKEN_OPTIONS | NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC 69 70 #define ADDRESS_SEPARATOR_NONE 0 71 #define ADDRESS_SEPARATOR_FIELD_INTERNAL 1 << 0 72 #define ADDRESS_SEPARATOR_FIELD 1 << 1 73 74 #define ADDRESS_PARSER_IS_SEPARATOR(token_type) ((token_type) == COMMA || (token_type) == NEWLINE || (token_type) == HYPHEN || (token_type) == DASH || (token_type) == BREAKING_DASH|| (token_type) == SEMICOLON || (token_type) == PUNCT_OPEN || (token_type) == PUNCT_CLOSE ) 75 #define ADDRESS_PARSER_IS_IGNORABLE(token_type) ((token.type) == INVALID_CHAR || (token.type) == PERIOD || (token_type) == COLON ) 76 77 #define SEPARATOR_LABEL "sep" 78 #define FIELD_SEPARATOR_LABEL "fsep" 79 80 #define ADDRESS_COMPONENT_NON_BOUNDARY 0 81 #define ADDRESS_COMPONENT_SUBURB 1 << 3 82 #define ADDRESS_COMPONENT_CITY_DISTRICT 1 << 4 83 #define ADDRESS_COMPONENT_CITY 1 << 5 84 #define ADDRESS_COMPONENT_ISLAND 1 << 7 85 #define ADDRESS_COMPONENT_STATE_DISTRICT 1 << 8 86 #define ADDRESS_COMPONENT_STATE 1 << 9 87 #define ADDRESS_COMPONENT_COUNTRY_REGION 1 << 11 88 #define ADDRESS_COMPONENT_COUNTRY 1 << 13 89 #define ADDRESS_COMPONENT_WORLD_REGION 1 << 14 90 91 typedef enum { 92 ADDRESS_PARSER_BOUNDARY_NONE, 93 ADDRESS_PARSER_BOUNDARY_SUBURB, 94 ADDRESS_PARSER_BOUNDARY_CITY_DISTRICT, 95 ADDRESS_PARSER_BOUNDARY_CITY, 96 ADDRESS_PARSER_BOUNDARY_STATE_DISTRICT, 97 ADDRESS_PARSER_BOUNDARY_ISLAND, 98 ADDRESS_PARSER_BOUNDARY_STATE, 99 ADDRESS_PARSER_BOUNDARY_COUNTRY_REGION, 100 ADDRESS_PARSER_BOUNDARY_COUNTRY, 101 ADDRESS_PARSER_BOUNDARY_WORLD_REGION, 102 NUM_ADDRESS_PARSER_BOUNDARY_TYPES 103 } address_parser_boundary_components; 104 105 106 #define ADDRESS_PARSER_LABEL_HOUSE "house" 107 #define ADDRESS_PARSER_LABEL_HOUSE_NUMBER "house_number" 108 #define ADDRESS_PARSER_LABEL_PO_BOX "po_box" 109 #define ADDRESS_PARSER_LABEL_BUILDING "building" 110 #define ADDRESS_PARSER_LABEL_ENTRANCE "entrance" 111 #define ADDRESS_PARSER_LABEL_STAIRCASE "staircase" 112 #define ADDRESS_PARSER_LABEL_LEVEL "level" 113 #define ADDRESS_PARSER_LABEL_UNIT "unit" 114 #define ADDRESS_PARSER_LABEL_ROAD "road" 115 #define ADDRESS_PARSER_LABEL_METRO_STATION "metro_station" 116 #define ADDRESS_PARSER_LABEL_SUBURB "suburb" 117 #define ADDRESS_PARSER_LABEL_CITY_DISTRICT "city_district" 118 #define ADDRESS_PARSER_LABEL_CITY "city" 119 #define ADDRESS_PARSER_LABEL_STATE_DISTRICT "state_district" 120 #define ADDRESS_PARSER_LABEL_ISLAND "island" 121 #define ADDRESS_PARSER_LABEL_STATE "state" 122 #define ADDRESS_PARSER_LABEL_POSTAL_CODE "postcode" 123 #define ADDRESS_PARSER_LABEL_COUNTRY_REGION "country_region" 124 #define ADDRESS_PARSER_LABEL_COUNTRY "country" 125 #define ADDRESS_PARSER_LABEL_WORLD_REGION "world_region" 126 127 #define ADDRESS_PARSER_LABEL_WEBSITE "website" 128 #define ADDRESS_PARSER_LABEL_TELEPHONE "phone" 129 130 typedef union address_parser_types { 131 uint32_t value; 132 struct { 133 uint32_t components:16; // Bitset of components 134 uint32_t most_common:16; // Most common component as short integer enum value 135 }; 136 } address_parser_types_t; 137 138 VECTOR_INIT(address_parser_types_array, address_parser_types_t) 139 140 typedef struct address_parser_context { 141 char *language; 142 char *country; 143 cstring_array *features; 144 cstring_array *prev_tag_features; 145 cstring_array *prev2_tag_features; 146 // Temporary strings used at each token during feature extraction 147 char_array *phrase; 148 char_array *context_phrase; 149 char_array *long_context_phrase; 150 char_array *prefix_phrase; 151 char_array *context_prefix_phrase; 152 char_array *long_context_prefix_phrase; 153 char_array *suffix_phrase; 154 char_array *context_suffix_phrase; 155 char_array *long_context_suffix_phrase; 156 char_array *component_phrase; 157 char_array *context_component_phrase; 158 char_array *long_context_component_phrase; 159 // ngrams and prefix/suffix features 160 cstring_array *ngrams; 161 // For hyphenated words 162 char_array *sub_token; 163 token_array *sub_tokens; 164 // Strings/arrays relating to the sentence 165 uint32_array *separators; 166 cstring_array *normalized; 167 token_array *normalized_tokens; 168 cstring_array *normalized_admin; 169 token_array *normalized_admin_tokens; 170 // Known phrases 171 phrase_array *address_dictionary_phrases; 172 int64_array *address_phrase_memberships; // Index in address_dictionary_phrases or -1 173 phrase_array *component_phrases; 174 int64_array *component_phrase_memberships; // Index in component_phrases or -1 175 phrase_array *postal_code_phrases; 176 int64_array *postal_code_phrase_memberships; // Index in postal_code_phrases or -1 177 phrase_array *prefix_phrases; 178 phrase_array *suffix_phrases; 179 // The tokenized string used to conveniently access both words as C strings and tokens by index 180 tokenized_string_t *tokenized_str; 181 } address_parser_context_t; 182 183 typedef union postal_code_context_value { 184 uint64_t value; 185 struct { 186 uint64_t postcode:32; 187 uint64_t admin:32; 188 }; 189 } postal_code_context_value_t; 190 191 #define POSTAL_CODE_CONTEXT(pc, ad) ((postal_code_context_value_t){.postcode = (pc), .admin = (ad) }) 192 193 typedef enum address_parser_model_type { 194 ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON, 195 ADDRESS_PARSER_TYPE_CRF 196 } address_parser_model_type_t; 197 198 typedef struct parser_options { 199 uint64_t rare_word_threshold; 200 bool print_features; 201 } parser_options_t; 202 203 // Can add other gazetteers as well 204 typedef struct address_parser { 205 parser_options_t options; 206 size_t num_classes; 207 address_parser_model_type_t model_type; 208 union { 209 averaged_perceptron_t *ap; 210 crf_t *crf; 211 } model; 212 address_parser_context_t *context; 213 trie_t *vocab; 214 trie_t *phrases; 215 address_parser_types_array *phrase_types; 216 trie_t *postal_codes; 217 graph_t *postal_code_contexts; 218 } address_parser_t; 219 220 // General usage 221 222 address_parser_t *address_parser_new(void); 223 address_parser_t *address_parser_new_options(parser_options_t options); 224 address_parser_t *get_address_parser(void); 225 bool address_parser_load(char *dir); 226 227 bool address_parser_print_features(bool print_features); 228 libpostal_address_parser_response_t *address_parser_parse(char *address, char *language, char *country); 229 void address_parser_destroy(address_parser_t *self); 230 231 char *address_parser_normalize_string(char *str); 232 void address_parser_normalize_token(cstring_array *array, char *str, token_t token); 233 234 bool address_parser_predict(address_parser_t *self, address_parser_context_t *context, cstring_array *token_labels, tagger_feature_function feature_function, tokenized_string_t *tokenized_str); 235 236 address_parser_context_t *address_parser_context_new(void); 237 void address_parser_context_destroy(address_parser_context_t *self); 238 239 void address_parser_context_fill(address_parser_context_t *context, address_parser_t *parser, tokenized_string_t *tokenized_str, char *language, char *country); 240 241 // Feature function 242 bool address_parser_features(void *self, void *ctx, tokenized_string_t *str, uint32_t i); 243 244 // I/O methods 245 246 bool address_parser_load(char *dir); 247 bool address_parser_save(address_parser_t *self, char *output_dir); 248 249 // Module setup/teardown 250 251 bool address_parser_module_setup(char *dir); 252 void address_parser_module_teardown(void); 253 254 255 #endif 256