1 /*
2 address_parser.h
3 ----------------
4 
5 International address parser, designed to use OSM training data,
6 over 40M addresses formatted with the OpenCage address formatting
7 templates: https://github.com/OpenCageData/address-formatting.
8 
9 This is a sequence modeling problem similar to e.g. part-of-speech
10 tagging, named entity recognition, etc. in which we have a sequence
11 of inputs (words/tokens) and want to predict a sequence of outputs
12 (labeled part-of-address tags). This is a supervised learning model
13 and the training data is created in the Python geodata package
14 included with this repo. Example record:
15 
16 en  us  123/house_number Fake/road Street/road Brooklyn/city NY/state 12345/postcode
17 
18 Where the fields are: {language, country, tagged address}.
19 
20 After training, the address parser can take as input a tokenized
21 input string e.g. "123 Fake Street Brooklyn NY 12345" and parse
22 it into:
23 
24 {
25     "house_number": "123",
26     "road": "Fake Street",
27     "city": "Brooklyn",
28     "state": "NY",
29     "postcode": "12345"
30 }
31 
32 The model used is a greedy averaged perceptron rather than something
33 like a CRF since there's ample training data from OSM and the accuracy
34 on this task is already very high with the simpler model.
35 
36 However, it is still worth investigating CRFs as they are relatively fast
37 at prediction time for a small number of tags, can often achieve better
38 performance and are robust to correlated features, which may not be true
39 with the general error-driven averaged perceptron.
40 
41 */
42 #ifndef ADDRESS_PARSER_H
43 #define ADDRESS_PARSER_H
44 
45 #include <stdlib.h>
46 #include <stdint.h>
47 #include <stdbool.h>
48 
49 #include "libpostal.h"
50 #include "libpostal_config.h"
51 
52 #include "averaged_perceptron.h"
53 #include "averaged_perceptron_tagger.h"
54 #include "collections.h"
55 #include "crf.h"
56 #include "graph.h"
57 #include "normalize.h"
58 #include "string_utils.h"
59 
60 #define DEFAULT_ADDRESS_PARSER_PATH LIBPOSTAL_ADDRESS_PARSER_DIR PATH_SEPARATOR "address_parser.dat"
61 
62 #define ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS NORMALIZE_STRING_COMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_SIMPLE_LATIN_ASCII
63 #define ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS_LATIN NORMALIZE_STRING_COMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_LATIN_ASCII
64 #define ADDRESS_PARSER_NORMALIZE_STRING_OPTIONS_UTF8 NORMALIZE_STRING_COMPOSE | NORMALIZE_STRING_LOWERCASE | NORMALIZE_STRING_STRIP_ACCENTS
65 
66 #define ADDRESS_PARSER_NORMALIZE_TOKEN_OPTIONS NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | NORMALIZE_TOKEN_REPLACE_DIGITS
67 #define ADDRESS_PARSER_NORMALIZE_ADMIN_TOKEN_OPTIONS ADDRESS_PARSER_NORMALIZE_TOKEN_OPTIONS ^ NORMALIZE_TOKEN_REPLACE_DIGITS
68 #define ADDRESS_PARSER_NORMALIZE_POSTAL_CODE_TOKEN_OPTIONS ADDRESS_PARSER_NORMALIZE_ADMIN_TOKEN_OPTIONS | NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC
69 
70 #define ADDRESS_SEPARATOR_NONE 0
71 #define ADDRESS_SEPARATOR_FIELD_INTERNAL 1 << 0
72 #define ADDRESS_SEPARATOR_FIELD 1 << 1
73 
74 #define ADDRESS_PARSER_IS_SEPARATOR(token_type) ((token_type) == COMMA || (token_type) == NEWLINE || (token_type) == HYPHEN || (token_type) == DASH || (token_type) == BREAKING_DASH|| (token_type) == SEMICOLON || (token_type) == PUNCT_OPEN || (token_type) == PUNCT_CLOSE )
75 #define ADDRESS_PARSER_IS_IGNORABLE(token_type) ((token.type) == INVALID_CHAR || (token.type) == PERIOD || (token_type) == COLON )
76 
77 #define SEPARATOR_LABEL "sep"
78 #define FIELD_SEPARATOR_LABEL "fsep"
79 
80 #define ADDRESS_COMPONENT_NON_BOUNDARY 0
81 #define ADDRESS_COMPONENT_SUBURB 1 << 3
82 #define ADDRESS_COMPONENT_CITY_DISTRICT 1 << 4
83 #define ADDRESS_COMPONENT_CITY 1 << 5
84 #define ADDRESS_COMPONENT_ISLAND 1 << 7
85 #define ADDRESS_COMPONENT_STATE_DISTRICT 1 << 8
86 #define ADDRESS_COMPONENT_STATE 1 << 9
87 #define ADDRESS_COMPONENT_COUNTRY_REGION 1 << 11
88 #define ADDRESS_COMPONENT_COUNTRY 1 << 13
89 #define ADDRESS_COMPONENT_WORLD_REGION 1 << 14
90 
91 typedef enum {
92     ADDRESS_PARSER_BOUNDARY_NONE,
93     ADDRESS_PARSER_BOUNDARY_SUBURB,
94     ADDRESS_PARSER_BOUNDARY_CITY_DISTRICT,
95     ADDRESS_PARSER_BOUNDARY_CITY,
96     ADDRESS_PARSER_BOUNDARY_STATE_DISTRICT,
97     ADDRESS_PARSER_BOUNDARY_ISLAND,
98     ADDRESS_PARSER_BOUNDARY_STATE,
99     ADDRESS_PARSER_BOUNDARY_COUNTRY_REGION,
100     ADDRESS_PARSER_BOUNDARY_COUNTRY,
101     ADDRESS_PARSER_BOUNDARY_WORLD_REGION,
102     NUM_ADDRESS_PARSER_BOUNDARY_TYPES
103 } address_parser_boundary_components;
104 
105 
106 #define ADDRESS_PARSER_LABEL_HOUSE "house"
107 #define ADDRESS_PARSER_LABEL_HOUSE_NUMBER "house_number"
108 #define ADDRESS_PARSER_LABEL_PO_BOX "po_box"
109 #define ADDRESS_PARSER_LABEL_BUILDING "building"
110 #define ADDRESS_PARSER_LABEL_ENTRANCE "entrance"
111 #define ADDRESS_PARSER_LABEL_STAIRCASE "staircase"
112 #define ADDRESS_PARSER_LABEL_LEVEL "level"
113 #define ADDRESS_PARSER_LABEL_UNIT "unit"
114 #define ADDRESS_PARSER_LABEL_ROAD "road"
115 #define ADDRESS_PARSER_LABEL_METRO_STATION "metro_station"
116 #define ADDRESS_PARSER_LABEL_SUBURB "suburb"
117 #define ADDRESS_PARSER_LABEL_CITY_DISTRICT "city_district"
118 #define ADDRESS_PARSER_LABEL_CITY "city"
119 #define ADDRESS_PARSER_LABEL_STATE_DISTRICT  "state_district"
120 #define ADDRESS_PARSER_LABEL_ISLAND "island"
121 #define ADDRESS_PARSER_LABEL_STATE  "state"
122 #define ADDRESS_PARSER_LABEL_POSTAL_CODE  "postcode"
123 #define ADDRESS_PARSER_LABEL_COUNTRY_REGION  "country_region"
124 #define ADDRESS_PARSER_LABEL_COUNTRY  "country"
125 #define ADDRESS_PARSER_LABEL_WORLD_REGION "world_region"
126 
127 #define ADDRESS_PARSER_LABEL_WEBSITE "website"
128 #define ADDRESS_PARSER_LABEL_TELEPHONE "phone"
129 
130 typedef union address_parser_types {
131     uint32_t value;
132     struct {
133         uint32_t components:16;     // Bitset of components
134         uint32_t most_common:16;    // Most common component as short integer enum value
135     };
136 } address_parser_types_t;
137 
138 VECTOR_INIT(address_parser_types_array, address_parser_types_t)
139 
140 typedef struct address_parser_context {
141     char *language;
142     char *country;
143     cstring_array *features;
144     cstring_array *prev_tag_features;
145     cstring_array *prev2_tag_features;
146     // Temporary strings used at each token during feature extraction
147     char_array *phrase;
148     char_array *context_phrase;
149     char_array *long_context_phrase;
150     char_array *prefix_phrase;
151     char_array *context_prefix_phrase;
152     char_array *long_context_prefix_phrase;
153     char_array *suffix_phrase;
154     char_array *context_suffix_phrase;
155     char_array *long_context_suffix_phrase;
156     char_array *component_phrase;
157     char_array *context_component_phrase;
158     char_array *long_context_component_phrase;
159     // ngrams and prefix/suffix features
160     cstring_array *ngrams;
161     // For hyphenated words
162     char_array *sub_token;
163     token_array *sub_tokens;
164     // Strings/arrays relating to the sentence
165     uint32_array *separators;
166     cstring_array *normalized;
167     token_array *normalized_tokens;
168     cstring_array *normalized_admin;
169     token_array *normalized_admin_tokens;
170     // Known phrases
171     phrase_array *address_dictionary_phrases;
172     int64_array *address_phrase_memberships; // Index in address_dictionary_phrases or -1
173     phrase_array *component_phrases;
174     int64_array *component_phrase_memberships; // Index in component_phrases or -1
175     phrase_array *postal_code_phrases;
176     int64_array *postal_code_phrase_memberships; // Index in postal_code_phrases or -1
177     phrase_array *prefix_phrases;
178     phrase_array *suffix_phrases;
179     // The tokenized string used to conveniently access both words as C strings and tokens by index
180     tokenized_string_t *tokenized_str;
181 } address_parser_context_t;
182 
183 typedef union postal_code_context_value {
184     uint64_t value;
185     struct {
186         uint64_t postcode:32;
187         uint64_t admin:32;
188     };
189 } postal_code_context_value_t;
190 
191 #define POSTAL_CODE_CONTEXT(pc, ad) ((postal_code_context_value_t){.postcode = (pc), .admin = (ad) })
192 
193 typedef enum address_parser_model_type {
194     ADDRESS_PARSER_TYPE_GREEDY_AVERAGED_PERCEPTRON,
195     ADDRESS_PARSER_TYPE_CRF
196 } address_parser_model_type_t;
197 
198 typedef struct parser_options {
199     uint64_t rare_word_threshold;
200     bool print_features;
201 } parser_options_t;
202 
203 // Can add other gazetteers as well
204 typedef struct address_parser {
205     parser_options_t options;
206     size_t num_classes;
207     address_parser_model_type_t model_type;
208     union {
209         averaged_perceptron_t *ap;
210         crf_t *crf;
211     } model;
212     address_parser_context_t *context;
213     trie_t *vocab;
214     trie_t *phrases;
215     address_parser_types_array *phrase_types;
216     trie_t *postal_codes;
217     graph_t *postal_code_contexts;
218 } address_parser_t;
219 
220 // General usage
221 
222 address_parser_t *address_parser_new(void);
223 address_parser_t *address_parser_new_options(parser_options_t options);
224 address_parser_t *get_address_parser(void);
225 bool address_parser_load(char *dir);
226 
227 bool address_parser_print_features(bool print_features);
228 libpostal_address_parser_response_t *address_parser_parse(char *address, char *language, char *country);
229 void address_parser_destroy(address_parser_t *self);
230 
231 char *address_parser_normalize_string(char *str);
232 void address_parser_normalize_token(cstring_array *array, char *str, token_t token);
233 
234 bool address_parser_predict(address_parser_t *self, address_parser_context_t *context, cstring_array *token_labels, tagger_feature_function feature_function, tokenized_string_t *tokenized_str);
235 
236 address_parser_context_t *address_parser_context_new(void);
237 void address_parser_context_destroy(address_parser_context_t *self);
238 
239 void address_parser_context_fill(address_parser_context_t *context, address_parser_t *parser, tokenized_string_t *tokenized_str, char *language, char *country);
240 
241 // Feature function
242 bool address_parser_features(void *self, void *ctx, tokenized_string_t *str, uint32_t i);
243 
244 // I/O methods
245 
246 bool address_parser_load(char *dir);
247 bool address_parser_save(address_parser_t *self, char *output_dir);
248 
249 // Module setup/teardown
250 
251 bool address_parser_module_setup(char *dir);
252 void address_parser_module_teardown(void);
253 
254 
255 #endif
256