1import random 2import re 3import six 4 5from itertools import izip 6 7from geodata.address_expansions.gazetteers import * 8from geodata.encoding import safe_decode, safe_encode 9from geodata.text.normalize import normalized_tokens 10from geodata.text.tokenize import tokenize_raw, token_types 11from geodata.text.utils import non_breaking_dash_regex 12 13 14def canonicals_for_language(data, language): 15 canonicals = set() 16 17 for d in data: 18 lang, dictionary, is_canonical, canonical = d.split(six.b('|')) 19 if language is None or lang == language: 20 canonicals.add(canonical) 21 22 return canonicals 23 24def equivalent(s1, s2, gazetteer, language): 25 ''' 26 Address/place equivalence 27 ------------------------- 28 29 OSM discourages abbreviations, but to make our training data map better 30 to real-world input, we can safely replace the canonical phrase with an 31 abbreviated version and retain the meaning of the words 32 ''' 33 34 tokens_s1 = normalized_tokens(s1) 35 tokens_s2 = normalized_tokens(s2) 36 37 abbreviated_s1 = list(abbreviations_gazetteer.filter(tokens_s1)) 38 abbreviated_s2 = list(abbreviations_gazetteer.filter(tokens_s2)) 39 40 if len(abbreviated_s1) != len(abbreviated_s2): 41 return False 42 43 for ((t1, c1, l1, d1), (t2, c2, l2, d2)) in izip(abbreviated_s1, abbreviated_s2): 44 if c1 != token_types.PHRASE and c2 != token_types.PHRASE: 45 if t1 != t2: 46 return False 47 elif c2 == token_types.PHRASE and c2 == token_types.PHRASE: 48 canonicals_s1 = canonicals_for_language(d1, language) 49 canonicals_s2 = canonicals_for_language(d2, language) 50 51 if not canonicals_s1 & canonicals_s2: 52 return False 53 else: 54 return False 55 56 return True 57