1import random
2import re
3import six
4
5from itertools import izip
6
7from geodata.address_expansions.gazetteers import *
8from geodata.encoding import safe_decode, safe_encode
9from geodata.text.normalize import normalized_tokens
10from geodata.text.tokenize import tokenize_raw, token_types
11from geodata.text.utils import non_breaking_dash_regex
12
13
14def canonicals_for_language(data, language):
15    canonicals = set()
16
17    for d in data:
18        lang, dictionary, is_canonical, canonical = d.split(six.b('|'))
19        if language is None or lang == language:
20            canonicals.add(canonical)
21
22    return canonicals
23
24def equivalent(s1, s2, gazetteer, language):
25    '''
26    Address/place equivalence
27    -------------------------
28
29    OSM discourages abbreviations, but to make our training data map better
30    to real-world input, we can safely replace the canonical phrase with an
31    abbreviated version and retain the meaning of the words
32    '''
33
34    tokens_s1 = normalized_tokens(s1)
35    tokens_s2 = normalized_tokens(s2)
36
37    abbreviated_s1 = list(abbreviations_gazetteer.filter(tokens_s1))
38    abbreviated_s2 = list(abbreviations_gazetteer.filter(tokens_s2))
39
40    if len(abbreviated_s1) != len(abbreviated_s2):
41        return False
42
43    for ((t1, c1, l1, d1), (t2, c2, l2, d2)) in izip(abbreviated_s1, abbreviated_s2):
44        if c1 != token_types.PHRASE and c2 != token_types.PHRASE:
45            if t1 != t2:
46                return False
47        elif c2 == token_types.PHRASE and c2 == token_types.PHRASE:
48            canonicals_s1 = canonicals_for_language(d1, language)
49            canonicals_s2 = canonicals_for_language(d2, language)
50
51            if not canonicals_s1 & canonicals_s2:
52                return False
53        else:
54            return False
55
56    return True
57