1import random
2import six
3
4from collections import namedtuple
5
6from geodata.addresses.config import address_config
7from geodata.address_expansions.gazetteers import chains_gazetteer
8from geodata.categories.config import category_config
9from geodata.categories.preposition import CategoryPreposition
10from geodata.math.sampling import weighted_choice, cdf
11from geodata.text.normalize import normalized_tokens
12from geodata.text.tokenize import tokenize, token_types
13from geodata.encoding import safe_decode
14
15ChainQuery = namedtuple('ChainQuery', 'name, prep, add_place_name, add_address')
16
17NULL_CHAIN_QUERY = ChainQuery(None, None, False, False)
18
19
20class Chain(object):
21    @classmethod
22    def tokenize_name(cls, name):
23        if not name:
24            return []
25        tokens = normalized_tokens(name)
26        return tokens
27
28    @classmethod
29    def possible_chain(cls, name):
30        '''
31        Determines if a venue name contains the name of a known chain store.
32
33        Returns a tuple of:
34
35        (True/False, known chain phrases, other tokens)
36
37        Handles cases like "Hard Rock Cafe Times Square" and allows for downstream
38        decision making (i.e. if the tokens have a low IDF in the local area we might
39        want to consider it a chain).
40        '''
41        tokens = cls.tokenize_name(name)
42        if not tokens:
43            return False, [], []
44        matches = chains_gazetteer.filter(tokens)
45        other_tokens = []
46        phrases = []
47        for t, c, l, d in matches:
48            if c == token_types.PHRASE:
49                phrases.append((t, c, l, d))
50            else:
51                other_tokens.append((t, c))
52
53        return len(phrases) > 0, phrases, other_tokens if len(phrases) > 0 else []
54
55    @classmethod
56    def extract(cls, name):
57        '''
58        Determines if an entire venue name matches a known chain store.
59
60        Note: to avoid false positives, only return True if all of the tokens
61        in the venue's name are part of a single chain store phrase. This will
62        miss a few things like "Hard Rock Cafe Times Square" and the like.
63
64        It will however handle compound chain stores like Subway/Taco Bell
65        '''
66
67        possible, phrases, other_tokens = cls.possible_chain(name)
68        is_chain = possible and not any((c in token_types.WORD_TOKEN_TYPES for t, c in other_tokens))
69        return is_chain, phrases if is_chain else []
70
71    @classmethod
72    def alternate_form(cls, language, dictionary, canonical):
73        choices = address_config.sample_phrases.get((language, dictionary), {}).get(canonical)
74        if not choices:
75            return canonical
76        return random.choice(choices)
77
78    @classmethod
79    def phrase(cls, chain, language, country=None):
80        if not chain:
81            return NULL_CHAIN_QUERY
82
83        chain_phrase = safe_decode(chain)
84
85        prep_phrase_type = CategoryPreposition.random(language, country=country)
86
87        if prep_phrase_type in (None, CategoryPreposition.NULL):
88            return ChainQuery(chain_phrase, prep=None, add_place_name=True, add_address=True)
89
90        values, probs = address_config.alternative_probabilities('categories.{}'.format(prep_phrase_type), language, country=country)
91        if not values:
92            return ChainQuery(chain_phrase, prep=None, add_place_name=True, add_address=True)
93
94        prep_phrase, prep_phrase_props = weighted_choice(values, probs)
95        prep_phrase = safe_decode(prep_phrase)
96
97        add_address = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME, CategoryPreposition.IN)
98        add_place_name = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME)
99
100        return ChainQuery(chain_phrase, prep=prep_phrase, add_place_name=add_place_name, add_address=add_address)
101