1import random 2import six 3 4from collections import namedtuple 5 6from geodata.addresses.config import address_config 7from geodata.address_expansions.gazetteers import chains_gazetteer 8from geodata.categories.config import category_config 9from geodata.categories.preposition import CategoryPreposition 10from geodata.math.sampling import weighted_choice, cdf 11from geodata.text.normalize import normalized_tokens 12from geodata.text.tokenize import tokenize, token_types 13from geodata.encoding import safe_decode 14 15ChainQuery = namedtuple('ChainQuery', 'name, prep, add_place_name, add_address') 16 17NULL_CHAIN_QUERY = ChainQuery(None, None, False, False) 18 19 20class Chain(object): 21 @classmethod 22 def tokenize_name(cls, name): 23 if not name: 24 return [] 25 tokens = normalized_tokens(name) 26 return tokens 27 28 @classmethod 29 def possible_chain(cls, name): 30 ''' 31 Determines if a venue name contains the name of a known chain store. 32 33 Returns a tuple of: 34 35 (True/False, known chain phrases, other tokens) 36 37 Handles cases like "Hard Rock Cafe Times Square" and allows for downstream 38 decision making (i.e. if the tokens have a low IDF in the local area we might 39 want to consider it a chain). 40 ''' 41 tokens = cls.tokenize_name(name) 42 if not tokens: 43 return False, [], [] 44 matches = chains_gazetteer.filter(tokens) 45 other_tokens = [] 46 phrases = [] 47 for t, c, l, d in matches: 48 if c == token_types.PHRASE: 49 phrases.append((t, c, l, d)) 50 else: 51 other_tokens.append((t, c)) 52 53 return len(phrases) > 0, phrases, other_tokens if len(phrases) > 0 else [] 54 55 @classmethod 56 def extract(cls, name): 57 ''' 58 Determines if an entire venue name matches a known chain store. 59 60 Note: to avoid false positives, only return True if all of the tokens 61 in the venue's name are part of a single chain store phrase. This will 62 miss a few things like "Hard Rock Cafe Times Square" and the like. 63 64 It will however handle compound chain stores like Subway/Taco Bell 65 ''' 66 67 possible, phrases, other_tokens = cls.possible_chain(name) 68 is_chain = possible and not any((c in token_types.WORD_TOKEN_TYPES for t, c in other_tokens)) 69 return is_chain, phrases if is_chain else [] 70 71 @classmethod 72 def alternate_form(cls, language, dictionary, canonical): 73 choices = address_config.sample_phrases.get((language, dictionary), {}).get(canonical) 74 if not choices: 75 return canonical 76 return random.choice(choices) 77 78 @classmethod 79 def phrase(cls, chain, language, country=None): 80 if not chain: 81 return NULL_CHAIN_QUERY 82 83 chain_phrase = safe_decode(chain) 84 85 prep_phrase_type = CategoryPreposition.random(language, country=country) 86 87 if prep_phrase_type in (None, CategoryPreposition.NULL): 88 return ChainQuery(chain_phrase, prep=None, add_place_name=True, add_address=True) 89 90 values, probs = address_config.alternative_probabilities('categories.{}'.format(prep_phrase_type), language, country=country) 91 if not values: 92 return ChainQuery(chain_phrase, prep=None, add_place_name=True, add_address=True) 93 94 prep_phrase, prep_phrase_props = weighted_choice(values, probs) 95 prep_phrase = safe_decode(prep_phrase) 96 97 add_address = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME, CategoryPreposition.IN) 98 add_place_name = prep_phrase_type not in (CategoryPreposition.NEARBY, CategoryPreposition.NEAR_ME) 99 100 return ChainQuery(chain_phrase, prep=prep_phrase, add_place_name=add_place_name, add_address=add_address) 101