1# -*- coding: utf-8 -*- 2import os 3import six 4import sys 5 6import pycountry 7 8from collections import OrderedDict 9 10from lxml import etree 11 12this_dir = os.path.realpath(os.path.dirname(__file__)) 13sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) 14 15from geodata.i18n.unicode_paths import CLDR_DIR 16from geodata.i18n.languages import * 17from geodata.encoding import safe_decode 18 19CLDR_MAIN_PATH = os.path.join(CLDR_DIR, 'common', 'main') 20 21COUNTRY_CONFIG = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, 22 'resources', 'countries', 'names.yaml') 23 24IGNORE_COUNTRIES = set([six.u('ZZ')]) 25 26COUNTRY_USE_SHORT_NAME = set([six.u('HK'), six.u('MM'), six.u('MO'), six.u('PS')]) 27COUNTRY_USE_VARIANT_NAME = set([six.u('CD'), six.u('CG'), six.u('CI'), six.u('TL')]) 28 29LANGUAGE_COUNTRY_OVERRIDES = { 30 'en': { 31 'CD': safe_decode('Democratic Republic of the Congo'), 32 'CG': safe_decode('Republic of the Congo'), 33 }, 34 35 # Countries where the local language is absent from CLDR 36 37 # Tajik / Tajikistan 38 'tg': { 39 'TJ': safe_decode('Тоҷикистон'), 40 }, 41 42 # Maldivan / Maldives 43 'dv': { 44 'MV': safe_decode('ދިވެހިރާއްޖެ'), 45 } 46 47} 48 49 50class CountryNames(object): 51 def __init__(self, base_dir=CLDR_MAIN_PATH): 52 self.base_dir = base_dir 53 54 self.country_alpha3_codes = {c.alpha2.lower(): c.alpha3.lower() for c in pycountry.countries} 55 self.iso_3166_names = {c.alpha2.lower(): c.name for c in pycountry.countries} 56 57 self.language_country_names = {} 58 self.country_language_names = defaultdict(dict) 59 60 self.country_official_names = defaultdict(OrderedDict) 61 self.country_local_names = defaultdict(OrderedDict) 62 63 local_languages = {} 64 65 country_local_language_names = defaultdict(dict) 66 67 for filename in os.listdir(base_dir): 68 lang = filename.split('.xml')[0] 69 if len(lang) > 3: 70 continue 71 72 names = self.cldr_country_names(lang) 73 lang = lang.lower() 74 self.language_country_names[lang] = names 75 76 for country, name in names.iteritems(): 77 country = country.lower() 78 79 languages = get_country_languages(country, official=False) or OrderedDict([('en', 1)]) 80 local_languages[country] = languages 81 82 self.country_language_names[country.lower()][lang.lower()] = name 83 84 if lang in local_languages.get(country, {}): 85 country_local_language_names[country][lang] = name 86 87 for l, names in six.iteritems(LANGUAGE_COUNTRY_OVERRIDES): 88 if l not in self.language_country_names: 89 self.language_country_names[l.lower()] = names 90 91 for c, name in six.iteritems(names): 92 self.country_language_names[c.lower()][l.lower()] = name 93 if c.lower() not in country_local_language_names: 94 country_local_language_names[c.lower()][l.lower()] = name 95 96 for country, langs in six.iteritems(local_languages): 97 names = country_local_language_names[country] 98 num_defaults = sum((1 for lang in names.keys() if langs.get(lang))) 99 for i, (lang, default) in enumerate(langs.iteritems()): 100 name = names.get(lang) 101 if not name: 102 continue 103 if default or num_defaults == 0: 104 self.country_official_names[country][lang] = name 105 if num_defaults == 0: 106 break 107 self.country_local_names[country][lang] = name 108 109 def cldr_country_names(self, language): 110 ''' 111 Country names are tricky as there can be several versions 112 and levels of verbosity e.g. United States of America 113 vs. the more commonly used United States. Most countries 114 have a similarly verbose form. 115 116 The CLDR repo (http://cldr.unicode.org/) has the most 117 comprehensive localized database of country names 118 (among other things), organized by language. This function 119 parses CLDR XML for a given language and returns a dictionary 120 of {country_code: name} for that language. 121 ''' 122 filename = os.path.join(self.base_dir, '{}.xml'.format(language)) 123 xml = etree.parse(open(filename)) 124 125 country_names = defaultdict(dict) 126 127 for territory in xml.xpath('*//territories/*'): 128 country_code = territory.attrib['type'] 129 130 if country_code in IGNORE_COUNTRIES or country_code.isdigit(): 131 continue 132 133 country_names[country_code][territory.attrib.get('alt')] = safe_decode(territory.text) 134 135 display_names = {} 136 137 for country_code, names in country_names.iteritems(): 138 if country_code in LANGUAGE_COUNTRY_OVERRIDES.get(language, {}): 139 display_names[country_code] = safe_decode(LANGUAGE_COUNTRY_OVERRIDES[language][country_code]) 140 continue 141 142 default_name = names.get(None) 143 144 if country_code in COUNTRY_USE_SHORT_NAME: 145 display_names[country_code] = names.get('short', default_name) 146 elif country_code in COUNTRY_USE_VARIANT_NAME: 147 display_names[country_code] = names.get('variant', default_name) 148 elif default_name is not None: 149 display_names[country_code] = default_name 150 151 return display_names 152 153 def localized_name(self, country_code, language=None): 154 ''' 155 Get the display name for a country code in the local language 156 e.g. Россия for Russia, España for Spain, etc. 157 158 For most countries there is a single official name. For countries 159 with more than one official language, this will return a concatenated 160 version separated by a slash e.g. Maroc / المغرب for Morocco. 161 162 Note that all of the exceptions in road_sign_languages.tsv are also 163 taken into account here so India for example uses the English name 164 rather than concatenating all 27 toponyms. 165 166 This method should be roughly consistent with OSM's display names. 167 168 Usage: 169 >>> country_names.localized_name('jp') # returns '日本' 170 >>> country_names.localized_name('be') # returns 'België / Belgique / Belgien' 171 ''' 172 173 country_code = country_code.lower() 174 if language is None: 175 return six.u(' / ').join(OrderedDict.fromkeys(n.replace(six.u('-'), six.u(' ')) 176 for n in self.country_official_names[country_code].values()).keys()) 177 else: 178 return self.country_language_names.get(country_code, {}).get(language) 179 180 def alpha3_code(self, alpha2_code): 181 alpha3 = self.country_alpha3_codes.get(alpha2_code.lower()) 182 return alpha3.upper() if alpha3 else None 183 184 def iso_3166_name(self, alpha2_code): 185 return self.iso_3166_names.get(alpha2_code.lower()) 186 187country_names = CountryNames() 188