1# -*- coding: utf-8 -*-
2import os
3import six
4import sys
5
6import pycountry
7
8from collections import OrderedDict
9
10from lxml import etree
11
12this_dir = os.path.realpath(os.path.dirname(__file__))
13sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
14
15from geodata.i18n.unicode_paths import CLDR_DIR
16from geodata.i18n.languages import *
17from geodata.encoding import safe_decode
18
19CLDR_MAIN_PATH = os.path.join(CLDR_DIR, 'common', 'main')
20
21COUNTRY_CONFIG = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
22                              'resources', 'countries', 'names.yaml')
23
24IGNORE_COUNTRIES = set([six.u('ZZ')])
25
26COUNTRY_USE_SHORT_NAME = set([six.u('HK'), six.u('MM'), six.u('MO'), six.u('PS')])
27COUNTRY_USE_VARIANT_NAME = set([six.u('CD'), six.u('CG'), six.u('CI'), six.u('TL')])
28
29LANGUAGE_COUNTRY_OVERRIDES = {
30    'en': {
31        'CD': safe_decode('Democratic Republic of the Congo'),
32        'CG': safe_decode('Republic of the Congo'),
33    },
34
35    # Countries where the local language is absent from CLDR
36
37    # Tajik / Tajikistan
38    'tg': {
39        'TJ': safe_decode('Тоҷикистон'),
40    },
41
42    # Maldivan / Maldives
43    'dv': {
44        'MV': safe_decode('ދިވެހިރާއްޖެ'),
45    }
46
47}
48
49
50class CountryNames(object):
51    def __init__(self, base_dir=CLDR_MAIN_PATH):
52        self.base_dir = base_dir
53
54        self.country_alpha3_codes = {c.alpha2.lower(): c.alpha3.lower() for c in pycountry.countries}
55        self.iso_3166_names = {c.alpha2.lower(): c.name for c in pycountry.countries}
56
57        self.language_country_names = {}
58        self.country_language_names = defaultdict(dict)
59
60        self.country_official_names = defaultdict(OrderedDict)
61        self.country_local_names = defaultdict(OrderedDict)
62
63        local_languages = {}
64
65        country_local_language_names = defaultdict(dict)
66
67        for filename in os.listdir(base_dir):
68            lang = filename.split('.xml')[0]
69            if len(lang) > 3:
70                continue
71
72            names = self.cldr_country_names(lang)
73            lang = lang.lower()
74            self.language_country_names[lang] = names
75
76            for country, name in names.iteritems():
77                country = country.lower()
78
79                languages = get_country_languages(country, official=False) or OrderedDict([('en', 1)])
80                local_languages[country] = languages
81
82                self.country_language_names[country.lower()][lang.lower()] = name
83
84                if lang in local_languages.get(country, {}):
85                    country_local_language_names[country][lang] = name
86
87        for l, names in six.iteritems(LANGUAGE_COUNTRY_OVERRIDES):
88            if l not in self.language_country_names:
89                self.language_country_names[l.lower()] = names
90
91            for c, name in six.iteritems(names):
92                self.country_language_names[c.lower()][l.lower()] = name
93                if c.lower() not in country_local_language_names:
94                    country_local_language_names[c.lower()][l.lower()] = name
95
96        for country, langs in six.iteritems(local_languages):
97            names = country_local_language_names[country]
98            num_defaults = sum((1 for lang in names.keys() if langs.get(lang)))
99            for i, (lang, default) in enumerate(langs.iteritems()):
100                name = names.get(lang)
101                if not name:
102                    continue
103                if default or num_defaults == 0:
104                    self.country_official_names[country][lang] = name
105                    if num_defaults == 0:
106                        break
107                self.country_local_names[country][lang] = name
108
109    def cldr_country_names(self, language):
110        '''
111        Country names are tricky as there can be several versions
112        and levels of verbosity e.g. United States of America
113        vs. the more commonly used United States. Most countries
114        have a similarly verbose form.
115
116        The CLDR repo (http://cldr.unicode.org/) has the most
117        comprehensive localized database of country names
118        (among other things), organized by language. This function
119        parses CLDR XML for a given language and returns a dictionary
120        of {country_code: name} for that language.
121        '''
122        filename = os.path.join(self.base_dir, '{}.xml'.format(language))
123        xml = etree.parse(open(filename))
124
125        country_names = defaultdict(dict)
126
127        for territory in xml.xpath('*//territories/*'):
128            country_code = territory.attrib['type']
129
130            if country_code in IGNORE_COUNTRIES or country_code.isdigit():
131                continue
132
133            country_names[country_code][territory.attrib.get('alt')] = safe_decode(territory.text)
134
135        display_names = {}
136
137        for country_code, names in country_names.iteritems():
138            if country_code in LANGUAGE_COUNTRY_OVERRIDES.get(language, {}):
139                display_names[country_code] = safe_decode(LANGUAGE_COUNTRY_OVERRIDES[language][country_code])
140                continue
141
142            default_name = names.get(None)
143
144            if country_code in COUNTRY_USE_SHORT_NAME:
145                display_names[country_code] = names.get('short', default_name)
146            elif country_code in COUNTRY_USE_VARIANT_NAME:
147                display_names[country_code] = names.get('variant', default_name)
148            elif default_name is not None:
149                display_names[country_code] = default_name
150
151        return display_names
152
153    def localized_name(self, country_code, language=None):
154        '''
155        Get the display name for a country code in the local language
156        e.g. Россия for Russia, España for Spain, etc.
157
158        For most countries there is a single official name. For countries
159        with more than one official language, this will return a concatenated
160        version separated by a slash e.g. Maroc / المغرب for Morocco.
161
162        Note that all of the exceptions in road_sign_languages.tsv are also
163        taken into account here so India for example uses the English name
164        rather than concatenating all 27 toponyms.
165
166        This method should be roughly consistent with OSM's display names.
167
168        Usage:
169            >>> country_names.localized_name('jp')     # returns '日本'
170            >>> country_names.localized_name('be')     # returns 'België / Belgique / Belgien'
171        '''
172
173        country_code = country_code.lower()
174        if language is None:
175            return six.u(' / ').join(OrderedDict.fromkeys(n.replace(six.u('-'), six.u(' '))
176                                     for n in self.country_official_names[country_code].values()).keys())
177        else:
178            return self.country_language_names.get(country_code, {}).get(language)
179
180    def alpha3_code(self, alpha2_code):
181        alpha3 = self.country_alpha3_codes.get(alpha2_code.lower())
182        return alpha3.upper() if alpha3 else None
183
184    def iso_3166_name(self, alpha2_code):
185        return self.iso_3166_names.get(alpha2_code.lower())
186
187country_names = CountryNames()
188