1# coding=utf-8 2""" 3Python library for ISO 639 standard 4 5Copyright (c) 2014-2016 Mikael Karlsson (CSC - IT Center for Science Ltd.). 6Licensed under AGPLv3. 7""" 8 9# Fix for Python 3.0 - 3.2 10if not __package__: 11 __package__ = __name__.split('.')[0] 12 13 14def _fabtabular(): 15 """ 16 This function retrieves the ISO 639 and inverted names datasets as tsv files and returns them as lists. 17 """ 18 import csv 19 import sys 20 from pkg_resources import resource_filename 21 22 data = resource_filename(__package__, 'iso-639-3.tab') 23 inverted = resource_filename(__package__, 'iso-639-3_Name_Index.tab') 24 macro = resource_filename(__package__, 'iso-639-3-macrolanguages.tab') 25 part5 = resource_filename(__package__, 'iso639-5.tsv') 26 part2 = resource_filename(__package__, 'iso639-2.tsv') 27 part1 = resource_filename(__package__, 'iso639-1.tsv') 28 29 # if sys.version_info[0] == 2: 30 # from urllib2 import urlopen 31 # from contextlib import closing 32 # data_fo = closing(urlopen('http://www-01.sil.org/iso639-3/iso-639-3.tab')) 33 # inverted_fo = closing(urlopen('http://www-01.sil.org/iso639-3/iso-639-3_Name_Index.tab')) 34 # else: 35 # from urllib.request import urlopen 36 # import io 37 # data_fo = io.StringIO(urlopen('http://www-01.sil.org/iso639-3/iso-639-3.tab').read().decode()) 38 # inverted_fo = io.StringIO(urlopen('http://www-01.sil.org/iso639-3/iso-639-3_Name_Index.tab').read().decode()) 39 40 if sys.version_info[0] == 3: 41 from functools import partial 42 43 global open 44 open = partial(open, encoding='utf-8') 45 46 data_fo = open(data) 47 inverted_fo = open(inverted) 48 macro_fo = open(macro) 49 part5_fo = open(part5) 50 part2_fo = open(part2) 51 part1_fo = open(part1) 52 with data_fo as u: 53 with inverted_fo as i: 54 with macro_fo as m: 55 with part5_fo as p5: 56 with part2_fo as p2: 57 with part1_fo as p1: 58 return (list(csv.reader(u, delimiter='\t'))[1:], 59 list(csv.reader(i, delimiter='\t'))[1:], 60 list(csv.reader(m, delimiter='\t'))[1:], 61 list(csv.reader(p5, delimiter='\t'))[1:], 62 list(csv.reader(p2, delimiter='\t'))[1:], 63 list(csv.reader(p1, delimiter='\t'))[1:]) 64 65 66class _Language(object): 67 """ 68 This class represents a language. It provides pycountry language class compatibility. 69 """ 70 71 def __init__(self, part3, part2b, part2t, part1, name, inverted, macro, names, part5): 72 self.part3 = part3 73 self.part2b = part2b 74 self.part2t = part2t 75 self.part1 = part1 76 self.name = name 77 self.inverted = inverted 78 self.macro = macro 79 self.names = names 80 self.part5 = part5 81 82 def __getattr__(self, item): 83 compat = { 84 'alpha2': self.part1, 85 'bibliographic': self.part2b, 86 'terminology': self.part2t, 87 } 88 if item not in compat: 89 raise AttributeError("'{o}' object has no attribute '{a}'".format(o=type(self).__name__, a=item)) 90 return compat[item] 91 92 93class lazy_property(object): 94 """ 95 Implements a lazy property decorator, that overwrites itself/property with value 96 """ 97 98 def __init__(self, f): 99 self.f = f 100 self.name = f.__name__ 101 102 def __get__(self, instance, owner=None): 103 if instance is None: 104 return self 105 val = self.f(instance) 106 setattr(instance, self.name, val) 107 return val 108 109 110class Iso639(object): 111 """ 112 This class is a close to drop-in replacement for pycountry.languages. 113 But unlike pycountry.languages it also supports ISO 639-3. 114 115 It implements the Singleton design pattern for performance reasons. 116 Is uses lazy properties for faster import time. 117 """ 118 119 def __new__(cls): 120 if not hasattr(cls, '__instance'): 121 setattr(cls, '__instance', super(cls, cls).__new__(cls)) 122 return getattr(cls, '__instance') 123 124 def __len__(self): 125 return len(self.languages) 126 127 def __iter__(self): 128 return iter(self.languages) 129 130 def __getattr__(self, item): 131 compat = { 132 'alpha2': self.part1, 133 'bibliographic': self.part2b, 134 'terminology': self.part2t, 135 } 136 if item not in compat: 137 raise AttributeError("'{o}' object has no attribute '{a}'".format(o=type(self).__name__, a=item)) 138 return compat[item] 139 140 @lazy_property 141 def languages(self): 142 def generate(): 143 # All of part3 and matching part2 144 for a, b, c, d, _, _, e, _ in l: 145 inv = alt[a].pop(e) 146 yield _Language(a, b, c, 147 d if d in p1c else '', # Fixes 'sh' 148 e, inv, 149 m.get(a, [''])[0], 150 list(alt[a].items()), 151 '') 152 p2.pop(b, None) 153 p2.pop(c, None) 154 155 # All of part5 and matching part2 156 for _, a, b, _ in p5: 157 yield _Language('', 158 a if a in p2 else '', 159 a if a in p2 else '', 160 p1n.get(b, ['', ''])[1], 161 b, '', '', '', a) 162 p2.pop(a, None) 163 164 # Rest of part2 165 p2.pop('qaa-qtz', None) # Is not a real code, but a range 166 for _, a, b, _ in p2.values(): 167 n = [x.strip() for x in b.split('|')] 168 yield _Language('', a, a, 169 p1n.get(b, ['', ''])[1], 170 n[0], '', '', zip(n[1:], n[1:]), '') 171 172 import collections 173 174 l, i, m, p5, p2, p1 = _fabtabular() 175 alt = collections.defaultdict(dict) 176 for x in i: 177 alt[x[0]][x[1]] = x[2] 178 m = dict((x[1], x) for x in m) 179 p2 = dict((x[1], x) for x in p2) 180 p1c = dict((x[1], x) for x in p1) 181 p1n = dict((x[2].split('|')[0].strip(), x) for x in p1) 182 return list(generate()) 183 184 @lazy_property 185 def part3(self): 186 return dict((x.part3, x) for x in self.languages if x.part3) 187 188 @lazy_property 189 def part2b(self): 190 return dict((x.part2b, x) for x in self.languages if x.part2b) 191 192 @lazy_property 193 def part2t(self): 194 return dict((x.part2t, x) for x in self.languages if x.part2t) 195 196 @lazy_property 197 def part1(self): 198 return dict((x.part1, x) for x in self.languages if x.part1) 199 200 @lazy_property 201 def part5(self): 202 return dict((x.part5, x) for x in self.languages if x.part5) 203 204 @lazy_property 205 def name(self): 206 def gen(): 207 for x in self.languages: 208 if x.name: 209 yield x.name, x 210 for n in x.names: 211 yield n[0], x 212 213 return dict(gen()) 214 215 @lazy_property 216 def inverted(self): 217 return dict((x.inverted, x) for x in self.languages if x.inverted) 218 219 @lazy_property 220 def macro(self): 221 import collections 222 223 m = collections.defaultdict(list) 224 for x in self.languages: 225 if x.macro: 226 m[x.macro].append(x) 227 return dict(m) 228 229 @lazy_property 230 def retired(self): 231 """ 232 Function for generating retired languages. Returns a dict('code', (datetime, [language, ...], 'description')). 233 """ 234 235 def gen(): 236 import csv 237 import re 238 from datetime import datetime 239 from pkg_resources import resource_filename 240 241 with open(resource_filename(__package__, 'iso-639-3_Retirements.tab')) as rf: 242 rtd = list(csv.reader(rf, delimiter='\t'))[1:] 243 rc = [r[0] for r in rtd] 244 for i, _, _, m, s, d in rtd: 245 d = datetime.strptime(d, '%Y-%m-%d') 246 if not m: 247 m = re.findall('\[([a-z]{3})\]', s) 248 if m: 249 m = [m] if isinstance(m, str) else m 250 yield i, (d, [self.get(part3=x) for x in m if x not in rc], s) 251 else: 252 yield i, (d, [], s) 253 254 yield 'sh', self.get(part3='hbs') # Add 'sh' as deprecated 255 256 return dict(gen()) 257 258 def get(self, **kwargs): 259 """ 260 Simple getter function for languages. Takes 1 keyword/value and returns 1 language object. 261 """ 262 if not len(kwargs) == 1: 263 raise AttributeError('Only one keyword expected') 264 key, value = kwargs.popitem() 265 return getattr(self, key)[value] 266