1# -*- coding: utf-8 -*-
2# enzyme - Video metadata parser
3# Copyright 2011-2012 Antoine Bertin <diaoulael@gmail.com>
4# Copyright 2003-2006 Dirk Meyer <dischi@freevo.org>
5#
6# This file is part of enzyme.
7#
8# enzyme is free software; you can redistribute it and/or modify it under
9# the terms of the GNU General Public License as published by
10# the Free Software Foundation; either version 3 of the License, or
11# (at your option) any later version.
12#
13# enzyme is distributed in the hope that it will be useful,
14# but WITHOUT ANY WARRANTY; without even the implied warranty of
15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16# GNU General Public License for more details.
17#
18# You should have received a copy of the GNU General Public License
19# along with enzyme.  If not, see <http://www.gnu.org/licenses/>.
20import re
21
22__all__ = ['resolve']
23
24
25def resolve(code):
26    """
27    Transform the given (2- or 3-letter) language code to a human readable
28    language name.  The return value is a 2-tuple containing the given
29    language code and the language name.  If the language code cannot be
30    resolved, name will be 'Unknown (<code>)'.
31    """
32    if not code:
33        return None, None
34    if not isinstance(code, basestring):
35        raise ValueError('Invalid language code specified by parser')
36
37    # Take up to 3 letters from the code.
38    code = re.split(r'[^a-z]', code.lower())[0][:3]
39
40    for spec in codes:
41        if code in spec[:-1]:
42            return code, spec[-1]
43
44    return code, 'Unknown (%r)' % code
45
46
47# Parsed from http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
48codes = (
49   ('aar', 'aa', 'Afar'),
50   ('abk', 'ab', 'Abkhazian'),
51   ('ace', 'Achinese'),
52   ('ach', 'Acoli'),
53   ('ada', 'Adangme'),
54   ('ady', 'Adyghe'),
55   ('afa', 'Afro-Asiatic '),
56   ('afh', 'Afrihili'),
57   ('afr', 'af', 'Afrikaans'),
58   ('ain', 'Ainu'),
59   ('aka', 'ak', 'Akan'),
60   ('akk', 'Akkadian'),
61   ('alb', 'sq', 'Albanian'),
62   ('ale', 'Aleut'),
63   ('alg', 'Algonquian languages'),
64   ('alt', 'Southern Altai'),
65   ('amh', 'am', 'Amharic'),
66   ('ang', 'English, Old '),
67   ('anp', 'Angika'),
68   ('apa', 'Apache languages'),
69   ('ara', 'ar', 'Arabic'),
70   ('arc', 'Official Aramaic '),
71   ('arg', 'an', 'Aragonese'),
72   ('arm', 'hy', 'Armenian'),
73   ('arn', 'Mapudungun'),
74   ('arp', 'Arapaho'),
75   ('art', 'Artificial '),
76   ('arw', 'Arawak'),
77   ('asm', 'as', 'Assamese'),
78   ('ast', 'Asturian'),
79   ('ath', 'Athapascan languages'),
80   ('aus', 'Australian languages'),
81   ('ava', 'av', 'Avaric'),
82   ('ave', 'ae', 'Avestan'),
83   ('awa', 'Awadhi'),
84   ('aym', 'ay', 'Aymara'),
85   ('aze', 'az', 'Azerbaijani'),
86   ('bad', 'Banda languages'),
87   ('bai', 'Bamileke languages'),
88   ('bak', 'ba', 'Bashkir'),
89   ('bal', 'Baluchi'),
90   ('bam', 'bm', 'Bambara'),
91   ('ban', 'Balinese'),
92   ('baq', 'eu', 'Basque'),
93   ('bas', 'Basa'),
94   ('bat', 'Baltic '),
95   ('bej', 'Beja'),
96   ('bel', 'be', 'Belarusian'),
97   ('bem', 'Bemba'),
98   ('ben', 'bn', 'Bengali'),
99   ('ber', 'Berber '),
100   ('bho', 'Bhojpuri'),
101   ('bih', 'bh', 'Bihari'),
102   ('bik', 'Bikol'),
103   ('bin', 'Bini'),
104   ('bis', 'bi', 'Bislama'),
105   ('bla', 'Siksika'),
106   ('bnt', 'Bantu '),
107   ('bos', 'bs', 'Bosnian'),
108   ('bra', 'Braj'),
109   ('bre', 'br', 'Breton'),
110   ('btk', 'Batak languages'),
111   ('bua', 'Buriat'),
112   ('bug', 'Buginese'),
113   ('bul', 'bg', 'Bulgarian'),
114   ('bur', 'my', 'Burmese'),
115   ('byn', 'Blin'),
116   ('cad', 'Caddo'),
117   ('cai', 'Central American Indian '),
118   ('car', 'Galibi Carib'),
119   ('cat', 'ca', 'Catalan'),
120   ('cau', 'Caucasian '),
121   ('ceb', 'Cebuano'),
122   ('cel', 'Celtic '),
123   ('cha', 'ch', 'Chamorro'),
124   ('chb', 'Chibcha'),
125   ('che', 'ce', 'Chechen'),
126   ('chg', 'Chagatai'),
127   ('chi', 'zh', 'Chinese'),
128   ('chk', 'Chuukese'),
129   ('chm', 'Mari'),
130   ('chn', 'Chinook jargon'),
131   ('cho', 'Choctaw'),
132   ('chp', 'Chipewyan'),
133   ('chr', 'Cherokee'),
134   ('chu', 'cu', 'Church Slavic'),
135   ('chv', 'cv', 'Chuvash'),
136   ('chy', 'Cheyenne'),
137   ('cmc', 'Chamic languages'),
138   ('cop', 'Coptic'),
139   ('cor', 'kw', 'Cornish'),
140   ('cos', 'co', 'Corsican'),
141   ('cpe', 'Creoles and pidgins, English based '),
142   ('cpf', 'Creoles and pidgins, French-based '),
143   ('cpp', 'Creoles and pidgins, Portuguese-based '),
144   ('cre', 'cr', 'Cree'),
145   ('crh', 'Crimean Tatar'),
146   ('crp', 'Creoles and pidgins '),
147   ('csb', 'Kashubian'),
148   ('cus', 'Cushitic '),
149   ('cze', 'cs', 'Czech'),
150   ('dak', 'Dakota'),
151   ('dan', 'da', 'Danish'),
152   ('dar', 'Dargwa'),
153   ('day', 'Land Dayak languages'),
154   ('del', 'Delaware'),
155   ('den', 'Slave '),
156   ('dgr', 'Dogrib'),
157   ('din', 'Dinka'),
158   ('div', 'dv', 'Divehi'),
159   ('doi', 'Dogri'),
160   ('dra', 'Dravidian '),
161   ('dsb', 'Lower Sorbian'),
162   ('dua', 'Duala'),
163   ('dum', 'Dutch, Middle '),
164   ('dut', 'nl', 'Dutch'),
165   ('dyu', 'Dyula'),
166   ('dzo', 'dz', 'Dzongkha'),
167   ('efi', 'Efik'),
168   ('egy', 'Egyptian '),
169   ('eka', 'Ekajuk'),
170   ('elx', 'Elamite'),
171   ('eng', 'en', 'English'),
172   ('enm', 'English, Middle '),
173   ('epo', 'eo', 'Esperanto'),
174   ('est', 'et', 'Estonian'),
175   ('ewe', 'ee', 'Ewe'),
176   ('ewo', 'Ewondo'),
177   ('fan', 'Fang'),
178   ('fao', 'fo', 'Faroese'),
179   ('fat', 'Fanti'),
180   ('fij', 'fj', 'Fijian'),
181   ('fil', 'Filipino'),
182   ('fin', 'fi', 'Finnish'),
183   ('fiu', 'Finno-Ugrian '),
184   ('fon', 'Fon'),
185   ('fre', 'fr', 'French'),
186   ('frm', 'French, Middle '),
187   ('fro', 'French, Old '),
188   ('frr', 'Northern Frisian'),
189   ('frs', 'Eastern Frisian'),
190   ('fry', 'fy', 'Western Frisian'),
191   ('ful', 'ff', 'Fulah'),
192   ('fur', 'Friulian'),
193   ('gaa', 'Ga'),
194   ('gay', 'Gayo'),
195   ('gba', 'Gbaya'),
196   ('gem', 'Germanic '),
197   ('geo', 'ka', 'Georgian'),
198   ('ger', 'de', 'German'),
199   ('gez', 'Geez'),
200   ('gil', 'Gilbertese'),
201   ('gla', 'gd', 'Gaelic'),
202   ('gle', 'ga', 'Irish'),
203   ('glg', 'gl', 'Galician'),
204   ('glv', 'gv', 'Manx'),
205   ('gmh', 'German, Middle High '),
206   ('goh', 'German, Old High '),
207   ('gon', 'Gondi'),
208   ('gor', 'Gorontalo'),
209   ('got', 'Gothic'),
210   ('grb', 'Grebo'),
211   ('grc', 'Greek, Ancient '),
212   ('gre', 'el', 'Greek, Modern '),
213   ('grn', 'gn', 'Guarani'),
214   ('gsw', 'Swiss German'),
215   ('guj', 'gu', 'Gujarati'),
216   ('gwi', "Gwich'in"),
217   ('hai', 'Haida'),
218   ('hat', 'ht', 'Haitian'),
219   ('hau', 'ha', 'Hausa'),
220   ('haw', 'Hawaiian'),
221   ('heb', 'he', 'Hebrew'),
222   ('her', 'hz', 'Herero'),
223   ('hil', 'Hiligaynon'),
224   ('him', 'Himachali'),
225   ('hin', 'hi', 'Hindi'),
226   ('hit', 'Hittite'),
227   ('hmn', 'Hmong'),
228   ('hmo', 'ho', 'Hiri Motu'),
229   ('hsb', 'Upper Sorbian'),
230   ('hun', 'hu', 'Hungarian'),
231   ('hup', 'Hupa'),
232   ('iba', 'Iban'),
233   ('ibo', 'ig', 'Igbo'),
234   ('ice', 'is', 'Icelandic'),
235   ('ido', 'io', 'Ido'),
236   ('iii', 'ii', 'Sichuan Yi'),
237   ('ijo', 'Ijo languages'),
238   ('iku', 'iu', 'Inuktitut'),
239   ('ile', 'ie', 'Interlingue'),
240   ('ilo', 'Iloko'),
241   ('ina', 'ia', 'Interlingua '),
242   ('inc', 'Indic '),
243   ('ind', 'id', 'Indonesian'),
244   ('ine', 'Indo-European '),
245   ('inh', 'Ingush'),
246   ('ipk', 'ik', 'Inupiaq'),
247   ('ira', 'Iranian '),
248   ('iro', 'Iroquoian languages'),
249   ('ita', 'it', 'Italian'),
250   ('jav', 'jv', 'Javanese'),
251   ('jbo', 'Lojban'),
252   ('jpn', 'ja', 'Japanese'),
253   ('jpr', 'Judeo-Persian'),
254   ('jrb', 'Judeo-Arabic'),
255   ('kaa', 'Kara-Kalpak'),
256   ('kab', 'Kabyle'),
257   ('kac', 'Kachin'),
258   ('kal', 'kl', 'Kalaallisut'),
259   ('kam', 'Kamba'),
260   ('kan', 'kn', 'Kannada'),
261   ('kar', 'Karen languages'),
262   ('kas', 'ks', 'Kashmiri'),
263   ('kau', 'kr', 'Kanuri'),
264   ('kaw', 'Kawi'),
265   ('kaz', 'kk', 'Kazakh'),
266   ('kbd', 'Kabardian'),
267   ('kha', 'Khasi'),
268   ('khi', 'Khoisan '),
269   ('khm', 'km', 'Central Khmer'),
270   ('kho', 'Khotanese'),
271   ('kik', 'ki', 'Kikuyu'),
272   ('kin', 'rw', 'Kinyarwanda'),
273   ('kir', 'ky', 'Kirghiz'),
274   ('kmb', 'Kimbundu'),
275   ('kok', 'Konkani'),
276   ('kom', 'kv', 'Komi'),
277   ('kon', 'kg', 'Kongo'),
278   ('kor', 'ko', 'Korean'),
279   ('kos', 'Kosraean'),
280   ('kpe', 'Kpelle'),
281   ('krc', 'Karachay-Balkar'),
282   ('krl', 'Karelian'),
283   ('kro', 'Kru languages'),
284   ('kru', 'Kurukh'),
285   ('kua', 'kj', 'Kuanyama'),
286   ('kum', 'Kumyk'),
287   ('kur', 'ku', 'Kurdish'),
288   ('kut', 'Kutenai'),
289   ('lad', 'Ladino'),
290   ('lah', 'Lahnda'),
291   ('lam', 'Lamba'),
292   ('lao', 'lo', 'Lao'),
293   ('lat', 'la', 'Latin'),
294   ('lav', 'lv', 'Latvian'),
295   ('lez', 'Lezghian'),
296   ('lim', 'li', 'Limburgan'),
297   ('lin', 'ln', 'Lingala'),
298   ('lit', 'lt', 'Lithuanian'),
299   ('lol', 'Mongo'),
300   ('loz', 'Lozi'),
301   ('ltz', 'lb', 'Luxembourgish'),
302   ('lua', 'Luba-Lulua'),
303   ('lub', 'lu', 'Luba-Katanga'),
304   ('lug', 'lg', 'Ganda'),
305   ('lui', 'Luiseno'),
306   ('lun', 'Lunda'),
307   ('luo', 'Luo '),
308   ('lus', 'Lushai'),
309   ('mac', 'mk', 'Macedonian'),
310   ('mad', 'Madurese'),
311   ('mag', 'Magahi'),
312   ('mah', 'mh', 'Marshallese'),
313   ('mai', 'Maithili'),
314   ('mak', 'Makasar'),
315   ('mal', 'ml', 'Malayalam'),
316   ('man', 'Mandingo'),
317   ('mao', 'mi', 'Maori'),
318   ('map', 'Austronesian '),
319   ('mar', 'mr', 'Marathi'),
320   ('mas', 'Masai'),
321   ('may', 'ms', 'Malay'),
322   ('mdf', 'Moksha'),
323   ('mdr', 'Mandar'),
324   ('men', 'Mende'),
325   ('mga', 'Irish, Middle '),
326   ('mic', "Mi'kmaq"),
327   ('min', 'Minangkabau'),
328   ('mis', 'Uncoded languages'),
329   ('mkh', 'Mon-Khmer '),
330   ('mlg', 'mg', 'Malagasy'),
331   ('mlt', 'mt', 'Maltese'),
332   ('mnc', 'Manchu'),
333   ('mni', 'Manipuri'),
334   ('mno', 'Manobo languages'),
335   ('moh', 'Mohawk'),
336   ('mol', 'mo', 'Moldavian'),
337   ('mon', 'mn', 'Mongolian'),
338   ('mos', 'Mossi'),
339   ('mul', 'Multiple languages'),
340   ('mun', 'Munda languages'),
341   ('mus', 'Creek'),
342   ('mwl', 'Mirandese'),
343   ('mwr', 'Marwari'),
344   ('myn', 'Mayan languages'),
345   ('myv', 'Erzya'),
346   ('nah', 'Nahuatl languages'),
347   ('nai', 'North American Indian'),
348   ('nap', 'Neapolitan'),
349   ('nau', 'na', 'Nauru'),
350   ('nav', 'nv', 'Navajo'),
351   ('nbl', 'nr', 'Ndebele, South'),
352   ('nde', 'nd', 'Ndebele, North'),
353   ('ndo', 'ng', 'Ndonga'),
354   ('nds', 'Low German'),
355   ('nep', 'ne', 'Nepali'),
356   ('new', 'Nepal Bhasa'),
357   ('nia', 'Nias'),
358   ('nic', 'Niger-Kordofanian '),
359   ('niu', 'Niuean'),
360   ('nno', 'nn', 'Norwegian Nynorsk'),
361   ('nob', 'nb', 'Bokm\xe5l, Norwegian'),
362   ('nog', 'Nogai'),
363   ('non', 'Norse, Old'),
364   ('nor', 'no', 'Norwegian'),
365   ('nqo', "N'Ko"),
366   ('nso', 'Pedi'),
367   ('nub', 'Nubian languages'),
368   ('nwc', 'Classical Newari'),
369   ('nya', 'ny', 'Chichewa'),
370   ('nym', 'Nyamwezi'),
371   ('nyn', 'Nyankole'),
372   ('nyo', 'Nyoro'),
373   ('nzi', 'Nzima'),
374   ('oci', 'oc', 'Occitan '),
375   ('oji', 'oj', 'Ojibwa'),
376   ('ori', 'or', 'Oriya'),
377   ('orm', 'om', 'Oromo'),
378   ('osa', 'Osage'),
379   ('oss', 'os', 'Ossetian'),
380   ('ota', 'Turkish, Ottoman '),
381   ('oto', 'Otomian languages'),
382   ('paa', 'Papuan '),
383   ('pag', 'Pangasinan'),
384   ('pal', 'Pahlavi'),
385   ('pam', 'Pampanga'),
386   ('pan', 'pa', 'Panjabi'),
387   ('pap', 'Papiamento'),
388   ('pau', 'Palauan'),
389   ('peo', 'Persian, Old '),
390   ('per', 'fa', 'Persian'),
391   ('phi', 'Philippine '),
392   ('phn', 'Phoenician'),
393   ('pli', 'pi', 'Pali'),
394   ('pol', 'pl', 'Polish'),
395   ('pon', 'Pohnpeian'),
396   ('por', 'pt', 'Portuguese'),
397   ('pra', 'Prakrit languages'),
398   ('pro', 'Proven\xe7al, Old '),
399   ('pus', 'ps', 'Pushto'),
400   ('qaa-qtz', 'Reserved for local use'),
401   ('que', 'qu', 'Quechua'),
402   ('raj', 'Rajasthani'),
403   ('rap', 'Rapanui'),
404   ('rar', 'Rarotongan'),
405   ('roa', 'Romance '),
406   ('roh', 'rm', 'Romansh'),
407   ('rom', 'Romany'),
408   ('rum', 'ro', 'Romanian'),
409   ('run', 'rn', 'Rundi'),
410   ('rup', 'Aromanian'),
411   ('rus', 'ru', 'Russian'),
412   ('sad', 'Sandawe'),
413   ('sag', 'sg', 'Sango'),
414   ('sah', 'Yakut'),
415   ('sai', 'South American Indian '),
416   ('sal', 'Salishan languages'),
417   ('sam', 'Samaritan Aramaic'),
418   ('san', 'sa', 'Sanskrit'),
419   ('sas', 'Sasak'),
420   ('sat', 'Santali'),
421   ('scc', 'sr', 'Serbian'),
422   ('scn', 'Sicilian'),
423   ('sco', 'Scots'),
424   ('scr', 'hr', 'Croatian'),
425   ('sel', 'Selkup'),
426   ('sem', 'Semitic '),
427   ('sga', 'Irish, Old '),
428   ('sgn', 'Sign Languages'),
429   ('shn', 'Shan'),
430   ('sid', 'Sidamo'),
431   ('sin', 'si', 'Sinhala'),
432   ('sio', 'Siouan languages'),
433   ('sit', 'Sino-Tibetan '),
434   ('sla', 'Slavic '),
435   ('slo', 'sk', 'Slovak'),
436   ('slv', 'sl', 'Slovenian'),
437   ('sma', 'Southern Sami'),
438   ('sme', 'se', 'Northern Sami'),
439   ('smi', 'Sami languages '),
440   ('smj', 'Lule Sami'),
441   ('smn', 'Inari Sami'),
442   ('smo', 'sm', 'Samoan'),
443   ('sms', 'Skolt Sami'),
444   ('sna', 'sn', 'Shona'),
445   ('snd', 'sd', 'Sindhi'),
446   ('snk', 'Soninke'),
447   ('sog', 'Sogdian'),
448   ('som', 'so', 'Somali'),
449   ('son', 'Songhai languages'),
450   ('sot', 'st', 'Sotho, Southern'),
451   ('spa', 'es', 'Spanish'),
452   ('srd', 'sc', 'Sardinian'),
453   ('srn', 'Sranan Tongo'),
454   ('srr', 'Serer'),
455   ('ssa', 'Nilo-Saharan '),
456   ('ssw', 'ss', 'Swati'),
457   ('suk', 'Sukuma'),
458   ('sun', 'su', 'Sundanese'),
459   ('sus', 'Susu'),
460   ('sux', 'Sumerian'),
461   ('swa', 'sw', 'Swahili'),
462   ('swe', 'sv', 'Swedish'),
463   ('syc', 'Classical Syriac'),
464   ('syr', 'Syriac'),
465   ('tah', 'ty', 'Tahitian'),
466   ('tai', 'Tai '),
467   ('tam', 'ta', 'Tamil'),
468   ('tat', 'tt', 'Tatar'),
469   ('tel', 'te', 'Telugu'),
470   ('tem', 'Timne'),
471   ('ter', 'Tereno'),
472   ('tet', 'Tetum'),
473   ('tgk', 'tg', 'Tajik'),
474   ('tgl', 'tl', 'Tagalog'),
475   ('tha', 'th', 'Thai'),
476   ('tib', 'bo', 'Tibetan'),
477   ('tig', 'Tigre'),
478   ('tir', 'ti', 'Tigrinya'),
479   ('tiv', 'Tiv'),
480   ('tkl', 'Tokelau'),
481   ('tlh', 'Klingon'),
482   ('tli', 'Tlingit'),
483   ('tmh', 'Tamashek'),
484   ('tog', 'Tonga '),
485   ('ton', 'to', 'Tonga '),
486   ('tpi', 'Tok Pisin'),
487   ('tsi', 'Tsimshian'),
488   ('tsn', 'tn', 'Tswana'),
489   ('tso', 'ts', 'Tsonga'),
490   ('tuk', 'tk', 'Turkmen'),
491   ('tum', 'Tumbuka'),
492   ('tup', 'Tupi languages'),
493   ('tur', 'tr', 'Turkish'),
494   ('tut', 'Altaic '),
495   ('tvl', 'Tuvalu'),
496   ('twi', 'tw', 'Twi'),
497   ('tyv', 'Tuvinian'),
498   ('udm', 'Udmurt'),
499   ('uga', 'Ugaritic'),
500   ('uig', 'ug', 'Uighur'),
501   ('ukr', 'uk', 'Ukrainian'),
502   ('umb', 'Umbundu'),
503   ('und', 'Undetermined'),
504   ('urd', 'ur', 'Urdu'),
505   ('uzb', 'uz', 'Uzbek'),
506   ('vai', 'Vai'),
507   ('ven', 've', 'Venda'),
508   ('vie', 'vi', 'Vietnamese'),
509   ('vol', 'vo', 'Volap\xfck'),
510   ('vot', 'Votic'),
511   ('wak', 'Wakashan languages'),
512   ('wal', 'Walamo'),
513   ('war', 'Waray'),
514   ('was', 'Washo'),
515   ('wel', 'cy', 'Welsh'),
516   ('wen', 'Sorbian languages'),
517   ('wln', 'wa', 'Walloon'),
518   ('wol', 'wo', 'Wolof'),
519   ('xal', 'Kalmyk'),
520   ('xho', 'xh', 'Xhosa'),
521   ('yao', 'Yao'),
522   ('yap', 'Yapese'),
523   ('yid', 'yi', 'Yiddish'),
524   ('yor', 'yo', 'Yoruba'),
525   ('ypk', 'Yupik languages'),
526   ('zap', 'Zapotec'),
527   ('zbl', 'Blissymbols'),
528   ('zen', 'Zenaga'),
529   ('zha', 'za', 'Zhuang'),
530   ('znd', 'Zande languages'),
531   ('zul', 'zu', 'Zulu'),
532   ('zun', 'Zuni'),
533   ('zxx', 'No linguistic content'),
534   ('zza', 'Zaza'),
535)
536