1 /* GStreamer language codes and names utility functions
2  * Copyright (C) 2009 Tim-Philipp Müller <tim centricular net>
3  *
4  * This library is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU Library General Public
6  * License as published by the Free Software Foundation; either
7  * version 2 of the License, or (at your option) any later version.
8  *
9  * This library is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12  * Library General Public License for more details.
13  *
14  * You should have received a copy of the GNU Library General Public
15  * License along with this library; if not, write to the
16  * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
17  * Boston, MA 02110-1301, USA.
18  */
19 
20 /**
21  * SECTION:gsttaglanguagecodes
22  * @title: ISO-639 lang mappings
23  * @short_description: mappings for ISO-639 language codes and names
24  * @see_also: #GstTagList
25  *
26  * Provides helper functions to convert between the various ISO-639 language
27  * codes, and to map language codes to language names.
28  *
29  */
30 
31 #ifdef HAVE_CONFIG_H
32 #include "config.h"
33 #endif
34 
35 #undef GETTEXT_PACKAGE
36 #define GETTEXT_PACKAGE "iso_639"
37 
38 #define ISO_639_XML_PATH ISO_CODES_PREFIX "/share/xml/iso-codes/iso_639.xml"
39 #define ISO_CODES_LOCALEDIR ISO_CODES_PREFIX "/share/locale"
40 
41 #include <gst/gst-i18n-plugin.h>
42 #include <gst/gst.h>
43 
44 #include <string.h>
45 #include <stdlib.h>
46 
47 #include "tag.h"
48 #include "lang-tables.dat"
49 
50 #ifndef GST_DISABLE_GST_DEBUG
51 
52 #define GST_CAT_DEFAULT ensure_debug_category()
53 
54 static GstDebugCategory *
ensure_debug_category(void)55 ensure_debug_category (void)
56 {
57   static gsize cat_gonce = 0;
58 
59   if (g_once_init_enter (&cat_gonce)) {
60     gsize cat_done;
61 
62     cat_done = (gsize) _gst_debug_category_new ("tag-langcodes", 0,
63         "GstTag language codes and names");
64 
65     g_once_init_leave (&cat_gonce, cat_done);
66   }
67 
68   return (GstDebugCategory *) cat_gonce;
69 }
70 
71 #else
72 
73 #define ensure_debug_category() /* NOOP */
74 
75 #endif /* GST_DISABLE_GST_DEBUG */
76 
77 /* ------------------------------------------------------------------------- */
78 
79 /* Loading and initing */
80 
81 #if defined(HAVE_ISO_CODES)
82 static const gchar *
get_val(const gchar ** names,const gchar ** vals,const gchar * name)83 get_val (const gchar ** names, const gchar ** vals, const gchar * name)
84 {
85   while (names != NULL && *names != NULL) {
86     if (strcmp (*names, name) == 0)
87       return *vals;
88     ++names;
89     ++vals;
90   }
91   return NULL;
92 }
93 
94 static void
parse_start_element(GMarkupParseContext * ctx,const gchar * element_name,const gchar ** attr_names,const gchar ** attr_vals,gpointer user_data,GError ** error)95 parse_start_element (GMarkupParseContext * ctx, const gchar * element_name,
96     const gchar ** attr_names, const gchar ** attr_vals,
97     gpointer user_data, GError ** error)
98 {
99   GHashTable *ht = (GHashTable *) user_data;
100   const gchar *c1, *c2t, *c2b, *name, *tname;
101 
102   if (strcmp (element_name, "iso_639_entry") != 0)
103     return;
104 
105   c1 = get_val (attr_names, attr_vals, "iso_639_1_code");
106 
107   /* only interested in languages with an ISO 639-1 code for now */
108   if (c1 == NULL)
109     return;
110 
111   c2t = get_val (attr_names, attr_vals, "iso_639_2T_code");
112   c2b = get_val (attr_names, attr_vals, "iso_639_2B_code");
113   name = get_val (attr_names, attr_vals, "name");
114 
115   if (c2t == NULL || c2b == NULL || name == NULL) {
116     GST_WARNING ("broken iso_639.xml entry: c2t=%p, c2b=%p, name=%p", c2t,
117         c2b, name);
118     return;
119   }
120 
121   /* translate language name */
122   tname = _(name);
123 
124   /* if no translation was found, it will return the input string, which we
125    * we don't want to put into the hash table because it will be freed again */
126   if (G_UNLIKELY (tname == name))
127     tname = g_intern_string (name);
128 
129   /* now overwrite default/fallback mappings with names in locale language */
130   g_hash_table_replace (ht, (gpointer) g_intern_string (c1), (gpointer) tname);
131   g_hash_table_replace (ht, (gpointer) g_intern_string (c2b), (gpointer) tname);
132   if (strcmp (c2t, c2b) != 0) {
133     g_hash_table_replace (ht, (gpointer) g_intern_string (c2t),
134         (gpointer) tname);
135   }
136 
137   GST_LOG ("%s %s %s : %s - %s", c1, c2t, c2b, name, tname);
138 }
139 
140 static void
gst_tag_load_iso_639_xml(GHashTable * ht)141 gst_tag_load_iso_639_xml (GHashTable * ht)
142 {
143   GMappedFile *f;
144   GError *err = NULL;
145   gchar *xml_data;
146   gsize xml_len;
147 
148 #ifdef ENABLE_NLS
149   GST_DEBUG ("binding text domain %s to locale dir %s", GETTEXT_PACKAGE,
150       ISO_CODES_LOCALEDIR);
151   bindtextdomain (GETTEXT_PACKAGE, ISO_CODES_LOCALEDIR);
152   bind_textdomain_codeset (GETTEXT_PACKAGE, "UTF-8");
153 #endif
154 
155   f = g_mapped_file_new (ISO_639_XML_PATH, FALSE, NULL);
156   if (f != NULL) {
157     xml_data = (gchar *) g_mapped_file_get_contents (f);
158     xml_len = g_mapped_file_get_length (f);
159   } else {
160     if (!g_file_get_contents (ISO_639_XML_PATH, &xml_data, &xml_len, &err)) {
161       GST_WARNING ("Could not read %s: %s", ISO_639_XML_PATH, err->message);
162       g_error_free (err);
163       return;
164     }
165   }
166 
167   if (g_utf8_validate (xml_data, xml_len, NULL)) {
168     GMarkupParser xml_parser = { parse_start_element, NULL, NULL, NULL, NULL };
169     GMarkupParseContext *ctx;
170 
171     ctx = g_markup_parse_context_new (&xml_parser, 0, ht, NULL);
172     if (!g_markup_parse_context_parse (ctx, xml_data, xml_len, &err)) {
173       GST_WARNING ("Parsing iso_639.xml failed: %s", err->message);
174       g_error_free (err);
175     }
176     g_markup_parse_context_free (ctx);
177   } else {
178     GST_WARNING ("iso_639.xml file is not valid UTF-8");
179     GST_MEMDUMP ("iso_639.xml file", (guint8 *) xml_data, xml_len);
180   }
181 
182   /* ... and clean up */
183   if (f != NULL)
184     g_mapped_file_unref (f);
185   else
186     g_free (xml_data);
187 }
188 #endif /* HAVE_ISO_CODES */
189 
190 static GHashTable *
gst_tag_get_iso_639_ht(void)191 gst_tag_get_iso_639_ht (void)
192 {
193   static gsize once_val = 0;
194   int i;
195 
196   if (g_once_init_enter (&once_val)) {
197     GHashTable *ht;
198     gsize done_val;
199 
200     GST_MEMDUMP ("iso 639 language names (internal default/fallback)",
201         (guint8 *) iso_639_names, sizeof (iso_639_names));
202 
203     /* maps code -> language name; all strings are either interned strings
204      * or const static strings from lang-table.c */
205     ht = g_hash_table_new (g_str_hash, g_str_equal);
206 
207     /* set up default/fallback mappings */
208     for (i = 0; i < G_N_ELEMENTS (iso_639_codes); ++i) {
209       GST_LOG ("%3d %s %s %c%c 0x%04x  %s", i, iso_639_codes[i].iso_639_1,
210           iso_639_codes[i].iso_639_2,
211           ((iso_639_codes[i].flags & ISO_639_FLAG_2B)) ? 'B' : '.',
212           ((iso_639_codes[i].flags & ISO_639_FLAG_2T)) ? 'T' : '.',
213           iso_639_codes[i].name_offset,
214           iso_639_names + iso_639_codes[i].name_offset);
215 
216 #ifdef HAVE_ISO_CODES
217       /* intern these in order to minimise allocations when interning strings
218        * read from the xml file later */
219       g_intern_static_string (iso_639_codes[i].iso_639_1);
220       g_intern_static_string (iso_639_codes[i].iso_639_2);
221       g_intern_static_string (iso_639_names + iso_639_codes[i].name_offset);
222 #endif
223 
224       /* and add default mapping (these strings are always valid) */
225       g_hash_table_insert (ht, (gpointer) iso_639_codes[i].iso_639_1,
226           (gpointer) (iso_639_names + iso_639_codes[i].name_offset));
227       g_hash_table_insert (ht, (gpointer) iso_639_codes[i].iso_639_2,
228           (gpointer) (iso_639_names + iso_639_codes[i].name_offset));
229     }
230 
231 #ifdef HAVE_ISO_CODES
232     {
233       GstClockTime ts = gst_util_get_timestamp ();
234 
235       gst_tag_load_iso_639_xml (ht);
236 
237       ts = gst_util_get_timestamp () - ts;
238       GST_INFO ("iso_639.xml loading took %.2gms", (double) ts / GST_MSECOND);
239     }
240 #else
241     GST_INFO ("iso-codes disabled or not available");
242 #endif
243 
244     done_val = (gsize) ht;
245     g_once_init_leave (&once_val, done_val);
246   }
247 
248   return (GHashTable *) once_val;
249 }
250 
251 /* ------------------------------------------------------------------------- */
252 
253 static int
qsort_strcmp_func(const void * p1,const void * p2)254 qsort_strcmp_func (const void *p1, const void *p2)
255 {
256   return strcmp (*(char *const *) p1, *(char *const *) p2);
257 }
258 
259 /**
260  * gst_tag_get_language_codes:
261  *
262  * Returns a list of known language codes (in form of two-letter ISO-639-1
263  * codes). This is useful for UIs to build a list of available languages for
264  * tagging purposes (e.g. to tag an audio track appropriately in a video or
265  * audio editor).
266  *
267  * Returns: (transfer full): NULL-terminated string array with two-letter
268  *     language codes. Free with g_strfreev() when no longer needed.
269  */
270 gchar **
gst_tag_get_language_codes(void)271 gst_tag_get_language_codes (void)
272 {
273   GHashTableIter iter;
274   GHashTable *ht;
275   gpointer key;
276   gchar **codes;
277   int i;
278 
279   ensure_debug_category ();
280 
281   ht = gst_tag_get_iso_639_ht ();
282 
283   /* we have at least two keys for each language (-1 code and -2 code) */
284   codes = g_new (gchar *, (g_hash_table_size (ht) / 2) + 1);
285 
286   i = 0;
287   g_hash_table_iter_init (&iter, ht);
288   while (g_hash_table_iter_next (&iter, &key, NULL)) {
289     const gchar *lang_code = key;
290 
291     if (strlen (lang_code) == 2) {
292       codes[i] = g_strdup (lang_code);
293       ++i;
294     }
295   }
296   codes[i] = NULL;
297 
298   /* be nice and sort the list */
299   qsort (&codes[0], i, sizeof (gchar *), qsort_strcmp_func);
300 
301   return codes;
302 }
303 
304 /**
305  * gst_tag_get_language_name:
306  * @language_code: two or three-letter ISO-639 language code
307  *
308  * Returns the name of the language given an ISO-639 language code as
309  * found in a GST_TAG_LANGUAGE_CODE tag. The name will be translated
310  * according to the current locale (if the library was built against the
311  * iso-codes package, otherwise the English name will be returned).
312  *
313  * Language codes are case-sensitive and expected to be lower case.
314  *
315  * Returns: language name in UTF-8 format, or NULL if @language_code could
316  *     not be mapped to a language name. The returned string must not be
317  *     modified and does not need to freed; it will stay valid until the
318  *     application is terminated.
319  */
320 const gchar *
gst_tag_get_language_name(const gchar * language_code)321 gst_tag_get_language_name (const gchar * language_code)
322 {
323   const gchar *lang_name;
324   GHashTable *ht;
325 
326   g_return_val_if_fail (language_code != NULL, NULL);
327 
328   ensure_debug_category ();
329 
330   ht = gst_tag_get_iso_639_ht ();
331 
332   lang_name = g_hash_table_lookup (ht, (gpointer) language_code);
333   GST_LOG ("%s -> %s", language_code, GST_STR_NULL (lang_name));
334 
335   return lang_name;
336 }
337 
338 /**
339  * gst_tag_get_language_code_iso_639_1:
340  * @lang_code: ISO-639 language code (e.g. "deu" or "ger" or "de")
341  *
342  * Returns two-letter ISO-639-1 language code given a three-letter ISO-639-2
343  * language code or two-letter ISO-639-1 language code (both are accepted for
344  * convenience).
345  *
346  * Language codes are case-sensitive and expected to be lower case.
347  *
348  * Returns: two-letter ISO-639-1 language code string that maps to @lang_code,
349  *     or NULL if no mapping is known. The returned string must not be
350  *     modified or freed.
351  */
352 const gchar *
gst_tag_get_language_code_iso_639_1(const gchar * lang_code)353 gst_tag_get_language_code_iso_639_1 (const gchar * lang_code)
354 {
355   const gchar *c = NULL;
356   int i;
357 
358   g_return_val_if_fail (lang_code != NULL, NULL);
359 
360   ensure_debug_category ();
361 
362   /* FIXME: we are being a bit inconsistent here in the sense that will only
363    * map the language codes from our static table. Theoretically the iso-codes
364    * XML file might have had additional codes that are now in the hash table.
365    * We keep it simple for now and don't waste memory on additional tables. */
366   for (i = 0; i < G_N_ELEMENTS (iso_639_codes); ++i) {
367     /* we check both codes here, so function can be used in a more versatile
368      * way, to convert a language tag to a two-letter language code and/or
369      * verify an existing code */
370     if (strcmp (lang_code, iso_639_codes[i].iso_639_1) == 0 ||
371         strcmp (lang_code, iso_639_codes[i].iso_639_2) == 0) {
372       c = iso_639_codes[i].iso_639_1;
373       break;
374     }
375   }
376 
377   GST_LOG ("%s -> %s", lang_code, GST_STR_NULL (c));
378 
379   return c;
380 }
381 
382 static const gchar *
gst_tag_get_language_code_iso_639_2X(const gchar * lang_code,guint8 flags)383 gst_tag_get_language_code_iso_639_2X (const gchar * lang_code, guint8 flags)
384 {
385   int i;
386 
387   /* FIXME: we are being a bit inconsistent here in the sense that we will only
388    * map the language codes from our static table. Theoretically the iso-codes
389    * XML file might have had additional codes that are now in the hash table.
390    * We keep it simple for now and don't waste memory on additional tables.
391    * Also, we currently only parse the iso_639.xml file if language names or
392    * a list of all codes is requested, and it'd be nice to keep it like that. */
393   for (i = 0; i < G_N_ELEMENTS (iso_639_codes); ++i) {
394     /* we check both codes here, so function can be used in a more versatile
395      * way, to convert a language tag to a three-letter language code and/or
396      * verify an existing code */
397     if (strcmp (lang_code, iso_639_codes[i].iso_639_1) == 0 ||
398         strcmp (lang_code, iso_639_codes[i].iso_639_2) == 0) {
399       if ((iso_639_codes[i].flags & flags) == flags) {
400         return iso_639_codes[i].iso_639_2;
401       } else if (i > 0 && (iso_639_codes[i - 1].flags & flags) == flags &&
402           iso_639_codes[i].name_offset == iso_639_codes[i - 1].name_offset) {
403         return iso_639_codes[i - 1].iso_639_2;
404       } else if ((i + 1) < G_N_ELEMENTS (iso_639_codes) &&
405           (iso_639_codes[i + 1].flags & flags) == flags &&
406           iso_639_codes[i].name_offset == iso_639_codes[i + 1].name_offset) {
407         return iso_639_codes[i + 1].iso_639_2;
408       }
409     }
410   }
411   return NULL;
412 }
413 
414 /**
415  * gst_tag_get_language_code_iso_639_2T:
416  * @lang_code: ISO-639 language code (e.g. "deu" or "ger" or "de")
417  *
418  * Returns three-letter ISO-639-2 "terminological" language code given a
419  * two-letter ISO-639-1 language code or a three-letter ISO-639-2 language
420  * code (both are accepted for convenience).
421  *
422  * The "terminological" code is derived from the local name of the language
423  * (e.g. "deu" for German instead of "ger"). In most scenarios, the
424  * "terminological" codes are prefered over the "bibliographic" ones.
425  *
426  * Language codes are case-sensitive and expected to be lower case.
427  *
428  * Returns: three-letter ISO-639-2 language code string that maps to @lang_code,
429  *     or NULL if no mapping is known. The returned string must not be
430  *     modified or freed.
431  */
432 const gchar *
gst_tag_get_language_code_iso_639_2T(const gchar * lang_code)433 gst_tag_get_language_code_iso_639_2T (const gchar * lang_code)
434 {
435   const gchar *c;
436 
437   g_return_val_if_fail (lang_code != NULL, NULL);
438 
439   ensure_debug_category ();
440 
441   c = gst_tag_get_language_code_iso_639_2X (lang_code, ISO_639_FLAG_2T);
442 
443   GST_LOG ("%s -> %s", lang_code, GST_STR_NULL (c));
444 
445   return c;
446 }
447 
448 /**
449  * gst_tag_get_language_code_iso_639_2B:
450  * @lang_code: ISO-639 language code (e.g. "deu" or "ger" or "de")
451  *
452  * Returns three-letter ISO-639-2 "bibliographic" language code given a
453  * two-letter ISO-639-1 language code or a three-letter ISO-639-2 language
454  * code (both are accepted for convenience).
455  *
456  * The "bibliographic" code is derived from the English name of the language
457  * (e.g. "ger" for German instead of "de" or "deu"). In most scenarios, the
458  * "terminological" codes are prefered.
459  *
460  * Language codes are case-sensitive and expected to be lower case.
461  *
462  * Returns: three-letter ISO-639-2 language code string that maps to @lang_code,
463  *     or NULL if no mapping is known. The returned string must not be
464  *     modified or freed.
465  */
466 const gchar *
gst_tag_get_language_code_iso_639_2B(const gchar * lang_code)467 gst_tag_get_language_code_iso_639_2B (const gchar * lang_code)
468 {
469   const gchar *c;
470 
471   g_return_val_if_fail (lang_code != NULL, NULL);
472 
473   ensure_debug_category ();
474 
475   c = gst_tag_get_language_code_iso_639_2X (lang_code, ISO_639_FLAG_2B);
476 
477   GST_LOG ("%s -> %s", lang_code, GST_STR_NULL (c));
478 
479   return c;
480 }
481 
482 /**
483  * gst_tag_check_language_code:
484  * @lang_code: ISO-639 language code (e.g. "deu" or "ger" or "de")
485  *
486  * Check if a given string contains a known ISO 639 language code.
487  *
488  * This is useful in situations where it's not clear whether a given
489  * string is a language code (which should be put into a #GST_TAG_LANGUAGE_CODE
490  * tag) or a free-form language name descriptor (which should be put into a
491  * #GST_TAG_LANGUAGE_NAME tag instead).
492  *
493  * Returns: TRUE if the two- or three-letter language code in @lang_code
494  *     is a valid ISO-639 language code.
495  */
496 gboolean
gst_tag_check_language_code(const gchar * lang_code)497 gst_tag_check_language_code (const gchar * lang_code)
498 {
499   return (gst_tag_get_language_code_iso_639_1 (lang_code) != NULL);
500 }
501