1 /* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil; -*- */
2 /* vim:set et sts=4: */
3 /* ibus - The Input Bus
4  * Copyright (C) 2018 Takao Fujiwara <takao.fujiwara1@gmail.com>
5  * Copyright (C) 2018 Red Hat, Inc.
6  *
7  * This library is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * This library is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with this library; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301
20  * USA
21  */
22 
23 #ifdef HAVE_CONFIG_H
24 #include <config.h>
25 #endif
26 
27 #include <glib.h>
28 #include <stdlib.h>
29 #include <string.h>
30 
31 #ifdef HAVE_LOCALE_H
32 #include <locale.h>
33 #endif
34 
35 #include "ibusunicode.h"
36 
37 #define NAMES_LIST_SUBJECT "The Unicode Standard"
38 #define BLOCKS_SUBJECT "Blocks-"
39 
40 /* This file has 21 lines about the license at the top of the file. */
41 #define LICENSE_LINES 21
42 
43 typedef enum
44 {
45     UCD_NAMES_LIST,
46     UCD_BLOCKS
47 } UCDType;
48 
49 typedef struct _UnicodeData UnicodeData;
50 typedef struct _UnicodeDataIndex UnicodeDataIndex;
51 
52 struct _UnicodeData{
53     gunichar code;
54     gchar   *name;
55     gchar   *alias;
56     gunichar start;
57     gunichar end;
58     GSList  *list;
59 };
60 
61 struct _UnicodeDataIndex {
62     gchar *index;
63     UnicodeData *data_list;
64 };
65 
66 static gchar *unicode_version;
67 
68 static void
unicode_data_new_object(UnicodeData * data)69 unicode_data_new_object (UnicodeData *data)
70 {
71     g_return_if_fail (data != NULL);
72     if (!data->name) {
73         g_warning ("No name in U+%04X", data->code);
74     }
75     IBusUnicodeData *unicode =
76             ibus_unicode_data_new ("code",
77                                    data->code,
78                                    "name",
79                                    data->name ? g_strdup (data->name)
80                                            : g_strdup (""),
81                                    "alias",
82                                    data->alias ? g_strdup (data->alias)
83                                            : g_strdup (""),
84                                    NULL);
85     data->list = g_slist_append (data->list, unicode);
86 }
87 
88 static void
unicode_block_new_object(UnicodeData * data)89 unicode_block_new_object (UnicodeData *data)
90 {
91     g_return_if_fail (data != NULL);
92     if (!data->name) {
93         g_warning ("No name in U+%04X", data->start);
94     }
95     IBusUnicodeBlock *block =
96             ibus_unicode_block_new ("start",
97                                     data->start,
98                                     "end",
99                                     data->end,
100                                     "name",
101                                     data->name ? g_strdup (data->name)
102                                            : g_strdup (""),
103                                    NULL);
104     data->list = g_slist_append (data->list, block);
105 }
106 
107 static void
unicode_data_reset(UnicodeData * data)108 unicode_data_reset (UnicodeData *data)
109 {
110     g_return_if_fail (data != NULL);
111     data->code = 0;
112     g_clear_pointer (&data->name, g_free);
113     g_clear_pointer (&data->alias, g_free);
114     data->start = 0;
115     data->end = 0;
116 }
117 
118 static gboolean
ucd_names_list_parse_comment(const gchar * line)119 ucd_names_list_parse_comment (const gchar *line)
120 {
121     static gboolean has_version = FALSE;
122 
123     if (has_version)
124         return TRUE;
125     if (strlen (line) > 4 && strncmp (line, "@@@", 3) == 0) {
126         gchar **elements = g_strsplit (line, "\t", -1);
127         if (strncmp (elements[1], NAMES_LIST_SUBJECT,
128             strlen (NAMES_LIST_SUBJECT)) == 0) {
129             unicode_version =
130                     g_strdup (elements[1] + strlen (NAMES_LIST_SUBJECT) + 1);
131             has_version = TRUE;
132         }
133         g_strfreev (elements);
134     }
135     return TRUE;
136 }
137 
138 static gboolean
ucd_names_list_parse_alias(const gchar * line,UnicodeData * data)139 ucd_names_list_parse_alias (const gchar *line,
140                             UnicodeData *data)
141 {
142     g_return_val_if_fail (line != NULL, FALSE);
143     g_return_val_if_fail (data != NULL, FALSE);
144 
145     if (*line == '\0')
146         return FALSE;
147     data->alias = g_strdup (line);
148     return TRUE;
149 }
150 
151 static gboolean
ucd_names_list_parse_indent_line(const gchar * line,UnicodeData * data)152 ucd_names_list_parse_indent_line (const gchar *line,
153                                   UnicodeData *data)
154 {
155     g_return_val_if_fail (line != NULL, FALSE);
156 
157     switch (*line) {
158     case '\0':
159         return FALSE;
160     case '=':
161         line++;
162         while (*line == ' ') line++;
163         return ucd_names_list_parse_alias (line, data);
164     default:;
165     }
166     return TRUE;
167 }
168 
169 static gboolean
ucd_names_list_parse_line(const gchar * line,UnicodeData * data)170 ucd_names_list_parse_line (const gchar *line,
171                            UnicodeData *data)
172 {
173     g_return_val_if_fail (line != NULL, FALSE);
174 
175     switch (*line) {
176     case '\0':
177         return TRUE;
178     case ';':
179         return TRUE;
180     case '@':
181         return ucd_names_list_parse_comment (line);
182     case '\t':
183         return ucd_names_list_parse_indent_line (line + 1, data);
184     default:;
185     }
186     if (g_ascii_isxdigit (*line)) {
187         gchar **elements = g_strsplit (line, "\t", -1);
188         gunichar code;
189         gchar *name;
190 
191         if (g_strv_length (elements) < 2) {
192             g_strfreev (elements);
193             return FALSE;
194         }
195         code = g_ascii_strtoull (elements[0], NULL, 16);
196         name = g_strdup (elements[1]);
197         if (data->name) {
198             unicode_data_new_object (data);
199             unicode_data_reset (data);
200         }
201         data->code = code;
202         data->name = name;
203     }
204     return TRUE;
205 }
206 
207 static gboolean
ucd_blocks_parse_comment(const gchar * line)208 ucd_blocks_parse_comment (const gchar *line)
209 {
210     static gboolean has_version = FALSE;
211 
212     g_return_val_if_fail (line != NULL, FALSE);
213 
214     if (has_version)
215         return TRUE;
216     while (*line == ' ') line++;
217     if (strlen (line) > strlen (BLOCKS_SUBJECT) &&
218         strncmp (line, BLOCKS_SUBJECT, strlen (BLOCKS_SUBJECT)) == 0) {
219             unicode_version = g_strdup (line + strlen (BLOCKS_SUBJECT) + 1);
220             has_version = TRUE;
221     }
222     return TRUE;
223 }
224 
225 static gboolean
ucd_blocks_parse_line(const gchar * line,UnicodeData * data)226 ucd_blocks_parse_line (const gchar *line,
227                        UnicodeData *data)
228 {
229     g_return_val_if_fail (line != NULL, FALSE);
230 
231     switch (*line) {
232     case '\0':
233         return TRUE;
234     case '#':
235         return ucd_blocks_parse_comment (line + 1);
236     default:;
237     }
238     if (g_ascii_isxdigit (*line)) {
239         gchar *endptr = NULL;
240         gunichar start = g_ascii_strtoull (line, &endptr, 16);
241         gunichar end;
242         gchar *name = NULL;
243 
244         if (endptr == NULL || *endptr == '\0')
245             return FALSE;
246         while (*endptr == '.') endptr++;
247         line = endptr;
248         endptr = NULL;
249         end = g_ascii_strtoull (line, &endptr, 16);
250         if (endptr == NULL || *endptr == '\0')
251             return FALSE;
252         while (*endptr == ';') endptr++;
253         while (*endptr == ' ') endptr++;
254         if (*endptr == '\0')
255             return FALSE;
256         name = g_strdup (endptr);
257         if (data->name) {
258             unicode_block_new_object (data);
259             unicode_data_reset (data);
260         }
261         data->start = start;
262         data->end = end;
263         data->name = name;
264     }
265     return TRUE;
266 }
267 
268 static gboolean
ucd_parse_file(const gchar * filename,GSList ** list,UCDType type)269 ucd_parse_file (const gchar *filename,
270                 GSList     **list,
271                 UCDType      type)
272 {
273     UnicodeData data = { 0, };
274     gchar *content = NULL;
275     gsize length = 0;
276     GError *error = NULL;
277     gchar *head, *end, *line;
278     int n = 1;
279 
280     g_return_val_if_fail (filename != NULL, FALSE);
281     g_return_val_if_fail (list != NULL, FALSE);
282 
283     if (!g_file_get_contents (filename, &content, &length, &error)) {
284         g_warning ("Failed to load %s: %s",
285                    filename, error ? error->message : "");
286         goto failed_to_parse_ucd_names_list;
287     }
288     head = end = content;
289     while (*end == '\n' && end - content < length) {
290         end++;
291         n++;
292     }
293     head = end;
294     while (end - content < length) {
295         while (*end != '\n' && end - content < length)
296             end++;
297         if (end - content >= length)
298             break;
299         line = g_strndup (head, end - head);
300         switch (type) {
301         case UCD_NAMES_LIST:
302             if (!ucd_names_list_parse_line (line, &data)) {
303                 g_warning ("parse error #%d in %s version %s: %s",
304                            n, filename,
305                            unicode_version ? unicode_version : "(null)",
306                            line);
307             }
308             break;
309         case UCD_BLOCKS:
310             if (!ucd_blocks_parse_line (line, &data)) {
311                 g_warning ("parse error #%d in %s version %s: %s",
312                            n, filename,
313                            unicode_version ? unicode_version : "(null)",
314                            line);
315             }
316             break;
317         default:
318             abort ();
319         }
320         while (*end == '\n' && end - content < length) {
321             end++;
322             n++;
323         }
324         g_free (line);
325         head = end;
326     }
327     if (data.name != NULL) {
328         switch (type) {
329         case UCD_NAMES_LIST:
330             unicode_data_new_object (&data);
331             break;
332         case UCD_BLOCKS:
333             unicode_block_new_object (&data);
334             break;
335         default:;
336         }
337         unicode_data_reset (&data);
338     }
339     g_free (content);
340     *list = data.list;
341     return TRUE;
342 
343 failed_to_parse_ucd_names_list:
344     if (error)
345         g_error_free (error);
346     g_clear_pointer (&content, g_free);
347     *list = data.list;
348     return FALSE;
349 }
350 
351 static void
block_list_dump(IBusUnicodeBlock * block,GString * buff)352 block_list_dump (IBusUnicodeBlock *block,
353                  GString          *buff)
354 {
355     g_return_if_fail (buff != NULL);
356 
357     g_string_append (buff, "    /* TRANSLATORS: You might refer the "         \
358                            "translations from gucharmap with\n"               \
359                            "                    the following command:\n"     \
360                            "       msgmerge -C gucharmap.po ibus.po "         \
361                            "ibus.pot */\n");
362     gchar *line = g_strdup_printf ("    N_(\"%s\"),\n",
363                                    ibus_unicode_block_get_name (block));
364     g_string_append (buff, line);
365 }
366 
367 static void
ucd_block_translatable_save(const gchar * filename,GSList * blocks_list)368 ucd_block_translatable_save (const gchar *filename,
369                              GSList      *blocks_list)
370 {
371     gchar *content = NULL;
372     gsize length = 0;
373     GError *error = NULL;
374     gchar *p;
375     GString *buff = NULL;
376     int i;
377     GSList *list = blocks_list;
378 
379     g_return_if_fail (filename != NULL);
380     g_return_if_fail (list != NULL);
381 
382     if (!g_file_get_contents (__FILE__, &content, &length, &error)) {
383         g_warning ("Failed to load %s: %s", __FILE__, error->message);
384         g_clear_pointer (&error, g_error_free);
385         return;
386     }
387 
388     buff = g_string_new (NULL);
389     p = content;
390     for (i = 0; i < LICENSE_LINES; i++, p++) {
391         if ((p = strchr (p, '\n')) == NULL)
392             break;
393     }
394     if (p != NULL) {
395         g_string_append (buff, g_strndup (content, p - content));
396         g_string_append_c (buff, '\n');
397     }
398     g_clear_pointer (&content, g_free);
399 
400     g_string_append (buff, g_strdup ("\n"));
401     g_string_append (buff, g_strdup_printf ("/* This file is generated by %s. */", __FILE__));
402     g_string_append (buff, g_strdup ("\n"));
403     g_string_append (buff, g_strdup ("include <glib/gi18n.h>\n"));
404     g_string_append (buff, g_strdup ("\n"));
405     g_string_append (buff, g_strdup ("#ifndef __IBUS_UNICODE_GEN_H_\n"));
406     g_string_append (buff, g_strdup ("#define __IBUS_UNICODE_GEN_H_\n"));
407     g_string_append (buff, g_strdup ("const static char *unicode_blocks[] = {\n"));
408     g_slist_foreach (list, (GFunc)block_list_dump, buff);
409     g_string_append (buff, g_strdup ("};\n"));
410     g_string_append (buff, g_strdup ("#endif\n"));
411 
412     if (!g_file_set_contents (filename, buff->str, -1, &error)) {
413         g_warning ("Failed to save emoji category file %s: %s", filename, error->message);
414         g_error_free (error);
415     }
416 
417     g_string_free (buff, TRUE);
418 }
419 
420 int
main(int argc,char * argv[])421 main (int argc, char *argv[])
422 {
423     gchar *prgname;
424     gchar *input_names_list = NULL;
425     gchar *input_blocks = NULL;
426     gchar *output_names_list = NULL;
427     gchar *output_blocks = NULL;
428     gchar *output_blocks_trans = NULL;
429     GOptionEntry     entries[] = {
430         { "input-names-list", 'n', 0, G_OPTION_ARG_STRING, &input_names_list,
431           "Parse NamesList.txt FILE in unicode.org ",
432           "FILE"
433         },
434         { "input-blocks", 'b', 0, G_OPTION_ARG_STRING, &input_blocks,
435           "Parse Blocks.txt FILE in unicode.org ",
436           "FILE"
437         },
438         { "output-names-list", 'o', 0, G_OPTION_ARG_STRING, &output_names_list,
439           "Save the Unicode data as FILE",
440           "FILE"
441         },
442         { "output-blocks", 'B', 0, G_OPTION_ARG_STRING, &output_blocks,
443           "Save the Unicode block list as FILE",
444           "FILE"
445         },
446         { "output-blocks-trans", 'C', 0, G_OPTION_ARG_STRING,
447           &output_blocks_trans,
448           "Save the translatable Unicode blocks as FILE",
449           "FILE"
450         },
451         { NULL }
452     };
453     GOptionContext *context;
454     GError *error = NULL;
455     GSList *names_list = NULL;
456     GSList *blocks_list = NULL;
457 
458 #ifdef HAVE_LOCALE_H
459     /* To output emoji warnings. */
460     setlocale (LC_ALL, "");
461 #endif
462 
463     prgname = g_path_get_basename (argv[0]);
464     g_set_prgname (prgname);
465     g_free (prgname);
466 
467     context = g_option_context_new (NULL);
468     g_option_context_add_main_entries (context, entries, NULL);
469 
470     if (argc < 3) {
471         g_print ("%s", g_option_context_get_help (context, TRUE, NULL));
472         g_option_context_free (context);
473         return -1;
474     }
475 
476     if (!g_option_context_parse (context, &argc, &argv, &error)) {
477         g_warning ("Failed options: %s", error->message);
478         g_error_free (error);
479         return -1;
480     }
481     g_option_context_free (context);
482 
483     if (input_names_list) {
484         ucd_parse_file (input_names_list, &names_list, UCD_NAMES_LIST);
485         g_free (input_names_list);
486     }
487     if (output_names_list && names_list)
488         ibus_unicode_data_save (output_names_list, names_list);
489     g_free (output_names_list);
490 
491     if (input_blocks) {
492         ucd_parse_file (input_blocks, &blocks_list, UCD_BLOCKS);
493         g_free (input_blocks);
494     }
495     if (output_blocks && blocks_list)
496         ibus_unicode_block_save (output_blocks, blocks_list);
497     if (output_blocks_trans && blocks_list)
498         ucd_block_translatable_save (output_blocks_trans, blocks_list);
499     g_free (output_blocks);
500 
501     g_free (unicode_version);
502     return 0;
503 }
504