ibus-1.5.24/src/unicode-parser.c

/* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil; -*- */
/* vim:set et sts=4: */
/* ibus - The Input Bus
 * Copyright (C) 2018 Takao Fujiwara <takao.fujiwara1@gmail.com>
 * Copyright (C) 2018 Red Hat, Inc.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301
 * USA
 */

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include <glib.h>
#include <stdlib.h>
#include <string.h>

#ifdef HAVE_LOCALE_H
#include <locale.h>
#endif

#include "ibusunicode.h"

#define NAMES_LIST_SUBJECT "The Unicode Standard"
#define BLOCKS_SUBJECT "Blocks-"

/* This file has 21 lines about the license at the top of the file. */
#define LICENSE_LINES 21

typedef enum
{
    UCD_NAMES_LIST,
    UCD_BLOCKS
} UCDType;

typedef struct _UnicodeData UnicodeData;
typedef struct _UnicodeDataIndex UnicodeDataIndex;

struct _UnicodeData{
    gunichar code;
    gchar   *name;
    gchar   *alias;
    gunichar start;
    gunichar end;
    GSList  *list;
};

struct _UnicodeDataIndex {
    gchar *index;
    UnicodeData *data_list;
};

static gchar *unicode_version;

static void
unicode_data_new_object (UnicodeData *data)
{
    g_return_if_fail (data != NULL);
    if (!data->name) {
        g_warning ("No name in U+%04X", data->code);
    }
    IBusUnicodeData *unicode =
            ibus_unicode_data_new ("code",
                                   data->code,
                                   "name",
                                   data->name ? g_strdup (data->name)
                                           : g_strdup (""),
                                   "alias",
                                   data->alias ? g_strdup (data->alias)
                                           : g_strdup (""),
                                   NULL);
    data->list = g_slist_append (data->list, unicode);
}

static void
unicode_block_new_object (UnicodeData *data)
{
    g_return_if_fail (data != NULL);
    if (!data->name) {
        g_warning ("No name in U+%04X", data->start);
    }
    IBusUnicodeBlock *block =
            ibus_unicode_block_new ("start",
                                    data->start,
                                    "end",
                                    data->end,
                                    "name",
                                    data->name ? g_strdup (data->name)
                                           : g_strdup (""),
                                   NULL);
    data->list = g_slist_append (data->list, block);
}

static void
unicode_data_reset (UnicodeData *data)
{
    g_return_if_fail (data != NULL);
    data->code = 0;
    g_clear_pointer (&data->name, g_free);
    g_clear_pointer (&data->alias, g_free);
    data->start = 0;
    data->end = 0;
}

static gboolean
ucd_names_list_parse_comment (const gchar *line)
{
    static gboolean has_version = FALSE;

    if (has_version)
        return TRUE;
    if (strlen (line) > 4 && strncmp (line, "@@@", 3) == 0) {
        gchar **elements = g_strsplit (line, "\t", -1);
        if (strncmp (elements[1], NAMES_LIST_SUBJECT,
            strlen (NAMES_LIST_SUBJECT)) == 0) {
            unicode_version =
                    g_strdup (elements[1] + strlen (NAMES_LIST_SUBJECT) + 1);
            has_version = TRUE;
        }
        g_strfreev (elements);
    }
    return TRUE;
}

static gboolean
ucd_names_list_parse_alias (const gchar *line,
                            UnicodeData *data)
{
    g_return_val_if_fail (line != NULL, FALSE);
    g_return_val_if_fail (data != NULL, FALSE);

    if (*line == '\0')
        return FALSE;
    data->alias = g_strdup (line);
    return TRUE;
}

static gboolean
ucd_names_list_parse_indent_line (const gchar *line,
                                  UnicodeData *data)
{
    g_return_val_if_fail (line != NULL, FALSE);

    switch (*line) {
    case '\0':
        return FALSE;
    case '=':
        line++;
        while (*line == ' ') line++;
        return ucd_names_list_parse_alias (line, data);
    default:;
    }
    return TRUE;
}

static gboolean
ucd_names_list_parse_line (const gchar *line,
                           UnicodeData *data)
{
    g_return_val_if_fail (line != NULL, FALSE);

    switch (*line) {
    case '\0':
        return TRUE;
    case ';':
        return TRUE;
    case '@':
        return ucd_names_list_parse_comment (line);
    case '\t':
        return ucd_names_list_parse_indent_line (line + 1, data);
    default:;
    }
    if (g_ascii_isxdigit (*line)) {
        gchar **elements = g_strsplit (line, "\t", -1);
        gunichar code;
        gchar *name;

        if (g_strv_length (elements) < 2) {
            g_strfreev (elements);
            return FALSE;
        }
        code = g_ascii_strtoull (elements[0], NULL, 16);
        name = g_strdup (elements[1]);
        if (data->name) {
            unicode_data_new_object (data);
            unicode_data_reset (data);
        }
        data->code = code;
        data->name = name;
    }
    return TRUE;
}

static gboolean
ucd_blocks_parse_comment (const gchar *line)
{
    static gboolean has_version = FALSE;

    g_return_val_if_fail (line != NULL, FALSE);

    if (has_version)
        return TRUE;
    while (*line == ' ') line++;
    if (strlen (line) > strlen (BLOCKS_SUBJECT) &&
        strncmp (line, BLOCKS_SUBJECT, strlen (BLOCKS_SUBJECT)) == 0) {
            unicode_version = g_strdup (line + strlen (BLOCKS_SUBJECT) + 1);
            has_version = TRUE;
    }
    return TRUE;
}

static gboolean
ucd_blocks_parse_line (const gchar *line,
                       UnicodeData *data)
{
    g_return_val_if_fail (line != NULL, FALSE);

    switch (*line) {
    case '\0':
        return TRUE;
    case '#':
        return ucd_blocks_parse_comment (line + 1);
    default:;
    }
    if (g_ascii_isxdigit (*line)) {
        gchar *endptr = NULL;
        gunichar start = g_ascii_strtoull (line, &endptr, 16);
        gunichar end;
        gchar *name = NULL;

        if (endptr == NULL || *endptr == '\0')
            return FALSE;
        while (*endptr == '.') endptr++;
        line = endptr;
        endptr = NULL;
        end = g_ascii_strtoull (line, &endptr, 16);
        if (endptr == NULL || *endptr == '\0')
            return FALSE;
        while (*endptr == ';') endptr++;
        while (*endptr == ' ') endptr++;
        if (*endptr == '\0')
            return FALSE;
        name = g_strdup (endptr);
        if (data->name) {
            unicode_block_new_object (data);
            unicode_data_reset (data);
        }
        data->start = start;
        data->end = end;
        data->name = name;
    }
    return TRUE;
}

static gboolean
ucd_parse_file (const gchar *filename,
                GSList     **list,
                UCDType      type)
{
    UnicodeData data = { 0, };
    gchar *content = NULL;
    gsize length = 0;
    GError *error = NULL;
    gchar *head, *end, *line;
    int n = 1;

    g_return_val_if_fail (filename != NULL, FALSE);
    g_return_val_if_fail (list != NULL, FALSE);

    if (!g_file_get_contents (filename, &content, &length, &error)) {
        g_warning ("Failed to load %s: %s",
                   filename, error ? error->message : "");
        goto failed_to_parse_ucd_names_list;
    }
    head = end = content;
    while (*end == '\n' && end - content < length) {
        end++;
        n++;
    }
    head = end;
    while (end - content < length) {
        while (*end != '\n' && end - content < length)
            end++;
        if (end - content >= length)
            break;
        line = g_strndup (head, end - head);
        switch (type) {
        case UCD_NAMES_LIST:
            if (!ucd_names_list_parse_line (line, &data)) {
                g_warning ("parse error #%d in %s version %s: %s",
                           n, filename,
                           unicode_version ? unicode_version : "(null)",
                           line);
            }
            break;
        case UCD_BLOCKS:
            if (!ucd_blocks_parse_line (line, &data)) {
                g_warning ("parse error #%d in %s version %s: %s",
                           n, filename,
                           unicode_version ? unicode_version : "(null)",
                           line);
            }
            break;
        default:
            abort ();
        }
        while (*end == '\n' && end - content < length) {
            end++;
            n++;
        }
        g_free (line);
        head = end;
    }
    if (data.name != NULL) {
        switch (type) {
        case UCD_NAMES_LIST:
            unicode_data_new_object (&data);
            break;
        case UCD_BLOCKS:
            unicode_block_new_object (&data);
            break;
        default:;
        }
        unicode_data_reset (&data);
    }
    g_free (content);
    *list = data.list;
    return TRUE;

failed_to_parse_ucd_names_list:
    if (error)
        g_error_free (error);
    g_clear_pointer (&content, g_free);
    *list = data.list;
    return FALSE;
}

static void
block_list_dump (IBusUnicodeBlock *block,
                 GString          *buff)
{
    g_return_if_fail (buff != NULL);

    g_string_append (buff, "    /* TRANSLATORS: You might refer the "         \
                           "translations from gucharmap with\n"               \
                           "                    the following command:\n"     \
                           "       msgmerge -C gucharmap.po ibus.po "         \
                           "ibus.pot */\n");
    gchar *line = g_strdup_printf ("    N_(\"%s\"),\n",
                                   ibus_unicode_block_get_name (block));
    g_string_append (buff, line);
}

static void
ucd_block_translatable_save (const gchar *filename,
                             GSList      *blocks_list)
{
    gchar *content = NULL;
    gsize length = 0;
    GError *error = NULL;
    gchar *p;
    GString *buff = NULL;
    int i;
    GSList *list = blocks_list;

    g_return_if_fail (filename != NULL);
    g_return_if_fail (list != NULL);

    if (!g_file_get_contents (__FILE__, &content, &length, &error)) {
        g_warning ("Failed to load %s: %s", __FILE__, error->message);
        g_clear_pointer (&error, g_error_free);
        return;
    }

    buff = g_string_new (NULL);
    p = content;
    for (i = 0; i < LICENSE_LINES; i++, p++) {
        if ((p = strchr (p, '\n')) == NULL)
            break;
    }
    if (p != NULL) {
        g_string_append (buff, g_strndup (content, p - content));
        g_string_append_c (buff, '\n');
    }
    g_clear_pointer (&content, g_free);

    g_string_append (buff, g_strdup ("\n"));
    g_string_append (buff, g_strdup_printf ("/* This file is generated by %s. */", __FILE__));
    g_string_append (buff, g_strdup ("\n"));
    g_string_append (buff, g_strdup ("include <glib/gi18n.h>\n"));
    g_string_append (buff, g_strdup ("\n"));
    g_string_append (buff, g_strdup ("#ifndef __IBUS_UNICODE_GEN_H_\n"));
    g_string_append (buff, g_strdup ("#define __IBUS_UNICODE_GEN_H_\n"));
    g_string_append (buff, g_strdup ("const static char *unicode_blocks[] = {\n"));
    g_slist_foreach (list, (GFunc)block_list_dump, buff);
    g_string_append (buff, g_strdup ("};\n"));
    g_string_append (buff, g_strdup ("#endif\n"));

    if (!g_file_set_contents (filename, buff->str, -1, &error)) {
        g_warning ("Failed to save emoji category file %s: %s", filename, error->message);
        g_error_free (error);
    }

    g_string_free (buff, TRUE);
}

int
main (int argc, char *argv[])
{
    gchar *prgname;
    gchar *input_names_list = NULL;
    gchar *input_blocks = NULL;
    gchar *output_names_list = NULL;
    gchar *output_blocks = NULL;
    gchar *output_blocks_trans = NULL;
    GOptionEntry     entries[] = {
        { "input-names-list", 'n', 0, G_OPTION_ARG_STRING, &input_names_list,
          "Parse NamesList.txt FILE in unicode.org ",
          "FILE"
        },
        { "input-blocks", 'b', 0, G_OPTION_ARG_STRING, &input_blocks,
          "Parse Blocks.txt FILE in unicode.org ",
          "FILE"
        },
        { "output-names-list", 'o', 0, G_OPTION_ARG_STRING, &output_names_list,
          "Save the Unicode data as FILE",
          "FILE"
        },
        { "output-blocks", 'B', 0, G_OPTION_ARG_STRING, &output_blocks,
          "Save the Unicode block list as FILE",
          "FILE"
        },
        { "output-blocks-trans", 'C', 0, G_OPTION_ARG_STRING,
          &output_blocks_trans,
          "Save the translatable Unicode blocks as FILE",
          "FILE"
        },
        { NULL }
    };
    GOptionContext *context;
    GError *error = NULL;
    GSList *names_list = NULL;
    GSList *blocks_list = NULL;

#ifdef HAVE_LOCALE_H
    /* To output emoji warnings. */
    setlocale (LC_ALL, "");
#endif

    prgname = g_path_get_basename (argv[0]);
    g_set_prgname (prgname);
    g_free (prgname);

    context = g_option_context_new (NULL);
    g_option_context_add_main_entries (context, entries, NULL);

    if (argc < 3) {
        g_print ("%s", g_option_context_get_help (context, TRUE, NULL));
        g_option_context_free (context);
        return -1;
    }

    if (!g_option_context_parse (context, &argc, &argv, &error)) {
        g_warning ("Failed options: %s", error->message);
        g_error_free (error);
        return -1;
    }
    g_option_context_free (context);

    if (input_names_list) {
        ucd_parse_file (input_names_list, &names_list, UCD_NAMES_LIST);
        g_free (input_names_list);
    }
    if (output_names_list && names_list)
        ibus_unicode_data_save (output_names_list, names_list);
    g_free (output_names_list);

    if (input_blocks) {
        ucd_parse_file (input_blocks, &blocks_list, UCD_BLOCKS);
        g_free (input_blocks);
    }
    if (output_blocks && blocks_list)
        ibus_unicode_block_save (output_blocks, blocks_list);
    if (output_blocks_trans && blocks_list)
        ucd_block_translatable_save (output_blocks_trans, blocks_list);
    g_free (output_blocks);

    g_free (unicode_version);
    return 0;
}