libtranslate-0.99/src/translate-util.c

/*
 * Copyright (C) 2004, 2005 Jean-Yves Lefort
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of Jean-Yves Lefort nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#include "config.h"
#include <string.h>
#include <errno.h>
#include <time.h>
#include <stdlib.h>
#include <limits.h>
#include <glib.h>
#include <glib/gi18n-lib.h>
#include "translate.h"

#include "translate-sgml-entities-private.h"

static gunichar translate_sgml_ref_get_unichar (const char *ref);

/**
 * translate_ascii_strcase_equal:
 * @s1: a nul-terminated string.
 * @s2: a nul-terminated string.
 *
 * Compares two strings, ignoring the case of ASCII characters of both
 * strings, and returns %TRUE if they are equal. It can be passed to
 * g_hash_table_new() as the @key_equal_func parameter, when using
 * strings as case-insensitive keys in a #GHashTable.
 *
 * Return value: %TRUE if the two strings match.
 **/
gboolean
translate_ascii_strcase_equal (gconstpointer s1, gconstpointer s2)
{
  return g_ascii_strcasecmp(s1, s2) == 0;
}

/**
 * translate_ascii_strcase_hash:
 * @key: a string key.
 *
 * Converts a string to a hash value, ignoring the case of ASCII
 * characters of the string. It can be passed to g_hash_table_new() as
 * the @hash_func parameter, when using strings as case-insensitive
 * keys in a #GHashTable.
 *
 * Return value: a hash value corresponding to the key.
 **/
unsigned int
translate_ascii_strcase_hash (gconstpointer key)
{
  const char *p = key;
  unsigned int h = g_ascii_tolower(*p);

  if (h)
    for (p++; *p; p++)
      h = (h << 5) - h + g_ascii_tolower(*p);

  return h;
}

/**
 * translate_ascii_strcasestr:
 * @big: a nul-terminated string, which may not be encoded in UTF-8.
 * @little: the nul-terminated string to search for, which may not be
 * encoded in UTF-8.
 *
 * Locates the first occurrence of @little in @big, ignoring the case
 * of ASCII characters of both strings.
 *
 * Return value: if @little is an empty string, @big is returned; if
 * @little occurs nowhere in @big, %NULL is returned; otherwise a
 * pointer to the first character of the first occurrence of @little
 * in @big is returned.
 **/
char *
translate_ascii_strcasestr (const char *big, const char *little)
{
  g_return_val_if_fail(big != NULL, NULL);
  g_return_val_if_fail(little != NULL, NULL);

  return translate_ascii_strcasestr_len(big, -1, little);
}

/**
 * translate_ascii_strcasestr_len:
 * @big: a nul-terminated string, which may not be encoded in UTF-8.
 * @big_len: length of @big in bytes, or -1 if @big is nul-terminated.
 * @little: the nul-terminated string to search for, which may not be
 * encoded in UTF-8.
 *
 * Locates the first occurrence of @little in @big, ignoring the case
 * of ASCII characters of both strings, and limiting the length of the
 * search to @big_len.
 *
 * Return value: if @little is an empty string, @big is returned; if
 * @little occurs nowhere in @big, %NULL is returned; otherwise a
 * pointer to the first character of the first occurrence of @little
 * in @big is returned.
 **/
char *
translate_ascii_strcasestr_len (const char *big,
				unsigned int big_len,
				const char *little)
{
  char *lower_big;
  char *lower_little;
  char *s;

  g_return_val_if_fail(big != NULL, NULL);
  g_return_val_if_fail(little != NULL, NULL);

  lower_big = g_ascii_strdown(big, (int) big_len);
  lower_little = g_ascii_strdown(little, -1);

  s = strstr(lower_big, lower_little);
  if (s)
    s = (char *) big + (s - lower_big);

  g_free(lower_big);
  g_free(lower_little);

  return s;
}

/**
 * translate_ascii_strcasecoll:
 * @s1: a nul-terminated string, which may not be encoded in UTF-8.
 * @s2: a nul-terminated string, which may not be encoded in UTF-8.
 *
 * Compares two strings for ordering using the linguistically correct
 * rules for the current locale, ignoring the case of ASCII characters
 * of both strings.
 *
 * Return value: an integer greater than, equal to, or less than 0,
 * according as @s1 is greater than, equal to, or less than @s2.
 **/
int
translate_ascii_strcasecoll (const char *s1, const char *s2)
{
  char *lower_s1;
  char *lower_s2;
  int coll;

  lower_s1 = g_ascii_strdown(s1, -1);
  lower_s2 = g_ascii_strdown(s2, -1);

  coll = strcoll(lower_s1, lower_s2);

  g_free(lower_s1);
  g_free(lower_s2);

  return coll;
}

/**
 * translate_utf8_strcasecoll:
 * @s1: a nul-terminated string.
 * @s2: a nul-terminated string.
 *
 * Compares two UTF-8 strings for ordering using the linguistically
 * correct rules for the current locale, ignoring the case of both
 * strings.
 *
 * Return value: an integer greater than, equal to, or less than 0,
 * according as @s1 is greater than, equal to, or less than @s2.
 **/
int
translate_utf8_strcasecoll (const char *s1, const char *s2)
{
  char *folded_s1;
  char *folded_s2;
  int coll;

  g_return_val_if_fail(s1 != NULL, 0);
  g_return_val_if_fail(s2 != NULL, 0);

  folded_s1 = g_utf8_casefold(s1, -1);
  folded_s2 = g_utf8_casefold(s2, -1);

  coll = g_utf8_collate(folded_s1, folded_s2);

  g_free(folded_s1);
  g_free(folded_s2);

  return coll;
}

/**
 * translate_utf8_strcmp:
 * @s1: a nul-terminated string.
 * @s2: a nul-terminated string.
 *
 * Compares two UTF-8 strings for ordering.
 *
 * Return value: an integer greater than, equal to, or less than 0,
 * according as @s1 is greater than, equal to, or less than @s2.
 **/
int
translate_utf8_strcmp (const char *s1, const char *s2)
{
  char *normalized_s1;
  char *normalized_s2;
  int cmp;

  g_return_val_if_fail(s1 != NULL, 0);
  g_return_val_if_fail(s2 != NULL, 0);

  normalized_s1 = g_utf8_normalize(s1, -1, G_NORMALIZE_ALL);
  normalized_s2 = g_utf8_normalize(s2, -1, G_NORMALIZE_ALL);

  cmp = strcmp(normalized_s1, normalized_s2);

  g_free(normalized_s1);
  g_free(normalized_s2);

  return cmp;
}

/**
 * translate_utf8_strcasecmp:
 * @s1: a nul-terminated string.
 * @s2: a nul-terminated string.
 *
 * Compares two UTF-8 strings for ordering, ignoring the case of both
 * strings.
 *
 * Return value: an integer greater than, equal to, or less than 0,
 * according as @s1 is greater than, equal to, or less than @s2.
 **/
int
translate_utf8_strcasecmp (const char *s1, const char *s2)
{
  char *normalized_s1;
  char *normalized_s2;
  char *case_normalized_s1;
  char *case_normalized_s2;
  int cmp;

  g_return_val_if_fail(s1 != NULL, 0);
  g_return_val_if_fail(s2 != NULL, 0);

  normalized_s1 = g_utf8_normalize(s1, -1, G_NORMALIZE_ALL);
  normalized_s2 = g_utf8_normalize(s2, -1, G_NORMALIZE_ALL);
  case_normalized_s1 = g_utf8_casefold(normalized_s1, -1);
  case_normalized_s2 = g_utf8_casefold(normalized_s2, -1);

  cmp = strcmp(case_normalized_s1, case_normalized_s2);

  g_free(normalized_s1);
  g_free(normalized_s2);
  g_free(case_normalized_s1);
  g_free(case_normalized_s2);

  return cmp;
}

/**
 * translate_time:
 *
 * Returns the current time, issuing a warning if an error occurs.
 *
 * Return value: the number of seconds since 0 hours, 0 minutes, 0
 * seconds, January 1, 1970, Coordinated Universal Time, or 0 if an
 * error has occurred.
 **/
time_t
translate_time (void)
{
  time_t now;

  now = time(NULL);
  if (now < 0)
    {
      g_warning(_("cannot get current time: %s"), g_strerror(errno));
      now = 0;
    }

  return now;
}

static gunichar
translate_sgml_ref_get_unichar (const char *ref)
{
  g_return_val_if_fail(ref != NULL, 0);

  if (*ref == '#')
    {				/* numeric reference */
      const char *nptr;
      int base;

      if (*(ref + 1) == 'x' || *(ref + 1) == 'X')
	{			/* hexadecimal number */
	  nptr = ref + 2;
	  base = 16;
	}
      else
	{			/* decimal number */
	  nptr = ref + 1;
	  base = 10;
	}

      if (*nptr)
	{
	  char *end;
	  unsigned long code;

	  code = strtoul(nptr, &end, base);
	  if (*end == 0)	/* could convert */
	    return code;
	}
    }
  else
    {				/* entity reference */
      int i;

      for (i = 0; i < G_N_ELEMENTS(entities); i++)
	if (! strcmp(ref, entities[i].name))
	  return entities[i].character;
    }

  return 0;			/* invalid reference */
}

/**
 * translate_sgml_ref_expand:
 * @str: a nul-terminated string.
 *
 * Parses @str, expanding its SGML character references and XHTML
 * character entities into their Unicode character value.
 *
 * Numerical SGML character references as well as XHTML entities are
 * supported. Unsupported entities will be inserted verbatim into the
 * result.
 *
 * Return value: the expansion of str. The returned string should be
 * freed when no longer needed.
 **/
char *
translate_sgml_ref_expand (const char *str)
{
  GString *unescaped;
  const char *start;

  g_return_val_if_fail(str != NULL, NULL);

  unescaped = g_string_new(NULL);

  while ((start = strchr(str, '&')))
    {
      const char *end;
      gunichar c;

      end = strpbrk(start + 1, "; &\t\n");
      if (! end)
	end = strchr(start + 1, 0);

      {
	char ref[end - start];

	strncpy(ref, start + 1, end - start - 1);
	ref[end - start - 1] = 0;

	c = translate_sgml_ref_get_unichar(ref);
      }

      if (*end == ';')		/* semicolon is part of entity, skip it */
	end++;

      g_string_append_len(unescaped, str, start - str);
      if (c)
	g_string_append_unichar(unescaped, c);
      else			/* invalid reference, append it raw */
	g_string_append_len(unescaped, start, end - start);

      str = end;
    }

  g_string_append(unescaped, str);

  return g_string_free(unescaped, FALSE);
}

/**
 * translate_utf8_strpbrk:
 * @p: a nul-terminated string.
 * @len: length of @p in bytes, or -1 if @p is nul-terminated.
 * @charset: the set of characters to search for.
 *
 * Locates in @p the first occurrence of any character in the string
 * @charset.
 *
 * Return value: the first occurrence of any character of @charset in
 * @p, or %NULL if no characters from @charset occur anywhere in @p.
 **/
char *
translate_utf8_strpbrk (const char *p, gssize len, const char *charset)
{
  g_return_val_if_fail(p != NULL, NULL);
  g_return_val_if_fail(charset != NULL, NULL);

  for (; *charset; charset = g_utf8_next_char(charset))
    {
      char *match;

      match = g_utf8_strchr(p, len, g_utf8_get_char(charset));
      if (match)
	return match;
    }

  return NULL;
}

/**
 * translate_utf8_strrpbrk:
 * @p: a nul-terminated string.
 * @len: length of @p in bytes, or -1 if @p is nul-terminated.
 * @charset: the set of characters to search for.
 *
 * Locates in @p the last occurrence of any character in the string
 * @charset.
 *
 * Return value: the last occurrence of any character of @charset in
 * @p, or %NULL if no characters from @charset occur anywhere in @p.
 **/
char *
translate_utf8_strrpbrk (const char *p, gssize len, const char *charset)
{
  g_return_val_if_fail(p != NULL, NULL);
  g_return_val_if_fail(charset != NULL, NULL);

  for (; *charset; charset = g_utf8_next_char(charset))
    {
      char *match;

      match = g_utf8_strrchr(p, len, g_utf8_get_char(charset));
      if (match)
	return match;
    }

  return NULL;
}