gettext-tools/src/read-properties.c

/* Reading Java .properties files.
   Copyright (C) 2003, 2005-2007, 2009, 2018, 2020 Free Software Foundation, Inc.
   Written by Bruno Haible <bruno@clisp.org>, 2003.

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */

#ifdef HAVE_CONFIG_H
# include <config.h>
#endif

/* Specification.  */
#include "read-properties.h"

#include <assert.h>
#include <errno.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "error.h"
#include "error-progname.h"
#include "message.h"
#include "read-catalog-abstract.h"
#include "xalloc.h"
#include "xvasprintf.h"
#include "po-xerror.h"
#include "msgl-ascii.h"
#include "read-file.h"
#include "unistr.h"
#include "gettext.h"

#define _(str) gettext (str)

/* For compiling this file in C++ mode.  */
#ifdef __cplusplus
# define this thiss
#endif


/* The format of the Java .properties files is documented in the JDK
   documentation for class java.util.Properties.  In the case of .properties
   files for PropertyResourceBundle, each non-comment line contains a
   key/value pair in the form "key = value" or "key : value" or "key value",
   where the key is the msgid and the value is the msgstr.  Messages with
   plurals are not supported in this format.

   The encoding of Java .properties files is:
     - ASCII with Java \uxxxx escape sequences,
     - ISO-8859-1 if non-ASCII bytes are encounterd,
     - UTF-8 if non-ASCII bytes are encountered and the entire file is
       valid UTF-8 (in Java 9 or newer), see
       https://docs.oracle.com/javase/9/intl/internationalization-enhancements-jdk-9.htm */

/* Handling of comments: We copy all comments from the .properties file to
   the PO file. This is not really needed; it's a service for translators
   who don't like PO files and prefer to maintain the .properties file.  */

/* Real filename, used in error messages about the input file.  */
static const char *real_file_name;

/* File name and line number.  */
extern lex_pos_ty gram_pos;

/* The contents of the input file.  */
static char *contents;
static size_t contents_length;

/* True if the input file is assumed to be in UTF-8 encoding.
   False if it is assumed to be in ISO-8859-1 encoding.  */
static bool assume_utf8;

/* Current position in contents.  */
static size_t position;

/* Phase 1: Read an input byte.
   Max. 1 pushback byte.  */

static int
phase1_getc ()
{
  if (position == contents_length)
    return EOF;

  return (unsigned char) contents[position++];
}

static inline void
phase1_ungetc (int c)
{
  if (c != EOF)
    position--;
}


/* Phase 2: Read an input byte, treating CR/LF like a single LF.
   Max. 2 pushback bytes.  */

static unsigned char phase2_pushback[2];
static int phase2_pushback_length;

static int
phase2_getc ()
{
  int c;

  if (phase2_pushback_length)
    c = phase2_pushback[--phase2_pushback_length];
  else
    {
      c = phase1_getc ();

      if (c == '\r')
        {
          int c2 = phase1_getc ();
          if (c2 == '\n')
            c = c2;
          else
            phase1_ungetc (c2);
        }
    }

  if (c == '\n')
    gram_pos.line_number++;

  return c;
}

static void
phase2_ungetc (int c)
{
  if (c == '\n')
    --gram_pos.line_number;
  if (c != EOF)
    phase2_pushback[phase2_pushback_length++] = c;
}


/* Phase 3: Read an input byte, treating CR/LF like a single LF,
   with handling of continuation lines.
   Max. 1 pushback character.  */

static int
phase3_getc ()
{
  int c = phase2_getc ();

  for (;;)
    {
      if (c != '\\')
        return c;

      c = phase2_getc ();
      if (c != '\n')
        {
          phase2_ungetc (c);
          return '\\';
        }

      /* Skip the backslash-newline and all whitespace that follows it.  */
      do
        c = phase2_getc ();
      while (c == ' ' || c == '\t' || c == '\r' || c == '\f');
    }
}

static inline void
phase3_ungetc (int c)
{
  phase2_ungetc (c);
}


/* Converts a string from ISO-8859-1 encoding to UTF-8 encoding.  */
static char *
conv_from_iso_8859_1 (char *string)
{
  if (is_ascii_string (string))
    return string;
  else
    {
      size_t length = strlen (string);
      /* Each ISO-8859-1 character needs 2 bytes at worst.  */
      unsigned char *utf8_string = XNMALLOC (2 * length + 1, unsigned char);
      unsigned char *q = utf8_string;
      const char *str = string;
      const char *str_limit = str + length;

      while (str < str_limit)
        {
          unsigned int uc = (unsigned char) *str++;
          int n = u8_uctomb (q, uc, 6);
          assert (n > 0);
          q += n;
        }
      *q = '\0';
      assert (q - utf8_string <= 2 * length);

      return (char *) utf8_string;
    }
}


/* Converts a string from JAVA encoding (with \uxxxx sequences) to UTF-8
   encoding.  May destructively modify the argument string.  */
static char *
conv_from_java (char *string)
{
  /* This conversion can only shrink the string, never increase its size.
     So there is no need to xmalloc the result freshly.  */
  const char *p = string;
  unsigned char *q = (unsigned char *) string;

  while (*p != '\0')
    {
      if (p[0] == '\\' && p[1] == 'u')
        {
          unsigned int n = 0;
          int i;

          for (i = 0; i < 4; i++)
            {
              int c1 = (unsigned char) p[2 + i];

              if (c1 >= '0' && c1 <= '9')
                n = (n << 4) + (c1 - '0');
              else if (c1 >= 'A' && c1 <= 'F')
                n = (n << 4) + (c1 - 'A' + 10);
              else if (c1 >= 'a' && c1 <= 'f')
                n = (n << 4) + (c1 - 'a' + 10);
              else
                goto just_one_byte;
            }

          if (i == 4)
            {
              unsigned int uc;

              if (n >= 0xd800 && n < 0xdc00)
                {
                  if (p[6] == '\\' && p[7] == 'u')
                    {
                      unsigned int m = 0;

                      for (i = 0; i < 4; i++)
                        {
                          int c1 = (unsigned char) p[8 + i];

                          if (c1 >= '0' && c1 <= '9')
                            m = (m << 4) + (c1 - '0');
                          else if (c1 >= 'A' && c1 <= 'F')
                            m = (m << 4) + (c1 - 'A' + 10);
                          else if (c1 >= 'a' && c1 <= 'f')
                            m = (m << 4) + (c1 - 'a' + 10);
                          else
                            goto just_one_byte;
                        }

                      if (i == 4 && (m >= 0xdc00 && m < 0xe000))
                        {
                          /* Combine two UTF-16 words to a character.  */
                          uc = 0x10000 + ((n - 0xd800) << 10) + (m - 0xdc00);
                          p += 12;
                        }
                      else
                        goto just_one_byte;
                    }
                  else
                    goto just_one_byte;
                }
              else
                {
                  uc = n;
                  p += 6;
                }

              q += u8_uctomb (q, uc, 6);
              continue;
            }
        }
      just_one_byte:
        *q++ = (unsigned char) *p++;
    }
  *q = '\0';
  return string;
}


/* Phase 4: Read the next single byte or UTF-16 code point,
   treating CR/LF like a single LF, with handling of continuation lines
   and of \uxxxx sequences.  */

/* Return value of phase 4 when EOF is reached.  */
#define P4_EOF 0xffff

/* Convert an UTF-16 code point to a return value that can be distinguished
   from a single-byte return value.  */
#define UNICODE(code) (0x10000 + (code))

/* Test a return value of phase 4 whether it designates an UTF-16 code
   point.  */
#define IS_UNICODE(p4_result) ((p4_result) >= 0x10000)

/* Extract the UTF-16 code of a return value that satisfies IS_UNICODE.  */
#define UTF16_VALUE(p4_result) ((p4_result) - 0x10000)

static int
phase4_getuc ()
{
  int c = phase3_getc ();

  if (c == EOF)
    return P4_EOF;
  if (c == '\\')
    {
      int c2 = phase3_getc ();

      if (c2 == 't')
        return '\t';
      if (c2 == 'n')
        return '\n';
      if (c2 == 'r')
        return '\r';
      if (c2 == 'f')
        return '\f';
      if (c2 == 'u')
        {
          unsigned int n = 0;
          int i;

          for (i = 0; i < 4; i++)
            {
              int c1 = phase3_getc ();

              if (c1 >= '0' && c1 <= '9')
                n = (n << 4) + (c1 - '0');
              else if (c1 >= 'A' && c1 <= 'F')
                n = (n << 4) + (c1 - 'A' + 10);
              else if (c1 >= 'a' && c1 <= 'f')
                n = (n << 4) + (c1 - 'a' + 10);
              else
                {
                  phase3_ungetc (c1);
                  po_xerror (PO_SEVERITY_ERROR, NULL,
                             real_file_name, gram_pos.line_number, (size_t)(-1),
                             false, _("warning: invalid \\uxxxx syntax for Unicode character"));
                  return 'u';
                }
            }
          return UNICODE (n);
        }

      return c2;
    }
  else
    return c;
}


/* Reads a key or value string.
   Returns the string in UTF-8 encoding, or NULL if the end of the logical
   line is reached.
   Parsing ends:
     - when returning NULL, after the end of the logical line,
     - otherwise, if in_key is true, after the whitespace and possibly the
       separator that follows after the string,
     - otherwise, if in_key is false, after the end of the logical line. */

static char *
read_escaped_string (bool in_key)
{
  /* The part of the string that has already been converted to UTF-8.  */
  static unsigned char *utf8_buffer;
  static size_t utf8_buflen;
  static size_t utf8_allocated;
  /* The first half of an UTF-16 surrogate character.  */
  unsigned short utf16_surr;
  /* Line in which this surrogate character occurred.  */
  size_t utf16_surr_line;

  /* Ensures utf8_buffer has room for N bytes.  N must be <= 10.  */
  #define utf8_buffer_ensure_available(n)  \
    do                                                                        \
      {                                                                       \
        if (utf8_buflen + (n) > utf8_allocated)                               \
          {                                                                   \
            utf8_allocated = 2 * utf8_allocated + 10;                         \
            utf8_buffer =                                                     \
              (unsigned char *) xrealloc (utf8_buffer, utf8_allocated);       \
          }                                                                   \
      }                                                                       \
    while (0)

  /* Appends a lone surrogate to utf8_buffer.  */
  /* Note: A half surrogate is invalid in UTF-8:
     - RFC 3629 says
         "The definition of UTF-8 prohibits encoding character
          numbers between U+D800 and U+DFFF".
     - Unicode 4.0 chapter 3
       <https://www.unicode.org/versions/Unicode4.0.0/ch03.pdf>
       section 3.9, p.77, says
         "Because surrogate code points are not Unicode scalar
          values, any UTF-8 byte sequence that would otherwise
          map to code points D800..DFFF is ill-formed."
       and in table 3-6, p. 78, does not mention D800..DFFF.
     - The unicode.org FAQ question "How do I convert an unpaired
       UTF-16 surrogate to UTF-8?" has the answer
         "By representing such an unpaired surrogate on its own
          as a 3-byte sequence, the resulting UTF-8 data stream
          would become ill-formed."
     So use U+FFFD instead.  */
  #define utf8_buffer_append_lone_surrogate(uc, line) \
    do                                                                        \
      {                                                                       \
        error_with_progname = false;                                          \
        po_xerror (PO_SEVERITY_ERROR, NULL,                                   \
                   real_file_name, (line), (size_t)(-1), false,               \
                   xasprintf (_("warning: lone surrogate U+%04X"), (uc)));    \
        error_with_progname = true;                                           \
        utf8_buffer_ensure_available (3);                                     \
        utf8_buffer[utf8_buflen++] = 0xef;                                    \
        utf8_buffer[utf8_buflen++] = 0xbf;                                    \
        utf8_buffer[utf8_buflen++] = 0xbd;                                    \
      }                                                                       \
    while (0)

  int c;

  /* Skip whitespace before the string.  */
  do
    c = phase3_getc ();
  while (c == ' ' || c == '\t' || c == '\r' || c == '\f');

  if (c == EOF || c == '\n')
    /* Empty string.  */
    return NULL;

  /* Start accumulating the string.  */
  utf8_buflen = 0;
  utf16_surr = 0;
  utf16_surr_line = 0;
  for (;;)
    {
      if (in_key && (c == '=' || c == ':'
                     || c == ' ' || c == '\t' || c == '\r' || c == '\f'))
        {
          /* Skip whitespace after the string.  */
          while (c == ' ' || c == '\t' || c == '\r' || c == '\f')
            c = phase3_getc ();
          /* Skip '=' or ':' separator.  */
          if (!(c == '=' || c == ':'))
            phase3_ungetc (c);
          break;
        }

      phase3_ungetc (c);

      /* Read the next byte or UTF-16 code point.  */
      c = phase4_getuc ();
      if (c == P4_EOF)
        break;

      /* Append it to the buffer.  */
      if (IS_UNICODE (c))
        {
          /* Append an UTF-16 code point.  */
          /* Test whether this character and the previous one form a Unicode
             surrogate pair.  */
          if (utf16_surr != 0
              && (c >= UNICODE (0xdc00) && c < UNICODE (0xe000)))
            {
              unsigned short utf16buf[2];
              ucs4_t uc;
              int len;

              utf16buf[0] = utf16_surr;
              utf16buf[1] = UTF16_VALUE (c);
              if (u16_mbtouc (&uc, utf16buf, 2) != 2)
                abort ();

              utf8_buffer_ensure_available (6);
              len = u8_uctomb (utf8_buffer + utf8_buflen, uc, 6);
              if (len < 0)
                {
                  error_with_progname = false;
                  po_xerror (PO_SEVERITY_ERROR, NULL,
                             real_file_name, gram_pos.line_number, (size_t)(-1),
                             false, _("warning: invalid Unicode character"));
                  error_with_progname = true;
                }
              else
                utf8_buflen += len;

              utf16_surr = 0;
            }
          else
            {
              if (utf16_surr != 0)
                {
                  utf8_buffer_append_lone_surrogate (utf16_surr, utf16_surr_line);
                  utf16_surr = 0;
                }

              if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00))
                {
                  utf16_surr = UTF16_VALUE (c);
                  utf16_surr_line = gram_pos.line_number;
                }
              else if (c >= UNICODE (0xdc00) && c < UNICODE (0xe000))
                utf8_buffer_append_lone_surrogate (UTF16_VALUE (c), gram_pos.line_number);
              else
                {
                  ucs4_t uc = UTF16_VALUE (c);
                  int len;

                  utf8_buffer_ensure_available (3);
                  len = u8_uctomb (utf8_buffer + utf8_buflen, uc, 3);
                  if (len < 0)
                    {
                      error_with_progname = false;
                      po_xerror (PO_SEVERITY_ERROR, NULL,
                                 real_file_name, gram_pos.line_number, (size_t)(-1),
                                 false, _("warning: invalid Unicode character"));
                      error_with_progname = true;
                    }
                  else
                    utf8_buflen += len;
                }
            }
        }
      else
        {
          /* Append a single byte.  */
          if (utf16_surr != 0)
            {
              utf8_buffer_append_lone_surrogate (utf16_surr, utf16_surr_line);
              utf16_surr = 0;
            }

          if (assume_utf8)
            {
              /* No conversion needed.  */
              utf8_buffer_ensure_available (1);
              utf8_buffer[utf8_buflen++] = c;
            }
          else
            {
              /* Convert the byte from ISO-8859-1 to UTF-8 on the fly.  */
              ucs4_t uc = c;
              int len;

              utf8_buffer_ensure_available (2);
              len = u8_uctomb (utf8_buffer + utf8_buflen, uc, 2);
              if (len < 0)
                abort ();
              utf8_buflen += len;
            }
        }

      c = phase3_getc ();
      if (c == EOF || c == '\n')
        {
          if (in_key)
            phase3_ungetc (c);
          break;
        }
    }
  if (utf16_surr != 0)
    utf8_buffer_append_lone_surrogate (utf16_surr, utf16_surr_line);

  /* Return the result.  */
  {
    unsigned char *utf8_string = XNMALLOC (utf8_buflen + 1, unsigned char);
    if (utf8_buflen > 0)
      memcpy (utf8_string, utf8_buffer, utf8_buflen);
    utf8_string[utf8_buflen] = '\0';

    return (char *) utf8_string;
  }
  #undef utf8_buffer_append_lone_surrogate
  #undef utf8_buffer_ensure_available
}


/* Read a .properties file from a stream, and dispatch to the various
   abstract_catalog_reader_class_ty methods.  */
static void
properties_parse (abstract_catalog_reader_ty *this, FILE *file,
                  const char *real_filename, const char *logical_filename)
{
  /* Read the file into memory.  */
  contents = fread_file (file, 0, &contents_length);
  if (contents == NULL)
    {
      const char *errno_description = strerror (errno);
      po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
                 xasprintf ("%s: %s",
                            xasprintf (_("error while reading \"%s\""),
                                       real_filename),
                            errno_description));
      return;
    }

  /* Test whether it's valid UTF-8.  */
  assume_utf8 = (u8_check ((uint8_t *) contents, contents_length) == NULL);

  position = 0;
  real_file_name = real_filename;
  gram_pos.file_name = xstrdup (real_file_name);
  gram_pos.line_number = 1;

  for (;;)
    {
      int c;
      bool comment;
      bool hidden;

      c = phase2_getc ();

      if (c == EOF)
        break;

      comment = false;
      hidden = false;
      if (c == '#')
        comment = true;
      else if (c == '!')
        {
          /* For compatibility with write-properties.c, we treat '!' not
             followed by space as a fuzzy or untranslated message.  */
          int c2 = phase2_getc ();
          if (c2 == ' ' || c2 == '\n' || c2 == EOF)
            comment = true;
          else
            hidden = true;
          phase2_ungetc (c2);
        }
      else
        phase2_ungetc (c);

      if (comment)
        {
          /* A comment line.  */
          static char *buffer;
          static size_t bufmax;
          static size_t buflen;

          buflen = 0;
          for (;;)
            {
              c = phase2_getc ();

              if (buflen >= bufmax)
                {
                  bufmax += 100;
                  buffer = xrealloc (buffer, bufmax);
                }

              if (c == EOF || c == '\n')
                break;

              buffer[buflen++] = c;
            }
          buffer[buflen] = '\0';

          po_callback_comment_dispatcher (
            conv_from_java (
              assume_utf8 ? buffer : conv_from_iso_8859_1 (buffer)));
        }
      else
        {
          /* A key/value pair.  */
          char *msgid;
          lex_pos_ty msgid_pos;

          msgid_pos = gram_pos;
          msgid = read_escaped_string (true);
          if (msgid == NULL)
            /* Skip blank line.  */
            ;
          else
            {
              char *msgstr;
              lex_pos_ty msgstr_pos;
              bool force_fuzzy;

              msgstr_pos = gram_pos;
              msgstr = read_escaped_string (false);
              if (msgstr == NULL)
                msgstr = xstrdup ("");

              /* Be sure to make the message fuzzy if it was commented out
                 and if it is not already header/fuzzy/untranslated.  */
              force_fuzzy = (hidden && msgid[0] != '\0' && msgstr[0] != '\0');

              po_callback_message (NULL, msgid, &msgid_pos, NULL,
                                   msgstr, strlen (msgstr) + 1, &msgstr_pos,
                                   NULL, NULL, NULL,
                                   force_fuzzy, false);
            }
        }
    }

  free (contents);
  contents = NULL;
  real_file_name = NULL;
  gram_pos.line_number = 0;
}

const struct catalog_input_format input_format_properties =
{
  properties_parse,                     /* parse */
  true                                  /* produces_utf8 */
};