gettext-tools/src/x-ycp.c

/* xgettext YCP backend.
   Copyright (C) 2001-2003, 2005-2009, 2011, 2018-2020 Free Software Foundation, Inc.

   This file was written by Bruno Haible <haible@clisp.cons.org>, 2001.

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */

#ifdef HAVE_CONFIG_H
# include "config.h"
#endif

/* Specification.  */
#include "x-ycp.h"

#include <errno.h>
#include <limits.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>

#include "message.h"
#include "rc-str-list.h"
#include "xgettext.h"
#include "xg-pos.h"
#include "xg-arglist-context.h"
#include "xg-message.h"
#include "error.h"
#include "xalloc.h"
#include "gettext.h"

#define _(s) gettext(s)

#define SIZEOF(a) (sizeof(a) / sizeof(a[0]))


/* The YCP syntax is defined in libycp/doc/syntax.html.
   See also libycp/src/scanner.ll.
   Both are part of the yast2-core package in SuSE Linux distributions.  */


void
init_flag_table_ycp ()
{
  xgettext_record_flag ("sformat:1:ycp-format");
  xgettext_record_flag ("y2debug:1:ycp-format");
  xgettext_record_flag ("y2milestone:1:ycp-format");
  xgettext_record_flag ("y2warning:1:ycp-format");
  xgettext_record_flag ("y2error:1:ycp-format");
  xgettext_record_flag ("y2security:1:ycp-format");
  xgettext_record_flag ("y2internal:1:ycp-format");
}


/* ======================== Reading of characters.  ======================== */

/* Position in the current line.  */
static int char_in_line;

/* The input file stream.  */
static FILE *fp;

/* These are for tracking whether comments count as immediately before
   keyword.  */
static int last_comment_line;
static int last_non_comment_line;


/* 1. line_number handling.  */

static int
phase1_getc ()
{
  int c = getc (fp);

  if (c == EOF)
    {
      if (ferror (fp))
        error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
               real_file_name);
      return EOF;
    }

  if (c == '\n')
    {
      line_number++;
      char_in_line = 0;
    }
  else
    char_in_line++;

  return c;
}

/* Supports only one pushback character.  */
static void
phase1_ungetc (int c)
{
  if (c != EOF)
    {
      if (c == '\n')
        {
          --line_number;
          char_in_line = INT_MAX;
        }
      else
        --char_in_line;

      ungetc (c, fp);
    }
}


/* 2. Replace each comment that is not inside a character constant or
   string literal with a space character.  We need to remember the
   comment for later, because it may be attached to a keyword string.
   YCP comments can be in C comment syntax, C++ comment syntax or sh
   comment syntax.  */

static unsigned char phase2_pushback[1];
static int phase2_pushback_length;

static int
phase2_getc ()
{
  static char *buffer;
  static size_t bufmax;
  size_t buflen;
  int lineno;
  int c;
  bool last_was_star;

  if (phase2_pushback_length)
    return phase2_pushback[--phase2_pushback_length];

  if (char_in_line == 0)
    {
      /* Eat whitespace, to recognize ^[\t ]*# pattern.  */
      do
        c = phase1_getc ();
      while (c == '\t' || c == ' ');

      if (c == '#')
        {
          /* sh comment.  */
          buflen = 0;
          lineno = line_number;
          for (;;)
            {
              c = phase1_getc ();
              if (c == '\n' || c == EOF)
                break;
              /* We skip all leading white space, but not EOLs.  */
              if (!(buflen == 0 && (c == ' ' || c == '\t')))
                {
                  if (buflen >= bufmax)
                    {
                      bufmax = 2 * bufmax + 10;
                      buffer = xrealloc (buffer, bufmax);
                    }
                  buffer[buflen++] = c;
                }
            }
          if (buflen >= bufmax)
            {
              bufmax = 2 * bufmax + 10;
              buffer = xrealloc (buffer, bufmax);
            }
          buffer[buflen] = '\0';
          savable_comment_add (buffer);
          last_comment_line = lineno;
          return '\n';
        }
    }
  else
    c = phase1_getc ();

  if (c == '/')
    {
      c = phase1_getc ();

      switch (c)
        {
        default:
          phase1_ungetc (c);
          return '/';

        case '*':
          /* C comment.  */
          buflen = 0;
          lineno = line_number;
          last_was_star = false;
          for (;;)
            {
              c = phase1_getc ();
              if (c == EOF)
                break;
              /* We skip all leading white space, but not EOLs.  */
              if (buflen == 0 && (c == ' ' || c == '\t'))
                continue;
              if (buflen >= bufmax)
                {
                  bufmax = 2 * bufmax + 10;
                  buffer = xrealloc (buffer, bufmax);
                }
              buffer[buflen++] = c;
              switch (c)
                {
                case '\n':
                  --buflen;
                  while (buflen >= 1
                         && (buffer[buflen - 1] == ' '
                             || buffer[buflen - 1] == '\t'))
                    --buflen;
                  buffer[buflen] = '\0';
                  savable_comment_add (buffer);
                  buflen = 0;
                  lineno = line_number;
                  last_was_star = false;
                  continue;

                case '*':
                  last_was_star = true;
                  continue;

                case '/':
                  if (last_was_star)
                    {
                      buflen -= 2;
                      while (buflen >= 1
                             && (buffer[buflen - 1] == ' '
                                 || buffer[buflen - 1] == '\t'))
                        --buflen;
                      buffer[buflen] = '\0';
                      savable_comment_add (buffer);
                      break;
                    }
                  /* FALLTHROUGH */

                default:
                  last_was_star = false;
                  continue;
                }
              break;
            }
          last_comment_line = lineno;
          return ' ';

        case '/':
          /* C++ comment.  */
          buflen = 0;
          lineno = line_number;
          for (;;)
            {
              c = phase1_getc ();
              if (c == '\n' || c == EOF)
                break;
              /* We skip all leading white space, but not EOLs.  */
              if (!(buflen == 0 && (c == ' ' || c == '\t')))
                {
                  if (buflen >= bufmax)
                    {
                      bufmax = 2 * bufmax + 10;
                      buffer = xrealloc (buffer, bufmax);
                    }
                  buffer[buflen++] = c;
                }
            }
          if (buflen >= bufmax)
            {
              bufmax = 2 * bufmax + 10;
              buffer = xrealloc (buffer, bufmax);
            }
          buffer[buflen] = '\0';
          savable_comment_add (buffer);
          last_comment_line = lineno;
          return '\n';
        }
    }
  else
    return c;
}

/* Supports only one pushback character.  */
static void
phase2_ungetc (int c)
{
  if (c != EOF)
    {
      if (phase2_pushback_length == SIZEOF (phase2_pushback))
        abort ();
      phase2_pushback[phase2_pushback_length++] = c;
    }
}


/* ========================== Reading of tokens.  ========================== */


enum token_type_ty
{
  token_type_eof,
  token_type_lparen,            /* ( */
  token_type_rparen,            /* ) */
  token_type_comma,             /* , */
  token_type_i18n,              /* _( */
  token_type_string_literal,    /* "abc" */
  token_type_symbol,            /* symbol, number */
  token_type_other              /* misc. operator */
};
typedef enum token_type_ty token_type_ty;

typedef struct token_ty token_ty;
struct token_ty
{
  token_type_ty type;
  char *string;         /* for token_type_string_literal, token_type_symbol */
  refcounted_string_list_ty *comment;   /* for token_type_string_literal */
  int line_number;
};


/* 7. Replace escape sequences within character strings with their
   single character equivalents.  */

#define P7_QUOTES (1000 + '"')

static int
phase7_getc ()
{
  int c;

  for (;;)
    {
      /* Use phase 1, because phase 2 elides comments.  */
      c = phase1_getc ();

      if (c == '"')
        return P7_QUOTES;
      if (c != '\\')
        return c;
      c = phase1_getc ();
      if (c != '\n')
        switch (c)
          {
          case 'b':
            return '\b';
          case 'f':
            return '\f';
          case 'n':
            return '\n';
          case 'r':
            return '\r';
          case 't':
            return '\t';

          /* FIXME: What is the octal escape syntax?
             syntax.html says: [0] [0-7]+
             scanner.ll says:  [0-7] [0-7] [0-7]
           */
#if 0
          case '0': case '1': case '2': case '3':
          case '4': case '5': case '6': case '7':
            {
              int n, j;

              n = 0;
              for (j = 0; j < 3; ++j)
                {
                  n = n * 8 + c - '0';
                  c = phase1_getc ();
                  switch (c)
                    {
                    default:
                      break;

                    case '0': case '1': case '2': case '3':
                    case '4': case '5': case '6': case '7':
                      continue;
                    }
                  break;
                }
              phase1_ungetc (c);
              return n;
            }
#endif

          default:
            return c;
          }
    }
}


/* Free the memory pointed to by a 'struct token_ty'.  */
static inline void
free_token (token_ty *tp)
{
  if (tp->type == token_type_string_literal || tp->type == token_type_symbol)
    free (tp->string);
  if (tp->type == token_type_string_literal)
    drop_reference (tp->comment);
}


/* Combine characters into tokens.  Discard whitespace.  */

static token_ty phase5_pushback[1];
static int phase5_pushback_length;

static void
phase5_get (token_ty *tp)
{
  static char *buffer;
  static int bufmax;
  int bufpos;
  int c;

  if (phase5_pushback_length)
    {
      *tp = phase5_pushback[--phase5_pushback_length];
      return;
    }
  for (;;)
    {
      tp->line_number = line_number;
      c = phase2_getc ();

      switch (c)
        {
        case EOF:
          tp->type = token_type_eof;
          return;

        case '\n':
          if (last_non_comment_line > last_comment_line)
            savable_comment_reset ();
          /* FALLTHROUGH */
        case '\r':
        case '\t':
        case ' ':
          /* Ignore whitespace and comments.  */
          continue;
        }

      last_non_comment_line = tp->line_number;

      switch (c)
        {
        case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
        case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
        case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
        case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
        case 'Y': case 'Z':
        case '_':
        case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
        case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
        case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
        case 's': case 't': case 'u': case 'v': case 'w': case 'x':
        case 'y': case 'z':
        case '0': case '1': case '2': case '3': case '4':
        case '5': case '6': case '7': case '8': case '9':
          /* Symbol, or part of a number.  */
          bufpos = 0;
          for (;;)
            {
              if (bufpos >= bufmax)
                {
                  bufmax = 2 * bufmax + 10;
                  buffer = xrealloc (buffer, bufmax);
                }
              buffer[bufpos++] = c;
              c = phase2_getc ();
              switch (c)
                {
                case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
                case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
                case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
                case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
                case 'Y': case 'Z':
                case '_':
                case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
                case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
                case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
                case 's': case 't': case 'u': case 'v': case 'w': case 'x':
                case 'y': case 'z':
                case '0': case '1': case '2': case '3': case '4':
                case '5': case '6': case '7': case '8': case '9':
                  continue;
                default:
                  if (bufpos == 1 && buffer[0] == '_' && c == '(')
                    {
                      tp->type = token_type_i18n;
                      return;
                    }
                  phase2_ungetc (c);
                  break;
                }
              break;
            }
          if (bufpos >= bufmax)
            {
              bufmax = 2 * bufmax + 10;
              buffer = xrealloc (buffer, bufmax);
            }
          buffer[bufpos] = '\0';
          tp->string = xstrdup (buffer);
          tp->type = token_type_symbol;
          return;

        case '"':
          bufpos = 0;
          for (;;)
            {
              c = phase7_getc ();
              if (c == EOF || c == P7_QUOTES)
                break;
              if (bufpos >= bufmax)
                {
                  bufmax = 2 * bufmax + 10;
                  buffer = xrealloc (buffer, bufmax);
                }
              buffer[bufpos++] = c;
            }
          if (bufpos >= bufmax)
            {
              bufmax = 2 * bufmax + 10;
              buffer = xrealloc (buffer, bufmax);
            }
          buffer[bufpos] = '\0';
          tp->string = xstrdup (buffer);
          tp->type = token_type_string_literal;
          tp->comment = add_reference (savable_comment);
          return;

        case '(':
          tp->type = token_type_lparen;
          return;

        case ')':
          tp->type = token_type_rparen;
          return;

        case ',':
          tp->type = token_type_comma;
          return;

        default:
          /* We could carefully recognize each of the 2 and 3 character
             operators, but it is not necessary, as we only need to recognize
             gettext invocations.  Don't bother.  */
          tp->type = token_type_other;
          return;
        }
    }
}

/* Supports only one pushback token.  */
static void
phase5_unget (token_ty *tp)
{
  if (tp->type != token_type_eof)
    {
      if (phase5_pushback_length == SIZEOF (phase5_pushback))
        abort ();
      phase5_pushback[phase5_pushback_length++] = *tp;
    }
}


/* Concatenate adjacent string literals to form single string literals.
   (See libycp/src/parser.yy, rule 'string' vs. terminal 'STRING'.)  */

static token_ty phase8_pushback[1];
static int phase8_pushback_length;

static void
phase8_get (token_ty *tp)
{
  if (phase8_pushback_length)
    {
      *tp = phase8_pushback[--phase8_pushback_length];
      return;
    }
  phase5_get (tp);
  if (tp->type != token_type_string_literal)
    return;
  for (;;)
    {
      token_ty tmp;
      size_t len;

      phase5_get (&tmp);
      if (tmp.type != token_type_string_literal)
        {
          phase5_unget (&tmp);
          return;
        }
      len = strlen (tp->string);
      tp->string = xrealloc (tp->string, len + strlen (tmp.string) + 1);
      strcpy (tp->string + len, tmp.string);
      free_token (&tmp);
    }
}

/* Supports only one pushback token.  */
static void
phase8_unget (token_ty *tp)
{
  if (tp->type != token_type_eof)
    {
      if (phase8_pushback_length == SIZEOF (phase8_pushback))
        abort ();
      phase8_pushback[phase8_pushback_length++] = *tp;
    }
}


/* ========================= Extracting strings.  ========================== */


/* Context lookup table.  */
static flag_context_list_table_ty *flag_context_list_table;


/* The file is broken into tokens.

     Normal handling: Look for
       [A] _( [B] msgid ... )
     Plural handling: Look for
       [A] _( [B] msgid [C] , [D] msgid_plural ... )
     At point [A]: state == 0.
     At point [B]: state == 1, plural_mp == NULL.
     At point [C]: state == 2, plural_mp != NULL.
     At point [D]: state == 1, plural_mp != NULL.

   We use recursion because we have to set the context according to the given
   flags.  */


/* Extract messages until the next balanced closing parenthesis.
   Extracted messages are added to MLP.
   Return true upon eof, false upon closing parenthesis.  */
static bool
extract_parenthesized (message_list_ty *mlp,
                       flag_context_ty outer_context,
                       flag_context_list_iterator_ty context_iter,
                       bool in_i18n)
{
  int state; /* 1 or 2 inside _( ... ), otherwise 0 */
  int plural_state = 0; /* defined only when in states 1 and 2 */
  message_ty *plural_mp = NULL; /* defined only when in states 1 and 2 */
  /* Context iterator that will be used if the next token is a '('.  */
  flag_context_list_iterator_ty next_context_iter =
    passthrough_context_list_iterator;
  /* Current context.  */
  flag_context_ty inner_context =
    inherited_context (outer_context,
                       flag_context_list_iterator_advance (&context_iter));

  /* Start state is 0 or 1.  */
  state = (in_i18n ? 1 : 0);

  for (;;)
    {
      token_ty token;

      if (in_i18n)
        phase8_get (&token);
      else
        phase5_get (&token);

      switch (token.type)
        {
        case token_type_i18n:
          if (extract_parenthesized (mlp, inner_context, next_context_iter,
                                     true))
            return true;
          next_context_iter = null_context_list_iterator;
          state = 0;
          continue;

        case token_type_string_literal:
          if (state == 1)
            {
              lex_pos_ty pos;
              pos.file_name = logical_file_name;
              pos.line_number = token.line_number;

              if (plural_state == 0)
                {
                  /* Seen an msgid.  */
                  token_ty token2;

                  if (in_i18n)
                    phase8_get (&token2);
                  else
                    phase5_get (&token2);

                  plural_mp =
                    remember_a_message (mlp, NULL, token.string, false,
                                        token2.type == token_type_comma,
                                        inner_context, &pos,
                                        NULL, token.comment, false);

                  if (in_i18n)
                    phase8_unget (&token2);
                  else
                    phase5_unget (&token2);

                  plural_state = 1;
                  state = 2;
                }
              else
                {
                  /* Seen an msgid_plural.  */
                  if (plural_mp != NULL)
                    remember_a_message_plural (plural_mp, token.string, false,
                                               inner_context, &pos,
                                               token.comment, false);
                  state = 0;
                }
              drop_reference (token.comment);
            }
          else
            {
              free_token (&token);
              state = 0;
            }
          next_context_iter = null_context_list_iterator;
          continue;

        case token_type_symbol:
          next_context_iter =
            flag_context_list_iterator (
              flag_context_list_table_lookup (
                flag_context_list_table,
                token.string, strlen (token.string)));
          free_token (&token);
          state = 0;
          continue;

        case token_type_lparen:
          if (extract_parenthesized (mlp, inner_context, next_context_iter,
                                     false))
            return true;
          next_context_iter = null_context_list_iterator;
          state = 0;
          continue;

        case token_type_rparen:
          return false;

        case token_type_comma:
          if (state == 2)
            state = 1;
          else
            state = 0;
          inner_context =
            inherited_context (outer_context,
                               flag_context_list_iterator_advance (
                                 &context_iter));
          next_context_iter = passthrough_context_list_iterator;
          continue;

        case token_type_other:
          next_context_iter = null_context_list_iterator;
          state = 0;
          continue;

        case token_type_eof:
          return true;

        default:
          abort ();
        }
    }
}


void
extract_ycp (FILE *f,
             const char *real_filename, const char *logical_filename,
             flag_context_list_table_ty *flag_table,
             msgdomain_list_ty *mdlp)
{
  message_list_ty *mlp = mdlp->item[0]->messages;

  fp = f;
  real_file_name = real_filename;
  logical_file_name = xstrdup (logical_filename);
  line_number = 1;
  char_in_line = 0;

  last_comment_line = -1;
  last_non_comment_line = -1;

  phase2_pushback_length = 0;
  phase5_pushback_length = 0;
  phase8_pushback_length = 0;

  flag_context_list_table = flag_table;

  /* Eat tokens until eof is seen.  When extract_parenthesized returns
     due to an unbalanced closing parenthesis, just restart it.  */
  while (!extract_parenthesized (mlp, null_context, null_context_list_iterator,
                                 false))
    ;

  fp = NULL;
  real_file_name = NULL;
  logical_file_name = NULL;
  line_number = 0;
  char_in_line = 0;
}