usr.bin/localedef/scanner.c

/*
 * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
 * Copyright 2015 John Marino <draco@marino.st>
 *
 * This source code is derived from the illumos localedef command, and
 * provided under BSD-style license terms by Nexenta Systems, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * This file contains the "scanner", which tokenizes the input files
 * for localedef for processing by the higher level grammar processor.
 */

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <limits.h>
#include <string.h>
#include <wchar.h>
#include <sys/types.h>
#include <assert.h>
#include "localedef.h"
#include "parser.h"

int			com_char = '#';
int			esc_char = '\\';
int			mb_cur_min = 1;
int			mb_cur_max = 1;
int			lineno = 1;
int			warnings = 0;
static int		is_stdin = 1;
static FILE		*input;
static int		nextline;
//static FILE		*input = stdin;
static const char	*filename = "<stdin>";
static int		instring = 0;
static int		escaped = 0;

/*
 * Token space ... grows on demand.
 */
static char *token = NULL;
static int tokidx;
static int toksz = 0;
static int hadtok = 0;

/*
 * Wide string space ... grows on demand.
 */
static wchar_t *widestr = NULL;
static int wideidx = 0;
static int widesz = 0;

/*
 * The last keyword seen.  This is useful to trigger the special lexer rules
 * for "copy" and also collating symbols and elements.
 */
int	last_kw = 0;
static int	category = T_END;

static struct token {
	int id;
	const char *name;
} keywords[] = {
	{ T_COM_CHAR,		"comment_char" },
	{ T_ESC_CHAR,		"escape_char" },
	{ T_END,		"END" },
	{ T_COPY,		"copy" },
	{ T_MESSAGES,		"LC_MESSAGES" },
	{ T_YESSTR,		"yesstr" },
	{ T_YESEXPR,		"yesexpr" },
	{ T_NOSTR,		"nostr" },
	{ T_NOEXPR,		"noexpr" },
	{ T_MONETARY,		"LC_MONETARY" },
	{ T_INT_CURR_SYMBOL,	"int_curr_symbol" },
	{ T_CURRENCY_SYMBOL,	"currency_symbol" },
	{ T_MON_DECIMAL_POINT,	"mon_decimal_point" },
	{ T_MON_THOUSANDS_SEP,	"mon_thousands_sep" },
	{ T_POSITIVE_SIGN,	"positive_sign" },
	{ T_NEGATIVE_SIGN,	"negative_sign" },
	{ T_MON_GROUPING,	"mon_grouping" },
	{ T_INT_FRAC_DIGITS,	"int_frac_digits" },
	{ T_FRAC_DIGITS,	"frac_digits" },
	{ T_P_CS_PRECEDES,	"p_cs_precedes" },
	{ T_P_SEP_BY_SPACE,	"p_sep_by_space" },
	{ T_N_CS_PRECEDES,	"n_cs_precedes" },
	{ T_N_SEP_BY_SPACE,	"n_sep_by_space" },
	{ T_P_SIGN_POSN,	"p_sign_posn" },
	{ T_N_SIGN_POSN,	"n_sign_posn" },
	{ T_INT_P_CS_PRECEDES,	"int_p_cs_precedes" },
	{ T_INT_N_CS_PRECEDES,	"int_n_cs_precedes" },
	{ T_INT_P_SEP_BY_SPACE,	"int_p_sep_by_space" },
	{ T_INT_N_SEP_BY_SPACE,	"int_n_sep_by_space" },
	{ T_INT_P_SIGN_POSN,	"int_p_sign_posn" },
	{ T_INT_N_SIGN_POSN,	"int_n_sign_posn" },
	{ T_COLLATE,		"LC_COLLATE" },
	{ T_COLLATING_SYMBOL,	"collating-symbol" },
	{ T_COLLATING_ELEMENT,	"collating-element" },
	{ T_FROM,		"from" },
	{ T_ORDER_START,	"order_start" },
	{ T_ORDER_END,		"order_end" },
	{ T_FORWARD,		"forward" },
	{ T_BACKWARD,		"backward" },
	{ T_POSITION,		"position" },
	{ T_IGNORE,		"IGNORE" },
	{ T_UNDEFINED,		"UNDEFINED" },
	{ T_NUMERIC,		"LC_NUMERIC" },
	{ T_DECIMAL_POINT,	"decimal_point" },
	{ T_THOUSANDS_SEP,	"thousands_sep" },
	{ T_GROUPING,		"grouping" },
	{ T_TIME,		"LC_TIME" },
	{ T_ABDAY,		"abday" },
	{ T_DAY,		"day" },
	{ T_ABMON,		"abmon" },
	{ T_MON,		"mon" },
	{ T_D_T_FMT,		"d_t_fmt" },
	{ T_D_FMT,		"d_fmt" },
	{ T_T_FMT,		"t_fmt" },
	{ T_AM_PM,		"am_pm" },
	{ T_T_FMT_AMPM,		"t_fmt_ampm" },
	{ T_ERA,		"era" },
	{ T_ERA_D_FMT,		"era_d_fmt" },
	{ T_ERA_T_FMT,		"era_t_fmt" },
	{ T_ERA_D_T_FMT,	"era_d_t_fmt" },
	{ T_ALT_DIGITS,		"alt_digits" },
	{ T_CTYPE,		"LC_CTYPE" },
	{ T_ISUPPER,		"upper" },
	{ T_ISLOWER,		"lower" },
	{ T_ISALPHA,		"alpha" },
	{ T_ISDIGIT,		"digit" },
	{ T_ISPUNCT,		"punct" },
	{ T_ISXDIGIT,		"xdigit" },
	{ T_ISSPACE,		"space" },
	{ T_ISPRINT,		"print" },
	{ T_ISGRAPH,		"graph" },
	{ T_ISBLANK,		"blank" },
	{ T_ISCNTRL,		"cntrl" },
	/*
	 * These entries are local additions, and not specified by
	 * TOG.  Note that they are not guaranteed to be accurate for
	 * all locales, and so applications should not depend on them.
	 */
	{ T_ISSPECIAL,		"special" },
	{ T_ISENGLISH,		"english" },
	{ T_ISPHONOGRAM,	"phonogram" },
	{ T_ISIDEOGRAM,		"ideogram" },
	{ T_ISNUMBER,		"number" },
	/*
	 * We have to support this in the grammar, but it would be a
	 * syntax error to define a character as one of these without
	 * also defining it as an alpha or digit.  We ignore it in our
	 * parsing.
	 */
	{ T_ISALNUM,		"alnum" },
	{ T_TOUPPER,		"toupper" },
	{ T_TOLOWER,		"tolower" },

	/*
	 * These are keywords used in the charmap file.  Note that
	 * Solaris orginally used angle brackets to wrap some of them,
	 * but we removed that to simplify our parser.  The first of these
	 * items are "global items."
	 */
	{ T_CHARMAP,		"CHARMAP" },
	{ T_WIDTH,		"WIDTH" },

	{ -1, NULL },
};

/*
 * These special words are only used in a charmap file, enclosed in <>.
 */
static struct token symwords[] = {
	{ T_COM_CHAR,		"comment_char" },
	{ T_ESC_CHAR,		"escape_char" },
	{ T_CODE_SET,		"code_set_name" },
	{ T_MB_CUR_MAX,		"mb_cur_max" },
	{ T_MB_CUR_MIN,		"mb_cur_min" },
	{ -1, NULL },
};

static int categories[] = {
	T_CHARMAP,
	T_CTYPE,
	T_COLLATE,
	T_MESSAGES,
	T_MONETARY,
	T_NUMERIC,
	T_TIME,
	T_WIDTH,
	0
};

void
reset_scanner(const char *fname)
{
	if (fname == NULL) {
		filename = "<stdin>";
		is_stdin = 1;
	} else {
		if (!is_stdin)
			(void) fclose(input);
		if ((input = fopen(fname, "r")) == NULL) {
			perror("fopen");
			exit(4);
		} else {
			is_stdin = 0;
		}
		filename = fname;
	}
	com_char = '#';
	esc_char = '\\';
	instring = 0;
	escaped = 0;
	lineno = 1;
	nextline = 1;
	tokidx = 0;
	wideidx = 0;
}

#define	hex(x)	\
	(isdigit(x) ? (x - '0') : ((islower(x) ? (x - 'a') : (x - 'A')) + 10))
#define	isodigit(x)	((x >= '0') && (x <= '7'))

static int
scanc(void)
{
	int	c;

	if (is_stdin)
		c = getc(stdin);
	else
		c = getc(input);
	lineno = nextline;
	if (c == '\n') {
		nextline++;
	}
	return (c);
}

static void
unscanc(int c)
{
	if (c == '\n') {
		nextline--;
	}
	if (ungetc(c, is_stdin ? stdin : input) < 0) {
		yyerror("ungetc failed");
	}
}

static int
scan_hex_byte(void)
{
	int	c1, c2;
	int	v;

	c1 = scanc();
	if (!isxdigit(c1)) {
		yyerror("malformed hex digit");
		return (0);
	}
	c2 = scanc();
	if (!isxdigit(c2)) {
		yyerror("malformed hex digit");
		return (0);
	}
	v = ((hex(c1) << 4) | hex(c2));
	return (v);
}

static int
scan_dec_byte(void)
{
	int	c1, c2, c3;
	int	b;

	c1 = scanc();
	if (!isdigit(c1)) {
		yyerror("malformed decimal digit");
		return (0);
	}
	b = c1 - '0';
	c2 = scanc();
	if (!isdigit(c2)) {
		yyerror("malformed decimal digit");
		return (0);
	}
	b *= 10;
	b += (c2 - '0');
	c3 = scanc();
	if (!isdigit(c3)) {
		unscanc(c3);
	} else {
		b *= 10;
		b += (c3 - '0');
	}
	return (b);
}

static int
scan_oct_byte(void)
{
	int c1, c2, c3;
	int	b;

	b = 0;

	c1 = scanc();
	if (!isodigit(c1)) {
		yyerror("malformed octal digit");
		return (0);
	}
	b = c1 - '0';
	c2 = scanc();
	if (!isodigit(c2)) {
		yyerror("malformed octal digit");
		return (0);
	}
	b *= 8;
	b += (c2 - '0');
	c3 = scanc();
	if (!isodigit(c3)) {
		unscanc(c3);
	} else {
		b *= 8;
		b += (c3 - '0');
	}
	return (b);
}

void
add_tok(int c)
{
	if ((tokidx + 1) >= toksz) {
		toksz += 64;
		if ((token = realloc(token, toksz)) == NULL) {
			yyerror("out of memory");
			tokidx = 0;
			toksz = 0;
			return;
		}
	}

	token[tokidx++] = (char)c;
	token[tokidx] = 0;
}
void
add_wcs(wchar_t c)
{
	if ((wideidx + 1) >= widesz) {
		widesz += 64;
		widestr = realloc(widestr, (widesz * sizeof (wchar_t)));
		if (widestr == NULL) {
			yyerror("out of memory");
			wideidx = 0;
			widesz = 0;
			return;
		}
	}

	widestr[wideidx++] = c;
	widestr[wideidx] = 0;
}

wchar_t *
get_wcs(void)
{
	wchar_t *ws = widestr;
	wideidx = 0;
	widestr = NULL;
	widesz = 0;
	if (ws == NULL) {
		if ((ws = wcsdup(L"")) == NULL) {
			yyerror("out of memory");
		}
	}
	return (ws);
}

static int
get_byte(void)
{
	int	c;

	if ((c = scanc()) != esc_char) {
		unscanc(c);
		return (EOF);
	}
	c = scanc();

	switch (c) {
	case 'd':
	case 'D':
		return (scan_dec_byte());
	case 'x':
	case 'X':
		return (scan_hex_byte());
	case '0':
	case '1':
	case '2':
	case '3':
	case '4':
	case '5':
	case '6':
	case '7':
		/* put the character back so we can get it */
		unscanc(c);
		return (scan_oct_byte());
	default:
		unscanc(c);
		unscanc(esc_char);
		return (EOF);
	}
}

int
get_escaped(int c)
{
	switch (c) {
	case 'n':
		return ('\n');
	case 'r':
		return ('\r');
	case 't':
		return ('\t');
	case 'f':
		return ('\f');
	case 'v':
		return ('\v');
	case 'b':
		return ('\b');
	case 'a':
		return ('\a');
	default:
		return (c);
	}
}

int
get_wide(void)
{
	static char mbs[MB_LEN_MAX + 1] = "";
	static int mbi = 0;
	int c;
	wchar_t	wc;

	if (mb_cur_max >= (int)sizeof (mbs)) {
		yyerror("max multibyte character size too big");
		mbi = 0;
		return (T_NULL);
	}
	for (;;) {
		if ((mbi == mb_cur_max) || ((c = get_byte()) == EOF)) {
			/*
			 * end of the byte sequence reached, but no
			 * valid wide decoding.  fatal error.
			 */
			mbi = 0;
			yyerror("not a valid character encoding");
			return (T_NULL);
		}
		mbs[mbi++] = c;
		mbs[mbi] = 0;

		/* does it decode? */
		if (to_wide(&wc, mbs) >= 0) {
			break;
		}
	}

	mbi = 0;
	if ((category != T_CHARMAP) && (category != T_WIDTH)) {
		if (check_charmap(wc) < 0) {
			yyerror("no symbolic name for character");
			return (T_NULL);
		}
	}

	yylval.wc = wc;
	return (T_CHAR);
}

int
get_symbol(void)
{
	int	c;

	while ((c = scanc()) != EOF) {
		if (escaped) {
			escaped = 0;
			if (c == '\n')
				continue;
			add_tok(get_escaped(c));
			continue;
		}
		if (c == esc_char) {
			escaped = 1;
			continue;
		}
		if (c == '\n') {	/* well that's strange! */
			yyerror("unterminated symbolic name");
			continue;
		}
		if (c == '>') {		/* end of symbol */

			/*
			 * This restarts the token from the beginning
			 * the next time we scan a character.  (This
			 * token is complete.)
			 */

			if (token == NULL) {
				yyerror("missing symbolic name");
				return (T_NULL);
			}
			tokidx = 0;

			/*
			 * A few symbols are handled as keywords outside
			 * of the normal categories.
			 */
			if (category == T_END) {
				int i;
				for (i = 0; symwords[i].name != NULL; i++) {
					if (strcmp(token, symwords[i].name) ==
					    0) {
						last_kw = symwords[i].id;
						return (last_kw);
					}
				}
			}
			/*
			 * Contextual rule: Only literal characters are
			 * permitted in CHARMAP.  Anywhere else the symbolic
			 * forms are fine.
			 */
			if ((category != T_CHARMAP) &&
			    (lookup_charmap(token, &yylval.wc)) != -1) {
				return (T_CHAR);
			}
			if ((yylval.collsym = lookup_collsym(token)) != NULL) {
				return (T_COLLSYM);
			}
			if ((yylval.collelem = lookup_collelem(token)) !=
			    NULL) {
				return (T_COLLELEM);
			}
			/* its an undefined symbol */
			yylval.token = strdup(token);
			token = NULL;
			toksz = 0;
			tokidx = 0;
			return (T_SYMBOL);
		}
		add_tok(c);
	}

	yyerror("unterminated symbolic name");
	return (EOF);
}

int
get_category(void)
{
	return (category);
}

static int
consume_token(void)
{
	int	len = tokidx;
	int	i;

	tokidx = 0;
	if (token == NULL)
		return (T_NULL);

	/*
	 * this one is special, because we don't want it to alter the
	 * last_kw field.
	 */
	if (strcmp(token, "...") == 0) {
		return (T_ELLIPSIS);
	}

	/* search for reserved words first */
	for (i = 0; keywords[i].name; i++) {
		int j;
		if (strcmp(keywords[i].name, token) != 0) {
			continue;
		}

		last_kw = keywords[i].id;

		/* clear the top level category if we're done with it */
		if (last_kw == T_END) {
			category = T_END;
		}

		/* set the top level category if we're changing */
		for (j = 0; categories[j]; j++) {
			if (categories[j] != last_kw)
				continue;
			category = last_kw;
		}

		return (keywords[i].id);
	}

	/* maybe its a numeric constant? */
	if (isdigit(*token) || (*token == '-' && isdigit(token[1]))) {
		char *eptr;
		yylval.num = strtol(token, &eptr, 10);
		if (*eptr != 0)
			yyerror("malformed number");
		return (T_NUMBER);
	}

	/*
	 * A single lone character is treated as a character literal.
	 * To avoid duplication of effort, we stick in the charmap.
	 */
	if (len == 1) {
		yylval.wc = token[0];
		return (T_CHAR);
	}

	/* anything else is treated as a symbolic name */
	yylval.token = strdup(token);
	token = NULL;
	toksz = 0;
	tokidx = 0;
	return (T_NAME);
}

void
scan_to_eol(void)
{
	int	c;
	while ((c = scanc()) != '\n') {
		if (c == EOF) {
			/* end of file without newline! */
			errf("missing newline");
			return;
		}
	}
	assert(c == '\n');
}

int
yylex(void)
{
	int		c;

	while ((c = scanc()) != EOF) {

		/* special handling for quoted string */
		if (instring) {
			if (escaped) {
				escaped = 0;

				/* if newline, just eat and forget it */
				if (c == '\n')
					continue;

				if (strchr("xXd01234567", c)) {
					unscanc(c);
					unscanc(esc_char);
					return (get_wide());
				}
				yylval.wc = get_escaped(c);
				return (T_CHAR);
			}
			if (c == esc_char) {
				escaped = 1;
				continue;
			}
			switch (c) {
			case '<':
				return (get_symbol());
			case '>':
				/* oops! should generate syntax error  */
				return (T_GT);
			case '"':
				instring = 0;
				return (T_QUOTE);
			default:
				yylval.wc = c;
				return (T_CHAR);
			}
		}

		/* escaped characters first */
		if (escaped) {
			escaped = 0;
			if (c == '\n') {
				/* eat the newline */
				continue;
			}
			hadtok = 1;
			if (tokidx) {
				/* an escape mid-token is nonsense */
				return (T_NULL);
			}

			/* numeric escapes are treated as wide characters */
			if (strchr("xXd01234567", c)) {
				unscanc(c);
				unscanc(esc_char);
				return (get_wide());
			}

			add_tok(get_escaped(c));
			continue;
		}

		/* if it is the escape charter itself note it */
		if (c == esc_char) {
			escaped = 1;
			continue;
		}

		/* remove from the comment char to end of line */
		if (c == com_char) {
			while (c != '\n') {
				if ((c = scanc()) == EOF) {
					/* end of file without newline! */
					return (EOF);
				}
			}
			assert(c == '\n');
			if (!hadtok) {
				/*
				 * If there were no tokens on this line,
				 * then just pretend it didn't exist at all.
				 */
				continue;
			}
			hadtok = 0;
			return (T_NL);
		}

		if (strchr(" \t\n;()<>,\"", c) && (tokidx != 0)) {
			/*
			 * These are all token delimiters.  If there
			 * is a token already in progress, we need to
			 * process it.
			 */
			unscanc(c);
			return (consume_token());
		}

		switch (c) {
		case '\n':
			if (!hadtok) {
				/*
				 * If the line was completely devoid of tokens,
				 * then just ignore it.
				 */
				continue;
			}
			/* we're starting a new line, reset the token state */
			hadtok = 0;
			return (T_NL);
		case ',':
			hadtok = 1;
			return (T_COMMA);
		case ';':
			hadtok = 1;
			return (T_SEMI);
		case '(':
			hadtok = 1;
			return (T_LPAREN);
		case ')':
			hadtok = 1;
			return (T_RPAREN);
		case '>':
			hadtok = 1;
			return (T_GT);
		case '<':
			/* symbol start! */
			hadtok = 1;
			return (get_symbol());
		case ' ':
		case '\t':
			/* whitespace, just ignore it */
			continue;
		case '"':
			hadtok = 1;
			instring = 1;
			return (T_QUOTE);
		default:
			hadtok = 1;
			add_tok(c);
			continue;
		}
	}
	return (EOF);
}

__dead2 void
yyerror(const char *msg)
{
	(void) fprintf(stderr, "%s: %d: error: %s\n",
	    filename, lineno, msg);
	exit(4);
}

__printflike(1, 2) void
errf(const char *fmt, ...)
{
	char	*msg;

	va_list	va;
	va_start(va, fmt);
	(void) vasprintf(&msg, fmt, va);
	va_end(va);

	(void) fprintf(stderr, "%s: %d: error: %s\n",
	    filename, lineno, msg);
	free(msg);
	exit(4);
}

__printflike(1, 2) void
warn(const char *fmt, ...)
{
	char	*msg;

	va_list	va;
	va_start(va, fmt);
	(void) vasprintf(&msg, fmt, va);
	va_end(va);

	(void) fprintf(stderr, "%s: %d: warning: %s\n",
	    filename, lineno, msg);
	free(msg);
	warnings++;
	if (!warnok)
		exit(4);
}