xref: /freebsd/usr.bin/localedef/scanner.c (revision b0b1dbdd)
1 /*-
2  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
3  * Copyright 2015 John Marino <draco@marino.st>
4  *
5  * This source code is derived from the illumos localedef command, and
6  * provided under BSD-style license terms by Nexenta Systems, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 /*
32  * This file contains the "scanner", which tokenizes the input files
33  * for localedef for processing by the higher level grammar processor.
34  */
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37 
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <ctype.h>
41 #include <limits.h>
42 #include <string.h>
43 #include <wchar.h>
44 #include <sys/types.h>
45 #include <assert.h>
46 #include "localedef.h"
47 #include "parser.h"
48 
49 int			com_char = '#';
50 int			esc_char = '\\';
51 int			mb_cur_min = 1;
52 int			mb_cur_max = 1;
53 int			lineno = 1;
54 int			warnings = 0;
55 int			is_stdin = 1;
56 FILE			*input;
57 static int		nextline;
58 //static FILE		*input = stdin;
59 static const char	*filename = "<stdin>";
60 static int		instring = 0;
61 static int		escaped = 0;
62 
63 /*
64  * Token space ... grows on demand.
65  */
66 static char *token = NULL;
67 static int tokidx;
68 static int toksz = 0;
69 static int hadtok = 0;
70 
71 /*
72  * Wide string space ... grows on demand.
73  */
74 static wchar_t *widestr = NULL;
75 static int wideidx = 0;
76 static int widesz = 0;
77 
78 /*
79  * The last keyword seen.  This is useful to trigger the special lexer rules
80  * for "copy" and also collating symbols and elements.
81  */
82 int	last_kw = 0;
83 static int	category = T_END;
84 
85 static struct token {
86 	int id;
87 	const char *name;
88 } keywords[] = {
89 	{ T_COM_CHAR,		"comment_char" },
90 	{ T_ESC_CHAR,		"escape_char" },
91 	{ T_END,		"END" },
92 	{ T_COPY,		"copy" },
93 	{ T_MESSAGES,		"LC_MESSAGES" },
94 	{ T_YESSTR,		"yesstr" },
95 	{ T_YESEXPR,		"yesexpr" },
96 	{ T_NOSTR,		"nostr" },
97 	{ T_NOEXPR,		"noexpr" },
98 	{ T_MONETARY,		"LC_MONETARY" },
99 	{ T_INT_CURR_SYMBOL,	"int_curr_symbol" },
100 	{ T_CURRENCY_SYMBOL,	"currency_symbol" },
101 	{ T_MON_DECIMAL_POINT,	"mon_decimal_point" },
102 	{ T_MON_THOUSANDS_SEP,	"mon_thousands_sep" },
103 	{ T_POSITIVE_SIGN,	"positive_sign" },
104 	{ T_NEGATIVE_SIGN,	"negative_sign" },
105 	{ T_MON_GROUPING,	"mon_grouping" },
106 	{ T_INT_FRAC_DIGITS,	"int_frac_digits" },
107 	{ T_FRAC_DIGITS,	"frac_digits" },
108 	{ T_P_CS_PRECEDES,	"p_cs_precedes" },
109 	{ T_P_SEP_BY_SPACE,	"p_sep_by_space" },
110 	{ T_N_CS_PRECEDES,	"n_cs_precedes" },
111 	{ T_N_SEP_BY_SPACE,	"n_sep_by_space" },
112 	{ T_P_SIGN_POSN,	"p_sign_posn" },
113 	{ T_N_SIGN_POSN,	"n_sign_posn" },
114 	{ T_INT_P_CS_PRECEDES,	"int_p_cs_precedes" },
115 	{ T_INT_N_CS_PRECEDES,	"int_n_cs_precedes" },
116 	{ T_INT_P_SEP_BY_SPACE,	"int_p_sep_by_space" },
117 	{ T_INT_N_SEP_BY_SPACE,	"int_n_sep_by_space" },
118 	{ T_INT_P_SIGN_POSN,	"int_p_sign_posn" },
119 	{ T_INT_N_SIGN_POSN,	"int_n_sign_posn" },
120 	{ T_COLLATE,		"LC_COLLATE" },
121 	{ T_COLLATING_SYMBOL,	"collating-symbol" },
122 	{ T_COLLATING_ELEMENT,	"collating-element" },
123 	{ T_FROM,		"from" },
124 	{ T_ORDER_START,	"order_start" },
125 	{ T_ORDER_END,		"order_end" },
126 	{ T_FORWARD,		"forward" },
127 	{ T_BACKWARD,		"backward" },
128 	{ T_POSITION,		"position" },
129 	{ T_IGNORE,		"IGNORE" },
130 	{ T_UNDEFINED,		"UNDEFINED" },
131 	{ T_NUMERIC,		"LC_NUMERIC" },
132 	{ T_DECIMAL_POINT,	"decimal_point" },
133 	{ T_THOUSANDS_SEP,	"thousands_sep" },
134 	{ T_GROUPING,		"grouping" },
135 	{ T_TIME,		"LC_TIME" },
136 	{ T_ABDAY,		"abday" },
137 	{ T_DAY,		"day" },
138 	{ T_ABMON,		"abmon" },
139 	{ T_MON,		"mon" },
140 	{ T_D_T_FMT,		"d_t_fmt" },
141 	{ T_D_FMT,		"d_fmt" },
142 	{ T_T_FMT,		"t_fmt" },
143 	{ T_AM_PM,		"am_pm" },
144 	{ T_T_FMT_AMPM,		"t_fmt_ampm" },
145 	{ T_ERA,		"era" },
146 	{ T_ERA_D_FMT,		"era_d_fmt" },
147 	{ T_ERA_T_FMT,		"era_t_fmt" },
148 	{ T_ERA_D_T_FMT,	"era_d_t_fmt" },
149 	{ T_ALT_DIGITS,		"alt_digits" },
150 	{ T_CTYPE,		"LC_CTYPE" },
151 	{ T_ISUPPER,		"upper" },
152 	{ T_ISLOWER,		"lower" },
153 	{ T_ISALPHA,		"alpha" },
154 	{ T_ISDIGIT,		"digit" },
155 	{ T_ISPUNCT,		"punct" },
156 	{ T_ISXDIGIT,		"xdigit" },
157 	{ T_ISSPACE,		"space" },
158 	{ T_ISPRINT,		"print" },
159 	{ T_ISGRAPH,		"graph" },
160 	{ T_ISBLANK,		"blank" },
161 	{ T_ISCNTRL,		"cntrl" },
162 	/*
163 	 * These entries are local additions, and not specified by
164 	 * TOG.  Note that they are not guaranteed to be accurate for
165 	 * all locales, and so applications should not depend on them.
166 	 */
167 	{ T_ISSPECIAL,		"special" },
168 	{ T_ISENGLISH,		"english" },
169 	{ T_ISPHONOGRAM,	"phonogram" },
170 	{ T_ISIDEOGRAM,		"ideogram" },
171 	{ T_ISNUMBER,		"number" },
172 	/*
173 	 * We have to support this in the grammar, but it would be a
174 	 * syntax error to define a character as one of these without
175 	 * also defining it as an alpha or digit.  We ignore it in our
176 	 * parsing.
177 	 */
178 	{ T_ISALNUM,		"alnum" },
179 	{ T_TOUPPER,		"toupper" },
180 	{ T_TOLOWER,		"tolower" },
181 
182 	/*
183 	 * These are keywords used in the charmap file.  Note that
184 	 * Solaris originally used angle brackets to wrap some of them,
185 	 * but we removed that to simplify our parser.  The first of these
186 	 * items are "global items."
187 	 */
188 	{ T_CHARMAP,		"CHARMAP" },
189 	{ T_WIDTH,		"WIDTH" },
190 
191 	{ -1, NULL },
192 };
193 
194 /*
195  * These special words are only used in a charmap file, enclosed in <>.
196  */
197 static struct token symwords[] = {
198 	{ T_COM_CHAR,		"comment_char" },
199 	{ T_ESC_CHAR,		"escape_char" },
200 	{ T_CODE_SET,		"code_set_name" },
201 	{ T_MB_CUR_MAX,		"mb_cur_max" },
202 	{ T_MB_CUR_MIN,		"mb_cur_min" },
203 	{ -1, NULL },
204 };
205 
206 static int categories[] = {
207 	T_CHARMAP,
208 	T_CTYPE,
209 	T_COLLATE,
210 	T_MESSAGES,
211 	T_MONETARY,
212 	T_NUMERIC,
213 	T_TIME,
214 	T_WIDTH,
215 	0
216 };
217 
218 void
219 reset_scanner(const char *fname)
220 {
221 	if (fname == NULL) {
222 		filename = "<stdin>";
223 		is_stdin = 1;
224 	} else {
225 		if (!is_stdin)
226 			(void) fclose(input);
227 		if ((input = fopen(fname, "r")) == NULL) {
228 			perror("fopen");
229 			exit(4);
230 		} else {
231 			is_stdin = 0;
232 		}
233 		filename = fname;
234 	}
235 	com_char = '#';
236 	esc_char = '\\';
237 	instring = 0;
238 	escaped = 0;
239 	lineno = 1;
240 	nextline = 1;
241 	tokidx = 0;
242 	wideidx = 0;
243 }
244 
245 #define	hex(x)	\
246 	(isdigit(x) ? (x - '0') : ((islower(x) ? (x - 'a') : (x - 'A')) + 10))
247 #define	isodigit(x)	((x >= '0') && (x <= '7'))
248 
249 static int
250 scanc(void)
251 {
252 	int	c;
253 
254 	if (is_stdin)
255 		c = getc(stdin);
256 	else
257 		c = getc(input);
258 	lineno = nextline;
259 	if (c == '\n') {
260 		nextline++;
261 	}
262 	return (c);
263 }
264 
265 static void
266 unscanc(int c)
267 {
268 	if (c == '\n') {
269 		nextline--;
270 	}
271 	if (ungetc(c, is_stdin ? stdin : input) < 0) {
272 		yyerror("ungetc failed");
273 	}
274 }
275 
276 static int
277 scan_hex_byte(void)
278 {
279 	int	c1, c2;
280 	int	v;
281 
282 	c1 = scanc();
283 	if (!isxdigit(c1)) {
284 		yyerror("malformed hex digit");
285 		return (0);
286 	}
287 	c2 = scanc();
288 	if (!isxdigit(c2)) {
289 		yyerror("malformed hex digit");
290 		return (0);
291 	}
292 	v = ((hex(c1) << 4) | hex(c2));
293 	return (v);
294 }
295 
296 static int
297 scan_dec_byte(void)
298 {
299 	int	c1, c2, c3;
300 	int	b;
301 
302 	c1 = scanc();
303 	if (!isdigit(c1)) {
304 		yyerror("malformed decimal digit");
305 		return (0);
306 	}
307 	b = c1 - '0';
308 	c2 = scanc();
309 	if (!isdigit(c2)) {
310 		yyerror("malformed decimal digit");
311 		return (0);
312 	}
313 	b *= 10;
314 	b += (c2 - '0');
315 	c3 = scanc();
316 	if (!isdigit(c3)) {
317 		unscanc(c3);
318 	} else {
319 		b *= 10;
320 		b += (c3 - '0');
321 	}
322 	return (b);
323 }
324 
325 static int
326 scan_oct_byte(void)
327 {
328 	int c1, c2, c3;
329 	int	b;
330 
331 	b = 0;
332 
333 	c1 = scanc();
334 	if (!isodigit(c1)) {
335 		yyerror("malformed octal digit");
336 		return (0);
337 	}
338 	b = c1 - '0';
339 	c2 = scanc();
340 	if (!isodigit(c2)) {
341 		yyerror("malformed octal digit");
342 		return (0);
343 	}
344 	b *= 8;
345 	b += (c2 - '0');
346 	c3 = scanc();
347 	if (!isodigit(c3)) {
348 		unscanc(c3);
349 	} else {
350 		b *= 8;
351 		b += (c3 - '0');
352 	}
353 	return (b);
354 }
355 
356 void
357 add_tok(int c)
358 {
359 	if ((tokidx + 1) >= toksz) {
360 		toksz += 64;
361 		if ((token = realloc(token, toksz)) == NULL) {
362 			yyerror("out of memory");
363 			tokidx = 0;
364 			toksz = 0;
365 			return;
366 		}
367 	}
368 
369 	token[tokidx++] = (char)c;
370 	token[tokidx] = 0;
371 }
372 void
373 add_wcs(wchar_t c)
374 {
375 	if ((wideidx + 1) >= widesz) {
376 		widesz += 64;
377 		widestr = realloc(widestr, (widesz * sizeof (wchar_t)));
378 		if (widestr == NULL) {
379 			yyerror("out of memory");
380 			wideidx = 0;
381 			widesz = 0;
382 			return;
383 		}
384 	}
385 
386 	widestr[wideidx++] = c;
387 	widestr[wideidx] = 0;
388 }
389 
390 wchar_t *
391 get_wcs(void)
392 {
393 	wchar_t *ws = widestr;
394 	wideidx = 0;
395 	widestr = NULL;
396 	widesz = 0;
397 	if (ws == NULL) {
398 		if ((ws = wcsdup(L"")) == NULL) {
399 			yyerror("out of memory");
400 		}
401 	}
402 	return (ws);
403 }
404 
405 static int
406 get_byte(void)
407 {
408 	int	c;
409 
410 	if ((c = scanc()) != esc_char) {
411 		unscanc(c);
412 		return (EOF);
413 	}
414 	c = scanc();
415 
416 	switch (c) {
417 	case 'd':
418 	case 'D':
419 		return (scan_dec_byte());
420 	case 'x':
421 	case 'X':
422 		return (scan_hex_byte());
423 	case '0':
424 	case '1':
425 	case '2':
426 	case '3':
427 	case '4':
428 	case '5':
429 	case '6':
430 	case '7':
431 		/* put the character back so we can get it */
432 		unscanc(c);
433 		return (scan_oct_byte());
434 	default:
435 		unscanc(c);
436 		unscanc(esc_char);
437 		return (EOF);
438 	}
439 }
440 
441 int
442 get_escaped(int c)
443 {
444 	switch (c) {
445 	case 'n':
446 		return ('\n');
447 	case 'r':
448 		return ('\r');
449 	case 't':
450 		return ('\t');
451 	case 'f':
452 		return ('\f');
453 	case 'v':
454 		return ('\v');
455 	case 'b':
456 		return ('\b');
457 	case 'a':
458 		return ('\a');
459 	default:
460 		return (c);
461 	}
462 }
463 
464 int
465 get_wide(void)
466 {
467 	static char mbs[MB_LEN_MAX + 1] = "";
468 	static int mbi = 0;
469 	int c;
470 	wchar_t	wc;
471 
472 	if (mb_cur_max >= (int)sizeof (mbs)) {
473 		yyerror("max multibyte character size too big");
474 		mbi = 0;
475 		return (T_NULL);
476 	}
477 	for (;;) {
478 		if ((mbi == mb_cur_max) || ((c = get_byte()) == EOF)) {
479 			/*
480 			 * end of the byte sequence reached, but no
481 			 * valid wide decoding.  fatal error.
482 			 */
483 			mbi = 0;
484 			yyerror("not a valid character encoding");
485 			return (T_NULL);
486 		}
487 		mbs[mbi++] = c;
488 		mbs[mbi] = 0;
489 
490 		/* does it decode? */
491 		if (to_wide(&wc, mbs) >= 0) {
492 			break;
493 		}
494 	}
495 
496 	mbi = 0;
497 	if ((category != T_CHARMAP) && (category != T_WIDTH)) {
498 		if (check_charmap(wc) < 0) {
499 			yyerror("no symbolic name for character");
500 			return (T_NULL);
501 		}
502 	}
503 
504 	yylval.wc = wc;
505 	return (T_CHAR);
506 }
507 
508 int
509 get_symbol(void)
510 {
511 	int	c;
512 
513 	while ((c = scanc()) != EOF) {
514 		if (escaped) {
515 			escaped = 0;
516 			if (c == '\n')
517 				continue;
518 			add_tok(get_escaped(c));
519 			continue;
520 		}
521 		if (c == esc_char) {
522 			escaped = 1;
523 			continue;
524 		}
525 		if (c == '\n') {	/* well that's strange! */
526 			yyerror("unterminated symbolic name");
527 			continue;
528 		}
529 		if (c == '>') {		/* end of symbol */
530 
531 			/*
532 			 * This restarts the token from the beginning
533 			 * the next time we scan a character.  (This
534 			 * token is complete.)
535 			 */
536 
537 			if (token == NULL) {
538 				yyerror("missing symbolic name");
539 				return (T_NULL);
540 			}
541 			tokidx = 0;
542 
543 			/*
544 			 * A few symbols are handled as keywords outside
545 			 * of the normal categories.
546 			 */
547 			if (category == T_END) {
548 				int i;
549 				for (i = 0; symwords[i].name != 0; i++) {
550 					if (strcmp(token, symwords[i].name) ==
551 					    0) {
552 						last_kw = symwords[i].id;
553 						return (last_kw);
554 					}
555 				}
556 			}
557 			/*
558 			 * Contextual rule: Only literal characters are
559 			 * permitted in CHARMAP.  Anywhere else the symbolic
560 			 * forms are fine.
561 			 */
562 			if ((category != T_CHARMAP) &&
563 			    (lookup_charmap(token, &yylval.wc)) != -1) {
564 				return (T_CHAR);
565 			}
566 			if ((yylval.collsym = lookup_collsym(token)) != NULL) {
567 				return (T_COLLSYM);
568 			}
569 			if ((yylval.collelem = lookup_collelem(token)) !=
570 			    NULL) {
571 				return (T_COLLELEM);
572 			}
573 			/* its an undefined symbol */
574 			yylval.token = strdup(token);
575 			token = NULL;
576 			toksz = 0;
577 			tokidx = 0;
578 			return (T_SYMBOL);
579 		}
580 		add_tok(c);
581 	}
582 
583 	yyerror("unterminated symbolic name");
584 	return (EOF);
585 }
586 
587 int
588 get_category(void)
589 {
590 	return (category);
591 }
592 
593 static int
594 consume_token(void)
595 {
596 	int	len = tokidx;
597 	int	i;
598 
599 	tokidx = 0;
600 	if (token == NULL)
601 		return (T_NULL);
602 
603 	/*
604 	 * this one is special, because we don't want it to alter the
605 	 * last_kw field.
606 	 */
607 	if (strcmp(token, "...") == 0) {
608 		return (T_ELLIPSIS);
609 	}
610 
611 	/* search for reserved words first */
612 	for (i = 0; keywords[i].name; i++) {
613 		int j;
614 		if (strcmp(keywords[i].name, token) != 0) {
615 			continue;
616 		}
617 
618 		last_kw = keywords[i].id;
619 
620 		/* clear the top level category if we're done with it */
621 		if (last_kw == T_END) {
622 			category = T_END;
623 		}
624 
625 		/* set the top level category if we're changing */
626 		for (j = 0; categories[j]; j++) {
627 			if (categories[j] != last_kw)
628 				continue;
629 			category = last_kw;
630 		}
631 
632 		return (keywords[i].id);
633 	}
634 
635 	/* maybe its a numeric constant? */
636 	if (isdigit(*token) || (*token == '-' && isdigit(token[1]))) {
637 		char *eptr;
638 		yylval.num = strtol(token, &eptr, 10);
639 		if (*eptr != 0)
640 			yyerror("malformed number");
641 		return (T_NUMBER);
642 	}
643 
644 	/*
645 	 * A single lone character is treated as a character literal.
646 	 * To avoid duplication of effort, we stick in the charmap.
647 	 */
648 	if (len == 1) {
649 		yylval.wc = token[0];
650 		return (T_CHAR);
651 	}
652 
653 	/* anything else is treated as a symbolic name */
654 	yylval.token = strdup(token);
655 	token = NULL;
656 	toksz = 0;
657 	tokidx = 0;
658 	return (T_NAME);
659 }
660 
661 void
662 scan_to_eol(void)
663 {
664 	int	c;
665 	while ((c = scanc()) != '\n') {
666 		if (c == EOF) {
667 			/* end of file without newline! */
668 			errf("missing newline");
669 			return;
670 		}
671 	}
672 	assert(c == '\n');
673 }
674 
675 int
676 yylex(void)
677 {
678 	int		c;
679 
680 	while ((c = scanc()) != EOF) {
681 
682 		/* special handling for quoted string */
683 		if (instring) {
684 			if (escaped) {
685 				escaped = 0;
686 
687 				/* if newline, just eat and forget it */
688 				if (c == '\n')
689 					continue;
690 
691 				if (strchr("xXd01234567", c)) {
692 					unscanc(c);
693 					unscanc(esc_char);
694 					return (get_wide());
695 				}
696 				yylval.wc = get_escaped(c);
697 				return (T_CHAR);
698 			}
699 			if (c == esc_char) {
700 				escaped = 1;
701 				continue;
702 			}
703 			switch (c) {
704 			case '<':
705 				return (get_symbol());
706 			case '>':
707 				/* oops! should generate syntax error  */
708 				return (T_GT);
709 			case '"':
710 				instring = 0;
711 				return (T_QUOTE);
712 			default:
713 				yylval.wc = c;
714 				return (T_CHAR);
715 			}
716 		}
717 
718 		/* escaped characters first */
719 		if (escaped) {
720 			escaped = 0;
721 			if (c == '\n') {
722 				/* eat the newline */
723 				continue;
724 			}
725 			hadtok = 1;
726 			if (tokidx) {
727 				/* an escape mid-token is nonsense */
728 				return (T_NULL);
729 			}
730 
731 			/* numeric escapes are treated as wide characters */
732 			if (strchr("xXd01234567", c)) {
733 				unscanc(c);
734 				unscanc(esc_char);
735 				return (get_wide());
736 			}
737 
738 			add_tok(get_escaped(c));
739 			continue;
740 		}
741 
742 		/* if it is the escape charter itself note it */
743 		if (c == esc_char) {
744 			escaped = 1;
745 			continue;
746 		}
747 
748 		/* remove from the comment char to end of line */
749 		if (c == com_char) {
750 			while (c != '\n') {
751 				if ((c = scanc()) == EOF) {
752 					/* end of file without newline! */
753 					return (EOF);
754 				}
755 			}
756 			assert(c == '\n');
757 			if (!hadtok) {
758 				/*
759 				 * If there were no tokens on this line,
760 				 * then just pretend it didn't exist at all.
761 				 */
762 				continue;
763 			}
764 			hadtok = 0;
765 			return (T_NL);
766 		}
767 
768 		if (strchr(" \t\n;()<>,\"", c) && (tokidx != 0)) {
769 			/*
770 			 * These are all token delimiters.  If there
771 			 * is a token already in progress, we need to
772 			 * process it.
773 			 */
774 			unscanc(c);
775 			return (consume_token());
776 		}
777 
778 		switch (c) {
779 		case '\n':
780 			if (!hadtok) {
781 				/*
782 				 * If the line was completely devoid of tokens,
783 				 * then just ignore it.
784 				 */
785 				continue;
786 			}
787 			/* we're starting a new line, reset the token state */
788 			hadtok = 0;
789 			return (T_NL);
790 		case ',':
791 			hadtok = 1;
792 			return (T_COMMA);
793 		case ';':
794 			hadtok = 1;
795 			return (T_SEMI);
796 		case '(':
797 			hadtok = 1;
798 			return (T_LPAREN);
799 		case ')':
800 			hadtok = 1;
801 			return (T_RPAREN);
802 		case '>':
803 			hadtok = 1;
804 			return (T_GT);
805 		case '<':
806 			/* symbol start! */
807 			hadtok = 1;
808 			return (get_symbol());
809 		case ' ':
810 		case '\t':
811 			/* whitespace, just ignore it */
812 			continue;
813 		case '"':
814 			hadtok = 1;
815 			instring = 1;
816 			return (T_QUOTE);
817 		default:
818 			hadtok = 1;
819 			add_tok(c);
820 			continue;
821 		}
822 	}
823 	return (EOF);
824 }
825 
826 void
827 yyerror(const char *msg)
828 {
829 	(void) fprintf(stderr, "%s: %d: error: %s\n",
830 	    filename, lineno, msg);
831 	exit(4);
832 }
833 
834 void
835 errf(const char *fmt, ...)
836 {
837 	char	*msg;
838 
839 	va_list	va;
840 	va_start(va, fmt);
841 	(void) vasprintf(&msg, fmt, va);
842 	va_end(va);
843 
844 	(void) fprintf(stderr, "%s: %d: error: %s\n",
845 	    filename, lineno, msg);
846 	free(msg);
847 	exit(4);
848 }
849 
850 void
851 warn(const char *fmt, ...)
852 {
853 	char	*msg;
854 
855 	va_list	va;
856 	va_start(va, fmt);
857 	(void) vasprintf(&msg, fmt, va);
858 	va_end(va);
859 
860 	(void) fprintf(stderr, "%s: %d: warning: %s\n",
861 	    filename, lineno, msg);
862 	free(msg);
863 	warnings++;
864 	if (!warnok)
865 		exit(4);
866 }
867