xref: /illumos-gate/usr/src/tools/ndrgen/ndr_lex.c (revision 24fe0b3b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <errno.h>
30 #include <stdarg.h>
31 #include "ndrgen.h"
32 #include "y.tab.h"
33 
34 /*
35  * C-like lexical analysis.
36  *
37  * 1. Define a "struct node"
38  * 2. Define a "struct symbol" that encapsulates a struct node.
39  * 3. Define a "struct integer" that encapsulates a struct node.
40  * 4. Set the YACC stack type in the grammar:
41  *		%{
42  *		#define YYSTYPE struct node *
43  *		%}
44  * 5. Define %token's in the grammer for IDENTIFIER, STRING and INTEGER.
45  *    Using "_KW" as a suffix for keyword tokens, i.e. "struct" is
46  *    "%token STRUCT_KW":
47  *	// atomic values
48  *	%token INTEGER STRING IDENTIFIER
49  *	// keywords
50  *	%token STRUCT_KW CASE_KW
51  *	// operators
52  *	%token PLUS MINUS ASSIGN ARROW
53  *	// overloaded tokens (++ --, < > <= >=, == !=, += -= *= ...)
54  *	%token INCOP RELOP EQUOP ASSOP
55  * 6. It's easiest to use the yacc(1) generated token numbers for node
56  *    labels.  For node labels that are not actually part of the grammer,
57  *    use a %token with an L_ prefix:
58  *	// node labels (can't be generated by lex)
59  *	%token L_LT L_LTE L_GT L_GTE L_EQU L_NEQ
60  * 7. Call set_lex_input() before parsing.
61  */
62 
63 #define	SQ	'\''
64 #define	DQ	'"'
65 
66 #define	isquote(c) ((c) == SQ || (c) == DQ)
67 #define	iswhite(c) ((c) == ' ' || (c) == '\t' || (c) == '\n' || (c) == '\f')
68 
69 #define	is_between(c, l, u)  ((l) <= (c) && (c) <= (u))
70 #define	is_white(c)	((c) == ' ' || c == '\r' || c == '\t' || c == '\f')
71 #define	is_lower(c)	is_between((c), 'a', 'z')
72 #define	is_upper(c)	is_between((c), 'A', 'Z')
73 #define	is_alpha(c)	(is_lower(c) || is_upper(c))
74 #define	is_digit(c)	is_between((c), '0', '9')
75 #define	is_sstart(c)	(is_alpha(c) || (c) == '_')
76 #define	is_sfollow(c)	(is_sstart(c) || is_digit(c))
77 #define	is_xdigit(c)	\
78 	(is_digit(c) || is_between((c), 'A', 'F') || is_between((c), 'a', 'f'))
79 
80 ndr_symbol_t		*symbol_list;
81 static ndr_integer_t	*integer_list;
82 static FILE		*lex_infp;
83 static ndr_symbol_t	*file_name;
84 int			line_number;
85 int			n_compile_error;
86 
87 static int		lex_at_bol;
88 
89 /* In yacc(1) generated parser */
90 extern struct node	*yylval;
91 
92 /*
93  * The keywtab[] and optable[] could be external to this lex
94  * and it would all still work.
95  */
96 static ndr_keyword_t keywtable[] = {
97 	{ "struct",	STRUCT_KW,	0 },
98 	{ "union",	UNION_KW,	0 },
99 	{ "typedef",	TYPEDEF_KW,	0 },
100 
101 	{ "interface",	INTERFACE_KW,	0 },
102 	{ "uuid",	UUID_KW,	0 },
103 	{ "_no_reorder", _NO_REORDER_KW, 0 },
104 	{ "extern",	EXTERN_KW,	0 },
105 	{ "reference",	REFERENCE_KW,	0 },
106 
107 	{ "align",	ALIGN_KW,	0 },
108 	{ "operation",	OPERATION_KW,	0 },
109 	{ "in",		IN_KW,		0 },
110 	{ "out",	OUT_KW,		0 },
111 
112 	{ "string",	STRING_KW,	0 },
113 	{ "size_is",	SIZE_IS_KW,	0 },
114 	{ "length_is",	LENGTH_IS_KW,	0 },
115 
116 	{ "switch_is",	SWITCH_IS_KW,	0 },
117 	{ "case",	CASE_KW,	0 },
118 	{ "default",	DEFAULT_KW,	0 },
119 
120 	{ "transmit_as", TRANSMIT_AS_KW, 0 },
121 	{ "arg_is",	ARG_IS_KW,	0 },
122 
123 	{ "char",	BASIC_TYPE,	1 },
124 	{ "uchar",	BASIC_TYPE,	1 },
125 	{ "wchar",	BASIC_TYPE,	2 },
126 	{ "short",	BASIC_TYPE,	2 },
127 	{ "ushort",	BASIC_TYPE,	2 },
128 	{ "long",	BASIC_TYPE,	4 },
129 	{ "ulong",	BASIC_TYPE,	4 },
130 	{0}
131 };
132 
133 static ndr_keyword_t optable[] = {
134 	{ "{",		LC,		0 },
135 	{ "}",		RC,		0 },
136 	{ "(",		LP,		0 },
137 	{ ")",		RP,		0 },
138 	{ "[",		LB,		0 },
139 	{ "]",		RB,		0 },
140 	{ "*",		STAR,		0 },
141 	{ ";",		SEMI,		0 },
142 	{0}
143 };
144 
145 static int getch(FILE *fp);
146 static ndr_integer_t *int_enter(long);
147 static ndr_symbol_t *sym_find(char *);
148 static int str_to_sv(char *, char *sv[]);
149 
150 /*
151  * Enter the symbols for keyword.
152  */
153 static void
154 keyw_tab_init(ndr_keyword_t kwtable[])
155 {
156 	int			i;
157 	ndr_keyword_t		*kw;
158 	ndr_symbol_t		*sym;
159 
160 	for (i = 0; kwtable[i].name; i++) {
161 		kw = &kwtable[i];
162 
163 		sym = sym_enter(kw->name);
164 		sym->kw = kw;
165 	}
166 }
167 
168 void
169 set_lex_input(FILE *fp, char *name)
170 {
171 	keyw_tab_init(keywtable);
172 	keyw_tab_init(optable);
173 
174 	lex_infp = fp;
175 	file_name = sym_enter(name);
176 	line_number = 1;
177 	lex_at_bol = 1;
178 }
179 
180 static int
181 getch(FILE *fp)
182 {
183 	return (getc(fp));
184 }
185 
186 int
187 yylex(void)
188 {
189 	char		lexeme[512];
190 	char		*p = lexeme;
191 	FILE		*fp = lex_infp;
192 	int		c, xc;
193 	ndr_symbol_t	*sym;
194 	ndr_integer_t	*intg;
195 
196 top:
197 	p = lexeme;
198 
199 	c = getch(fp);
200 	if (c == EOF)
201 		return (EOF);
202 
203 	if (c == '\n') {
204 		line_number++;
205 		lex_at_bol = 1;
206 		goto top;
207 	}
208 
209 	/*
210 	 * Handle preprocessor lines. This just notes
211 	 * which file we're processing.
212 	 */
213 	if (c == '#' && lex_at_bol) {
214 		char		*sv[10];
215 		int		sc;
216 
217 		while ((c = getch(fp)) != EOF && c != '\n')
218 			*p++ = c;
219 
220 		*p = 0;
221 		/* note: no ungetc() of newline, we don't want to count it */
222 
223 		if (*lexeme != ' ') {
224 			/* not a line we know */
225 			goto top;
226 		}
227 
228 		sc = str_to_sv(lexeme, sv);
229 		if (sc < 2)
230 			goto top;
231 
232 		file_name = sym_enter(sv[1]);
233 		line_number = atoi(sv[0]);	/* for next input line */
234 		lex_at_bol = 1;
235 		goto top;
236 	}
237 
238 	lex_at_bol = 0;
239 
240 	/*
241 	 * Skip white space
242 	 */
243 	if (is_white(c))
244 		goto top;
245 
246 	/*
247 	 * Symbol? Might be a keyword or just an identifier
248 	 */
249 	if (is_sstart(c)) {
250 		/* we got a symbol */
251 		do {
252 			*p++ = c;
253 			c = getch(fp);
254 		} while (is_sfollow(c));
255 		(void) ungetc(c, fp);
256 		*p = 0;
257 
258 		sym = sym_enter(lexeme);
259 
260 		yylval = &sym->s_node;
261 
262 		if (sym->kw) {
263 			return (sym->kw->token);
264 		} else {
265 			return (IDENTIFIER);
266 		}
267 	}
268 
269 	/*
270 	 * Integer constant?
271 	 */
272 	if (is_digit(c)) {
273 		/* we got a number */
274 		*p++ = c;
275 		if (c == '0') {
276 			c = getch(fp);
277 			if (c == 'x' || c == 'X') {
278 				/* handle hex specially */
279 				do {
280 					*p++ = c;
281 					c = getch(fp);
282 				} while (is_xdigit(c));
283 				goto convert_icon;
284 			} else if (c == 'b' || c == 'B' ||
285 			    c == 'd' || c == 'D' ||
286 			    c == 'o' || c == 'O') {
287 				do {
288 					*p++ = c;
289 					c = getch(fp);
290 				} while (is_digit(c));
291 				goto convert_icon;
292 			}
293 			(void) ungetc(c, fp);
294 		}
295 		/* could be anything */
296 		c = getch(fp);
297 		while (is_digit(c)) {
298 			*p++ = c;
299 			c = getch(fp);
300 		}
301 
302 convert_icon:
303 		*p = 0;
304 		(void) ungetc(c, fp);
305 
306 		intg = int_enter(strtol(lexeme, 0, 0));
307 		yylval = &intg->s_node;
308 
309 		return (INTEGER);
310 	}
311 
312 	/* Could handle strings. We don't seem to need them yet */
313 
314 	yylval = 0;		/* operator tokens have no value */
315 	xc = getch(fp);		/* get look-ahead for two-char lexemes */
316 
317 	lexeme[0] = c;
318 	lexeme[1] = xc;
319 	lexeme[2] = 0;
320 
321 	/*
322 	 * Look for to-end-of-line comment
323 	 */
324 	if (c == '/' && xc == '/') {
325 		/* eat the comment */
326 		while ((c = getch(fp)) != EOF && c != '\n')
327 			;
328 		(void) ungetc(c, fp);		/* put back newline */
329 		goto top;
330 	}
331 
332 	/*
333 	 * Look for multi-line comment
334 	 */
335 	if (c == '/' && xc == '*') {
336 		/* eat the comment */
337 		xc = -1;
338 		while ((c = getch(fp)) != EOF) {
339 			if (xc == '*' && c == '/') {
340 				/* that's it */
341 				break;
342 			}
343 			xc = c;
344 			if (c == '\n')
345 				line_number++;
346 		}
347 		goto top;
348 	}
349 
350 	/*
351 	 * Use symbol table lookup for two-character and
352 	 * one character operator tokens.
353 	 */
354 	sym = sym_find(lexeme);
355 	if (sym) {
356 		/* there better be a keyword attached */
357 		yylval = &sym->s_node;
358 		return (sym->kw->token);
359 	}
360 
361 	/* Try a one-character form */
362 	(void) ungetc(xc, fp);
363 	lexeme[1] = 0;
364 	sym = sym_find(lexeme);
365 	if (sym) {
366 		/* there better be a keyword attached */
367 		yylval = &sym->s_node;
368 		return (sym->kw->token);
369 	}
370 
371 	compile_error("unrecognized character 0x%02x", c);
372 	goto top;
373 }
374 
375 static ndr_symbol_t *
376 sym_find(char *name)
377 {
378 	ndr_symbol_t		**pp;
379 	ndr_symbol_t		*p;
380 
381 	for (pp = &symbol_list; (p = *pp) != 0; pp = &p->next) {
382 		if (strcmp(p->name, name) == 0)
383 			return (p);
384 	}
385 
386 	return (0);
387 }
388 
389 ndr_symbol_t *
390 sym_enter(char *name)
391 {
392 	ndr_symbol_t		**pp;
393 	ndr_symbol_t		*p;
394 
395 	for (pp = &symbol_list; (p = *pp) != 0; pp = &p->next) {
396 		if (strcmp(p->name, name) == 0)
397 			return (p);
398 	}
399 
400 	p = ndr_alloc(1, sizeof (ndr_symbol_t));
401 
402 	if ((p->name = strdup(name)) == NULL)
403 		fatal_error("%s", strerror(ENOMEM));
404 
405 	p->s_node.label = IDENTIFIER;
406 	p->s_node.n_sym = p;
407 
408 	*pp = p;
409 
410 	return (p);
411 }
412 
413 static ndr_integer_t *
414 int_enter(long value)
415 {
416 	ndr_integer_t		**pp;
417 	ndr_integer_t		*p;
418 
419 	for (pp = &integer_list; (p = *pp) != 0; pp = &p->next) {
420 		if (p->value == value)
421 			return (p);
422 	}
423 
424 	p = ndr_alloc(1, sizeof (ndr_integer_t));
425 
426 	p->value = value;
427 	p->s_node.label = INTEGER;
428 	p->s_node.n_int = value;
429 
430 	*pp = p;
431 
432 	return (p);
433 }
434 
435 void *
436 ndr_alloc(size_t nelem, size_t elsize)
437 {
438 	void *p;
439 
440 	if ((p = calloc(nelem, elsize)) == NULL) {
441 		fatal_error("%s", strerror(ENOMEM));
442 		/* NOTREACHED */
443 	}
444 
445 	return (p);
446 }
447 
448 /*
449  * The input context (filename, line number) is maintained by the
450  * lexical analysis, and we generally want such info reported for
451  * errors in a consistent manner.
452  */
453 void
454 compile_error(const char *fmt, ...)
455 {
456 	char	buf[NDLBUFSZ];
457 	va_list ap;
458 
459 	va_start(ap, fmt);
460 	(void) vsnprintf(buf, NDLBUFSZ, fmt, ap);
461 	va_end(ap);
462 
463 	(void) fprintf(stderr, "ndrgen: compile error: %s:%d: %s\n",
464 	    file_name->name, line_number, buf);
465 
466 	n_compile_error++;
467 }
468 
469 void
470 fatal_error(const char *fmt, ...)
471 {
472 	char	buf[NDLBUFSZ];
473 	va_list ap;
474 
475 	va_start(ap, fmt);
476 	(void) vsnprintf(buf, NDLBUFSZ, fmt, ap);
477 	va_end(ap);
478 
479 	(void) fprintf(stderr, "ndrgen: fatal error: %s\n", buf);
480 	exit(1);
481 }
482 
483 /*
484  * Setup nodes for the lexical analyzer.
485  */
486 struct node *
487 n_cons(int label, ...)
488 {
489 	ndr_node_t		*np;
490 	va_list ap;
491 
492 	np = ndr_alloc(1, sizeof (ndr_node_t));
493 
494 	va_start(ap, label);
495 	np->label = label;
496 	np->n_arg[0] = va_arg(ap, void *);
497 	np->n_arg[1] = va_arg(ap, void *);
498 	np->n_arg[2] = va_arg(ap, void *);
499 	va_end(ap);
500 
501 	np->line_number = line_number;
502 	np->file_name = file_name;
503 
504 	return (np);
505 }
506 
507 /*
508  *	list:	item
509  *	|	list item	={ n_splice($1, $2); }
510  *	;
511  */
512 void
513 n_splice(struct node *np1, struct node *np2)
514 {
515 	while (np1->n_next)
516 		np1 = np1->n_next;
517 
518 	np1->n_next = np2;
519 }
520 
521 /*
522  * Convert a string of words to a vector of strings.
523  * Returns the number of words.
524  */
525 static int
526 str_to_sv(char *buf, char *sv[])
527 {
528 	char		**pp = sv;
529 	char		*p = buf;
530 	char		*q = buf;
531 	int		in_word = 0;
532 	int		c;
533 
534 	for (;;) {
535 		c = *p++;
536 		if (c == 0)
537 			break;
538 
539 		if (!in_word) {
540 			if (iswhite(c))
541 				continue;
542 
543 			*pp++ = q;
544 			in_word = 1;
545 		}
546 
547 		if (isquote(c)) {
548 			int		qc = c;
549 
550 			while (((c = *p++) != 0) && (c != qc))
551 				*q++ = c;
552 			if (c == 0)
553 				break;
554 		} else if (iswhite(c)) {
555 			/* end of word */
556 			*q++ = 0;
557 			in_word = 0;
558 		} else {
559 			/* still inside word */
560 			*q++ = c;
561 		}
562 	}
563 
564 	if (in_word)
565 		*q++ = 0;
566 
567 	*pp = (char *)0;
568 	return (pp - sv);
569 }
570