xref: /netbsd/usr.bin/indent/lexi.c (revision 6550d01e)
1 /*	$NetBSD: lexi.c,v 1.13 2009/04/12 11:09:49 lukem Exp $	*/
2 
3 /*
4  * Copyright (c) 1980, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. Neither the name of the University nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1976 Board of Trustees of the University of Illinois.
34  * Copyright (c) 1985 Sun Microsystems, Inc.
35  * All rights reserved.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. All advertising materials mentioning features or use of this software
46  *    must display the following acknowledgement:
47  *	This product includes software developed by the University of
48  *	California, Berkeley and its contributors.
49  * 4. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  */
65 
66 #include <sys/cdefs.h>
67 #ifndef lint
68 #if 0
69 static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
70 #else
71 __RCSID("$NetBSD: lexi.c,v 1.13 2009/04/12 11:09:49 lukem Exp $");
72 #endif
73 #endif				/* not lint */
74 
75 /*
76  * Here we have the token scanner for indent.  It scans off one token and puts
77  * it in the global variable "token".  It returns a code, indicating the type
78  * of token scanned.
79  */
80 
81 #include <stdio.h>
82 #include <ctype.h>
83 #include <stdlib.h>
84 #include <string.h>
85 #include "indent_globs.h"
86 #include "indent_codes.h"
87 
88 #define alphanum 1
89 #define opchar 3
90 
91 struct templ {
92 	const char	*rwd;
93 	int		rwcode;
94 };
95 
96 struct templ specials[1000] =
97 {
98 	{"switch", 1},
99 	{"case", 2},
100 	{"break", 0},
101 	{"struct", 3},
102 	{"union", 3},
103 	{"enum", 3},
104 	{"default", 2},
105 	{"int", 4},
106 	{"char", 4},
107 	{"float", 4},
108 	{"double", 4},
109 	{"long", 4},
110 	{"short", 4},
111 	{"typdef", 4},
112 	{"unsigned", 4},
113 	{"register", 4},
114 	{"static", 4},
115 	{"global", 4},
116 	{"extern", 4},
117 	{"void", 4},
118 	{"goto", 0},
119 	{"return", 0},
120 	{"if", 5},
121 	{"while", 5},
122 	{"for", 5},
123 	{"else", 6},
124 	{"do", 6},
125 	{"sizeof", 7},
126 	{0, 0}
127 };
128 
129 char    chartype[128] =
130 {				/* this is used to facilitate the decision of
131 				 * what type (alphanumeric, operator) each
132 				 * character is */
133 	0, 0, 0, 0, 0, 0, 0, 0,
134 	0, 0, 0, 0, 0, 0, 0, 0,
135 	0, 0, 0, 0, 0, 0, 0, 0,
136 	0, 0, 0, 0, 0, 0, 0, 0,
137 	0, 3, 0, 0, 1, 3, 3, 0,
138 	0, 0, 3, 3, 0, 3, 0, 3,
139 	1, 1, 1, 1, 1, 1, 1, 1,
140 	1, 1, 0, 0, 3, 3, 3, 3,
141 	0, 1, 1, 1, 1, 1, 1, 1,
142 	1, 1, 1, 1, 1, 1, 1, 1,
143 	1, 1, 1, 1, 1, 1, 1, 1,
144 	1, 1, 1, 0, 0, 0, 3, 1,
145 	0, 1, 1, 1, 1, 1, 1, 1,
146 	1, 1, 1, 1, 1, 1, 1, 1,
147 	1, 1, 1, 1, 1, 1, 1, 1,
148 	1, 1, 1, 0, 3, 0, 3, 0
149 };
150 
151 
152 
153 
154 int
155 lexi(void)
156 {
157 	int     unary_delim;	/* this is set to 1 if the current token
158 				 *
159 				 * forces a following operator to be unary */
160 	static int last_code;	/* the last token type returned */
161 	static int l_struct;	/* set to 1 if the last token was 'struct' */
162 	int     code;		/* internal code to be returned */
163 	char    qchar;		/* the delimiter character for a string */
164 
165 	e_token = s_token;	/* point to start of place to save token */
166 	unary_delim = false;
167 	ps.col_1 = ps.last_nl;	/* tell world that this token started in
168 				 * column 1 iff the last thing scanned was nl */
169 	ps.last_nl = false;
170 
171 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
172 		ps.col_1 = false;	/* leading blanks imply token is not
173 					 * in column 1 */
174 		if (++buf_ptr >= buf_end)
175 			fill_buffer();
176 	}
177 
178 	/* Scan an alphanumeric token */
179 	if (chartype[(int) *buf_ptr] == alphanum ||
180 	    (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
181 		/*
182 		 * we have a character or number
183 		 */
184 		const char *j;	/* used for searching thru list of
185 				 * reserved words */
186 		struct templ *p;
187 
188 		if (isdigit((unsigned char)*buf_ptr) ||
189 		    (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
190 			int     seendot = 0, seenexp = 0, seensfx = 0;
191 			if (*buf_ptr == '0' &&
192 			    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
193 				*e_token++ = *buf_ptr++;
194 				*e_token++ = *buf_ptr++;
195 				while (isxdigit((unsigned char)*buf_ptr)) {
196 					CHECK_SIZE_TOKEN;
197 					*e_token++ = *buf_ptr++;
198 				}
199 			} else {
200 				while (1) {
201 					if (*buf_ptr == '.') {
202 						if (seendot)
203 							break;
204 						else
205 							seendot++;
206 					}
207 					CHECK_SIZE_TOKEN;
208 					*e_token++ = *buf_ptr++;
209 					if (!isdigit((unsigned char)*buf_ptr)
210 					&& *buf_ptr != '.') {
211 						if ((*buf_ptr != 'E'
212 						&& *buf_ptr != 'e') || seenexp)
213 							break;
214 						else {
215 							seenexp++;
216 							seendot++;
217 							CHECK_SIZE_TOKEN;
218 							*e_token++ = *buf_ptr++;
219 							if (*buf_ptr == '+' || *buf_ptr == '-')
220 								*e_token++ = *buf_ptr++;
221 						}
222 					}
223 				}
224 			}
225 			if (*buf_ptr == 'F' || *buf_ptr == 'f') {
226 				/* float constant */
227 				*e_token++ = *buf_ptr++;
228 			} else {
229 				/* integer constant */
230 				while (1) {
231 					if (!(seensfx & 1) &&
232 					    (*buf_ptr == 'U' ||
233 					     *buf_ptr == 'u')) {
234 						CHECK_SIZE_TOKEN;
235 						*e_token++ = *buf_ptr++;
236 						seensfx |= 1;
237 						continue;
238 					}
239 					if (!(seensfx & 2) &&
240 					    (*buf_ptr == 'L' ||
241 					     *buf_ptr == 'l')) {
242 						CHECK_SIZE_TOKEN;
243 						if (buf_ptr[1] == buf_ptr[0])
244 							*e_token++ = *buf_ptr++;
245 						*e_token++ = *buf_ptr++;
246 						seensfx |= 2;
247 						continue;
248 					}
249 					break;
250 				}
251 			}
252 		} else
253 			while (chartype[(int) *buf_ptr] == alphanum) {	/* copy it over */
254 				CHECK_SIZE_TOKEN;
255 				*e_token++ = *buf_ptr++;
256 				if (buf_ptr >= buf_end)
257 					fill_buffer();
258 			}
259 		*e_token++ = '\0';
260 		while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
261 			if (++buf_ptr >= buf_end)
262 				fill_buffer();
263 		}
264 		ps.its_a_keyword = false;
265 		ps.sizeof_keyword = false;
266 		if (l_struct) {	/* if last token was 'struct', then this token
267 				 * should be treated as a declaration */
268 			l_struct = false;
269 			last_code = ident;
270 			ps.last_u_d = true;
271 			return (decl);
272 		}
273 		ps.last_u_d = false;	/* Operator after indentifier is
274 					 * binary */
275 		last_code = ident;	/* Remember that this is the code we
276 					 * will return */
277 
278 		/*
279 		 * This loop will check if the token is a keyword.
280 		 */
281 		for (p = specials; (j = p->rwd) != 0; p++) {
282 			char   *pt = s_token;	/* point at scanned token */
283 			if (*j++ != *pt++ || *j++ != *pt++)
284 				continue;	/* This test depends on the
285 						 * fact that identifiers are
286 						 * always at least 1 character
287 						 * long (ie. the first two
288 						 * bytes of the identifier are
289 						 * always meaningful) */
290 			if (pt[-1] == 0)
291 				break;	/* If its a one-character identifier */
292 			while (*pt++ == *j)
293 				if (*j++ == 0)
294 					goto found_keyword;	/* I wish that C had a
295 								 * multi-level break... */
296 		}
297 		if (p->rwd) {	/* we have a keyword */
298 	found_keyword:
299 			ps.its_a_keyword = true;
300 			ps.last_u_d = true;
301 			switch (p->rwcode) {
302 			case 1:/* it is a switch */
303 				return (swstmt);
304 			case 2:/* a case or default */
305 				return (casestmt);
306 
307 			case 3:/* a "struct" */
308 				if (ps.p_l_follow)
309 					break;	/* inside parens: cast */
310 				l_struct = true;
311 
312 				/*
313 				 * Next time around, we will want to know that we have had a
314 				 * 'struct'
315 				 */
316 			case 4:/* one of the declaration keywords */
317 				if (ps.p_l_follow) {
318 					ps.cast_mask |= 1 << ps.p_l_follow;
319 					break;	/* inside parens: cast */
320 				}
321 				last_code = decl;
322 				return (decl);
323 
324 			case 5:/* if, while, for */
325 				return (sp_paren);
326 
327 			case 6:/* do, else */
328 				return (sp_nparen);
329 
330 			case 7:
331 				ps.sizeof_keyword = true;
332 			default:	/* all others are treated like any
333 					 * other identifier */
334 				return (ident);
335 			}	/* end of switch */
336 		}		/* end of if (found_it) */
337 		if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
338 			char   *tp = buf_ptr;
339 			while (tp < buf_end)
340 				if (*tp++ == ')' && (*tp == ';' || *tp == ','))
341 					goto not_proc;
342 			strncpy(ps.procname, token, sizeof ps.procname - 1);
343 			ps.in_parameter_declaration = 1;
344 			rparen_count = 1;
345 	not_proc:	;
346 		}
347 		/*
348 		 * The following hack attempts to guess whether or not the current
349 		 * token is in fact a declaration keyword -- one that has been
350 		 * typedefd
351 		 */
352 		if (((*buf_ptr == '*' && buf_ptr[1] != '=') ||
353 		    isalpha((unsigned char)*buf_ptr) || *buf_ptr == '_')
354 		    && !ps.p_l_follow
355 		    && !ps.block_init
356 		    && (ps.last_token == rparen || ps.last_token == semicolon ||
357 			ps.last_token == decl ||
358 			ps.last_token == lbrace || ps.last_token == rbrace)) {
359 			ps.its_a_keyword = true;
360 			ps.last_u_d = true;
361 			last_code = decl;
362 			return decl;
363 		}
364 		if (last_code == decl)	/* if this is a declared variable,
365 					 * then following sign is unary */
366 			ps.last_u_d = true;	/* will make "int a -1" work */
367 		last_code = ident;
368 		return (ident);	/* the ident is not in the list */
369 	}			/* end of procesing for alpanum character */
370 	/* Scan a non-alphanumeric token */
371 	*e_token++ = *buf_ptr;	/* if it is only a one-character token, it is
372 				 * moved here */
373 	*e_token = '\0';
374 	if (++buf_ptr >= buf_end)
375 		fill_buffer();
376 
377 	switch (*token) {
378 	case '\n':
379 		unary_delim = ps.last_u_d;
380 		ps.last_nl = true;	/* remember that we just had a newline */
381 		code = (had_eof ? 0 : newline);
382 
383 		/*
384 		 * if data has been exausted, the newline is a dummy, and we should
385 		 * return code to stop
386 		 */
387 		break;
388 
389 	case '\'':		/* start of quoted character */
390 	case '"':		/* start of string */
391 		qchar = *token;
392 		if (troff) {
393 			e_token[-1] = '`';
394 			if (qchar == '"')
395 				*e_token++ = '`';
396 			e_token = chfont(&bodyf, &stringf, e_token);
397 		}
398 		do {		/* copy the string */
399 			while (1) {	/* move one character or
400 					 * [/<char>]<char> */
401 				if (*buf_ptr == '\n') {
402 					printf("%d: Unterminated literal\n", line_no);
403 					goto stop_lit;
404 				}
405 				CHECK_SIZE_TOKEN;	/* Only have to do this
406 							 * once in this loop,
407 							 * since CHECK_SIZE
408 							 * guarantees that there
409 							 * are at least 5
410 							 * entries left */
411 				*e_token = *buf_ptr++;
412 				if (buf_ptr >= buf_end)
413 					fill_buffer();
414 				if (*e_token == BACKSLASH) {	/* if escape, copy extra
415 								 * char */
416 					if (*buf_ptr == '\n')	/* check for escaped
417 								 * newline */
418 						++line_no;
419 					if (troff) {
420 						*++e_token = BACKSLASH;
421 						if (*buf_ptr == BACKSLASH)
422 							*++e_token = BACKSLASH;
423 					}
424 					*++e_token = *buf_ptr++;
425 					++e_token;	/* we must increment
426 							 * this again because we
427 							 * copied two chars */
428 					if (buf_ptr >= buf_end)
429 						fill_buffer();
430 				} else
431 					break;	/* we copied one character */
432 			}	/* end of while (1) */
433 		} while (*e_token++ != qchar);
434 		if (troff) {
435 			e_token = chfont(&stringf, &bodyf, e_token - 1);
436 			if (qchar == '"')
437 				*e_token++ = '\'';
438 		}
439 stop_lit:
440 		code = ident;
441 		break;
442 
443 	case ('('):
444 	case ('['):
445 		unary_delim = true;
446 		code = lparen;
447 		break;
448 
449 	case (')'):
450 	case (']'):
451 		code = rparen;
452 		break;
453 
454 	case '#':
455 		unary_delim = ps.last_u_d;
456 		code = preesc;
457 		break;
458 
459 	case '?':
460 		unary_delim = true;
461 		code = question;
462 		break;
463 
464 	case (':'):
465 		code = colon;
466 		unary_delim = true;
467 		break;
468 
469 	case (';'):
470 		unary_delim = true;
471 		code = semicolon;
472 		break;
473 
474 	case ('{'):
475 		unary_delim = true;
476 
477 		/*
478 		 * if (ps.in_or_st) ps.block_init = 1;
479 		 */
480 		/* ?	code = ps.block_init ? lparen : lbrace; */
481 		code = lbrace;
482 		break;
483 
484 	case ('}'):
485 		unary_delim = true;
486 		/* ?	code = ps.block_init ? rparen : rbrace; */
487 		code = rbrace;
488 		break;
489 
490 	case 014:		/* a form feed */
491 		unary_delim = ps.last_u_d;
492 		ps.last_nl = true;	/* remember this so we can set
493 					 * 'ps.col_1' right */
494 		code = form_feed;
495 		break;
496 
497 	case (','):
498 		unary_delim = true;
499 		code = comma;
500 		break;
501 
502 	case '.':
503 		unary_delim = false;
504 		code = period;
505 		break;
506 
507 	case '-':
508 	case '+':		/* check for -, +, --, ++ */
509 		code = (ps.last_u_d ? unary_op : binary_op);
510 		unary_delim = true;
511 
512 		if (*buf_ptr == token[0]) {
513 			/* check for doubled character */
514 			*e_token++ = *buf_ptr++;
515 			/* buffer overflow will be checked at end of loop */
516 			if (last_code == ident || last_code == rparen) {
517 				code = (ps.last_u_d ? unary_op : postop);
518 				/* check for following ++ or -- */
519 				unary_delim = false;
520 			}
521 		} else
522 			if (*buf_ptr == '=')
523 				/* check for operator += */
524 				*e_token++ = *buf_ptr++;
525 			else
526 				if (*buf_ptr == '>') {
527 					/* check for operator -> */
528 					*e_token++ = *buf_ptr++;
529 					if (!pointer_as_binop) {
530 						unary_delim = false;
531 						code = unary_op;
532 						ps.want_blank = false;
533 					}
534 				}
535 		break;		/* buffer overflow will be checked at end of
536 				 * switch */
537 
538 	case '=':
539 		if (ps.in_or_st)
540 			ps.block_init = 1;
541 #ifdef undef
542 		if (chartype[*buf_ptr] == opchar) {	/* we have two char
543 							 * assignment */
544 			e_token[-1] = *buf_ptr++;
545 			if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
546 				*e_token++ = *buf_ptr++;
547 			*e_token++ = '=';	/* Flip =+ to += */
548 			*e_token = 0;
549 		}
550 #else
551 		if (*buf_ptr == '=') {	/* == */
552 			*e_token++ = '=';	/* Flip =+ to += */
553 			buf_ptr++;
554 			*e_token = 0;
555 		}
556 #endif
557 		code = binary_op;
558 		unary_delim = true;
559 		break;
560 		/* can drop thru!!! */
561 
562 	case '>':
563 	case '<':
564 	case '!':		/* ops like <, <<, <=, !=, etc */
565 		if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
566 			*e_token++ = *buf_ptr;
567 			if (++buf_ptr >= buf_end)
568 				fill_buffer();
569 		}
570 		if (*buf_ptr == '=')
571 			*e_token++ = *buf_ptr++;
572 		code = (ps.last_u_d ? unary_op : binary_op);
573 		unary_delim = true;
574 		break;
575 
576 	default:
577 		if (token[0] == '/' && *buf_ptr == '*') {
578 			/* it is start of comment */
579 			*e_token++ = '*';
580 
581 			if (++buf_ptr >= buf_end)
582 				fill_buffer();
583 
584 			code = comment;
585 			unary_delim = ps.last_u_d;
586 			break;
587 		}
588 		while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
589 			/*
590 		         * handle ||, &&, etc, and also things as in int *****i
591 		         */
592 			*e_token++ = *buf_ptr;
593 			if (++buf_ptr >= buf_end)
594 				fill_buffer();
595 		}
596 		code = (ps.last_u_d ? unary_op : binary_op);
597 		unary_delim = true;
598 
599 
600 	}			/* end of switch */
601 	if (code != newline) {
602 		l_struct = false;
603 		last_code = code;
604 	}
605 	if (buf_ptr >= buf_end)	/* check for input buffer empty */
606 		fill_buffer();
607 	ps.last_u_d = unary_delim;
608 	*e_token = '\0';	/* null terminate the token */
609 	return (code);
610 }
611 /*
612  * Add the given keyword to the keyword table, using val as the keyword type
613  */
614 void
615 addkey(char *key, int val)
616 {
617 	struct templ *p = specials;
618 	while (p->rwd)
619 		if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
620 			return;
621 		else
622 			p++;
623 	if (p >= specials + sizeof specials / sizeof specials[0])
624 		return;		/* For now, table overflows are silently
625 				 * ignored */
626 	p->rwd = key;
627 	p->rwcode = val;
628 	p[1].rwd = 0;
629 	p[1].rwcode = 0;
630 }
631