xref: /netbsd/usr.bin/indent/lexi.c (revision bf9ec67e)
1 /*	$NetBSD: lexi.c,v 1.11 2002/05/26 22:53:38 wiz Exp $	*/
2 
3 /*
4  * Copyright (c) 1980, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  * Copyright (c) 1976 Board of Trustees of the University of Illinois.
7  * Copyright (c) 1985 Sun Microsystems, Inc.
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  */
38 
39 #include <sys/cdefs.h>
40 #ifndef lint
41 #if 0
42 static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
43 #else
44 __RCSID("$NetBSD: lexi.c,v 1.11 2002/05/26 22:53:38 wiz Exp $");
45 #endif
46 #endif				/* not lint */
47 
48 /*
49  * Here we have the token scanner for indent.  It scans off one token and puts
50  * it in the global variable "token".  It returns a code, indicating the type
51  * of token scanned.
52  */
53 
54 #include <stdio.h>
55 #include <ctype.h>
56 #include <stdlib.h>
57 #include <string.h>
58 #include "indent_globs.h"
59 #include "indent_codes.h"
60 
61 #define alphanum 1
62 #define opchar 3
63 
64 struct templ {
65 	char   *rwd;
66 	int     rwcode;
67 };
68 
69 struct templ specials[1000] =
70 {
71 	{"switch", 1},
72 	{"case", 2},
73 	{"break", 0},
74 	{"struct", 3},
75 	{"union", 3},
76 	{"enum", 3},
77 	{"default", 2},
78 	{"int", 4},
79 	{"char", 4},
80 	{"float", 4},
81 	{"double", 4},
82 	{"long", 4},
83 	{"short", 4},
84 	{"typdef", 4},
85 	{"unsigned", 4},
86 	{"register", 4},
87 	{"static", 4},
88 	{"global", 4},
89 	{"extern", 4},
90 	{"void", 4},
91 	{"goto", 0},
92 	{"return", 0},
93 	{"if", 5},
94 	{"while", 5},
95 	{"for", 5},
96 	{"else", 6},
97 	{"do", 6},
98 	{"sizeof", 7},
99 	{0, 0}
100 };
101 
102 char    chartype[128] =
103 {				/* this is used to facilitate the decision of
104 				 * what type (alphanumeric, operator) each
105 				 * character is */
106 	0, 0, 0, 0, 0, 0, 0, 0,
107 	0, 0, 0, 0, 0, 0, 0, 0,
108 	0, 0, 0, 0, 0, 0, 0, 0,
109 	0, 0, 0, 0, 0, 0, 0, 0,
110 	0, 3, 0, 0, 1, 3, 3, 0,
111 	0, 0, 3, 3, 0, 3, 0, 3,
112 	1, 1, 1, 1, 1, 1, 1, 1,
113 	1, 1, 0, 0, 3, 3, 3, 3,
114 	0, 1, 1, 1, 1, 1, 1, 1,
115 	1, 1, 1, 1, 1, 1, 1, 1,
116 	1, 1, 1, 1, 1, 1, 1, 1,
117 	1, 1, 1, 0, 0, 0, 3, 1,
118 	0, 1, 1, 1, 1, 1, 1, 1,
119 	1, 1, 1, 1, 1, 1, 1, 1,
120 	1, 1, 1, 1, 1, 1, 1, 1,
121 	1, 1, 1, 0, 3, 0, 3, 0
122 };
123 
124 
125 
126 
127 int
128 lexi(void)
129 {
130 	int     unary_delim;	/* this is set to 1 if the current token
131 				 *
132 				 * forces a following operator to be unary */
133 	static int last_code;	/* the last token type returned */
134 	static int l_struct;	/* set to 1 if the last token was 'struct' */
135 	int     code;		/* internal code to be returned */
136 	char    qchar;		/* the delimiter character for a string */
137 
138 	e_token = s_token;	/* point to start of place to save token */
139 	unary_delim = false;
140 	ps.col_1 = ps.last_nl;	/* tell world that this token started in
141 				 * column 1 iff the last thing scanned was nl */
142 	ps.last_nl = false;
143 
144 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
145 		ps.col_1 = false;	/* leading blanks imply token is not
146 					 * in column 1 */
147 		if (++buf_ptr >= buf_end)
148 			fill_buffer();
149 	}
150 
151 	/* Scan an alphanumeric token */
152 	if (chartype[(int) *buf_ptr] == alphanum ||
153 	    (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
154 		/*
155 		 * we have a character or number
156 		 */
157 		char   *j;	/* used for searching thru list of
158 				 *
159 				 * reserved words */
160 		struct templ *p;
161 
162 		if (isdigit((unsigned char)*buf_ptr) ||
163 		    (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
164 			int     seendot = 0, seenexp = 0, seensfx = 0;
165 			if (*buf_ptr == '0' &&
166 			    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
167 				*e_token++ = *buf_ptr++;
168 				*e_token++ = *buf_ptr++;
169 				while (isxdigit((unsigned char)*buf_ptr)) {
170 					CHECK_SIZE_TOKEN;
171 					*e_token++ = *buf_ptr++;
172 				}
173 			} else {
174 				while (1) {
175 					if (*buf_ptr == '.') {
176 						if (seendot)
177 							break;
178 						else
179 							seendot++;
180 					}
181 					CHECK_SIZE_TOKEN;
182 					*e_token++ = *buf_ptr++;
183 					if (!isdigit((unsigned char)*buf_ptr)
184 					&& *buf_ptr != '.') {
185 						if ((*buf_ptr != 'E'
186 						&& *buf_ptr != 'e') || seenexp)
187 							break;
188 						else {
189 							seenexp++;
190 							seendot++;
191 							CHECK_SIZE_TOKEN;
192 							*e_token++ = *buf_ptr++;
193 							if (*buf_ptr == '+' || *buf_ptr == '-')
194 								*e_token++ = *buf_ptr++;
195 						}
196 					}
197 				}
198 			}
199 			if (*buf_ptr == 'F' || *buf_ptr == 'f') {
200 				/* float constant */
201 				*e_token++ = *buf_ptr++;
202 			} else {
203 				/* integer constant */
204 				while (1) {
205 					if (!(seensfx & 1) &&
206 					    (*buf_ptr == 'U' ||
207 					     *buf_ptr == 'u')) {
208 						CHECK_SIZE_TOKEN;
209 						*e_token++ = *buf_ptr++;
210 						seensfx |= 1;
211 						continue;
212 					}
213 					if (!(seensfx & 2) &&
214 					    (*buf_ptr == 'L' ||
215 					     *buf_ptr == 'l')) {
216 						CHECK_SIZE_TOKEN;
217 						if (buf_ptr[1] == buf_ptr[0])
218 							*e_token++ = *buf_ptr++;
219 						*e_token++ = *buf_ptr++;
220 						seensfx |= 2;
221 						continue;
222 					}
223 					break;
224 				}
225 			}
226 		} else
227 			while (chartype[(int) *buf_ptr] == alphanum) {	/* copy it over */
228 				CHECK_SIZE_TOKEN;
229 				*e_token++ = *buf_ptr++;
230 				if (buf_ptr >= buf_end)
231 					fill_buffer();
232 			}
233 		*e_token++ = '\0';
234 		while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
235 			if (++buf_ptr >= buf_end)
236 				fill_buffer();
237 		}
238 		ps.its_a_keyword = false;
239 		ps.sizeof_keyword = false;
240 		if (l_struct) {	/* if last token was 'struct', then this token
241 				 * should be treated as a declaration */
242 			l_struct = false;
243 			last_code = ident;
244 			ps.last_u_d = true;
245 			return (decl);
246 		}
247 		ps.last_u_d = false;	/* Operator after indentifier is
248 					 * binary */
249 		last_code = ident;	/* Remember that this is the code we
250 					 * will return */
251 
252 		/*
253 		 * This loop will check if the token is a keyword.
254 		 */
255 		for (p = specials; (j = p->rwd) != 0; p++) {
256 			char   *p = s_token;	/* point at scanned token */
257 			if (*j++ != *p++ || *j++ != *p++)
258 				continue;	/* This test depends on the
259 						 * fact that identifiers are
260 						 * always at least 1 character
261 						 * long (ie. the first two
262 						 * bytes of the identifier are
263 						 * always meaningful) */
264 			if (p[-1] == 0)
265 				break;	/* If its a one-character identifier */
266 			while (*p++ == *j)
267 				if (*j++ == 0)
268 					goto found_keyword;	/* I wish that C had a
269 								 * multi-level break... */
270 		}
271 		if (p->rwd) {	/* we have a keyword */
272 	found_keyword:
273 			ps.its_a_keyword = true;
274 			ps.last_u_d = true;
275 			switch (p->rwcode) {
276 			case 1:/* it is a switch */
277 				return (swstmt);
278 			case 2:/* a case or default */
279 				return (casestmt);
280 
281 			case 3:/* a "struct" */
282 				if (ps.p_l_follow)
283 					break;	/* inside parens: cast */
284 				l_struct = true;
285 
286 				/*
287 				 * Next time around, we will want to know that we have had a
288 				 * 'struct'
289 				 */
290 			case 4:/* one of the declaration keywords */
291 				if (ps.p_l_follow) {
292 					ps.cast_mask |= 1 << ps.p_l_follow;
293 					break;	/* inside parens: cast */
294 				}
295 				last_code = decl;
296 				return (decl);
297 
298 			case 5:/* if, while, for */
299 				return (sp_paren);
300 
301 			case 6:/* do, else */
302 				return (sp_nparen);
303 
304 			case 7:
305 				ps.sizeof_keyword = true;
306 			default:	/* all others are treated like any
307 					 * other identifier */
308 				return (ident);
309 			}	/* end of switch */
310 		}		/* end of if (found_it) */
311 		if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
312 			char   *tp = buf_ptr;
313 			while (tp < buf_end)
314 				if (*tp++ == ')' && (*tp == ';' || *tp == ','))
315 					goto not_proc;
316 			strncpy(ps.procname, token, sizeof ps.procname - 1);
317 			ps.in_parameter_declaration = 1;
318 			rparen_count = 1;
319 	not_proc:	;
320 		}
321 		/*
322 		 * The following hack attempts to guess whether or not the current
323 		 * token is in fact a declaration keyword -- one that has been
324 		 * typedefd
325 		 */
326 		if (((*buf_ptr == '*' && buf_ptr[1] != '=') ||
327 		    isalpha((unsigned char)*buf_ptr) || *buf_ptr == '_')
328 		    && !ps.p_l_follow
329 		    && !ps.block_init
330 		    && (ps.last_token == rparen || ps.last_token == semicolon ||
331 			ps.last_token == decl ||
332 			ps.last_token == lbrace || ps.last_token == rbrace)) {
333 			ps.its_a_keyword = true;
334 			ps.last_u_d = true;
335 			last_code = decl;
336 			return decl;
337 		}
338 		if (last_code == decl)	/* if this is a declared variable,
339 					 * then following sign is unary */
340 			ps.last_u_d = true;	/* will make "int a -1" work */
341 		last_code = ident;
342 		return (ident);	/* the ident is not in the list */
343 	}			/* end of procesing for alpanum character */
344 	/* Scan a non-alphanumeric token */
345 	*e_token++ = *buf_ptr;	/* if it is only a one-character token, it is
346 				 * moved here */
347 	*e_token = '\0';
348 	if (++buf_ptr >= buf_end)
349 		fill_buffer();
350 
351 	switch (*token) {
352 	case '\n':
353 		unary_delim = ps.last_u_d;
354 		ps.last_nl = true;	/* remember that we just had a newline */
355 		code = (had_eof ? 0 : newline);
356 
357 		/*
358 		 * if data has been exausted, the newline is a dummy, and we should
359 		 * return code to stop
360 		 */
361 		break;
362 
363 	case '\'':		/* start of quoted character */
364 	case '"':		/* start of string */
365 		qchar = *token;
366 		if (troff) {
367 			e_token[-1] = '`';
368 			if (qchar == '"')
369 				*e_token++ = '`';
370 			e_token = chfont(&bodyf, &stringf, e_token);
371 		}
372 		do {		/* copy the string */
373 			while (1) {	/* move one character or
374 					 * [/<char>]<char> */
375 				if (*buf_ptr == '\n') {
376 					printf("%d: Unterminated literal\n", line_no);
377 					goto stop_lit;
378 				}
379 				CHECK_SIZE_TOKEN;	/* Only have to do this
380 							 * once in this loop,
381 							 * since CHECK_SIZE
382 							 * guarantees that there
383 							 * are at least 5
384 							 * entries left */
385 				*e_token = *buf_ptr++;
386 				if (buf_ptr >= buf_end)
387 					fill_buffer();
388 				if (*e_token == BACKSLASH) {	/* if escape, copy extra
389 								 * char */
390 					if (*buf_ptr == '\n')	/* check for escaped
391 								 * newline */
392 						++line_no;
393 					if (troff) {
394 						*++e_token = BACKSLASH;
395 						if (*buf_ptr == BACKSLASH)
396 							*++e_token = BACKSLASH;
397 					}
398 					*++e_token = *buf_ptr++;
399 					++e_token;	/* we must increment
400 							 * this again because we
401 							 * copied two chars */
402 					if (buf_ptr >= buf_end)
403 						fill_buffer();
404 				} else
405 					break;	/* we copied one character */
406 			}	/* end of while (1) */
407 		} while (*e_token++ != qchar);
408 		if (troff) {
409 			e_token = chfont(&stringf, &bodyf, e_token - 1);
410 			if (qchar == '"')
411 				*e_token++ = '\'';
412 		}
413 stop_lit:
414 		code = ident;
415 		break;
416 
417 	case ('('):
418 	case ('['):
419 		unary_delim = true;
420 		code = lparen;
421 		break;
422 
423 	case (')'):
424 	case (']'):
425 		code = rparen;
426 		break;
427 
428 	case '#':
429 		unary_delim = ps.last_u_d;
430 		code = preesc;
431 		break;
432 
433 	case '?':
434 		unary_delim = true;
435 		code = question;
436 		break;
437 
438 	case (':'):
439 		code = colon;
440 		unary_delim = true;
441 		break;
442 
443 	case (';'):
444 		unary_delim = true;
445 		code = semicolon;
446 		break;
447 
448 	case ('{'):
449 		unary_delim = true;
450 
451 		/*
452 		 * if (ps.in_or_st) ps.block_init = 1;
453 		 */
454 		/* ?	code = ps.block_init ? lparen : lbrace; */
455 		code = lbrace;
456 		break;
457 
458 	case ('}'):
459 		unary_delim = true;
460 		/* ?	code = ps.block_init ? rparen : rbrace; */
461 		code = rbrace;
462 		break;
463 
464 	case 014:		/* a form feed */
465 		unary_delim = ps.last_u_d;
466 		ps.last_nl = true;	/* remember this so we can set
467 					 * 'ps.col_1' right */
468 		code = form_feed;
469 		break;
470 
471 	case (','):
472 		unary_delim = true;
473 		code = comma;
474 		break;
475 
476 	case '.':
477 		unary_delim = false;
478 		code = period;
479 		break;
480 
481 	case '-':
482 	case '+':		/* check for -, +, --, ++ */
483 		code = (ps.last_u_d ? unary_op : binary_op);
484 		unary_delim = true;
485 
486 		if (*buf_ptr == token[0]) {
487 			/* check for doubled character */
488 			*e_token++ = *buf_ptr++;
489 			/* buffer overflow will be checked at end of loop */
490 			if (last_code == ident || last_code == rparen) {
491 				code = (ps.last_u_d ? unary_op : postop);
492 				/* check for following ++ or -- */
493 				unary_delim = false;
494 			}
495 		} else
496 			if (*buf_ptr == '=')
497 				/* check for operator += */
498 				*e_token++ = *buf_ptr++;
499 			else
500 				if (*buf_ptr == '>') {
501 					/* check for operator -> */
502 					*e_token++ = *buf_ptr++;
503 					if (!pointer_as_binop) {
504 						unary_delim = false;
505 						code = unary_op;
506 						ps.want_blank = false;
507 					}
508 				}
509 		break;		/* buffer overflow will be checked at end of
510 				 * switch */
511 
512 	case '=':
513 		if (ps.in_or_st)
514 			ps.block_init = 1;
515 #ifdef undef
516 		if (chartype[*buf_ptr] == opchar) {	/* we have two char
517 							 * assignment */
518 			e_token[-1] = *buf_ptr++;
519 			if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
520 				*e_token++ = *buf_ptr++;
521 			*e_token++ = '=';	/* Flip =+ to += */
522 			*e_token = 0;
523 		}
524 #else
525 		if (*buf_ptr == '=') {	/* == */
526 			*e_token++ = '=';	/* Flip =+ to += */
527 			buf_ptr++;
528 			*e_token = 0;
529 		}
530 #endif
531 		code = binary_op;
532 		unary_delim = true;
533 		break;
534 		/* can drop thru!!! */
535 
536 	case '>':
537 	case '<':
538 	case '!':		/* ops like <, <<, <=, !=, etc */
539 		if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
540 			*e_token++ = *buf_ptr;
541 			if (++buf_ptr >= buf_end)
542 				fill_buffer();
543 		}
544 		if (*buf_ptr == '=')
545 			*e_token++ = *buf_ptr++;
546 		code = (ps.last_u_d ? unary_op : binary_op);
547 		unary_delim = true;
548 		break;
549 
550 	default:
551 		if (token[0] == '/' && *buf_ptr == '*') {
552 			/* it is start of comment */
553 			*e_token++ = '*';
554 
555 			if (++buf_ptr >= buf_end)
556 				fill_buffer();
557 
558 			code = comment;
559 			unary_delim = ps.last_u_d;
560 			break;
561 		}
562 		while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
563 			/*
564 		         * handle ||, &&, etc, and also things as in int *****i
565 		         */
566 			*e_token++ = *buf_ptr;
567 			if (++buf_ptr >= buf_end)
568 				fill_buffer();
569 		}
570 		code = (ps.last_u_d ? unary_op : binary_op);
571 		unary_delim = true;
572 
573 
574 	}			/* end of switch */
575 	if (code != newline) {
576 		l_struct = false;
577 		last_code = code;
578 	}
579 	if (buf_ptr >= buf_end)	/* check for input buffer empty */
580 		fill_buffer();
581 	ps.last_u_d = unary_delim;
582 	*e_token = '\0';	/* null terminate the token */
583 	return (code);
584 }
585 /*
586  * Add the given keyword to the keyword table, using val as the keyword type
587  */
588 void
589 addkey(char *key, int val)
590 {
591 	struct templ *p = specials;
592 	while (p->rwd)
593 		if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
594 			return;
595 		else
596 			p++;
597 	if (p >= specials + sizeof specials / sizeof specials[0])
598 		return;		/* For now, table overflows are silently
599 				 * ignored */
600 	p->rwd = key;
601 	p->rwcode = val;
602 	p[1].rwd = 0;
603 	p[1].rwcode = 0;
604 }
605