xref: /freebsd/usr.bin/indent/lexi.c (revision 076ad2f8)
1 /*-
2  * Copyright (c) 1985 Sun Microsystems, Inc.
3  * Copyright (c) 1980, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed by the University of
18  *	California, Berkeley and its contributors.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 #if 0
37 #ifndef lint
38 static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
39 #endif /* not lint */
40 #endif
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43 
44 /*
45  * Here we have the token scanner for indent.  It scans off one token and puts
46  * it in the global variable "token".  It returns a code, indicating the type
47  * of token scanned.
48  */
49 
50 #include <err.h>
51 #include <stdio.h>
52 #include <ctype.h>
53 #include <stdlib.h>
54 #include <string.h>
55 #include "indent_globs.h"
56 #include "indent_codes.h"
57 #include "indent.h"
58 
59 #define alphanum 1
60 #ifdef undef
61 #define opchar 3
62 #endif
63 
64 struct templ {
65     const char *rwd;
66     int         rwcode;
67 };
68 
69 /*
70  * This table has to be sorted alphabetically, because it'll be used in binary
71  * search. For the same reason, string must be the first thing in struct templ.
72  */
73 struct templ specials[] =
74 {
75     {"auto", 10},
76     {"break", 9},
77     {"case", 8},
78     {"char", 4},
79     {"const", 4},
80     {"default", 8},
81     {"do", 6},
82     {"double", 4},
83     {"else", 6},
84     {"enum", 3},
85     {"extern", 10},
86     {"float", 4},
87     {"for", 5},
88     {"global", 4},
89     {"goto", 9},
90     {"if", 5},
91     {"int", 4},
92     {"long", 4},
93     {"offsetof", 1},
94     {"register", 10},
95     {"return", 9},
96     {"short", 4},
97     {"sizeof", 2},
98     {"static", 10},
99     {"struct", 3},
100     {"switch", 7},
101     {"typedef", 10},
102     {"union", 3},
103     {"unsigned", 4},
104     {"void", 4},
105     {"volatile", 4},
106     {"while", 5}
107 };
108 
109 const char **typenames;
110 int         typename_count;
111 int         typename_top = -1;
112 
113 char        chartype[128] =
114 {				/* this is used to facilitate the decision of
115 				 * what type (alphanumeric, operator) each
116 				 * character is */
117     0, 0, 0, 0, 0, 0, 0, 0,
118     0, 0, 0, 0, 0, 0, 0, 0,
119     0, 0, 0, 0, 0, 0, 0, 0,
120     0, 0, 0, 0, 0, 0, 0, 0,
121     0, 3, 0, 0, 1, 3, 3, 0,
122     0, 0, 3, 3, 0, 3, 0, 3,
123     1, 1, 1, 1, 1, 1, 1, 1,
124     1, 1, 0, 0, 3, 3, 3, 3,
125     0, 1, 1, 1, 1, 1, 1, 1,
126     1, 1, 1, 1, 1, 1, 1, 1,
127     1, 1, 1, 1, 1, 1, 1, 1,
128     1, 1, 1, 0, 0, 0, 3, 1,
129     0, 1, 1, 1, 1, 1, 1, 1,
130     1, 1, 1, 1, 1, 1, 1, 1,
131     1, 1, 1, 1, 1, 1, 1, 1,
132     1, 1, 1, 0, 3, 0, 3, 0
133 };
134 
135 static int
136 strcmp_type(const void *e1, const void *e2)
137 {
138     return (strcmp(e1, *(const char * const *)e2));
139 }
140 
141 int
142 lexi(void)
143 {
144     int         unary_delim;	/* this is set to 1 if the current token
145 				 * forces a following operator to be unary */
146     static int  last_code;	/* the last token type returned */
147     static int  l_struct;	/* set to 1 if the last token was 'struct' */
148     int         code;		/* internal code to be returned */
149     char        qchar;		/* the delimiter character for a string */
150 
151     e_token = s_token;		/* point to start of place to save token */
152     unary_delim = false;
153     ps.col_1 = ps.last_nl;	/* tell world that this token started in
154 				 * column 1 iff the last thing scanned was nl */
155     ps.last_nl = false;
156 
157     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
158 	ps.col_1 = false;	/* leading blanks imply token is not in column
159 				 * 1 */
160 	if (++buf_ptr >= buf_end)
161 	    fill_buffer();
162     }
163 
164     /* Scan an alphanumeric token */
165     if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
166 	/*
167 	 * we have a character or number
168 	 */
169 	struct templ *p;
170 
171 	if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
172 	    int         seendot = 0,
173 	                seenexp = 0,
174 			seensfx = 0;
175 	    if (*buf_ptr == '0' &&
176 		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
177 		*e_token++ = *buf_ptr++;
178 		*e_token++ = *buf_ptr++;
179 		while (isxdigit(*buf_ptr)) {
180 		    CHECK_SIZE_TOKEN;
181 		    *e_token++ = *buf_ptr++;
182 		}
183 	    }
184 	    else
185 		while (1) {
186 		    if (*buf_ptr == '.') {
187 			if (seendot)
188 			    break;
189 			else
190 			    seendot++;
191 		    }
192 		    CHECK_SIZE_TOKEN;
193 		    *e_token++ = *buf_ptr++;
194 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
195 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
196 			    break;
197 			else {
198 			    seenexp++;
199 			    seendot++;
200 			    CHECK_SIZE_TOKEN;
201 			    *e_token++ = *buf_ptr++;
202 			    if (*buf_ptr == '+' || *buf_ptr == '-')
203 				*e_token++ = *buf_ptr++;
204 			}
205 		    }
206 		}
207 	    while (1) {
208 		if (!(seensfx & 1) && (*buf_ptr == 'U' || *buf_ptr == 'u')) {
209 		    CHECK_SIZE_TOKEN;
210 		    *e_token++ = *buf_ptr++;
211 		    seensfx |= 1;
212 		    continue;
213 		}
214 		if (!(seensfx & 2) && (strchr("fFlL", *buf_ptr) != NULL)) {
215 		    CHECK_SIZE_TOKEN;
216 		    if (buf_ptr[1] == buf_ptr[0])
217 		        *e_token++ = *buf_ptr++;
218 		    *e_token++ = *buf_ptr++;
219 		    seensfx |= 2;
220 		    continue;
221 		}
222 		break;
223 	    }
224 	}
225 	else
226 	    while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
227 		/* fill_buffer() terminates buffer with newline */
228 		if (*buf_ptr == BACKSLASH) {
229 		    if (*(buf_ptr + 1) == '\n') {
230 			buf_ptr += 2;
231 			if (buf_ptr >= buf_end)
232 			    fill_buffer();
233 			} else
234 			    break;
235 		}
236 		CHECK_SIZE_TOKEN;
237 		/* copy it over */
238 		*e_token++ = *buf_ptr++;
239 		if (buf_ptr >= buf_end)
240 		    fill_buffer();
241 	    }
242 	*e_token++ = '\0';
243 
244 	if (s_token[0] == 'L' && s_token[1] == '\0' &&
245 	      (*buf_ptr == '"' || *buf_ptr == '\''))
246 	    return (strpfx);
247 
248 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
249 	    if (++buf_ptr >= buf_end)
250 		fill_buffer();
251 	}
252 	ps.keyword = 0;
253 	if (l_struct && !ps.p_l_follow) {
254 				/* if last token was 'struct' and we're not
255 				 * in parentheses, then this token
256 				 * should be treated as a declaration */
257 	    l_struct = false;
258 	    last_code = ident;
259 	    ps.last_u_d = true;
260 	    return (decl);
261 	}
262 	ps.last_u_d = l_struct;	/* Operator after identifier is binary
263 				 * unless last token was 'struct' */
264 	l_struct = false;
265 	last_code = ident;	/* Remember that this is the code we will
266 				 * return */
267 
268 	p = bsearch(s_token,
269 	    specials,
270 	    sizeof(specials) / sizeof(specials[0]),
271 	    sizeof(specials[0]),
272 	    strcmp_type);
273 	if (p == NULL) {	/* not a special keyword... */
274 	    char *u;
275 
276 	    /* ... so maybe a type_t or a typedef */
277 	    if ((auto_typedefs && ((u = strrchr(s_token, '_')) != NULL) &&
278 	        strcmp(u, "_t") == 0) || (typename_top >= 0 &&
279 		  bsearch(s_token, typenames, typename_top + 1,
280 		    sizeof(typenames[0]), strcmp_type))) {
281 		ps.keyword = 4;	/* a type name */
282 		ps.last_u_d = true;
283 	        goto found_typename;
284 	    }
285 	} else {			/* we have a keyword */
286 	    ps.keyword = p->rwcode;
287 	    ps.last_u_d = true;
288 	    switch (p->rwcode) {
289 	    case 7:		/* it is a switch */
290 		return (swstmt);
291 	    case 8:		/* a case or default */
292 		return (casestmt);
293 
294 	    case 3:		/* a "struct" */
295 		/*
296 		 * Next time around, we will want to know that we have had a
297 		 * 'struct'
298 		 */
299 		l_struct = true;
300 		/* FALLTHROUGH */
301 
302 	    case 4:		/* one of the declaration keywords */
303 	    found_typename:
304 		if (ps.p_l_follow) {
305 		    /* inside parens: cast, param list, offsetof or sizeof */
306 		    ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.not_cast_mask;
307 		    break;
308 		}
309 		last_code = decl;
310 		return (decl);
311 
312 	    case 5:		/* if, while, for */
313 		return (sp_paren);
314 
315 	    case 6:		/* do, else */
316 		return (sp_nparen);
317 
318 	    case 10:		/* storage class specifier */
319 		return (storage);
320 
321 	    default:		/* all others are treated like any other
322 				 * identifier */
323 		return (ident);
324 	    }			/* end of switch */
325 	}			/* end of if (found_it) */
326 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
327 	    char *tp = buf_ptr;
328 	    while (tp < buf_end)
329 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
330 		    goto not_proc;
331 	    strncpy(ps.procname, token, sizeof ps.procname - 1);
332 	    if (ps.in_decl)
333 		ps.in_parameter_declaration = 1;
334 	    rparen_count = 1;
335     not_proc:;
336 	}
337 	/*
338 	 * The following hack attempts to guess whether or not the current
339 	 * token is in fact a declaration keyword -- one that has been
340 	 * typedefd
341 	 */
342 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
343 		&& !ps.p_l_follow
344 	        && !ps.block_init
345 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
346 		    ps.last_token == decl ||
347 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
348 	    ps.keyword = 4;	/* a type name */
349 	    ps.last_u_d = true;
350 	    last_code = decl;
351 	    return decl;
352 	}
353 	if (last_code == decl)	/* if this is a declared variable, then
354 				 * following sign is unary */
355 	    ps.last_u_d = true;	/* will make "int a -1" work */
356 	last_code = ident;
357 	return (ident);		/* the ident is not in the list */
358     }				/* end of procesing for alpanum character */
359 
360     /* Scan a non-alphanumeric token */
361 
362     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
363 				 * moved here */
364     *e_token = '\0';
365     if (++buf_ptr >= buf_end)
366 	fill_buffer();
367 
368     switch (*token) {
369     case '\n':
370 	unary_delim = ps.last_u_d;
371 	ps.last_nl = true;	/* remember that we just had a newline */
372 	code = (had_eof ? 0 : newline);
373 
374 	/*
375 	 * if data has been exhausted, the newline is a dummy, and we should
376 	 * return code to stop
377 	 */
378 	break;
379 
380     case '\'':			/* start of quoted character */
381     case '"':			/* start of string */
382 	qchar = *token;
383 	if (troff) {
384 	    e_token[-1] = '`';
385 	    if (qchar == '"')
386 		*e_token++ = '`';
387 	    e_token = chfont(&bodyf, &stringf, e_token);
388 	}
389 	do {			/* copy the string */
390 	    while (1) {		/* move one character or [/<char>]<char> */
391 		if (*buf_ptr == '\n') {
392 		    diag2(1, "Unterminated literal");
393 		    goto stop_lit;
394 		}
395 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
396 					 * since CHECK_SIZE guarantees that there
397 					 * are at least 5 entries left */
398 		*e_token = *buf_ptr++;
399 		if (buf_ptr >= buf_end)
400 		    fill_buffer();
401 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
402 		    if (*buf_ptr == '\n')	/* check for escaped newline */
403 			++line_no;
404 		    if (troff) {
405 			*++e_token = BACKSLASH;
406 			if (*buf_ptr == BACKSLASH)
407 			    *++e_token = BACKSLASH;
408 		    }
409 		    *++e_token = *buf_ptr++;
410 		    ++e_token;	/* we must increment this again because we
411 				 * copied two chars */
412 		    if (buf_ptr >= buf_end)
413 			fill_buffer();
414 		}
415 		else
416 		    break;	/* we copied one character */
417 	    }			/* end of while (1) */
418 	} while (*e_token++ != qchar);
419 	if (troff) {
420 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
421 	    if (qchar == '"')
422 		*e_token++ = '\'';
423 	}
424 stop_lit:
425 	code = ident;
426 	break;
427 
428     case ('('):
429     case ('['):
430 	unary_delim = true;
431 	code = lparen;
432 	break;
433 
434     case (')'):
435     case (']'):
436 	code = rparen;
437 	break;
438 
439     case '#':
440 	unary_delim = ps.last_u_d;
441 	code = preesc;
442 	break;
443 
444     case '?':
445 	unary_delim = true;
446 	code = question;
447 	break;
448 
449     case (':'):
450 	code = colon;
451 	unary_delim = true;
452 	break;
453 
454     case (';'):
455 	unary_delim = true;
456 	code = semicolon;
457 	break;
458 
459     case ('{'):
460 	unary_delim = true;
461 
462 	/*
463 	 * if (ps.in_or_st) ps.block_init = 1;
464 	 */
465 	/* ?	code = ps.block_init ? lparen : lbrace; */
466 	code = lbrace;
467 	break;
468 
469     case ('}'):
470 	unary_delim = true;
471 	/* ?	code = ps.block_init ? rparen : rbrace; */
472 	code = rbrace;
473 	break;
474 
475     case 014:			/* a form feed */
476 	unary_delim = ps.last_u_d;
477 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
478 				 * right */
479 	code = form_feed;
480 	break;
481 
482     case (','):
483 	unary_delim = true;
484 	code = comma;
485 	break;
486 
487     case '.':
488 	unary_delim = false;
489 	code = period;
490 	break;
491 
492     case '-':
493     case '+':			/* check for -, +, --, ++ */
494 	code = (ps.last_u_d ? unary_op : binary_op);
495 	unary_delim = true;
496 
497 	if (*buf_ptr == token[0]) {
498 	    /* check for doubled character */
499 	    *e_token++ = *buf_ptr++;
500 	    /* buffer overflow will be checked at end of loop */
501 	    if (last_code == ident || last_code == rparen) {
502 		code = (ps.last_u_d ? unary_op : postop);
503 		/* check for following ++ or -- */
504 		unary_delim = false;
505 	    }
506 	}
507 	else if (*buf_ptr == '=')
508 	    /* check for operator += */
509 	    *e_token++ = *buf_ptr++;
510 	else if (*buf_ptr == '>') {
511 	    /* check for operator -> */
512 	    *e_token++ = *buf_ptr++;
513 	    if (!pointer_as_binop) {
514 		unary_delim = false;
515 		code = unary_op;
516 		ps.want_blank = false;
517 	    }
518 	}
519 	break;			/* buffer overflow will be checked at end of
520 				 * switch */
521 
522     case '=':
523 	if (ps.in_or_st)
524 	    ps.block_init = 1;
525 #ifdef undef
526 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
527 	    e_token[-1] = *buf_ptr++;
528 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
529 		*e_token++ = *buf_ptr++;
530 	    *e_token++ = '=';	/* Flip =+ to += */
531 	    *e_token = 0;
532 	}
533 #else
534 	if (*buf_ptr == '=') {/* == */
535 	    *e_token++ = '=';	/* Flip =+ to += */
536 	    buf_ptr++;
537 	    *e_token = 0;
538 	}
539 #endif
540 	code = binary_op;
541 	unary_delim = true;
542 	break;
543 	/* can drop thru!!! */
544 
545     case '>':
546     case '<':
547     case '!':			/* ops like <, <<, <=, !=, etc */
548 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
549 	    *e_token++ = *buf_ptr;
550 	    if (++buf_ptr >= buf_end)
551 		fill_buffer();
552 	}
553 	if (*buf_ptr == '=')
554 	    *e_token++ = *buf_ptr++;
555 	code = (ps.last_u_d ? unary_op : binary_op);
556 	unary_delim = true;
557 	break;
558 
559     default:
560 	if (token[0] == '/' && *buf_ptr == '*') {
561 	    /* it is start of comment */
562 	    *e_token++ = '*';
563 
564 	    if (++buf_ptr >= buf_end)
565 		fill_buffer();
566 
567 	    code = comment;
568 	    unary_delim = ps.last_u_d;
569 	    break;
570 	}
571 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
572 	    /*
573 	     * handle ||, &&, etc, and also things as in int *****i
574 	     */
575 	    *e_token++ = *buf_ptr;
576 	    if (++buf_ptr >= buf_end)
577 		fill_buffer();
578 	}
579 	code = (ps.last_u_d ? unary_op : binary_op);
580 	unary_delim = true;
581 
582 
583     }				/* end of switch */
584     if (code != newline) {
585 	l_struct = false;
586 	last_code = code;
587     }
588     if (buf_ptr >= buf_end)	/* check for input buffer empty */
589 	fill_buffer();
590     ps.last_u_d = unary_delim;
591     *e_token = '\0';		/* null terminate the token */
592     return (code);
593 }
594 
595 void
596 alloc_typenames(void)
597 {
598 
599     typenames = (const char **)malloc(sizeof(typenames[0]) *
600         (typename_count = 16));
601     if (typenames == NULL)
602 	err(1, NULL);
603 }
604 
605 void
606 add_typename(const char *key)
607 {
608     int comparison;
609     const char *copy;
610 
611     if (typename_top + 1 >= typename_count) {
612 	typenames = realloc((void *)typenames,
613 	    sizeof(typenames[0]) * (typename_count *= 2));
614 	if (typenames == NULL)
615 	    err(1, NULL);
616     }
617     if (typename_top == -1)
618 	typenames[++typename_top] = copy = strdup(key);
619     else if ((comparison = strcmp(key, typenames[typename_top])) >= 0) {
620 	/* take advantage of sorted input */
621 	if (comparison == 0)	/* remove duplicates */
622 	    return;
623 	typenames[++typename_top] = copy = strdup(key);
624     }
625     else {
626 	int p;
627 
628 	for (p = 0; (comparison = strcmp(key, typenames[p])) > 0; p++)
629 	    /* find place for the new key */;
630 	if (comparison == 0)	/* remove duplicates */
631 	    return;
632 	memmove(&typenames[p + 1], &typenames[p],
633 	    sizeof(typenames[0]) * (++typename_top - p));
634 	typenames[p] = copy = strdup(key);
635     }
636 
637     if (copy == NULL)
638 	err(1, NULL);
639 }
640