xref: /freebsd/usr.bin/indent/lexi.c (revision 780fb4a2)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 1985 Sun Microsystems, Inc.
5  * Copyright (c) 1980, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. All advertising materials mentioning features or use of this software
18  *    must display the following acknowledgement:
19  *	This product includes software developed by the University of
20  *	California, Berkeley and its contributors.
21  * 4. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  */
37 
38 #if 0
39 #ifndef lint
40 static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
41 #endif /* not lint */
42 #endif
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD$");
45 
46 /*
47  * Here we have the token scanner for indent.  It scans off one token and puts
48  * it in the global variable "token".  It returns a code, indicating the type
49  * of token scanned.
50  */
51 
52 #include <err.h>
53 #include <stdio.h>
54 #include <ctype.h>
55 #include <stdlib.h>
56 #include <string.h>
57 #include "indent_globs.h"
58 #include "indent_codes.h"
59 #include "indent.h"
60 
61 #define alphanum 1
62 #ifdef undef
63 #define opchar 3
64 #endif
65 
66 struct templ {
67     const char *rwd;
68     int         rwcode;
69 };
70 
71 /*
72  * This table has to be sorted alphabetically, because it'll be used in binary
73  * search. For the same reason, string must be the first thing in struct templ.
74  */
75 struct templ specials[] =
76 {
77     {"_Bool", 4},
78     {"_Complex", 4},
79     {"_Imaginary", 4},
80     {"auto", 10},
81     {"bool", 4},
82     {"break", 9},
83     {"case", 8},
84     {"char", 4},
85     {"complex", 4},
86     {"const", 4},
87     {"continue", 12},
88     {"default", 8},
89     {"do", 6},
90     {"double", 4},
91     {"else", 6},
92     {"enum", 3},
93     {"extern", 10},
94     {"float", 4},
95     {"for", 5},
96     {"global", 4},
97     {"goto", 9},
98     {"if", 5},
99     {"imaginary", 4},
100     {"inline", 12},
101     {"int", 4},
102     {"long", 4},
103     {"offsetof", 1},
104     {"register", 10},
105     {"restrict", 12},
106     {"return", 9},
107     {"short", 4},
108     {"signed", 4},
109     {"sizeof", 2},
110     {"static", 10},
111     {"struct", 3},
112     {"switch", 7},
113     {"typedef", 11},
114     {"union", 3},
115     {"unsigned", 4},
116     {"void", 4},
117     {"volatile", 4},
118     {"while", 5}
119 };
120 
121 const char **typenames;
122 int         typename_count;
123 int         typename_top = -1;
124 
125 char        chartype[128] =
126 {				/* this is used to facilitate the decision of
127 				 * what type (alphanumeric, operator) each
128 				 * character is */
129     0, 0, 0, 0, 0, 0, 0, 0,
130     0, 0, 0, 0, 0, 0, 0, 0,
131     0, 0, 0, 0, 0, 0, 0, 0,
132     0, 0, 0, 0, 0, 0, 0, 0,
133     0, 3, 0, 0, 1, 3, 3, 0,
134     0, 0, 3, 3, 0, 3, 0, 3,
135     1, 1, 1, 1, 1, 1, 1, 1,
136     1, 1, 0, 0, 3, 3, 3, 3,
137     0, 1, 1, 1, 1, 1, 1, 1,
138     1, 1, 1, 1, 1, 1, 1, 1,
139     1, 1, 1, 1, 1, 1, 1, 1,
140     1, 1, 1, 0, 0, 0, 3, 1,
141     0, 1, 1, 1, 1, 1, 1, 1,
142     1, 1, 1, 1, 1, 1, 1, 1,
143     1, 1, 1, 1, 1, 1, 1, 1,
144     1, 1, 1, 0, 3, 0, 3, 0
145 };
146 
147 static int
148 strcmp_type(const void *e1, const void *e2)
149 {
150     return (strcmp(e1, *(const char * const *)e2));
151 }
152 
153 int
154 lexi(struct parser_state *state)
155 {
156     int         unary_delim;	/* this is set to 1 if the current token
157 				 * forces a following operator to be unary */
158     int         code;		/* internal code to be returned */
159     char        qchar;		/* the delimiter character for a string */
160 
161     e_token = s_token;		/* point to start of place to save token */
162     unary_delim = false;
163     state->col_1 = state->last_nl;	/* tell world that this token started
164 					 * in column 1 iff the last thing
165 					 * scanned was a newline */
166     state->last_nl = false;
167 
168     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
169 	state->col_1 = false;	/* leading blanks imply token is not in column
170 				 * 1 */
171 	if (++buf_ptr >= buf_end)
172 	    fill_buffer();
173     }
174 
175     /* Scan an alphanumeric token */
176     if (chartype[*buf_ptr & 127] == alphanum ||
177 	(buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
178 	/*
179 	 * we have a character or number
180 	 */
181 	struct templ *p;
182 
183 	if (isdigit((unsigned char)*buf_ptr) ||
184 	    (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
185 	    int         seendot = 0,
186 	                seenexp = 0,
187 			seensfx = 0;
188 
189 	    /*
190 	     * base 2, base 8, base 16:
191 	     */
192 	    if (buf_ptr[0] == '0' && buf_ptr[1] != '.') {
193 		int len;
194 
195 		if (buf_ptr[1] == 'b' || buf_ptr[1] == 'B')
196 		    len = strspn(buf_ptr + 2, "01") + 2;
197 		else if (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')
198 		    len = strspn(buf_ptr + 2, "0123456789ABCDEFabcdef") + 2;
199 		else
200 		    len = strspn(buf_ptr + 1, "012345678") + 1;
201 		if (len > 0) {
202 		    CHECK_SIZE_TOKEN(len);
203 		    memcpy(e_token, buf_ptr, len);
204 		    e_token += len;
205 		    buf_ptr += len;
206 		}
207 		else
208 		    diag2(1, "Unterminated literal");
209 	    }
210 	    else		/* base 10: */
211 		while (1) {
212 		    if (*buf_ptr == '.') {
213 			if (seendot)
214 			    break;
215 			else
216 			    seendot++;
217 		    }
218 		    CHECK_SIZE_TOKEN(3);
219 		    *e_token++ = *buf_ptr++;
220 		    if (!isdigit((unsigned char)*buf_ptr) && *buf_ptr != '.') {
221 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
222 			    break;
223 			else {
224 			    seenexp++;
225 			    seendot++;
226 			    *e_token++ = *buf_ptr++;
227 			    if (*buf_ptr == '+' || *buf_ptr == '-')
228 				*e_token++ = *buf_ptr++;
229 			}
230 		    }
231 		}
232 
233 	    while (1) {
234 		CHECK_SIZE_TOKEN(2);
235 		if (!(seensfx & 1) && (*buf_ptr == 'U' || *buf_ptr == 'u')) {
236 		    *e_token++ = *buf_ptr++;
237 		    seensfx |= 1;
238 		    continue;
239 		}
240 		if (!(seensfx & 2) && (strchr("fFlL", *buf_ptr) != NULL)) {
241 		    if (buf_ptr[1] == buf_ptr[0])
242 		        *e_token++ = *buf_ptr++;
243 		    *e_token++ = *buf_ptr++;
244 		    seensfx |= 2;
245 		    continue;
246 		}
247 		break;
248 	    }
249 	}
250 	else
251 	    while (chartype[*buf_ptr & 127] == alphanum || *buf_ptr == BACKSLASH) {
252 		/* fill_buffer() terminates buffer with newline */
253 		if (*buf_ptr == BACKSLASH) {
254 		    if (*(buf_ptr + 1) == '\n') {
255 			buf_ptr += 2;
256 			if (buf_ptr >= buf_end)
257 			    fill_buffer();
258 			} else
259 			    break;
260 		}
261 		CHECK_SIZE_TOKEN(1);
262 		/* copy it over */
263 		*e_token++ = *buf_ptr++;
264 		if (buf_ptr >= buf_end)
265 		    fill_buffer();
266 	    }
267 	*e_token = '\0';
268 
269 	if (s_token[0] == 'L' && s_token[1] == '\0' &&
270 	      (*buf_ptr == '"' || *buf_ptr == '\''))
271 	    return (strpfx);
272 
273 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
274 	    if (++buf_ptr >= buf_end)
275 		fill_buffer();
276 	}
277 	state->keyword = 0;
278 	if (state->last_token == structure && !state->p_l_follow) {
279 				/* if last token was 'struct' and we're not
280 				 * in parentheses, then this token
281 				 * should be treated as a declaration */
282 	    state->last_u_d = true;
283 	    return (decl);
284 	}
285 	/*
286 	 * Operator after identifier is binary unless last token was 'struct'
287 	 */
288 	state->last_u_d = (state->last_token == structure);
289 
290 	p = bsearch(s_token,
291 	    specials,
292 	    sizeof(specials) / sizeof(specials[0]),
293 	    sizeof(specials[0]),
294 	    strcmp_type);
295 	if (p == NULL) {	/* not a special keyword... */
296 	    char *u;
297 
298 	    /* ... so maybe a type_t or a typedef */
299 	    if ((opt.auto_typedefs && ((u = strrchr(s_token, '_')) != NULL) &&
300 	        strcmp(u, "_t") == 0) || (typename_top >= 0 &&
301 		  bsearch(s_token, typenames, typename_top + 1,
302 		    sizeof(typenames[0]), strcmp_type))) {
303 		state->keyword = 4;	/* a type name */
304 		state->last_u_d = true;
305 	        goto found_typename;
306 	    }
307 	} else {			/* we have a keyword */
308 	    state->keyword = p->rwcode;
309 	    state->last_u_d = true;
310 	    switch (p->rwcode) {
311 	    case 7:		/* it is a switch */
312 		return (swstmt);
313 	    case 8:		/* a case or default */
314 		return (casestmt);
315 
316 	    case 3:		/* a "struct" */
317 		/* FALLTHROUGH */
318 	    case 4:		/* one of the declaration keywords */
319 	    found_typename:
320 		if (state->p_l_follow) {
321 		    /* inside parens: cast, param list, offsetof or sizeof */
322 		    state->cast_mask |= (1 << state->p_l_follow) & ~state->not_cast_mask;
323 		}
324 		if (state->last_token == period || state->last_token == unary_op) {
325 		    state->keyword = 0;
326 		    break;
327 		}
328 		if (p != NULL && p->rwcode == 3)
329 		    return (structure);
330 		if (state->p_l_follow)
331 		    break;
332 		return (decl);
333 
334 	    case 5:		/* if, while, for */
335 		return (sp_paren);
336 
337 	    case 6:		/* do, else */
338 		return (sp_nparen);
339 
340 	    case 10:		/* storage class specifier */
341 		return (storage);
342 
343 	    case 11:		/* typedef */
344 		return (type_def);
345 
346 	    default:		/* all others are treated like any other
347 				 * identifier */
348 		return (ident);
349 	    }			/* end of switch */
350 	}			/* end of if (found_it) */
351 	if (*buf_ptr == '(' && state->tos <= 1 && state->ind_level == 0 &&
352 	    state->in_parameter_declaration == 0 && state->block_init == 0) {
353 	    char *tp = buf_ptr;
354 	    while (tp < buf_end)
355 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
356 		    goto not_proc;
357 	    strncpy(state->procname, token, sizeof state->procname - 1);
358 	    if (state->in_decl)
359 		state->in_parameter_declaration = 1;
360 	    return (funcname);
361     not_proc:;
362 	}
363 	/*
364 	 * The following hack attempts to guess whether or not the current
365 	 * token is in fact a declaration keyword -- one that has been
366 	 * typedefd
367 	 */
368 	else if (!state->p_l_follow && !state->block_init &&
369 	    !state->in_stmt &&
370 	    ((*buf_ptr == '*' && buf_ptr[1] != '=') ||
371 		isalpha((unsigned char)*buf_ptr)) &&
372 	    (state->last_token == semicolon || state->last_token == lbrace ||
373 		state->last_token == rbrace)) {
374 	    state->keyword = 4;	/* a type name */
375 	    state->last_u_d = true;
376 	    return decl;
377 	}
378 	if (state->last_token == decl)	/* if this is a declared variable,
379 					 * then following sign is unary */
380 	    state->last_u_d = true;	/* will make "int a -1" work */
381 	return (ident);		/* the ident is not in the list */
382     }				/* end of procesing for alpanum character */
383 
384     /* Scan a non-alphanumeric token */
385 
386     CHECK_SIZE_TOKEN(3);		/* things like "<<=" */
387     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
388 				 * moved here */
389     *e_token = '\0';
390     if (++buf_ptr >= buf_end)
391 	fill_buffer();
392 
393     switch (*token) {
394     case '\n':
395 	unary_delim = state->last_u_d;
396 	state->last_nl = true;	/* remember that we just had a newline */
397 	code = (had_eof ? 0 : newline);
398 
399 	/*
400 	 * if data has been exhausted, the newline is a dummy, and we should
401 	 * return code to stop
402 	 */
403 	break;
404 
405     case '\'':			/* start of quoted character */
406     case '"':			/* start of string */
407 	qchar = *token;
408 	do {			/* copy the string */
409 	    while (1) {		/* move one character or [/<char>]<char> */
410 		if (*buf_ptr == '\n') {
411 		    diag2(1, "Unterminated literal");
412 		    goto stop_lit;
413 		}
414 		CHECK_SIZE_TOKEN(2);
415 		*e_token = *buf_ptr++;
416 		if (buf_ptr >= buf_end)
417 		    fill_buffer();
418 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
419 		    if (*buf_ptr == '\n')	/* check for escaped newline */
420 			++line_no;
421 		    *++e_token = *buf_ptr++;
422 		    ++e_token;	/* we must increment this again because we
423 				 * copied two chars */
424 		    if (buf_ptr >= buf_end)
425 			fill_buffer();
426 		}
427 		else
428 		    break;	/* we copied one character */
429 	    }			/* end of while (1) */
430 	} while (*e_token++ != qchar);
431 stop_lit:
432 	code = ident;
433 	break;
434 
435     case ('('):
436     case ('['):
437 	unary_delim = true;
438 	code = lparen;
439 	break;
440 
441     case (')'):
442     case (']'):
443 	code = rparen;
444 	break;
445 
446     case '#':
447 	unary_delim = state->last_u_d;
448 	code = preesc;
449 	break;
450 
451     case '?':
452 	unary_delim = true;
453 	code = question;
454 	break;
455 
456     case (':'):
457 	code = colon;
458 	unary_delim = true;
459 	break;
460 
461     case (';'):
462 	unary_delim = true;
463 	code = semicolon;
464 	break;
465 
466     case ('{'):
467 	unary_delim = true;
468 
469 	/*
470 	 * if (state->in_or_st) state->block_init = 1;
471 	 */
472 	/* ?	code = state->block_init ? lparen : lbrace; */
473 	code = lbrace;
474 	break;
475 
476     case ('}'):
477 	unary_delim = true;
478 	/* ?	code = state->block_init ? rparen : rbrace; */
479 	code = rbrace;
480 	break;
481 
482     case 014:			/* a form feed */
483 	unary_delim = state->last_u_d;
484 	state->last_nl = true;	/* remember this so we can set 'state->col_1'
485 				 * right */
486 	code = form_feed;
487 	break;
488 
489     case (','):
490 	unary_delim = true;
491 	code = comma;
492 	break;
493 
494     case '.':
495 	unary_delim = false;
496 	code = period;
497 	break;
498 
499     case '-':
500     case '+':			/* check for -, +, --, ++ */
501 	code = (state->last_u_d ? unary_op : binary_op);
502 	unary_delim = true;
503 
504 	if (*buf_ptr == token[0]) {
505 	    /* check for doubled character */
506 	    *e_token++ = *buf_ptr++;
507 	    /* buffer overflow will be checked at end of loop */
508 	    if (state->last_token == ident || state->last_token == rparen) {
509 		code = (state->last_u_d ? unary_op : postop);
510 		/* check for following ++ or -- */
511 		unary_delim = false;
512 	    }
513 	}
514 	else if (*buf_ptr == '=')
515 	    /* check for operator += */
516 	    *e_token++ = *buf_ptr++;
517 	else if (*buf_ptr == '>') {
518 	    /* check for operator -> */
519 	    *e_token++ = *buf_ptr++;
520 	    unary_delim = false;
521 	    code = unary_op;
522 	    state->want_blank = false;
523 	}
524 	break;			/* buffer overflow will be checked at end of
525 				 * switch */
526 
527     case '=':
528 	if (state->in_or_st)
529 	    state->block_init = 1;
530 #ifdef undef
531 	if (chartype[*buf_ptr & 127] == opchar) {	/* we have two char assignment */
532 	    e_token[-1] = *buf_ptr++;
533 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
534 		*e_token++ = *buf_ptr++;
535 	    *e_token++ = '=';	/* Flip =+ to += */
536 	    *e_token = 0;
537 	}
538 #else
539 	if (*buf_ptr == '=') {/* == */
540 	    *e_token++ = '=';	/* Flip =+ to += */
541 	    buf_ptr++;
542 	    *e_token = 0;
543 	}
544 #endif
545 	code = binary_op;
546 	unary_delim = true;
547 	break;
548 	/* can drop thru!!! */
549 
550     case '>':
551     case '<':
552     case '!':			/* ops like <, <<, <=, !=, etc */
553 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
554 	    *e_token++ = *buf_ptr;
555 	    if (++buf_ptr >= buf_end)
556 		fill_buffer();
557 	}
558 	if (*buf_ptr == '=')
559 	    *e_token++ = *buf_ptr++;
560 	code = (state->last_u_d ? unary_op : binary_op);
561 	unary_delim = true;
562 	break;
563 
564     case '*':
565 	unary_delim = true;
566 	if (!state->last_u_d) {
567 	    if (*buf_ptr == '=')
568 		*e_token++ = *buf_ptr++;
569 	    code = binary_op;
570 	    break;
571 	}
572 	while (*buf_ptr == '*' || isspace((unsigned char)*buf_ptr)) {
573 	    if (*buf_ptr == '*') {
574 		CHECK_SIZE_TOKEN(1);
575 		*e_token++ = *buf_ptr;
576 	    }
577 	    if (++buf_ptr >= buf_end)
578 		fill_buffer();
579 	}
580 	if (ps.in_decl) {
581 	    char *tp = buf_ptr;
582 
583 	    while (isalpha((unsigned char)*tp) ||
584 		   isspace((unsigned char)*tp)) {
585 		if (++tp >= buf_end)
586 		    fill_buffer();
587 	    }
588 	    if (*tp == '(')
589 		ps.procname[0] = ' ';
590 	}
591 	code = unary_op;
592 	break;
593 
594     default:
595 	if (token[0] == '/' && *buf_ptr == '*') {
596 	    /* it is start of comment */
597 	    *e_token++ = '*';
598 
599 	    if (++buf_ptr >= buf_end)
600 		fill_buffer();
601 
602 	    code = comment;
603 	    unary_delim = state->last_u_d;
604 	    break;
605 	}
606 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
607 	    /*
608 	     * handle ||, &&, etc, and also things as in int *****i
609 	     */
610 	    CHECK_SIZE_TOKEN(1);
611 	    *e_token++ = *buf_ptr;
612 	    if (++buf_ptr >= buf_end)
613 		fill_buffer();
614 	}
615 	code = (state->last_u_d ? unary_op : binary_op);
616 	unary_delim = true;
617 
618 
619     }				/* end of switch */
620     if (buf_ptr >= buf_end)	/* check for input buffer empty */
621 	fill_buffer();
622     state->last_u_d = unary_delim;
623     CHECK_SIZE_TOKEN(1);
624     *e_token = '\0';		/* null terminate the token */
625     return (code);
626 }
627 
628 void
629 alloc_typenames(void)
630 {
631 
632     typenames = (const char **)malloc(sizeof(typenames[0]) *
633         (typename_count = 16));
634     if (typenames == NULL)
635 	err(1, NULL);
636 }
637 
638 void
639 add_typename(const char *key)
640 {
641     int comparison;
642     const char *copy;
643 
644     if (typename_top + 1 >= typename_count) {
645 	typenames = realloc((void *)typenames,
646 	    sizeof(typenames[0]) * (typename_count *= 2));
647 	if (typenames == NULL)
648 	    err(1, NULL);
649     }
650     if (typename_top == -1)
651 	typenames[++typename_top] = copy = strdup(key);
652     else if ((comparison = strcmp(key, typenames[typename_top])) >= 0) {
653 	/* take advantage of sorted input */
654 	if (comparison == 0)	/* remove duplicates */
655 	    return;
656 	typenames[++typename_top] = copy = strdup(key);
657     }
658     else {
659 	int p;
660 
661 	for (p = 0; (comparison = strcmp(key, typenames[p])) > 0; p++)
662 	    /* find place for the new key */;
663 	if (comparison == 0)	/* remove duplicates */
664 	    return;
665 	memmove(&typenames[p + 1], &typenames[p],
666 	    sizeof(typenames[0]) * (++typename_top - p));
667 	typenames[p] = copy = strdup(key);
668     }
669 
670     if (copy == NULL)
671 	err(1, NULL);
672 }
673