xref: /openbsd/usr.bin/indent/lexi.c (revision 4bdff4be)
1 /*	$OpenBSD: lexi.c,v 1.21 2022/12/26 19:16:01 jmc Exp $	*/
2 
3 /*
4  * Copyright (c) 1980, 1993
5  *	The Regents of the University of California.
6  * Copyright (c) 1976 Board of Trustees of the University of Illinois.
7  * Copyright (c) 1985 Sun Microsystems, Inc.
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 /*
36  * Here we have the token scanner for indent.  It scans off one token and puts
37  * it in the global variable "token".  It returns a code, indicating the type
38  * of token scanned.
39  */
40 
41 #include <stdio.h>
42 #include <ctype.h>
43 #include <stdlib.h>
44 #include <string.h>
45 #include <err.h>
46 #include "indent_globs.h"
47 #include "indent_codes.h"
48 
49 #define alphanum 1
50 #define opchar 3
51 
52 struct templ {
53     char       *rwd;
54     int         rwcode;
55 };
56 
57 struct templ specialsinit[] = {
58 	{ "switch", 1 },
59 	{ "case", 2 },
60 	{ "break", 0 },
61 	{ "struct", 3 },
62 	{ "union", 3 },
63 	{ "enum", 3 },
64 	{ "default", 2 },
65 	{ "int", 4 },
66 	{ "char", 4 },
67 	{ "float", 4 },
68 	{ "double", 4 },
69 	{ "long", 4 },
70 	{ "short", 4 },
71 	{ "typedef", 4 },
72 	{ "unsigned", 4 },
73 	{ "register", 4 },
74 	{ "static", 4 },
75 	{ "global", 4 },
76 	{ "extern", 4 },
77 	{ "void", 4 },
78 	{ "goto", 0 },
79 	{ "return", 0 },
80 	{ "if", 5 },
81 	{ "while", 5 },
82 	{ "for", 5 },
83 	{ "else", 6 },
84 	{ "do", 6 },
85 	{ "sizeof", 7 },
86 };
87 
88 struct templ *specials = specialsinit;
89 int	nspecials = sizeof(specialsinit) / sizeof(specialsinit[0]);
90 int	maxspecials;
91 
92 char        chartype[128] =
93 {				/* this is used to facilitate the decision of
94 				 * what type (alphanumeric, operator) each
95 				 * character is */
96     0, 0, 0, 0, 0, 0, 0, 0,
97     0, 0, 0, 0, 0, 0, 0, 0,
98     0, 0, 0, 0, 0, 0, 0, 0,
99     0, 0, 0, 0, 0, 0, 0, 0,
100     0, 3, 0, 0, 1, 3, 3, 0,
101     0, 0, 3, 3, 0, 3, 0, 3,
102     1, 1, 1, 1, 1, 1, 1, 1,
103     1, 1, 0, 0, 3, 3, 3, 3,
104     0, 1, 1, 1, 1, 1, 1, 1,
105     1, 1, 1, 1, 1, 1, 1, 1,
106     1, 1, 1, 1, 1, 1, 1, 1,
107     1, 1, 1, 0, 0, 0, 3, 1,
108     0, 1, 1, 1, 1, 1, 1, 1,
109     1, 1, 1, 1, 1, 1, 1, 1,
110     1, 1, 1, 1, 1, 1, 1, 1,
111     1, 1, 1, 0, 3, 0, 3, 0
112 };
113 
114 
115 
116 
117 int
118 lexi(void)
119 {
120     int         unary_delim;	/* this is set to 1 if the current token
121 				 * forces a following operator to be unary */
122     static int  last_code;	/* the last token type returned */
123     static int  l_struct;	/* set to 1 if the last token was 'struct' */
124     int         code;		/* internal code to be returned */
125     char        qchar;		/* the delimiter character for a string */
126     int		i;
127 
128     e_token = s_token;		/* point to start of place to save token */
129     unary_delim = false;
130     ps.col_1 = ps.last_nl;	/* tell world that this token started in
131 				 * column 1 iff the last thing scanned was nl */
132     ps.last_nl = false;
133 
134     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
135 	ps.col_1 = false;	/* leading blanks imply token is not in column
136 				 * 1 */
137 	if (++buf_ptr >= buf_end)
138 	    fill_buffer();
139     }
140 
141     /* Scan an alphanumeric token */
142     if (chartype[(int)*buf_ptr] == alphanum ||
143 	(buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
144 	/*
145 	 * we have a character or number
146 	 */
147 	char *j;	/* used for searching thru list of
148 			 * reserved words */
149 	if (isdigit((unsigned char)*buf_ptr) ||
150 	    (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
151 	    int         seendot = 0,
152 	                seenexp = 0,
153 			seensfx = 0;
154 	    if (*buf_ptr == '0' &&
155 		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
156 		*e_token++ = *buf_ptr++;
157 		*e_token++ = *buf_ptr++;
158 		while (isxdigit(*buf_ptr)) {
159 		    CHECK_SIZE_TOKEN;
160 		    *e_token++ = *buf_ptr++;
161 		}
162 	    }
163 	    else
164 		while (1) {
165 		    if (*buf_ptr == '.') {
166 			if (seendot)
167 			    break;
168 			else
169 			    seendot++;
170 		    }
171 		    CHECK_SIZE_TOKEN;
172 		    *e_token++ = *buf_ptr++;
173 		    if (!isdigit((unsigned char)*buf_ptr) && *buf_ptr != '.') {
174 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
175 			    break;
176 			else {
177 			    seenexp++;
178 			    seendot++;
179 			    CHECK_SIZE_TOKEN;
180 			    *e_token++ = *buf_ptr++;
181 			    if (*buf_ptr == '+' || *buf_ptr == '-')
182 				*e_token++ = *buf_ptr++;
183 			}
184 		    }
185 		}
186 	    while (1) {
187 		if (!(seensfx & 1) &&
188 			(*buf_ptr == 'U' || *buf_ptr == 'u')) {
189 		    CHECK_SIZE_TOKEN;
190 		    *e_token++ = *buf_ptr++;
191 		    seensfx |= 1;
192 		    continue;
193 		}
194         	if (!(seensfx & 2) &&
195 			(*buf_ptr == 'L' || *buf_ptr == 'l')) {
196 		    CHECK_SIZE_TOKEN;
197 		    if (buf_ptr[1] == buf_ptr[0])
198 		        *e_token++ = *buf_ptr++;
199 		    *e_token++ = *buf_ptr++;
200 		    seensfx |= 2;
201 		    continue;
202 		}
203 		break;
204 	    }
205 	    if (!(seensfx & 1) &&
206 	        (*buf_ptr == 'F' || *buf_ptr == 'f')) {
207 		CHECK_SIZE_TOKEN;
208 		*e_token++ = *buf_ptr++;
209 		seensfx |= 1;
210 	    }
211 	}
212 	else
213 	    while (chartype[(int)*buf_ptr] == alphanum) {	/* copy it over */
214 		CHECK_SIZE_TOKEN;
215 		*e_token++ = *buf_ptr++;
216 		if (buf_ptr >= buf_end)
217 		    fill_buffer();
218 	    }
219 	*e_token++ = '\0';
220 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
221 	    if (++buf_ptr >= buf_end)
222 		fill_buffer();
223 	}
224 	ps.its_a_keyword = false;
225 	ps.sizeof_keyword = false;
226 	if (l_struct) {		/* if last token was 'struct', then this token
227 				 * should be treated as a declaration */
228 	    l_struct = false;
229 	    last_code = ident;
230 	    ps.last_u_d = true;
231 	    return (decl);
232 	}
233 	ps.last_u_d = false;	/* Operator after identifier is binary */
234 	last_code = ident;	/* Remember that this is the code we will
235 				 * return */
236 
237 	/*
238 	 * This loop will check if the token is a keyword.
239 	 */
240 	for (i = 0; i < nspecials; i++) {
241 	    char *p = s_token;	/* point at scanned token */
242 	    j = specials[i].rwd;
243 	    if (*j++ != *p++ || *j++ != *p++)
244 		continue;	/* This test depends on the fact that
245 				 * identifiers are always at least 1 character
246 				 * long (ie. the first two bytes of the
247 				 * identifier are always meaningful) */
248 	    if (p[-1] == 0)
249 		break;		/* If its a one-character identifier */
250 	    while (*p++ == *j)
251 		if (*j++ == 0)
252 		    goto found_keyword;	/* I wish that C had a multi-level
253 					 * break... */
254 	}
255 	if (i < nspecials) {		/* we have a keyword */
256     found_keyword:
257 	    ps.its_a_keyword = true;
258 	    ps.last_u_d = true;
259 	    switch (specials[i].rwcode) {
260 	    case 1:		/* it is a switch */
261 		return (swstmt);
262 	    case 2:		/* a case or default */
263 		return (casestmt);
264 
265 	    case 3:		/* a "struct" */
266 		if (ps.p_l_follow)
267 		    break;	/* inside parens: cast */
268 		l_struct = true;
269 
270 		/*
271 		 * Next time around, we will want to know that we have had a
272 		 * 'struct'
273 		 */
274 	    case 4:		/* one of the declaration keywords */
275 		if (ps.p_l_follow) {
276 		    ps.cast_mask |= 1 << ps.p_l_follow;
277 		    break;	/* inside parens: cast */
278 		}
279 		last_code = decl;
280 		return (decl);
281 
282 	    case 5:		/* if, while, for */
283 		return (sp_paren);
284 
285 	    case 6:		/* do, else */
286 		return (sp_nparen);
287 
288 	    case 7:
289 		ps.sizeof_keyword = true;
290 	    default:		/* all others are treated like any other
291 				 * identifier */
292 		return (ident);
293 	    }			/* end of switch */
294 	}			/* end of if (found_it) */
295 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
296 	    char *tp = buf_ptr;
297 	    while (tp < buf_end)
298 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
299 		    goto not_proc;
300 	    strlcpy(ps.procname, token, sizeof ps.procname);
301 	    ps.in_parameter_declaration = 1;
302 	    rparen_count = 1;
303     not_proc:;
304 	}
305 	/*
306 	 * The following hack attempts to guess whether or not the current
307 	 * token is in fact a declaration keyword -- one that has been
308 	 * typedefd
309 	 */
310 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') ||
311 	    isalpha((unsigned char)*buf_ptr) || *buf_ptr == '_')
312 		&& !ps.p_l_follow
313 	        && !ps.block_init
314 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
315 		    ps.last_token == decl ||
316 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
317 	    ps.its_a_keyword = true;
318 	    ps.last_u_d = true;
319 	    last_code = decl;
320 	    return decl;
321 	}
322 	if (last_code == decl)	/* if this is a declared variable, then
323 				 * following sign is unary */
324 	    ps.last_u_d = true;	/* will make "int a -1" work */
325 	last_code = ident;
326 	return (ident);		/* the ident is not in the list */
327     }				/* end of processing for alpanum character */
328 
329     /* Scan a non-alphanumeric token */
330 
331     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
332 				 * moved here */
333     *e_token = '\0';
334     if (++buf_ptr >= buf_end)
335 	fill_buffer();
336 
337     switch (*token) {
338     case '\n':
339 	unary_delim = ps.last_u_d;
340 	ps.last_nl = true;	/* remember that we just had a newline */
341 	code = (had_eof ? 0 : newline);
342 
343 	/*
344 	 * if data has been exhausted, the newline is a dummy, and we should
345 	 * return code to stop
346 	 */
347 	break;
348 
349     case '\'':			/* start of quoted character */
350     case '"':			/* start of string */
351 	qchar = *token;
352 	if (troff) {
353 	    e_token[-1] = '`';
354 	    if (qchar == '"')
355 		*e_token++ = '`';
356 	    e_token = chfont(&bodyf, &stringf, e_token);
357 	}
358 	do {			/* copy the string */
359 	    while (1) {		/* move one character or [/<char>]<char> */
360 		if (*buf_ptr == '\n') {
361 		    printf("%d: Unterminated literal\n", line_no);
362 		    goto stop_lit;
363 		}
364 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
365 					 * since CHECK_SIZE guarantees that there
366 					 * are at least 5 entries left */
367 		*e_token = *buf_ptr++;
368 		if (buf_ptr >= buf_end)
369 		    fill_buffer();
370 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
371 		    if (*buf_ptr == '\n')	/* check for escaped newline */
372 			++line_no;
373 		    if (troff) {
374 			*++e_token = BACKSLASH;
375 			if (*buf_ptr == BACKSLASH)
376 			    *++e_token = BACKSLASH;
377 		    }
378 		    *++e_token = *buf_ptr++;
379 		    ++e_token;	/* we must increment this again because we
380 				 * copied two chars */
381 		    if (buf_ptr >= buf_end)
382 			fill_buffer();
383 		}
384 		else
385 		    break;	/* we copied one character */
386 	    }			/* end of while (1) */
387 	} while (*e_token++ != qchar);
388 	if (troff) {
389 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
390 	    if (qchar == '"')
391 		*e_token++ = '\'';
392 	}
393 stop_lit:
394 	code = ident;
395 	break;
396 
397     case ('('):
398     case ('['):
399 	unary_delim = true;
400 	code = lparen;
401 	break;
402 
403     case (')'):
404     case (']'):
405 	code = rparen;
406 	break;
407 
408     case '#':
409 	unary_delim = ps.last_u_d;
410 	code = preesc;
411 	break;
412 
413     case '?':
414 	unary_delim = true;
415 	code = question;
416 	break;
417 
418     case (':'):
419 	code = colon;
420 	unary_delim = true;
421 	break;
422 
423     case (';'):
424 	unary_delim = true;
425 	code = semicolon;
426 	break;
427 
428     case ('{'):
429 	unary_delim = true;
430 
431 	/*
432 	 * if (ps.in_or_st) ps.block_init = 1;
433 	 */
434 	/* ?	code = ps.block_init ? lparen : lbrace; */
435 	code = lbrace;
436 	break;
437 
438     case ('}'):
439 	unary_delim = true;
440 	/* ?	code = ps.block_init ? rparen : rbrace; */
441 	code = rbrace;
442 	break;
443 
444     case 014:			/* a form feed */
445 	unary_delim = ps.last_u_d;
446 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
447 				 * right */
448 	code = form_feed;
449 	break;
450 
451     case (','):
452 	unary_delim = true;
453 	code = comma;
454 	break;
455 
456     case '.':
457 	unary_delim = false;
458 	code = period;
459 	break;
460 
461     case '-':
462     case '+':			/* check for -, +, --, ++ */
463 	code = (ps.last_u_d ? unary_op : binary_op);
464 	unary_delim = true;
465 
466 	if (*buf_ptr == token[0]) {
467 	    /* check for doubled character */
468 	    *e_token++ = *buf_ptr++;
469 	    /* buffer overflow will be checked at end of loop */
470 	    if (last_code == ident || last_code == rparen) {
471 		code = (ps.last_u_d ? unary_op : postop);
472 		/* check for following ++ or -- */
473 		unary_delim = false;
474 	    }
475 	}
476 	else if (*buf_ptr == '=')
477 	    /* check for operator += */
478 	    *e_token++ = *buf_ptr++;
479 	else if (*buf_ptr == '>') {
480 	    /* check for operator -> */
481 	    *e_token++ = *buf_ptr++;
482 	    if (!pointer_as_binop) {
483 		unary_delim = false;
484 		code = unary_op;
485 		ps.want_blank = false;
486 	    }
487 	}
488 	break;			/* buffer overflow will be checked at end of
489 				 * switch */
490 
491     case '=':
492 	if (ps.in_or_st)
493 	    ps.block_init = 1;
494 #ifdef undef
495 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
496 	    e_token[-1] = *buf_ptr++;
497 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
498 		*e_token++ = *buf_ptr++;
499 	    *e_token++ = '=';	/* Flip =+ to += */
500 	    *e_token = 0;
501 	}
502 #else
503 	if (*buf_ptr == '=') {/* == */
504 	    *e_token++ = '=';	/* Flip =+ to += */
505 	    buf_ptr++;
506 	    *e_token = 0;
507 	}
508 #endif
509 	code = binary_op;
510 	unary_delim = true;
511 	break;
512 	/* can drop thru!!! */
513 
514     case '>':
515     case '<':
516     case '!':			/* ops like <, <<, <=, !=, etc */
517 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
518 	    *e_token++ = *buf_ptr;
519 	    if (++buf_ptr >= buf_end)
520 		fill_buffer();
521 	}
522 	if (*buf_ptr == '=')
523 	    *e_token++ = *buf_ptr++;
524 	code = (ps.last_u_d ? unary_op : binary_op);
525 	unary_delim = true;
526 	break;
527 
528     default:
529 	if (token[0] == '/' && *buf_ptr == '*') {
530 	    /* it is start of comment */
531 	    *e_token++ = '*';
532 
533 	    if (++buf_ptr >= buf_end)
534 		fill_buffer();
535 
536 	    code = comment;
537 	    unary_delim = ps.last_u_d;
538 	    break;
539 	}
540 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
541 	    /*
542 	     * handle ||, &&, etc, and also things as in int *****i
543 	     */
544 	    *e_token++ = *buf_ptr;
545 	    if (++buf_ptr >= buf_end)
546 		fill_buffer();
547 	}
548 	code = (ps.last_u_d ? unary_op : binary_op);
549 	unary_delim = true;
550 
551 
552     }				/* end of switch */
553     if (code != newline) {
554 	l_struct = false;
555 	last_code = code;
556     }
557     if (buf_ptr >= buf_end)	/* check for input buffer empty */
558 	fill_buffer();
559     ps.last_u_d = unary_delim;
560     *e_token = '\0';		/* null terminate the token */
561     return (code);
562 }
563 
564 /*
565  * Add the given keyword to the keyword table, using val as the keyword type
566  */
567 void
568 addkey(char *key, int val)
569 {
570     struct templ *p;
571     int i;
572 
573     for (i = 0; i < nspecials; i++) {
574 	p = &specials[i];
575 	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
576 	    return;
577     }
578 
579     if (specials == specialsinit) {
580 	/*
581 	 * Whoa. Must reallocate special table.
582 	 */
583 	nspecials = sizeof (specialsinit) / sizeof (specialsinit[0]);
584 	maxspecials = nspecials + (nspecials >> 2);
585 	specials = calloc(maxspecials, sizeof specials[0]);
586 	if (specials == NULL)
587 	    err(1, NULL);
588 	memcpy(specials, specialsinit, sizeof specialsinit);
589     } else if (nspecials >= maxspecials) {
590 	int newspecials = maxspecials + (maxspecials >> 2);
591 	struct templ *specials2;
592 
593 	specials2 = reallocarray(specials, newspecials, sizeof(specials[0]));
594 	if (specials2 == NULL)
595 	    err(1, NULL);
596 	specials = specials2;
597 	maxspecials = newspecials;
598     }
599 
600     p = &specials[nspecials];
601     p->rwd = key;
602     p->rwcode = val;
603     nspecials++;
604     return;
605 }
606