xref: /openbsd/usr.bin/indent/lexi.c (revision 891d7ab6)
1 /*	$OpenBSD: lexi.c,v 1.15 2009/10/27 23:59:39 deraadt Exp $	*/
2 
3 /*
4  * Copyright (c) 1980, 1993
5  *	The Regents of the University of California.
6  * Copyright (c) 1976 Board of Trustees of the University of Illinois.
7  * Copyright (c) 1985 Sun Microsystems, Inc.
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 /*
36  * Here we have the token scanner for indent.  It scans off one token and puts
37  * it in the global variable "token".  It returns a code, indicating the type
38  * of token scanned.
39  */
40 
41 #include <stdio.h>
42 #include <ctype.h>
43 #include <stdlib.h>
44 #include <string.h>
45 #include <err.h>
46 #include "indent_globs.h"
47 #include "indent_codes.h"
48 
49 #define alphanum 1
50 #define opchar 3
51 
52 struct templ {
53     char       *rwd;
54     int         rwcode;
55 };
56 
57 struct templ specialsinit[] = {
58 	{ "switch", 1 },
59 	{ "case", 2 },
60 	{ "break", 0 },
61 	{ "struct", 3 },
62 	{ "union", 3 },
63 	{ "enum", 3 },
64 	{ "default", 2 },
65 	{ "int", 4 },
66 	{ "char", 4 },
67 	{ "float", 4 },
68 	{ "double", 4 },
69 	{ "long", 4 },
70 	{ "short", 4 },
71 	{ "typdef", 4 },
72 	{ "unsigned", 4 },
73 	{ "register", 4 },
74 	{ "static", 4 },
75 	{ "global", 4 },
76 	{ "extern", 4 },
77 	{ "void", 4 },
78 	{ "goto", 0 },
79 	{ "return", 0 },
80 	{ "if", 5 },
81 	{ "while", 5 },
82 	{ "for", 5 },
83 	{ "else", 6 },
84 	{ "do", 6 },
85 	{ "sizeof", 7 },
86 };
87 
88 struct templ *specials = specialsinit;
89 int	nspecials = sizeof(specialsinit) / sizeof(specialsinit[0]);
90 int	maxspecials;
91 
92 char        chartype[128] =
93 {				/* this is used to facilitate the decision of
94 				 * what type (alphanumeric, operator) each
95 				 * character is */
96     0, 0, 0, 0, 0, 0, 0, 0,
97     0, 0, 0, 0, 0, 0, 0, 0,
98     0, 0, 0, 0, 0, 0, 0, 0,
99     0, 0, 0, 0, 0, 0, 0, 0,
100     0, 3, 0, 0, 1, 3, 3, 0,
101     0, 0, 3, 3, 0, 3, 0, 3,
102     1, 1, 1, 1, 1, 1, 1, 1,
103     1, 1, 0, 0, 3, 3, 3, 3,
104     0, 1, 1, 1, 1, 1, 1, 1,
105     1, 1, 1, 1, 1, 1, 1, 1,
106     1, 1, 1, 1, 1, 1, 1, 1,
107     1, 1, 1, 0, 0, 0, 3, 1,
108     0, 1, 1, 1, 1, 1, 1, 1,
109     1, 1, 1, 1, 1, 1, 1, 1,
110     1, 1, 1, 1, 1, 1, 1, 1,
111     1, 1, 1, 0, 3, 0, 3, 0
112 };
113 
114 
115 
116 
117 int
118 lexi(void)
119 {
120     int         unary_delim;	/* this is set to 1 if the current token
121 				 * forces a following operator to be unary */
122     static int  last_code;	/* the last token type returned */
123     static int  l_struct;	/* set to 1 if the last token was 'struct' */
124     int         code;		/* internal code to be returned */
125     char        qchar;		/* the delimiter character for a string */
126     int		i;
127 
128     e_token = s_token;		/* point to start of place to save token */
129     unary_delim = false;
130     ps.col_1 = ps.last_nl;	/* tell world that this token started in
131 				 * column 1 iff the last thing scanned was nl */
132     ps.last_nl = false;
133 
134     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
135 	ps.col_1 = false;	/* leading blanks imply token is not in column
136 				 * 1 */
137 	if (++buf_ptr >= buf_end)
138 	    fill_buffer();
139     }
140 
141     /* Scan an alphanumeric token */
142     if (chartype[(int)*buf_ptr] == alphanum ||
143 	(buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
144 	/*
145 	 * we have a character or number
146 	 */
147 	char *j;	/* used for searching thru list of
148 			 * reserved words */
149 	if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
150 	    int         seendot = 0,
151 	                seenexp = 0,
152 			seensfx = 0;
153 	    if (*buf_ptr == '0' &&
154 		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
155 		*e_token++ = *buf_ptr++;
156 		*e_token++ = *buf_ptr++;
157 		while (isxdigit(*buf_ptr)) {
158 		    CHECK_SIZE_TOKEN;
159 		    *e_token++ = *buf_ptr++;
160 		}
161 	    }
162 	    else
163 		while (1) {
164 		    if (*buf_ptr == '.') {
165 			if (seendot)
166 			    break;
167 			else
168 			    seendot++;
169 		    }
170 		    CHECK_SIZE_TOKEN;
171 		    *e_token++ = *buf_ptr++;
172 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
173 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
174 			    break;
175 			else {
176 			    seenexp++;
177 			    seendot++;
178 			    CHECK_SIZE_TOKEN;
179 			    *e_token++ = *buf_ptr++;
180 			    if (*buf_ptr == '+' || *buf_ptr == '-')
181 				*e_token++ = *buf_ptr++;
182 			}
183 		    }
184 		}
185 	    while (1) {
186 		if (!(seensfx & 1) &&
187 			(*buf_ptr == 'U' || *buf_ptr == 'u')) {
188 		    CHECK_SIZE_TOKEN;
189 		    *e_token++ = *buf_ptr++;
190 		    seensfx |= 1;
191 		    continue;
192 		}
193         	if (!(seensfx & 2) &&
194 			(*buf_ptr == 'L' || *buf_ptr == 'l')) {
195 		    CHECK_SIZE_TOKEN;
196 		    if (buf_ptr[1] == buf_ptr[0])
197 		        *e_token++ = *buf_ptr++;
198 		    *e_token++ = *buf_ptr++;
199 		    seensfx |= 2;
200 		    continue;
201 		}
202 		break;
203 	    }
204 	}
205 	else
206 	    while (chartype[(int)*buf_ptr] == alphanum) {	/* copy it over */
207 		CHECK_SIZE_TOKEN;
208 		*e_token++ = *buf_ptr++;
209 		if (buf_ptr >= buf_end)
210 		    fill_buffer();
211 	    }
212 	*e_token++ = '\0';
213 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
214 	    if (++buf_ptr >= buf_end)
215 		fill_buffer();
216 	}
217 	ps.its_a_keyword = false;
218 	ps.sizeof_keyword = false;
219 	if (l_struct) {		/* if last token was 'struct', then this token
220 				 * should be treated as a declaration */
221 	    l_struct = false;
222 	    last_code = ident;
223 	    ps.last_u_d = true;
224 	    return (decl);
225 	}
226 	ps.last_u_d = false;	/* Operator after identifier is binary */
227 	last_code = ident;	/* Remember that this is the code we will
228 				 * return */
229 
230 	/*
231 	 * This loop will check if the token is a keyword.
232 	 */
233 	for (i = 0; i < nspecials; i++) {
234 	    char *p = s_token;	/* point at scanned token */
235 	    j = specials[i].rwd;
236 	    if (*j++ != *p++ || *j++ != *p++)
237 		continue;	/* This test depends on the fact that
238 				 * identifiers are always at least 1 character
239 				 * long (ie. the first two bytes of the
240 				 * identifier are always meaningful) */
241 	    if (p[-1] == 0)
242 		break;		/* If its a one-character identifier */
243 	    while (*p++ == *j)
244 		if (*j++ == 0)
245 		    goto found_keyword;	/* I wish that C had a multi-level
246 					 * break... */
247 	}
248 	if (i < nspecials) {		/* we have a keyword */
249     found_keyword:
250 	    ps.its_a_keyword = true;
251 	    ps.last_u_d = true;
252 	    switch (specials[i].rwcode) {
253 	    case 1:		/* it is a switch */
254 		return (swstmt);
255 	    case 2:		/* a case or default */
256 		return (casestmt);
257 
258 	    case 3:		/* a "struct" */
259 		if (ps.p_l_follow)
260 		    break;	/* inside parens: cast */
261 		l_struct = true;
262 
263 		/*
264 		 * Next time around, we will want to know that we have had a
265 		 * 'struct'
266 		 */
267 	    case 4:		/* one of the declaration keywords */
268 		if (ps.p_l_follow) {
269 		    ps.cast_mask |= 1 << ps.p_l_follow;
270 		    break;	/* inside parens: cast */
271 		}
272 		last_code = decl;
273 		return (decl);
274 
275 	    case 5:		/* if, while, for */
276 		return (sp_paren);
277 
278 	    case 6:		/* do, else */
279 		return (sp_nparen);
280 
281 	    case 7:
282 		ps.sizeof_keyword = true;
283 	    default:		/* all others are treated like any other
284 				 * identifier */
285 		return (ident);
286 	    }			/* end of switch */
287 	}			/* end of if (found_it) */
288 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
289 	    char *tp = buf_ptr;
290 	    while (tp < buf_end)
291 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
292 		    goto not_proc;
293 	    strlcpy(ps.procname, token, sizeof ps.procname);
294 	    ps.in_parameter_declaration = 1;
295 	    rparen_count = 1;
296     not_proc:;
297 	}
298 	/*
299 	 * The following hack attempts to guess whether or not the current
300 	 * token is in fact a declaration keyword -- one that has been
301 	 * typedefd
302 	 */
303 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
304 		&& !ps.p_l_follow
305 	        && !ps.block_init
306 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
307 		    ps.last_token == decl ||
308 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
309 	    ps.its_a_keyword = true;
310 	    ps.last_u_d = true;
311 	    last_code = decl;
312 	    return decl;
313 	}
314 	if (last_code == decl)	/* if this is a declared variable, then
315 				 * following sign is unary */
316 	    ps.last_u_d = true;	/* will make "int a -1" work */
317 	last_code = ident;
318 	return (ident);		/* the ident is not in the list */
319     }				/* end of procesing for alpanum character */
320 
321     /* Scan a non-alphanumeric token */
322 
323     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
324 				 * moved here */
325     *e_token = '\0';
326     if (++buf_ptr >= buf_end)
327 	fill_buffer();
328 
329     switch (*token) {
330     case '\n':
331 	unary_delim = ps.last_u_d;
332 	ps.last_nl = true;	/* remember that we just had a newline */
333 	code = (had_eof ? 0 : newline);
334 
335 	/*
336 	 * if data has been exausted, the newline is a dummy, and we should
337 	 * return code to stop
338 	 */
339 	break;
340 
341     case '\'':			/* start of quoted character */
342     case '"':			/* start of string */
343 	qchar = *token;
344 	if (troff) {
345 	    e_token[-1] = '`';
346 	    if (qchar == '"')
347 		*e_token++ = '`';
348 	    e_token = chfont(&bodyf, &stringf, e_token);
349 	}
350 	do {			/* copy the string */
351 	    while (1) {		/* move one character or [/<char>]<char> */
352 		if (*buf_ptr == '\n') {
353 		    printf("%d: Unterminated literal\n", line_no);
354 		    goto stop_lit;
355 		}
356 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
357 					 * since CHECK_SIZE guarantees that there
358 					 * are at least 5 entries left */
359 		*e_token = *buf_ptr++;
360 		if (buf_ptr >= buf_end)
361 		    fill_buffer();
362 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
363 		    if (*buf_ptr == '\n')	/* check for escaped newline */
364 			++line_no;
365 		    if (troff) {
366 			*++e_token = BACKSLASH;
367 			if (*buf_ptr == BACKSLASH)
368 			    *++e_token = BACKSLASH;
369 		    }
370 		    *++e_token = *buf_ptr++;
371 		    ++e_token;	/* we must increment this again because we
372 				 * copied two chars */
373 		    if (buf_ptr >= buf_end)
374 			fill_buffer();
375 		}
376 		else
377 		    break;	/* we copied one character */
378 	    }			/* end of while (1) */
379 	} while (*e_token++ != qchar);
380 	if (troff) {
381 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
382 	    if (qchar == '"')
383 		*e_token++ = '\'';
384 	}
385 stop_lit:
386 	code = ident;
387 	break;
388 
389     case ('('):
390     case ('['):
391 	unary_delim = true;
392 	code = lparen;
393 	break;
394 
395     case (')'):
396     case (']'):
397 	code = rparen;
398 	break;
399 
400     case '#':
401 	unary_delim = ps.last_u_d;
402 	code = preesc;
403 	break;
404 
405     case '?':
406 	unary_delim = true;
407 	code = question;
408 	break;
409 
410     case (':'):
411 	code = colon;
412 	unary_delim = true;
413 	break;
414 
415     case (';'):
416 	unary_delim = true;
417 	code = semicolon;
418 	break;
419 
420     case ('{'):
421 	unary_delim = true;
422 
423 	/*
424 	 * if (ps.in_or_st) ps.block_init = 1;
425 	 */
426 	/* ?	code = ps.block_init ? lparen : lbrace; */
427 	code = lbrace;
428 	break;
429 
430     case ('}'):
431 	unary_delim = true;
432 	/* ?	code = ps.block_init ? rparen : rbrace; */
433 	code = rbrace;
434 	break;
435 
436     case 014:			/* a form feed */
437 	unary_delim = ps.last_u_d;
438 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
439 				 * right */
440 	code = form_feed;
441 	break;
442 
443     case (','):
444 	unary_delim = true;
445 	code = comma;
446 	break;
447 
448     case '.':
449 	unary_delim = false;
450 	code = period;
451 	break;
452 
453     case '-':
454     case '+':			/* check for -, +, --, ++ */
455 	code = (ps.last_u_d ? unary_op : binary_op);
456 	unary_delim = true;
457 
458 	if (*buf_ptr == token[0]) {
459 	    /* check for doubled character */
460 	    *e_token++ = *buf_ptr++;
461 	    /* buffer overflow will be checked at end of loop */
462 	    if (last_code == ident || last_code == rparen) {
463 		code = (ps.last_u_d ? unary_op : postop);
464 		/* check for following ++ or -- */
465 		unary_delim = false;
466 	    }
467 	}
468 	else if (*buf_ptr == '=')
469 	    /* check for operator += */
470 	    *e_token++ = *buf_ptr++;
471 	else if (*buf_ptr == '>') {
472 	    /* check for operator -> */
473 	    *e_token++ = *buf_ptr++;
474 	    if (!pointer_as_binop) {
475 		unary_delim = false;
476 		code = unary_op;
477 		ps.want_blank = false;
478 	    }
479 	}
480 	break;			/* buffer overflow will be checked at end of
481 				 * switch */
482 
483     case '=':
484 	if (ps.in_or_st)
485 	    ps.block_init = 1;
486 #ifdef undef
487 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
488 	    e_token[-1] = *buf_ptr++;
489 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
490 		*e_token++ = *buf_ptr++;
491 	    *e_token++ = '=';	/* Flip =+ to += */
492 	    *e_token = 0;
493 	}
494 #else
495 	if (*buf_ptr == '=') {/* == */
496 	    *e_token++ = '=';	/* Flip =+ to += */
497 	    buf_ptr++;
498 	    *e_token = 0;
499 	}
500 #endif
501 	code = binary_op;
502 	unary_delim = true;
503 	break;
504 	/* can drop thru!!! */
505 
506     case '>':
507     case '<':
508     case '!':			/* ops like <, <<, <=, !=, etc */
509 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
510 	    *e_token++ = *buf_ptr;
511 	    if (++buf_ptr >= buf_end)
512 		fill_buffer();
513 	}
514 	if (*buf_ptr == '=')
515 	    *e_token++ = *buf_ptr++;
516 	code = (ps.last_u_d ? unary_op : binary_op);
517 	unary_delim = true;
518 	break;
519 
520     default:
521 	if (token[0] == '/' && *buf_ptr == '*') {
522 	    /* it is start of comment */
523 	    *e_token++ = '*';
524 
525 	    if (++buf_ptr >= buf_end)
526 		fill_buffer();
527 
528 	    code = comment;
529 	    unary_delim = ps.last_u_d;
530 	    break;
531 	}
532 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
533 	    /*
534 	     * handle ||, &&, etc, and also things as in int *****i
535 	     */
536 	    *e_token++ = *buf_ptr;
537 	    if (++buf_ptr >= buf_end)
538 		fill_buffer();
539 	}
540 	code = (ps.last_u_d ? unary_op : binary_op);
541 	unary_delim = true;
542 
543 
544     }				/* end of switch */
545     if (code != newline) {
546 	l_struct = false;
547 	last_code = code;
548     }
549     if (buf_ptr >= buf_end)	/* check for input buffer empty */
550 	fill_buffer();
551     ps.last_u_d = unary_delim;
552     *e_token = '\0';		/* null terminate the token */
553     return (code);
554 }
555 
556 /*
557  * Add the given keyword to the keyword table, using val as the keyword type
558  */
559 void
560 addkey(char *key, int val)
561 {
562     struct templ *p;
563     int i;
564 
565     for (i = 0; i < nspecials; i++) {
566 	p = &specials[i];
567 	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
568 	    return;
569     }
570 
571     if (specials == specialsinit) {
572 	/*
573 	 * Whoa. Must reallocate special table.
574 	 */
575 	nspecials = sizeof (specialsinit) / sizeof (specialsinit[0]);
576 	maxspecials = nspecials + (nspecials >> 2);
577 	specials = (struct templ *)calloc(maxspecials, sizeof specials[0]);
578 	if (specials == NULL)
579 	    err(1, NULL);
580 	memcpy(specials, specialsinit, sizeof specialsinit);
581     } else if (nspecials >= maxspecials) {
582 	int newspecials = maxspecials + (maxspecials >> 2);
583 	struct templ *specials2;
584 
585 	specials2 = realloc(specials, newspecials * sizeof specials[0]);
586 	if (specials2 == NULL)
587 	    err(1, NULL);
588 	specials = specials2;
589 	maxspecials = newspecials;
590     }
591 
592     p = &specials[nspecials];
593     p->rwd = key;
594     p->rwcode = val;
595     nspecials++;
596     return;
597 }
598