xref: /dragonfly/usr.bin/indent/lexi.c (revision 73610d44)
1 /*
2  * Copyright (c) 1985 Sun Microsystems, Inc.
3  * Copyright (c) 1980, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. Neither the name of the University nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  *
31  * @(#)lexi.c	8.1 (Berkeley) 6/6/93
32  * $FreeBSD: src/usr.bin/indent/lexi.c,v 1.21 2010/04/15 21:41:07 avg Exp $
33  */
34 
35 /*
36  * Here we have the token scanner for indent.  It scans off one token and puts
37  * it in the global variable "token".  It returns a code, indicating the type
38  * of token scanned.
39  */
40 
41 #include <err.h>
42 #include <stdio.h>
43 #include <ctype.h>
44 #include <stdlib.h>
45 #include <string.h>
46 #include "indent_globs.h"
47 #include "indent_codes.h"
48 #include "indent.h"
49 
50 #define alphanum 1
51 #define opchar 3
52 
53 struct templ {
54     const char *rwd;
55     int         rwcode;
56 };
57 
58 struct templ specials[1000] =
59 {
60     {"switch", 1},
61     {"case", 2},
62     {"break", 0},
63     {"struct", 3},
64     {"union", 3},
65     {"enum", 3},
66     {"default", 2},
67     {"int", 4},
68     {"char", 4},
69     {"float", 4},
70     {"double", 4},
71     {"long", 4},
72     {"short", 4},
73     {"typdef", 4},
74     {"unsigned", 4},
75     {"register", 4},
76     {"static", 4},
77     {"global", 4},
78     {"extern", 4},
79     {"void", 4},
80     {"const", 4},
81     {"volatile", 4},
82     {"goto", 0},
83     {"return", 0},
84     {"if", 5},
85     {"while", 5},
86     {"for", 5},
87     {"else", 6},
88     {"do", 6},
89     {"sizeof", 7},
90     {0, 0}
91 };
92 
93 char        chartype[128] =
94 {				/* this is used to facilitate the decision of
95 				 * what type (alphanumeric, operator) each
96 				 * character is */
97     0, 0, 0, 0, 0, 0, 0, 0,
98     0, 0, 0, 0, 0, 0, 0, 0,
99     0, 0, 0, 0, 0, 0, 0, 0,
100     0, 0, 0, 0, 0, 0, 0, 0,
101     0, 3, 0, 0, 1, 3, 3, 0,
102     0, 0, 3, 3, 0, 3, 0, 3,
103     1, 1, 1, 1, 1, 1, 1, 1,
104     1, 1, 0, 0, 3, 3, 3, 3,
105     0, 1, 1, 1, 1, 1, 1, 1,
106     1, 1, 1, 1, 1, 1, 1, 1,
107     1, 1, 1, 1, 1, 1, 1, 1,
108     1, 1, 1, 0, 0, 0, 3, 1,
109     0, 1, 1, 1, 1, 1, 1, 1,
110     1, 1, 1, 1, 1, 1, 1, 1,
111     1, 1, 1, 1, 1, 1, 1, 1,
112     1, 1, 1, 0, 3, 0, 3, 0
113 };
114 
115 int
116 lexi(void)
117 {
118     int         unary_delim;	/* this is set to 1 if the current token
119 				 * forces a following operator to be unary */
120     static int  last_code;	/* the last token type returned */
121     static int  l_struct;	/* set to 1 if the last token was 'struct' */
122     int         code;		/* internal code to be returned */
123     char        qchar;		/* the delimiter character for a string */
124 
125     e_token = s_token;		/* point to start of place to save token */
126     unary_delim = false;
127     ps.col_1 = ps.last_nl;	/* tell world that this token started in
128 				 * column 1 iff the last thing scanned was nl */
129     ps.last_nl = false;
130 
131     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
132 	ps.col_1 = false;	/* leading blanks imply token is not in column
133 				 * 1 */
134 	if (++buf_ptr >= buf_end)
135 	    fill_buffer();
136     }
137 
138     /* Scan an alphanumeric token */
139     if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
140 	/*
141 	 * we have a character or number
142 	 */
143 	const char *j;		/* used for searching thru list of
144 				 *
145 				 * reserved words */
146 	struct templ *p;
147 
148 	if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
149 	    int         seendot = 0,
150 	                seenexp = 0,
151 			seensfx = 0;
152 	    if (*buf_ptr == '0' &&
153 		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
154 		*e_token++ = *buf_ptr++;
155 		*e_token++ = *buf_ptr++;
156 		while (isxdigit(*buf_ptr)) {
157 		    CHECK_SIZE_TOKEN;
158 		    *e_token++ = *buf_ptr++;
159 		}
160 	    }
161 	    else
162 		while (1) {
163 		    if (*buf_ptr == '.') {
164 			if (seendot)
165 			    break;
166 			else
167 			    seendot++;
168 		    }
169 		    CHECK_SIZE_TOKEN;
170 		    *e_token++ = *buf_ptr++;
171 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
172 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
173 			    break;
174 			else {
175 			    seenexp++;
176 			    seendot++;
177 			    CHECK_SIZE_TOKEN;
178 			    *e_token++ = *buf_ptr++;
179 			    if (*buf_ptr == '+' || *buf_ptr == '-')
180 				*e_token++ = *buf_ptr++;
181 			}
182 		    }
183 		}
184 	    while (1) {
185 		if (!(seensfx & 1) &&
186 			(*buf_ptr == 'U' || *buf_ptr == 'u')) {
187 		    CHECK_SIZE_TOKEN;
188 		    *e_token++ = *buf_ptr++;
189 		    seensfx |= 1;
190 		    continue;
191 		}
192         	if (!(seensfx & 2) &&
193 			(*buf_ptr == 'L' || *buf_ptr == 'l')) {
194 		    CHECK_SIZE_TOKEN;
195 		    if (buf_ptr[1] == buf_ptr[0])
196 		        *e_token++ = *buf_ptr++;
197 		    *e_token++ = *buf_ptr++;
198 		    seensfx |= 2;
199 		    continue;
200 		}
201 		break;
202 	    }
203 	}
204 	else
205 	    while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
206 		/* fill_buffer() terminates buffer with newline */
207 		if (*buf_ptr == BACKSLASH) {
208 		    if (*(buf_ptr + 1) == '\n') {
209 			buf_ptr += 2;
210 			if (buf_ptr >= buf_end)
211 			    fill_buffer();
212 			} else
213 			    break;
214 		}
215 		CHECK_SIZE_TOKEN;
216 		/* copy it over */
217 		*e_token++ = *buf_ptr++;
218 		if (buf_ptr >= buf_end)
219 		    fill_buffer();
220 	    }
221 	*e_token++ = '\0';
222 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
223 	    if (++buf_ptr >= buf_end)
224 		fill_buffer();
225 	}
226 	ps.its_a_keyword = false;
227 	ps.sizeof_keyword = false;
228 	if (l_struct && !ps.p_l_follow) {
229 				/* if last token was 'struct' and we're not
230 				 * in parentheses, then this token
231 				 * should be treated as a declaration */
232 	    l_struct = false;
233 	    last_code = ident;
234 	    ps.last_u_d = true;
235 	    return (decl);
236 	}
237 	ps.last_u_d = l_struct;	/* Operator after identifier is binary
238 				 * unless last token was 'struct' */
239 	l_struct = false;
240 	last_code = ident;	/* Remember that this is the code we will
241 				 * return */
242 
243 	if (auto_typedefs) {
244 	    const char *q = s_token;
245 	    size_t q_len = strlen(q);
246 	    /* Check if we have an "_t" in the end */
247 	    if (q_len > 2 &&
248 	        (strcmp(q + q_len - 2, "_t") == 0)) {
249 	        ps.its_a_keyword = true;
250 		ps.last_u_d = true;
251 	        goto found_auto_typedef;
252 	    }
253 	}
254 
255 	/*
256 	 * This loop will check if the token is a keyword.
257 	 */
258 	for (p = specials; (j = p->rwd) != NULL; p++) {
259 	    const char *q = s_token;	/* point at scanned token */
260 	    if (*j++ != *q++ || *j++ != *q++)
261 		continue;	/* This test depends on the fact that
262 				 * identifiers are always at least 1 character
263 				 * long (ie. the first two bytes of the
264 				 * identifier are always meaningful) */
265 	    if (q[-1] == 0)
266 		break;		/* If its a one-character identifier */
267 	    while (*q++ == *j)
268 		if (*j++ == 0)
269 		    goto found_keyword;	/* I wish that C had a multi-level
270 					 * break... */
271 	}
272 	if (p->rwd) {		/* we have a keyword */
273     found_keyword:
274 	    ps.its_a_keyword = true;
275 	    ps.last_u_d = true;
276 	    switch (p->rwcode) {
277 	    case 1:		/* it is a switch */
278 		return (swstmt);
279 	    case 2:		/* a case or default */
280 		return (casestmt);
281 
282 	    case 3:		/* a "struct" */
283 		/*
284 		 * Next time around, we will want to know that we have had a
285 		 * 'struct'
286 		 */
287 		l_struct = true;
288 		/* FALLTHROUGH */
289 
290 	    case 4:		/* one of the declaration keywords */
291 	    found_auto_typedef:
292 		if (ps.p_l_follow) {
293 		    ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.sizeof_mask;
294 		    break;	/* inside parens: cast, param list or sizeof */
295 		}
296 		last_code = decl;
297 		return (decl);
298 
299 	    case 5:		/* if, while, for */
300 		return (sp_paren);
301 
302 	    case 6:		/* do, else */
303 		return (sp_nparen);
304 
305 	    case 7:
306 		ps.sizeof_keyword = true;
307 	    default:		/* all others are treated like any other
308 				 * identifier */
309 		return (ident);
310 	    }			/* end of switch */
311 	}			/* end of if (found_it) */
312 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
313 	    char *tp = buf_ptr;
314 	    while (tp < buf_end)
315 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
316 		    goto not_proc;
317 	    strncpy(ps.procname, token, sizeof ps.procname - 1);
318 	    ps.in_parameter_declaration = 1;
319 	    rparen_count = 1;
320     not_proc:;
321 	}
322 	/*
323 	 * The following hack attempts to guess whether or not the current
324 	 * token is in fact a declaration keyword -- one that has been
325 	 * typedefd
326 	 */
327 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
328 		&& !ps.p_l_follow
329 	        && !ps.block_init
330 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
331 		    ps.last_token == decl ||
332 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
333 	    ps.its_a_keyword = true;
334 	    ps.last_u_d = true;
335 	    last_code = decl;
336 	    return decl;
337 	}
338 	if (last_code == decl)	/* if this is a declared variable, then
339 				 * following sign is unary */
340 	    ps.last_u_d = true;	/* will make "int a -1" work */
341 	last_code = ident;
342 	return (ident);		/* the ident is not in the list */
343     }				/* end of procesing for alpanum character */
344 
345     /* Scan a non-alphanumeric token */
346 
347     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
348 				 * moved here */
349     *e_token = '\0';
350     if (++buf_ptr >= buf_end)
351 	fill_buffer();
352 
353     switch (*token) {
354     case '\n':
355 	unary_delim = ps.last_u_d;
356 	ps.last_nl = true;	/* remember that we just had a newline */
357 	code = (had_eof ? 0 : newline);
358 
359 	/*
360 	 * if data has been exhausted, the newline is a dummy, and we should
361 	 * return code to stop
362 	 */
363 	break;
364 
365     case '\'':			/* start of quoted character */
366     case '"':			/* start of string */
367 	qchar = *token;
368 	if (troff) {
369 	    e_token[-1] = '`';
370 	    if (qchar == '"')
371 		*e_token++ = '`';
372 	    e_token = chfont(&bodyf, &stringf, e_token);
373 	}
374 	do {			/* copy the string */
375 	    while (1) {		/* move one character or [/<char>]<char> */
376 		if (*buf_ptr == '\n') {
377 		    diag2(1, "Unterminated literal");
378 		    goto stop_lit;
379 		}
380 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
381 					 * since CHECK_SIZE guarantees that there
382 					 * are at least 5 entries left */
383 		*e_token = *buf_ptr++;
384 		if (buf_ptr >= buf_end)
385 		    fill_buffer();
386 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
387 		    if (*buf_ptr == '\n')	/* check for escaped newline */
388 			++line_no;
389 		    if (troff) {
390 			*++e_token = BACKSLASH;
391 			if (*buf_ptr == BACKSLASH)
392 			    *++e_token = BACKSLASH;
393 		    }
394 		    *++e_token = *buf_ptr++;
395 		    ++e_token;	/* we must increment this again because we
396 				 * copied two chars */
397 		    if (buf_ptr >= buf_end)
398 			fill_buffer();
399 		}
400 		else
401 		    break;	/* we copied one character */
402 	    }			/* end of while (1) */
403 	} while (*e_token++ != qchar);
404 	if (troff) {
405 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
406 	    if (qchar == '"')
407 		*e_token++ = '\'';
408 	}
409 stop_lit:
410 	code = ident;
411 	break;
412 
413     case ('('):
414     case ('['):
415 	unary_delim = true;
416 	code = lparen;
417 	break;
418 
419     case (')'):
420     case (']'):
421 	code = rparen;
422 	break;
423 
424     case '#':
425 	unary_delim = ps.last_u_d;
426 	code = preesc;
427 	break;
428 
429     case '?':
430 	unary_delim = true;
431 	code = question;
432 	break;
433 
434     case (':'):
435 	code = colon;
436 	unary_delim = true;
437 	break;
438 
439     case (';'):
440 	unary_delim = true;
441 	code = semicolon;
442 	break;
443 
444     case ('{'):
445 	unary_delim = true;
446 
447 	/*
448 	 * if (ps.in_or_st) ps.block_init = 1;
449 	 */
450 	/* ?	code = ps.block_init ? lparen : lbrace; */
451 	code = lbrace;
452 	break;
453 
454     case ('}'):
455 	unary_delim = true;
456 	/* ?	code = ps.block_init ? rparen : rbrace; */
457 	code = rbrace;
458 	break;
459 
460     case 014:			/* a form feed */
461 	unary_delim = ps.last_u_d;
462 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
463 				 * right */
464 	code = form_feed;
465 	break;
466 
467     case (','):
468 	unary_delim = true;
469 	code = comma;
470 	break;
471 
472     case '.':
473 	unary_delim = false;
474 	code = period;
475 	break;
476 
477     case '-':
478     case '+':			/* check for -, +, --, ++ */
479 	code = (ps.last_u_d ? unary_op : binary_op);
480 	unary_delim = true;
481 
482 	if (*buf_ptr == token[0]) {
483 	    /* check for doubled character */
484 	    *e_token++ = *buf_ptr++;
485 	    /* buffer overflow will be checked at end of loop */
486 	    if (last_code == ident || last_code == rparen) {
487 		code = (ps.last_u_d ? unary_op : postop);
488 		/* check for following ++ or -- */
489 		unary_delim = false;
490 	    }
491 	}
492 	else if (*buf_ptr == '=')
493 	    /* check for operator += */
494 	    *e_token++ = *buf_ptr++;
495 	else if (*buf_ptr == '>') {
496 	    /* check for operator -> */
497 	    *e_token++ = *buf_ptr++;
498 	    if (!pointer_as_binop) {
499 		unary_delim = false;
500 		code = unary_op;
501 		ps.want_blank = false;
502 	    }
503 	}
504 	break;			/* buffer overflow will be checked at end of
505 				 * switch */
506 
507     case '=':
508 	if (ps.in_or_st)
509 	    ps.block_init = 1;
510 #ifdef undef
511 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
512 	    e_token[-1] = *buf_ptr++;
513 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
514 		*e_token++ = *buf_ptr++;
515 	    *e_token++ = '=';	/* Flip =+ to += */
516 	    *e_token = 0;
517 	}
518 #else
519 	if (*buf_ptr == '=') {/* == */
520 	    *e_token++ = '=';	/* Flip =+ to += */
521 	    buf_ptr++;
522 	    *e_token = 0;
523 	}
524 #endif
525 	code = binary_op;
526 	unary_delim = true;
527 	break;
528 	/* can drop thru!!! */
529 
530     case '>':
531     case '<':
532     case '!':			/* ops like <, <<, <=, !=, etc */
533 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
534 	    *e_token++ = *buf_ptr;
535 	    if (++buf_ptr >= buf_end)
536 		fill_buffer();
537 	}
538 	if (*buf_ptr == '=')
539 	    *e_token++ = *buf_ptr++;
540 	code = (ps.last_u_d ? unary_op : binary_op);
541 	unary_delim = true;
542 	break;
543 
544     default:
545 	if (token[0] == '/' && *buf_ptr == '*') {
546 	    /* it is start of comment */
547 	    *e_token++ = '*';
548 
549 	    if (++buf_ptr >= buf_end)
550 		fill_buffer();
551 
552 	    code = comment;
553 	    unary_delim = ps.last_u_d;
554 	    break;
555 	}
556 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
557 	    /*
558 	     * handle ||, &&, etc, and also things as in int *****i
559 	     */
560 	    *e_token++ = *buf_ptr;
561 	    if (++buf_ptr >= buf_end)
562 		fill_buffer();
563 	}
564 	code = (ps.last_u_d ? unary_op : binary_op);
565 	unary_delim = true;
566 
567 
568     }				/* end of switch */
569     if (code != newline) {
570 	l_struct = false;
571 	last_code = code;
572     }
573     if (buf_ptr >= buf_end)	/* check for input buffer empty */
574 	fill_buffer();
575     ps.last_u_d = unary_delim;
576     *e_token = '\0';		/* null terminate the token */
577     return (code);
578 }
579 
580 /*
581  * Add the given keyword to the keyword table, using val as the keyword type
582  */
583 void
584 addkey(char *key, int val)
585 {
586     struct templ *p = specials;
587     while (p->rwd)
588 	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
589 	    return;
590 	else
591 	    p++;
592     if (p >= specials + sizeof specials / sizeof specials[0])
593 	return;			/* For now, table overflows are silently
594 				 * ignored */
595     p->rwd = key;
596     p->rwcode = val;
597     p[1].rwd = NULL;
598     p[1].rwcode = 0;
599 }
600