xref: /dragonfly/usr.bin/indent/lexi.c (revision 52f9f0d9)
1 /*
2  * Copyright (c) 1985 Sun Microsystems, Inc.
3  * Copyright (c) 1980, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed by the University of
18  *	California, Berkeley and its contributors.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  *
35  * @(#)lexi.c	8.1 (Berkeley) 6/6/93
36  * $FreeBSD: src/usr.bin/indent/lexi.c,v 1.21 2010/04/15 21:41:07 avg Exp $
37  */
38 
39 /*
40  * Here we have the token scanner for indent.  It scans off one token and puts
41  * it in the global variable "token".  It returns a code, indicating the type
42  * of token scanned.
43  */
44 
45 #include <err.h>
46 #include <stdio.h>
47 #include <ctype.h>
48 #include <stdlib.h>
49 #include <string.h>
50 #include "indent_globs.h"
51 #include "indent_codes.h"
52 #include "indent.h"
53 
54 #define alphanum 1
55 #define opchar 3
56 
57 struct templ {
58     const char *rwd;
59     int         rwcode;
60 };
61 
62 struct templ specials[1000] =
63 {
64     {"switch", 1},
65     {"case", 2},
66     {"break", 0},
67     {"struct", 3},
68     {"union", 3},
69     {"enum", 3},
70     {"default", 2},
71     {"int", 4},
72     {"char", 4},
73     {"float", 4},
74     {"double", 4},
75     {"long", 4},
76     {"short", 4},
77     {"typdef", 4},
78     {"unsigned", 4},
79     {"register", 4},
80     {"static", 4},
81     {"global", 4},
82     {"extern", 4},
83     {"void", 4},
84     {"const", 4},
85     {"volatile", 4},
86     {"goto", 0},
87     {"return", 0},
88     {"if", 5},
89     {"while", 5},
90     {"for", 5},
91     {"else", 6},
92     {"do", 6},
93     {"sizeof", 7},
94     {0, 0}
95 };
96 
97 char        chartype[128] =
98 {				/* this is used to facilitate the decision of
99 				 * what type (alphanumeric, operator) each
100 				 * character is */
101     0, 0, 0, 0, 0, 0, 0, 0,
102     0, 0, 0, 0, 0, 0, 0, 0,
103     0, 0, 0, 0, 0, 0, 0, 0,
104     0, 0, 0, 0, 0, 0, 0, 0,
105     0, 3, 0, 0, 1, 3, 3, 0,
106     0, 0, 3, 3, 0, 3, 0, 3,
107     1, 1, 1, 1, 1, 1, 1, 1,
108     1, 1, 0, 0, 3, 3, 3, 3,
109     0, 1, 1, 1, 1, 1, 1, 1,
110     1, 1, 1, 1, 1, 1, 1, 1,
111     1, 1, 1, 1, 1, 1, 1, 1,
112     1, 1, 1, 0, 0, 0, 3, 1,
113     0, 1, 1, 1, 1, 1, 1, 1,
114     1, 1, 1, 1, 1, 1, 1, 1,
115     1, 1, 1, 1, 1, 1, 1, 1,
116     1, 1, 1, 0, 3, 0, 3, 0
117 };
118 
119 int
120 lexi(void)
121 {
122     int         unary_delim;	/* this is set to 1 if the current token
123 				 * forces a following operator to be unary */
124     static int  last_code;	/* the last token type returned */
125     static int  l_struct;	/* set to 1 if the last token was 'struct' */
126     int         code;		/* internal code to be returned */
127     char        qchar;		/* the delimiter character for a string */
128 
129     e_token = s_token;		/* point to start of place to save token */
130     unary_delim = false;
131     ps.col_1 = ps.last_nl;	/* tell world that this token started in
132 				 * column 1 iff the last thing scanned was nl */
133     ps.last_nl = false;
134 
135     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
136 	ps.col_1 = false;	/* leading blanks imply token is not in column
137 				 * 1 */
138 	if (++buf_ptr >= buf_end)
139 	    fill_buffer();
140     }
141 
142     /* Scan an alphanumeric token */
143     if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
144 	/*
145 	 * we have a character or number
146 	 */
147 	const char *j;		/* used for searching thru list of
148 				 *
149 				 * reserved words */
150 	struct templ *p;
151 
152 	if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
153 	    int         seendot = 0,
154 	                seenexp = 0,
155 			seensfx = 0;
156 	    if (*buf_ptr == '0' &&
157 		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
158 		*e_token++ = *buf_ptr++;
159 		*e_token++ = *buf_ptr++;
160 		while (isxdigit(*buf_ptr)) {
161 		    CHECK_SIZE_TOKEN;
162 		    *e_token++ = *buf_ptr++;
163 		}
164 	    }
165 	    else
166 		while (1) {
167 		    if (*buf_ptr == '.') {
168 			if (seendot)
169 			    break;
170 			else
171 			    seendot++;
172 		    }
173 		    CHECK_SIZE_TOKEN;
174 		    *e_token++ = *buf_ptr++;
175 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
176 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
177 			    break;
178 			else {
179 			    seenexp++;
180 			    seendot++;
181 			    CHECK_SIZE_TOKEN;
182 			    *e_token++ = *buf_ptr++;
183 			    if (*buf_ptr == '+' || *buf_ptr == '-')
184 				*e_token++ = *buf_ptr++;
185 			}
186 		    }
187 		}
188 	    while (1) {
189 		if (!(seensfx & 1) &&
190 			(*buf_ptr == 'U' || *buf_ptr == 'u')) {
191 		    CHECK_SIZE_TOKEN;
192 		    *e_token++ = *buf_ptr++;
193 		    seensfx |= 1;
194 		    continue;
195 		}
196         	if (!(seensfx & 2) &&
197 			(*buf_ptr == 'L' || *buf_ptr == 'l')) {
198 		    CHECK_SIZE_TOKEN;
199 		    if (buf_ptr[1] == buf_ptr[0])
200 		        *e_token++ = *buf_ptr++;
201 		    *e_token++ = *buf_ptr++;
202 		    seensfx |= 2;
203 		    continue;
204 		}
205 		break;
206 	    }
207 	}
208 	else
209 	    while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
210 		/* fill_buffer() terminates buffer with newline */
211 		if (*buf_ptr == BACKSLASH) {
212 		    if (*(buf_ptr + 1) == '\n') {
213 			buf_ptr += 2;
214 			if (buf_ptr >= buf_end)
215 			    fill_buffer();
216 			} else
217 			    break;
218 		}
219 		CHECK_SIZE_TOKEN;
220 		/* copy it over */
221 		*e_token++ = *buf_ptr++;
222 		if (buf_ptr >= buf_end)
223 		    fill_buffer();
224 	    }
225 	*e_token++ = '\0';
226 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
227 	    if (++buf_ptr >= buf_end)
228 		fill_buffer();
229 	}
230 	ps.its_a_keyword = false;
231 	ps.sizeof_keyword = false;
232 	if (l_struct && !ps.p_l_follow) {
233 				/* if last token was 'struct' and we're not
234 				 * in parentheses, then this token
235 				 * should be treated as a declaration */
236 	    l_struct = false;
237 	    last_code = ident;
238 	    ps.last_u_d = true;
239 	    return (decl);
240 	}
241 	ps.last_u_d = l_struct;	/* Operator after identifier is binary
242 				 * unless last token was 'struct' */
243 	l_struct = false;
244 	last_code = ident;	/* Remember that this is the code we will
245 				 * return */
246 
247 	if (auto_typedefs) {
248 	    const char *q = s_token;
249 	    size_t q_len = strlen(q);
250 	    /* Check if we have an "_t" in the end */
251 	    if (q_len > 2 &&
252 	        (strcmp(q + q_len - 2, "_t") == 0)) {
253 	        ps.its_a_keyword = true;
254 		ps.last_u_d = true;
255 	        goto found_auto_typedef;
256 	    }
257 	}
258 
259 	/*
260 	 * This loop will check if the token is a keyword.
261 	 */
262 	for (p = specials; (j = p->rwd) != NULL; p++) {
263 	    const char *q = s_token;	/* point at scanned token */
264 	    if (*j++ != *q++ || *j++ != *q++)
265 		continue;	/* This test depends on the fact that
266 				 * identifiers are always at least 1 character
267 				 * long (ie. the first two bytes of the
268 				 * identifier are always meaningful) */
269 	    if (q[-1] == 0)
270 		break;		/* If its a one-character identifier */
271 	    while (*q++ == *j)
272 		if (*j++ == 0)
273 		    goto found_keyword;	/* I wish that C had a multi-level
274 					 * break... */
275 	}
276 	if (p->rwd) {		/* we have a keyword */
277     found_keyword:
278 	    ps.its_a_keyword = true;
279 	    ps.last_u_d = true;
280 	    switch (p->rwcode) {
281 	    case 1:		/* it is a switch */
282 		return (swstmt);
283 	    case 2:		/* a case or default */
284 		return (casestmt);
285 
286 	    case 3:		/* a "struct" */
287 		/*
288 		 * Next time around, we will want to know that we have had a
289 		 * 'struct'
290 		 */
291 		l_struct = true;
292 		/* FALLTHROUGH */
293 
294 	    case 4:		/* one of the declaration keywords */
295 	    found_auto_typedef:
296 		if (ps.p_l_follow) {
297 		    ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.sizeof_mask;
298 		    break;	/* inside parens: cast, param list or sizeof */
299 		}
300 		last_code = decl;
301 		return (decl);
302 
303 	    case 5:		/* if, while, for */
304 		return (sp_paren);
305 
306 	    case 6:		/* do, else */
307 		return (sp_nparen);
308 
309 	    case 7:
310 		ps.sizeof_keyword = true;
311 	    default:		/* all others are treated like any other
312 				 * identifier */
313 		return (ident);
314 	    }			/* end of switch */
315 	}			/* end of if (found_it) */
316 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
317 	    char *tp = buf_ptr;
318 	    while (tp < buf_end)
319 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
320 		    goto not_proc;
321 	    strncpy(ps.procname, token, sizeof ps.procname - 1);
322 	    ps.in_parameter_declaration = 1;
323 	    rparen_count = 1;
324     not_proc:;
325 	}
326 	/*
327 	 * The following hack attempts to guess whether or not the current
328 	 * token is in fact a declaration keyword -- one that has been
329 	 * typedefd
330 	 */
331 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
332 		&& !ps.p_l_follow
333 	        && !ps.block_init
334 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
335 		    ps.last_token == decl ||
336 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
337 	    ps.its_a_keyword = true;
338 	    ps.last_u_d = true;
339 	    last_code = decl;
340 	    return decl;
341 	}
342 	if (last_code == decl)	/* if this is a declared variable, then
343 				 * following sign is unary */
344 	    ps.last_u_d = true;	/* will make "int a -1" work */
345 	last_code = ident;
346 	return (ident);		/* the ident is not in the list */
347     }				/* end of procesing for alpanum character */
348 
349     /* Scan a non-alphanumeric token */
350 
351     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
352 				 * moved here */
353     *e_token = '\0';
354     if (++buf_ptr >= buf_end)
355 	fill_buffer();
356 
357     switch (*token) {
358     case '\n':
359 	unary_delim = ps.last_u_d;
360 	ps.last_nl = true;	/* remember that we just had a newline */
361 	code = (had_eof ? 0 : newline);
362 
363 	/*
364 	 * if data has been exhausted, the newline is a dummy, and we should
365 	 * return code to stop
366 	 */
367 	break;
368 
369     case '\'':			/* start of quoted character */
370     case '"':			/* start of string */
371 	qchar = *token;
372 	if (troff) {
373 	    e_token[-1] = '`';
374 	    if (qchar == '"')
375 		*e_token++ = '`';
376 	    e_token = chfont(&bodyf, &stringf, e_token);
377 	}
378 	do {			/* copy the string */
379 	    while (1) {		/* move one character or [/<char>]<char> */
380 		if (*buf_ptr == '\n') {
381 		    diag2(1, "Unterminated literal");
382 		    goto stop_lit;
383 		}
384 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
385 					 * since CHECK_SIZE guarantees that there
386 					 * are at least 5 entries left */
387 		*e_token = *buf_ptr++;
388 		if (buf_ptr >= buf_end)
389 		    fill_buffer();
390 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
391 		    if (*buf_ptr == '\n')	/* check for escaped newline */
392 			++line_no;
393 		    if (troff) {
394 			*++e_token = BACKSLASH;
395 			if (*buf_ptr == BACKSLASH)
396 			    *++e_token = BACKSLASH;
397 		    }
398 		    *++e_token = *buf_ptr++;
399 		    ++e_token;	/* we must increment this again because we
400 				 * copied two chars */
401 		    if (buf_ptr >= buf_end)
402 			fill_buffer();
403 		}
404 		else
405 		    break;	/* we copied one character */
406 	    }			/* end of while (1) */
407 	} while (*e_token++ != qchar);
408 	if (troff) {
409 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
410 	    if (qchar == '"')
411 		*e_token++ = '\'';
412 	}
413 stop_lit:
414 	code = ident;
415 	break;
416 
417     case ('('):
418     case ('['):
419 	unary_delim = true;
420 	code = lparen;
421 	break;
422 
423     case (')'):
424     case (']'):
425 	code = rparen;
426 	break;
427 
428     case '#':
429 	unary_delim = ps.last_u_d;
430 	code = preesc;
431 	break;
432 
433     case '?':
434 	unary_delim = true;
435 	code = question;
436 	break;
437 
438     case (':'):
439 	code = colon;
440 	unary_delim = true;
441 	break;
442 
443     case (';'):
444 	unary_delim = true;
445 	code = semicolon;
446 	break;
447 
448     case ('{'):
449 	unary_delim = true;
450 
451 	/*
452 	 * if (ps.in_or_st) ps.block_init = 1;
453 	 */
454 	/* ?	code = ps.block_init ? lparen : lbrace; */
455 	code = lbrace;
456 	break;
457 
458     case ('}'):
459 	unary_delim = true;
460 	/* ?	code = ps.block_init ? rparen : rbrace; */
461 	code = rbrace;
462 	break;
463 
464     case 014:			/* a form feed */
465 	unary_delim = ps.last_u_d;
466 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
467 				 * right */
468 	code = form_feed;
469 	break;
470 
471     case (','):
472 	unary_delim = true;
473 	code = comma;
474 	break;
475 
476     case '.':
477 	unary_delim = false;
478 	code = period;
479 	break;
480 
481     case '-':
482     case '+':			/* check for -, +, --, ++ */
483 	code = (ps.last_u_d ? unary_op : binary_op);
484 	unary_delim = true;
485 
486 	if (*buf_ptr == token[0]) {
487 	    /* check for doubled character */
488 	    *e_token++ = *buf_ptr++;
489 	    /* buffer overflow will be checked at end of loop */
490 	    if (last_code == ident || last_code == rparen) {
491 		code = (ps.last_u_d ? unary_op : postop);
492 		/* check for following ++ or -- */
493 		unary_delim = false;
494 	    }
495 	}
496 	else if (*buf_ptr == '=')
497 	    /* check for operator += */
498 	    *e_token++ = *buf_ptr++;
499 	else if (*buf_ptr == '>') {
500 	    /* check for operator -> */
501 	    *e_token++ = *buf_ptr++;
502 	    if (!pointer_as_binop) {
503 		unary_delim = false;
504 		code = unary_op;
505 		ps.want_blank = false;
506 	    }
507 	}
508 	break;			/* buffer overflow will be checked at end of
509 				 * switch */
510 
511     case '=':
512 	if (ps.in_or_st)
513 	    ps.block_init = 1;
514 #ifdef undef
515 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
516 	    e_token[-1] = *buf_ptr++;
517 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
518 		*e_token++ = *buf_ptr++;
519 	    *e_token++ = '=';	/* Flip =+ to += */
520 	    *e_token = 0;
521 	}
522 #else
523 	if (*buf_ptr == '=') {/* == */
524 	    *e_token++ = '=';	/* Flip =+ to += */
525 	    buf_ptr++;
526 	    *e_token = 0;
527 	}
528 #endif
529 	code = binary_op;
530 	unary_delim = true;
531 	break;
532 	/* can drop thru!!! */
533 
534     case '>':
535     case '<':
536     case '!':			/* ops like <, <<, <=, !=, etc */
537 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
538 	    *e_token++ = *buf_ptr;
539 	    if (++buf_ptr >= buf_end)
540 		fill_buffer();
541 	}
542 	if (*buf_ptr == '=')
543 	    *e_token++ = *buf_ptr++;
544 	code = (ps.last_u_d ? unary_op : binary_op);
545 	unary_delim = true;
546 	break;
547 
548     default:
549 	if (token[0] == '/' && *buf_ptr == '*') {
550 	    /* it is start of comment */
551 	    *e_token++ = '*';
552 
553 	    if (++buf_ptr >= buf_end)
554 		fill_buffer();
555 
556 	    code = comment;
557 	    unary_delim = ps.last_u_d;
558 	    break;
559 	}
560 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
561 	    /*
562 	     * handle ||, &&, etc, and also things as in int *****i
563 	     */
564 	    *e_token++ = *buf_ptr;
565 	    if (++buf_ptr >= buf_end)
566 		fill_buffer();
567 	}
568 	code = (ps.last_u_d ? unary_op : binary_op);
569 	unary_delim = true;
570 
571 
572     }				/* end of switch */
573     if (code != newline) {
574 	l_struct = false;
575 	last_code = code;
576     }
577     if (buf_ptr >= buf_end)	/* check for input buffer empty */
578 	fill_buffer();
579     ps.last_u_d = unary_delim;
580     *e_token = '\0';		/* null terminate the token */
581     return (code);
582 }
583 
584 /*
585  * Add the given keyword to the keyword table, using val as the keyword type
586  */
587 void
588 addkey(char *key, int val)
589 {
590     struct templ *p = specials;
591     while (p->rwd)
592 	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
593 	    return;
594 	else
595 	    p++;
596     if (p >= specials + sizeof specials / sizeof specials[0])
597 	return;			/* For now, table overflows are silently
598 				 * ignored */
599     p->rwd = key;
600     p->rwcode = val;
601     p[1].rwd = NULL;
602     p[1].rwcode = 0;
603 }
604