xref: /dragonfly/usr.bin/indent/lexi.c (revision 7bcb6caf)
1 /*
2  * Copyright (c) 1985 Sun Microsystems, Inc.
3  * Copyright (c) 1980, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. Neither the name of the University nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  *
31  * @(#)lexi.c	8.1 (Berkeley) 6/6/93
32  * $FreeBSD: src/usr.bin/indent/lexi.c,v 1.21 2010/04/15 21:41:07 avg Exp $
33  */
34 
35 /*
36  * Here we have the token scanner for indent.  It scans off one token and puts
37  * it in the global variable "token".  It returns a code, indicating the type
38  * of token scanned.
39  */
40 
41 #include <err.h>
42 #include <stdio.h>
43 #include <ctype.h>
44 #include <stdlib.h>
45 #include <string.h>
46 #include "indent_globs.h"
47 #include "indent_codes.h"
48 #include "indent.h"
49 
50 #define alphanum 1
51 #define opchar 3
52 
53 struct templ {
54     const char *rwd;
55     int         rwcode;
56 };
57 
58 struct templ specials[1000] =
59 {
60     {"switch", 1},
61     {"case", 2},
62     {"break", 0},
63     {"struct", 3},
64     {"union", 3},
65     {"enum", 3},
66     {"default", 2},
67     {"int", 4},
68     {"char", 4},
69     {"float", 4},
70     {"double", 4},
71     {"long", 4},
72     {"short", 4},
73     {"typdef", 4},
74     {"unsigned", 4},
75     {"register", 4},
76     {"static", 4},
77     {"global", 4},
78     {"extern", 4},
79     {"void", 4},
80     {"const", 4},
81     {"volatile", 4},
82     {"goto", 0},
83     {"return", 0},
84     {"if", 5},
85     {"while", 5},
86     {"for", 5},
87     {"else", 6},
88     {"do", 6},
89     {"sizeof", 7},
90     {0, 0}
91 };
92 
93 char        chartype[128] =
94 {				/* this is used to facilitate the decision of
95 				 * what type (alphanumeric, operator) each
96 				 * character is */
97     0, 0, 0, 0, 0, 0, 0, 0,
98     0, 0, 0, 0, 0, 0, 0, 0,
99     0, 0, 0, 0, 0, 0, 0, 0,
100     0, 0, 0, 0, 0, 0, 0, 0,
101     0, 3, 0, 0, 1, 3, 3, 0,
102     0, 0, 3, 3, 0, 3, 0, 3,
103     1, 1, 1, 1, 1, 1, 1, 1,
104     1, 1, 0, 0, 3, 3, 3, 3,
105     0, 1, 1, 1, 1, 1, 1, 1,
106     1, 1, 1, 1, 1, 1, 1, 1,
107     1, 1, 1, 1, 1, 1, 1, 1,
108     1, 1, 1, 0, 0, 0, 3, 1,
109     0, 1, 1, 1, 1, 1, 1, 1,
110     1, 1, 1, 1, 1, 1, 1, 1,
111     1, 1, 1, 1, 1, 1, 1, 1,
112     1, 1, 1, 0, 3, 0, 3, 0
113 };
114 
115 int
116 lexi(void)
117 {
118     int         unary_delim;	/* this is set to 1 if the current token
119 				 * forces a following operator to be unary */
120     static int  last_code;	/* the last token type returned */
121     static int  l_struct;	/* set to 1 if the last token was 'struct' */
122     int         code;		/* internal code to be returned */
123     char        qchar;		/* the delimiter character for a string */
124 
125     e_token = s_token;		/* point to start of place to save token */
126     unary_delim = false;
127     ps.col_1 = ps.last_nl;	/* tell world that this token started in
128 				 * column 1 iff the last thing scanned was nl */
129     ps.last_nl = false;
130 
131     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
132 	ps.col_1 = false;	/* leading blanks imply token is not in column
133 				 * 1 */
134 	if (++buf_ptr >= buf_end)
135 	    fill_buffer();
136     }
137 
138     /* Scan an alphanumeric token */
139     if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
140 	/*
141 	 * we have a character or number
142 	 */
143 	const char *j;		/* used for searching thru list of
144 				 *
145 				 * reserved words */
146 	struct templ *p;
147 
148 	if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
149 	    int         seendot = 0,
150 	                seenexp = 0,
151 			seensfx = 0;
152 	    if (*buf_ptr == '0' &&
153 		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
154 		*e_token++ = *buf_ptr++;
155 		*e_token++ = *buf_ptr++;
156 		while (isxdigit(*buf_ptr)) {
157 		    CHECK_SIZE_TOKEN;
158 		    *e_token++ = *buf_ptr++;
159 		}
160 	    }
161 	    else
162 		while (1) {
163 		    if (*buf_ptr == '.') {
164 			if (seendot)
165 			    break;
166 			else
167 			    seendot++;
168 		    }
169 		    CHECK_SIZE_TOKEN;
170 		    *e_token++ = *buf_ptr++;
171 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
172 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
173 			    break;
174 			else {
175 			    seenexp++;
176 			    seendot++;
177 			    CHECK_SIZE_TOKEN;
178 			    *e_token++ = *buf_ptr++;
179 			    if (*buf_ptr == '+' || *buf_ptr == '-')
180 				*e_token++ = *buf_ptr++;
181 			}
182 		    }
183 		}
184 	    while (1) {
185 		if (!(seensfx & 1) &&
186 			(*buf_ptr == 'U' || *buf_ptr == 'u')) {
187 		    CHECK_SIZE_TOKEN;
188 		    *e_token++ = *buf_ptr++;
189 		    seensfx |= 1;
190 		    continue;
191 		}
192         	if (!(seensfx & 2) &&
193 			(*buf_ptr == 'L' || *buf_ptr == 'l')) {
194 		    CHECK_SIZE_TOKEN;
195 		    if (buf_ptr[1] == buf_ptr[0])
196 		        *e_token++ = *buf_ptr++;
197 		    *e_token++ = *buf_ptr++;
198 		    seensfx |= 2;
199 		    continue;
200 		}
201 		break;
202 	    }
203 	}
204 	else
205 	    while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
206 		/* fill_buffer() terminates buffer with newline */
207 		if (*buf_ptr == BACKSLASH) {
208 		    if (*(buf_ptr + 1) == '\n') {
209 			buf_ptr += 2;
210 			if (buf_ptr >= buf_end)
211 			    fill_buffer();
212 			} else
213 			    break;
214 		}
215 		CHECK_SIZE_TOKEN;
216 		/* copy it over */
217 		*e_token++ = *buf_ptr++;
218 		if (buf_ptr >= buf_end)
219 		    fill_buffer();
220 	    }
221 	*e_token++ = '\0';
222 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
223 	    if (++buf_ptr >= buf_end)
224 		fill_buffer();
225 	}
226 	ps.its_a_keyword = false;
227 	ps.sizeof_keyword = false;
228 	if (l_struct && !ps.p_l_follow) {
229 				/* if last token was 'struct' and we're not
230 				 * in parentheses, then this token
231 				 * should be treated as a declaration */
232 	    l_struct = false;
233 	    last_code = ident;
234 	    ps.last_u_d = true;
235 	    return (decl);
236 	}
237 	ps.last_u_d = l_struct;	/* Operator after identifier is binary
238 				 * unless last token was 'struct' */
239 	l_struct = false;
240 	last_code = ident;	/* Remember that this is the code we will
241 				 * return */
242 
243 	if (auto_typedefs) {
244 	    const char *q = s_token;
245 	    size_t q_len = strlen(q);
246 	    /* Check if we have an "_t" in the end */
247 	    if (q_len > 2 &&
248 	        (strcmp(q + q_len - 2, "_t") == 0)) {
249 	        ps.its_a_keyword = true;
250 		ps.last_u_d = true;
251 	        goto found_auto_typedef;
252 	    }
253 	}
254 
255 	/*
256 	 * This loop will check if the token is a keyword.
257 	 */
258 	for (p = specials; (j = p->rwd) != NULL; p++) {
259 	    const char *q = s_token;	/* point at scanned token */
260 	    if (*j++ != *q++ || *j++ != *q++)
261 		continue;	/* This test depends on the fact that
262 				 * identifiers are always at least 1 character
263 				 * long (ie. the first two bytes of the
264 				 * identifier are always meaningful) */
265 	    if (q[-1] == 0)
266 		break;		/* If its a one-character identifier */
267 	    while (*q++ == *j)
268 		if (*j++ == 0)
269 		    goto found_keyword;	/* I wish that C had a multi-level
270 					 * break... */
271 	}
272 	if (p->rwd) {		/* we have a keyword */
273     found_keyword:
274 	    ps.its_a_keyword = true;
275 	    ps.last_u_d = true;
276 	    switch (p->rwcode) {
277 	    case 1:		/* it is a switch */
278 		return (swstmt);
279 	    case 2:		/* a case or default */
280 		return (casestmt);
281 
282 	    case 3:		/* a "struct" */
283 		/*
284 		 * Next time around, we will want to know that we have had a
285 		 * 'struct'
286 		 */
287 		l_struct = true;
288 		/* FALLTHROUGH */
289 
290 	    case 4:		/* one of the declaration keywords */
291 	    found_auto_typedef:
292 		if (ps.p_l_follow) {
293 		    ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.sizeof_mask;
294 		    break;	/* inside parens: cast, param list or sizeof */
295 		}
296 		last_code = decl;
297 		return (decl);
298 
299 	    case 5:		/* if, while, for */
300 		return (sp_paren);
301 
302 	    case 6:		/* do, else */
303 		return (sp_nparen);
304 
305 	    case 7:
306 		ps.sizeof_keyword = true;
307 		/* FALLTHROUGH */
308 	    default:		/* all others are treated like any other
309 				 * identifier */
310 		return (ident);
311 	    }			/* end of switch */
312 	}			/* end of if (found_it) */
313 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
314 	    char *tp = buf_ptr;
315 	    while (tp < buf_end)
316 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
317 		    goto not_proc;
318 	    strncpy(ps.procname, token, sizeof ps.procname - 1);
319 	    ps.in_parameter_declaration = 1;
320 	    rparen_count = 1;
321     not_proc:;
322 	}
323 	/*
324 	 * The following hack attempts to guess whether or not the current
325 	 * token is in fact a declaration keyword -- one that has been
326 	 * typedefd
327 	 */
328 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
329 		&& !ps.p_l_follow
330 	        && !ps.block_init
331 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
332 		    ps.last_token == decl ||
333 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
334 	    ps.its_a_keyword = true;
335 	    ps.last_u_d = true;
336 	    last_code = decl;
337 	    return decl;
338 	}
339 	if (last_code == decl)	/* if this is a declared variable, then
340 				 * following sign is unary */
341 	    ps.last_u_d = true;	/* will make "int a -1" work */
342 	last_code = ident;
343 	return (ident);		/* the ident is not in the list */
344     }				/* end of procesing for alpanum character */
345 
346     /* Scan a non-alphanumeric token */
347 
348     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
349 				 * moved here */
350     *e_token = '\0';
351     if (++buf_ptr >= buf_end)
352 	fill_buffer();
353 
354     switch (*token) {
355     case '\n':
356 	unary_delim = ps.last_u_d;
357 	ps.last_nl = true;	/* remember that we just had a newline */
358 	code = (had_eof ? 0 : newline);
359 
360 	/*
361 	 * if data has been exhausted, the newline is a dummy, and we should
362 	 * return code to stop
363 	 */
364 	break;
365 
366     case '\'':			/* start of quoted character */
367     case '"':			/* start of string */
368 	qchar = *token;
369 	if (troff) {
370 	    e_token[-1] = '`';
371 	    if (qchar == '"')
372 		*e_token++ = '`';
373 	    e_token = chfont(&bodyf, &stringf, e_token);
374 	}
375 	do {			/* copy the string */
376 	    while (1) {		/* move one character or [/<char>]<char> */
377 		if (*buf_ptr == '\n') {
378 		    diag2(1, "Unterminated literal");
379 		    goto stop_lit;
380 		}
381 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
382 					 * since CHECK_SIZE guarantees that there
383 					 * are at least 5 entries left */
384 		*e_token = *buf_ptr++;
385 		if (buf_ptr >= buf_end)
386 		    fill_buffer();
387 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
388 		    if (*buf_ptr == '\n')	/* check for escaped newline */
389 			++line_no;
390 		    if (troff) {
391 			*++e_token = BACKSLASH;
392 			if (*buf_ptr == BACKSLASH)
393 			    *++e_token = BACKSLASH;
394 		    }
395 		    *++e_token = *buf_ptr++;
396 		    ++e_token;	/* we must increment this again because we
397 				 * copied two chars */
398 		    if (buf_ptr >= buf_end)
399 			fill_buffer();
400 		}
401 		else
402 		    break;	/* we copied one character */
403 	    }			/* end of while (1) */
404 	} while (*e_token++ != qchar);
405 	if (troff) {
406 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
407 	    if (qchar == '"')
408 		*e_token++ = '\'';
409 	}
410 stop_lit:
411 	code = ident;
412 	break;
413 
414     case ('('):
415     case ('['):
416 	unary_delim = true;
417 	code = lparen;
418 	break;
419 
420     case (')'):
421     case (']'):
422 	code = rparen;
423 	break;
424 
425     case '#':
426 	unary_delim = ps.last_u_d;
427 	code = preesc;
428 	break;
429 
430     case '?':
431 	unary_delim = true;
432 	code = question;
433 	break;
434 
435     case (':'):
436 	code = colon;
437 	unary_delim = true;
438 	break;
439 
440     case (';'):
441 	unary_delim = true;
442 	code = semicolon;
443 	break;
444 
445     case ('{'):
446 	unary_delim = true;
447 
448 	/*
449 	 * if (ps.in_or_st) ps.block_init = 1;
450 	 */
451 	/* ?	code = ps.block_init ? lparen : lbrace; */
452 	code = lbrace;
453 	break;
454 
455     case ('}'):
456 	unary_delim = true;
457 	/* ?	code = ps.block_init ? rparen : rbrace; */
458 	code = rbrace;
459 	break;
460 
461     case 014:			/* a form feed */
462 	unary_delim = ps.last_u_d;
463 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
464 				 * right */
465 	code = form_feed;
466 	break;
467 
468     case (','):
469 	unary_delim = true;
470 	code = comma;
471 	break;
472 
473     case '.':
474 	unary_delim = false;
475 	code = period;
476 	break;
477 
478     case '-':
479     case '+':			/* check for -, +, --, ++ */
480 	code = (ps.last_u_d ? unary_op : binary_op);
481 	unary_delim = true;
482 
483 	if (*buf_ptr == token[0]) {
484 	    /* check for doubled character */
485 	    *e_token++ = *buf_ptr++;
486 	    /* buffer overflow will be checked at end of loop */
487 	    if (last_code == ident || last_code == rparen) {
488 		code = (ps.last_u_d ? unary_op : postop);
489 		/* check for following ++ or -- */
490 		unary_delim = false;
491 	    }
492 	}
493 	else if (*buf_ptr == '=')
494 	    /* check for operator += */
495 	    *e_token++ = *buf_ptr++;
496 	else if (*buf_ptr == '>') {
497 	    /* check for operator -> */
498 	    *e_token++ = *buf_ptr++;
499 	    if (!pointer_as_binop) {
500 		unary_delim = false;
501 		code = unary_op;
502 		ps.want_blank = false;
503 	    }
504 	}
505 	break;			/* buffer overflow will be checked at end of
506 				 * switch */
507 
508     case '=':
509 	if (ps.in_or_st)
510 	    ps.block_init = 1;
511 #ifdef undef
512 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
513 	    e_token[-1] = *buf_ptr++;
514 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
515 		*e_token++ = *buf_ptr++;
516 	    *e_token++ = '=';	/* Flip =+ to += */
517 	    *e_token = 0;
518 	}
519 #else
520 	if (*buf_ptr == '=') {/* == */
521 	    *e_token++ = '=';	/* Flip =+ to += */
522 	    buf_ptr++;
523 	    *e_token = 0;
524 	}
525 #endif
526 	code = binary_op;
527 	unary_delim = true;
528 	break;
529 	/* can drop thru!!! */
530 
531     case '>':
532     case '<':
533     case '!':			/* ops like <, <<, <=, !=, etc */
534 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
535 	    *e_token++ = *buf_ptr;
536 	    if (++buf_ptr >= buf_end)
537 		fill_buffer();
538 	}
539 	if (*buf_ptr == '=')
540 	    *e_token++ = *buf_ptr++;
541 	code = (ps.last_u_d ? unary_op : binary_op);
542 	unary_delim = true;
543 	break;
544 
545     default:
546 	if (token[0] == '/' && *buf_ptr == '*') {
547 	    /* it is start of comment */
548 	    *e_token++ = '*';
549 
550 	    if (++buf_ptr >= buf_end)
551 		fill_buffer();
552 
553 	    code = comment;
554 	    unary_delim = ps.last_u_d;
555 	    break;
556 	}
557 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
558 	    /*
559 	     * handle ||, &&, etc, and also things as in int *****i
560 	     */
561 	    *e_token++ = *buf_ptr;
562 	    if (++buf_ptr >= buf_end)
563 		fill_buffer();
564 	}
565 	code = (ps.last_u_d ? unary_op : binary_op);
566 	unary_delim = true;
567 
568 
569     }				/* end of switch */
570     if (code != newline) {
571 	l_struct = false;
572 	last_code = code;
573     }
574     if (buf_ptr >= buf_end)	/* check for input buffer empty */
575 	fill_buffer();
576     ps.last_u_d = unary_delim;
577     *e_token = '\0';		/* null terminate the token */
578     return (code);
579 }
580 
581 /*
582  * Add the given keyword to the keyword table, using val as the keyword type
583  */
584 void
585 addkey(char *key, int val)
586 {
587     struct templ *p = specials;
588     while (p->rwd)
589 	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
590 	    return;
591 	else
592 	    p++;
593     if (p >= specials + sizeof specials / sizeof specials[0])
594 	return;			/* For now, table overflows are silently
595 				 * ignored */
596     p->rwd = key;
597     p->rwcode = val;
598     p[1].rwd = NULL;
599     p[1].rwcode = 0;
600 }
601