xref: /dragonfly/usr.bin/indent/lexi.c (revision f746689a)
1 /*
2  * Copyright (c) 1985 Sun Microsystems, Inc.
3  * Copyright (c) 1980, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed by the University of
18  *	California, Berkeley and its contributors.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  *
35  * @(#)lexi.c	8.1 (Berkeley) 6/6/93
36  * $FreeBSD: src/usr.bin/indent/lexi.c,v 1.19 2005/11/20 13:48:15 dds Exp $
37  * $DragonFly: src/usr.bin/indent/lexi.c,v 1.3 2005/04/10 20:55:38 drhodus Exp $
38  */
39 
40 /*
41  * Here we have the token scanner for indent.  It scans off one token and puts
42  * it in the global variable "token".  It returns a code, indicating the type
43  * of token scanned.
44  */
45 
46 #include <err.h>
47 #include <stdio.h>
48 #include <ctype.h>
49 #include <stdlib.h>
50 #include <string.h>
51 #include "indent_globs.h"
52 #include "indent_codes.h"
53 #include "indent.h"
54 
55 #define alphanum 1
56 #define opchar 3
57 
58 struct templ {
59     const char *rwd;
60     int         rwcode;
61 };
62 
63 struct templ specials[1000] =
64 {
65     {"switch", 1},
66     {"case", 2},
67     {"break", 0},
68     {"struct", 3},
69     {"union", 3},
70     {"enum", 3},
71     {"default", 2},
72     {"int", 4},
73     {"char", 4},
74     {"float", 4},
75     {"double", 4},
76     {"long", 4},
77     {"short", 4},
78     {"typdef", 4},
79     {"unsigned", 4},
80     {"register", 4},
81     {"static", 4},
82     {"global", 4},
83     {"extern", 4},
84     {"void", 4},
85     {"const", 4},
86     {"volatile", 4},
87     {"goto", 0},
88     {"return", 0},
89     {"if", 5},
90     {"while", 5},
91     {"for", 5},
92     {"else", 6},
93     {"do", 6},
94     {"sizeof", 7},
95     {0, 0}
96 };
97 
98 char        chartype[128] =
99 {				/* this is used to facilitate the decision of
100 				 * what type (alphanumeric, operator) each
101 				 * character is */
102     0, 0, 0, 0, 0, 0, 0, 0,
103     0, 0, 0, 0, 0, 0, 0, 0,
104     0, 0, 0, 0, 0, 0, 0, 0,
105     0, 0, 0, 0, 0, 0, 0, 0,
106     0, 3, 0, 0, 1, 3, 3, 0,
107     0, 0, 3, 3, 0, 3, 0, 3,
108     1, 1, 1, 1, 1, 1, 1, 1,
109     1, 1, 0, 0, 3, 3, 3, 3,
110     0, 1, 1, 1, 1, 1, 1, 1,
111     1, 1, 1, 1, 1, 1, 1, 1,
112     1, 1, 1, 1, 1, 1, 1, 1,
113     1, 1, 1, 0, 0, 0, 3, 1,
114     0, 1, 1, 1, 1, 1, 1, 1,
115     1, 1, 1, 1, 1, 1, 1, 1,
116     1, 1, 1, 1, 1, 1, 1, 1,
117     1, 1, 1, 0, 3, 0, 3, 0
118 };
119 
120 int
121 lexi(void)
122 {
123     int         unary_delim;	/* this is set to 1 if the current token
124 				 * forces a following operator to be unary */
125     static int  last_code;	/* the last token type returned */
126     static int  l_struct;	/* set to 1 if the last token was 'struct' */
127     int         code;		/* internal code to be returned */
128     char        qchar;		/* the delimiter character for a string */
129 
130     e_token = s_token;		/* point to start of place to save token */
131     unary_delim = false;
132     ps.col_1 = ps.last_nl;	/* tell world that this token started in
133 				 * column 1 iff the last thing scanned was nl */
134     ps.last_nl = false;
135 
136     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
137 	ps.col_1 = false;	/* leading blanks imply token is not in column
138 				 * 1 */
139 	if (++buf_ptr >= buf_end)
140 	    fill_buffer();
141     }
142 
143     /* Scan an alphanumeric token */
144     if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
145 	/*
146 	 * we have a character or number
147 	 */
148 	const char *j;		/* used for searching thru list of
149 				 *
150 				 * reserved words */
151 	struct templ *p;
152 
153 	if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
154 	    int         seendot = 0,
155 	                seenexp = 0,
156 			seensfx = 0;
157 	    if (*buf_ptr == '0' &&
158 		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
159 		*e_token++ = *buf_ptr++;
160 		*e_token++ = *buf_ptr++;
161 		while (isxdigit(*buf_ptr)) {
162 		    CHECK_SIZE_TOKEN;
163 		    *e_token++ = *buf_ptr++;
164 		}
165 	    }
166 	    else
167 		while (1) {
168 		    if (*buf_ptr == '.') {
169 			if (seendot)
170 			    break;
171 			else
172 			    seendot++;
173 		    }
174 		    CHECK_SIZE_TOKEN;
175 		    *e_token++ = *buf_ptr++;
176 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
177 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
178 			    break;
179 			else {
180 			    seenexp++;
181 			    seendot++;
182 			    CHECK_SIZE_TOKEN;
183 			    *e_token++ = *buf_ptr++;
184 			    if (*buf_ptr == '+' || *buf_ptr == '-')
185 				*e_token++ = *buf_ptr++;
186 			}
187 		    }
188 		}
189 	    while (1) {
190 		if (!(seensfx & 1) &&
191 			(*buf_ptr == 'U' || *buf_ptr == 'u')) {
192 		    CHECK_SIZE_TOKEN;
193 		    *e_token++ = *buf_ptr++;
194 		    seensfx |= 1;
195 		    continue;
196 		}
197         	if (!(seensfx & 2) &&
198 			(*buf_ptr == 'L' || *buf_ptr == 'l')) {
199 		    CHECK_SIZE_TOKEN;
200 		    if (buf_ptr[1] == buf_ptr[0])
201 		        *e_token++ = *buf_ptr++;
202 		    *e_token++ = *buf_ptr++;
203 		    seensfx |= 2;
204 		    continue;
205 		}
206 		break;
207 	    }
208 	}
209 	else
210 	    while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
211 		/* fill_buffer() terminates buffer with newline */
212 		if (*buf_ptr == BACKSLASH) {
213 		    if (*(buf_ptr + 1) == '\n') {
214 			buf_ptr += 2;
215 			if (buf_ptr >= buf_end)
216 			    fill_buffer();
217 			} else
218 			    break;
219 		}
220 		CHECK_SIZE_TOKEN;
221 		/* copy it over */
222 		*e_token++ = *buf_ptr++;
223 		if (buf_ptr >= buf_end)
224 		    fill_buffer();
225 	    }
226 	*e_token++ = '\0';
227 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
228 	    if (++buf_ptr >= buf_end)
229 		fill_buffer();
230 	}
231 	ps.its_a_keyword = false;
232 	ps.sizeof_keyword = false;
233 	if (l_struct && !ps.p_l_follow) {
234 				/* if last token was 'struct' and we're not
235 				 * in parentheses, then this token
236 				 * should be treated as a declaration */
237 	    l_struct = false;
238 	    last_code = ident;
239 	    ps.last_u_d = true;
240 	    return (decl);
241 	}
242 	ps.last_u_d = l_struct;	/* Operator after identifier is binary
243 				 * unless last token was 'struct' */
244 	l_struct = false;
245 	last_code = ident;	/* Remember that this is the code we will
246 				 * return */
247 
248 	/*
249 	 * This loop will check if the token is a keyword.
250 	 */
251 	for (p = specials; (j = p->rwd) != 0; p++) {
252 	    const char *q = s_token;	/* point at scanned token */
253 	    if (*j++ != *q++ || *j++ != *q++)
254 		continue;	/* This test depends on the fact that
255 				 * identifiers are always at least 1 character
256 				 * long (ie. the first two bytes of the
257 				 * identifier are always meaningful) */
258 	    if (q[-1] == 0)
259 		break;		/* If its a one-character identifier */
260 	    while (*q++ == *j)
261 		if (*j++ == 0)
262 		    goto found_keyword;	/* I wish that C had a multi-level
263 					 * break... */
264 	}
265 	if (p->rwd) {		/* we have a keyword */
266     found_keyword:
267 	    ps.its_a_keyword = true;
268 	    ps.last_u_d = true;
269 	    switch (p->rwcode) {
270 	    case 1:		/* it is a switch */
271 		return (swstmt);
272 	    case 2:		/* a case or default */
273 		return (casestmt);
274 
275 	    case 3:		/* a "struct" */
276 		/*
277 		 * Next time around, we will want to know that we have had a
278 		 * 'struct'
279 		 */
280 		l_struct = true;
281 		/* FALLTHROUGH */
282 
283 	    case 4:		/* one of the declaration keywords */
284 		if (ps.p_l_follow) {
285 		    ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.sizeof_mask;
286 		    break;	/* inside parens: cast, param list or sizeof */
287 		}
288 		last_code = decl;
289 		return (decl);
290 
291 	    case 5:		/* if, while, for */
292 		return (sp_paren);
293 
294 	    case 6:		/* do, else */
295 		return (sp_nparen);
296 
297 	    case 7:
298 		ps.sizeof_keyword = true;
299 	    default:		/* all others are treated like any other
300 				 * identifier */
301 		return (ident);
302 	    }			/* end of switch */
303 	}			/* end of if (found_it) */
304 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
305 	    char *tp = buf_ptr;
306 	    while (tp < buf_end)
307 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
308 		    goto not_proc;
309 	    strncpy(ps.procname, token, sizeof ps.procname - 1);
310 	    ps.in_parameter_declaration = 1;
311 	    rparen_count = 1;
312     not_proc:;
313 	}
314 	/*
315 	 * The following hack attempts to guess whether or not the current
316 	 * token is in fact a declaration keyword -- one that has been
317 	 * typedefd
318 	 */
319 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
320 		&& !ps.p_l_follow
321 	        && !ps.block_init
322 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
323 		    ps.last_token == decl ||
324 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
325 	    ps.its_a_keyword = true;
326 	    ps.last_u_d = true;
327 	    last_code = decl;
328 	    return decl;
329 	}
330 	if (last_code == decl)	/* if this is a declared variable, then
331 				 * following sign is unary */
332 	    ps.last_u_d = true;	/* will make "int a -1" work */
333 	last_code = ident;
334 	return (ident);		/* the ident is not in the list */
335     }				/* end of procesing for alpanum character */
336 
337     /* Scan a non-alphanumeric token */
338 
339     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
340 				 * moved here */
341     *e_token = '\0';
342     if (++buf_ptr >= buf_end)
343 	fill_buffer();
344 
345     switch (*token) {
346     case '\n':
347 	unary_delim = ps.last_u_d;
348 	ps.last_nl = true;	/* remember that we just had a newline */
349 	code = (had_eof ? 0 : newline);
350 
351 	/*
352 	 * if data has been exhausted, the newline is a dummy, and we should
353 	 * return code to stop
354 	 */
355 	break;
356 
357     case '\'':			/* start of quoted character */
358     case '"':			/* start of string */
359 	qchar = *token;
360 	if (troff) {
361 	    e_token[-1] = '`';
362 	    if (qchar == '"')
363 		*e_token++ = '`';
364 	    e_token = chfont(&bodyf, &stringf, e_token);
365 	}
366 	do {			/* copy the string */
367 	    while (1) {		/* move one character or [/<char>]<char> */
368 		if (*buf_ptr == '\n') {
369 		    diag2(1, "Unterminated literal");
370 		    goto stop_lit;
371 		}
372 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
373 					 * since CHECK_SIZE guarantees that there
374 					 * are at least 5 entries left */
375 		*e_token = *buf_ptr++;
376 		if (buf_ptr >= buf_end)
377 		    fill_buffer();
378 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
379 		    if (*buf_ptr == '\n')	/* check for escaped newline */
380 			++line_no;
381 		    if (troff) {
382 			*++e_token = BACKSLASH;
383 			if (*buf_ptr == BACKSLASH)
384 			    *++e_token = BACKSLASH;
385 		    }
386 		    *++e_token = *buf_ptr++;
387 		    ++e_token;	/* we must increment this again because we
388 				 * copied two chars */
389 		    if (buf_ptr >= buf_end)
390 			fill_buffer();
391 		}
392 		else
393 		    break;	/* we copied one character */
394 	    }			/* end of while (1) */
395 	} while (*e_token++ != qchar);
396 	if (troff) {
397 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
398 	    if (qchar == '"')
399 		*e_token++ = '\'';
400 	}
401 stop_lit:
402 	code = ident;
403 	break;
404 
405     case ('('):
406     case ('['):
407 	unary_delim = true;
408 	code = lparen;
409 	break;
410 
411     case (')'):
412     case (']'):
413 	code = rparen;
414 	break;
415 
416     case '#':
417 	unary_delim = ps.last_u_d;
418 	code = preesc;
419 	break;
420 
421     case '?':
422 	unary_delim = true;
423 	code = question;
424 	break;
425 
426     case (':'):
427 	code = colon;
428 	unary_delim = true;
429 	break;
430 
431     case (';'):
432 	unary_delim = true;
433 	code = semicolon;
434 	break;
435 
436     case ('{'):
437 	unary_delim = true;
438 
439 	/*
440 	 * if (ps.in_or_st) ps.block_init = 1;
441 	 */
442 	/* ?	code = ps.block_init ? lparen : lbrace; */
443 	code = lbrace;
444 	break;
445 
446     case ('}'):
447 	unary_delim = true;
448 	/* ?	code = ps.block_init ? rparen : rbrace; */
449 	code = rbrace;
450 	break;
451 
452     case 014:			/* a form feed */
453 	unary_delim = ps.last_u_d;
454 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
455 				 * right */
456 	code = form_feed;
457 	break;
458 
459     case (','):
460 	unary_delim = true;
461 	code = comma;
462 	break;
463 
464     case '.':
465 	unary_delim = false;
466 	code = period;
467 	break;
468 
469     case '-':
470     case '+':			/* check for -, +, --, ++ */
471 	code = (ps.last_u_d ? unary_op : binary_op);
472 	unary_delim = true;
473 
474 	if (*buf_ptr == token[0]) {
475 	    /* check for doubled character */
476 	    *e_token++ = *buf_ptr++;
477 	    /* buffer overflow will be checked at end of loop */
478 	    if (last_code == ident || last_code == rparen) {
479 		code = (ps.last_u_d ? unary_op : postop);
480 		/* check for following ++ or -- */
481 		unary_delim = false;
482 	    }
483 	}
484 	else if (*buf_ptr == '=')
485 	    /* check for operator += */
486 	    *e_token++ = *buf_ptr++;
487 	else if (*buf_ptr == '>') {
488 	    /* check for operator -> */
489 	    *e_token++ = *buf_ptr++;
490 	    if (!pointer_as_binop) {
491 		unary_delim = false;
492 		code = unary_op;
493 		ps.want_blank = false;
494 	    }
495 	}
496 	break;			/* buffer overflow will be checked at end of
497 				 * switch */
498 
499     case '=':
500 	if (ps.in_or_st)
501 	    ps.block_init = 1;
502 #ifdef undef
503 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
504 	    e_token[-1] = *buf_ptr++;
505 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
506 		*e_token++ = *buf_ptr++;
507 	    *e_token++ = '=';	/* Flip =+ to += */
508 	    *e_token = 0;
509 	}
510 #else
511 	if (*buf_ptr == '=') {/* == */
512 	    *e_token++ = '=';	/* Flip =+ to += */
513 	    buf_ptr++;
514 	    *e_token = 0;
515 	}
516 #endif
517 	code = binary_op;
518 	unary_delim = true;
519 	break;
520 	/* can drop thru!!! */
521 
522     case '>':
523     case '<':
524     case '!':			/* ops like <, <<, <=, !=, etc */
525 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
526 	    *e_token++ = *buf_ptr;
527 	    if (++buf_ptr >= buf_end)
528 		fill_buffer();
529 	}
530 	if (*buf_ptr == '=')
531 	    *e_token++ = *buf_ptr++;
532 	code = (ps.last_u_d ? unary_op : binary_op);
533 	unary_delim = true;
534 	break;
535 
536     default:
537 	if (token[0] == '/' && *buf_ptr == '*') {
538 	    /* it is start of comment */
539 	    *e_token++ = '*';
540 
541 	    if (++buf_ptr >= buf_end)
542 		fill_buffer();
543 
544 	    code = comment;
545 	    unary_delim = ps.last_u_d;
546 	    break;
547 	}
548 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
549 	    /*
550 	     * handle ||, &&, etc, and also things as in int *****i
551 	     */
552 	    *e_token++ = *buf_ptr;
553 	    if (++buf_ptr >= buf_end)
554 		fill_buffer();
555 	}
556 	code = (ps.last_u_d ? unary_op : binary_op);
557 	unary_delim = true;
558 
559 
560     }				/* end of switch */
561     if (code != newline) {
562 	l_struct = false;
563 	last_code = code;
564     }
565     if (buf_ptr >= buf_end)	/* check for input buffer empty */
566 	fill_buffer();
567     ps.last_u_d = unary_delim;
568     *e_token = '\0';		/* null terminate the token */
569     return (code);
570 }
571 
572 /*
573  * Add the given keyword to the keyword table, using val as the keyword type
574  */
575 void
576 addkey(char *key, int val)
577 {
578     struct templ *p = specials;
579     while (p->rwd)
580 	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
581 	    return;
582 	else
583 	    p++;
584     if (p >= specials + sizeof specials / sizeof specials[0])
585 	return;			/* For now, table overflows are silently
586 				 * ignored */
587     p->rwd = key;
588     p->rwcode = val;
589     p[1].rwd = 0;
590     p[1].rwcode = 0;
591 }
592