xref: /dragonfly/usr.bin/indent/lexi.c (revision 6e285212)
1 /*
2  * Copyright (c) 1985 Sun Microsystems, Inc.
3  * Copyright (c) 1980, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed by the University of
18  *	California, Berkeley and its contributors.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  *
35  * @(#)lexi.c	8.1 (Berkeley) 6/6/93
36  * $FreeBSD: src/usr.bin/indent/lexi.c,v 1.3.6.3 2001/12/06 19:28:47 schweikh Exp $
37  * $DragonFly: src/usr.bin/indent/lexi.c,v 1.2 2003/06/17 04:29:27 dillon Exp $
38  */
39 
40 #if 0
41 #endif
42 
43 /*
44  * Here we have the token scanner for indent.  It scans off one token and puts
45  * it in the global variable "token".  It returns a code, indicating the type
46  * of token scanned.
47  */
48 
49 #include <stdio.h>
50 #include <ctype.h>
51 #include <stdlib.h>
52 #include <string.h>
53 #include "indent_globs.h"
54 #include "indent_codes.h"
55 
56 #define alphanum 1
57 #define opchar 3
58 
59 void fill_buffer(void);
60 
61 struct templ {
62     char       *rwd;
63     int         rwcode;
64 };
65 
66 struct templ specials[1000] =
67 {
68     {"switch", 1},
69     {"case", 2},
70     {"break", 0},
71     {"struct", 3},
72     {"union", 3},
73     {"enum", 3},
74     {"default", 2},
75     {"int", 4},
76     {"char", 4},
77     {"float", 4},
78     {"double", 4},
79     {"long", 4},
80     {"short", 4},
81     {"typdef", 4},
82     {"unsigned", 4},
83     {"register", 4},
84     {"static", 4},
85     {"global", 4},
86     {"extern", 4},
87     {"void", 4},
88     {"goto", 0},
89     {"return", 0},
90     {"if", 5},
91     {"while", 5},
92     {"for", 5},
93     {"else", 6},
94     {"do", 6},
95     {"sizeof", 7},
96     {"const", 9},
97     {"volatile", 9},
98     {0, 0}
99 };
100 
101 char        chartype[128] =
102 {				/* this is used to facilitate the decision of
103 				 * what type (alphanumeric, operator) each
104 				 * character is */
105     0, 0, 0, 0, 0, 0, 0, 0,
106     0, 0, 0, 0, 0, 0, 0, 0,
107     0, 0, 0, 0, 0, 0, 0, 0,
108     0, 0, 0, 0, 0, 0, 0, 0,
109     0, 3, 0, 0, 1, 3, 3, 0,
110     0, 0, 3, 3, 0, 3, 0, 3,
111     1, 1, 1, 1, 1, 1, 1, 1,
112     1, 1, 0, 0, 3, 3, 3, 3,
113     0, 1, 1, 1, 1, 1, 1, 1,
114     1, 1, 1, 1, 1, 1, 1, 1,
115     1, 1, 1, 1, 1, 1, 1, 1,
116     1, 1, 1, 0, 0, 0, 3, 1,
117     0, 1, 1, 1, 1, 1, 1, 1,
118     1, 1, 1, 1, 1, 1, 1, 1,
119     1, 1, 1, 1, 1, 1, 1, 1,
120     1, 1, 1, 0, 3, 0, 3, 0
121 };
122 
123 int
124 lexi(void)
125 {
126     int         unary_delim;	/* this is set to 1 if the current token
127 				 * forces a following operator to be unary */
128     static int  last_code;	/* the last token type returned */
129     static int  l_struct;	/* set to 1 if the last token was 'struct' */
130     int         code;		/* internal code to be returned */
131     char        qchar;		/* the delimiter character for a string */
132 
133     e_token = s_token;		/* point to start of place to save token */
134     unary_delim = false;
135     ps.col_1 = ps.last_nl;	/* tell world that this token started in
136 				 * column 1 iff the last thing scanned was nl */
137     ps.last_nl = false;
138 
139     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
140 	ps.col_1 = false;	/* leading blanks imply token is not in column
141 				 * 1 */
142 	if (++buf_ptr >= buf_end)
143 	    fill_buffer();
144     }
145 
146     /* Scan an alphanumeric token */
147     if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
148 	/*
149 	 * we have a character or number
150 	 */
151 	register char *j;	/* used for searching thru list of
152 				 *
153 				 * reserved words */
154 	register struct templ *p;
155 
156 	if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
157 	    int         seendot = 0,
158 	                seenexp = 0,
159 			seensfx = 0;
160 	    if (*buf_ptr == '0' &&
161 		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
162 		*e_token++ = *buf_ptr++;
163 		*e_token++ = *buf_ptr++;
164 		while (isxdigit(*buf_ptr)) {
165 		    CHECK_SIZE_TOKEN;
166 		    *e_token++ = *buf_ptr++;
167 		}
168 	    }
169 	    else
170 		while (1) {
171 		    if (*buf_ptr == '.') {
172 			if (seendot)
173 			    break;
174 			else
175 			    seendot++;
176 		    }
177 		    CHECK_SIZE_TOKEN;
178 		    *e_token++ = *buf_ptr++;
179 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
180 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
181 			    break;
182 			else {
183 			    seenexp++;
184 			    seendot++;
185 			    CHECK_SIZE_TOKEN;
186 			    *e_token++ = *buf_ptr++;
187 			    if (*buf_ptr == '+' || *buf_ptr == '-')
188 				*e_token++ = *buf_ptr++;
189 			}
190 		    }
191 		}
192 	    while (1) {
193 		if (!(seensfx & 1) &&
194 			(*buf_ptr == 'U' || *buf_ptr == 'u')) {
195 		    CHECK_SIZE_TOKEN;
196 		    *e_token++ = *buf_ptr++;
197 		    seensfx |= 1;
198 		    continue;
199 		}
200         	if (!(seensfx & 2) &&
201 			(*buf_ptr == 'L' || *buf_ptr == 'l')) {
202 		    CHECK_SIZE_TOKEN;
203 		    if (buf_ptr[1] == buf_ptr[0])
204 		        *e_token++ = *buf_ptr++;
205 		    *e_token++ = *buf_ptr++;
206 		    seensfx |= 2;
207 		    continue;
208 		}
209 		break;
210 	    }
211 	}
212 	else
213 	    while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
214 		/* fill_buffer() terminates buffer with newline */
215 		if (*buf_ptr == BACKSLASH) {
216 		    if (*(buf_ptr + 1) == '\n') {
217 			buf_ptr += 2;
218 			if (buf_ptr >= buf_end)
219 			    fill_buffer();
220 			} else
221 			    break;
222 		}
223 		CHECK_SIZE_TOKEN;
224 		/* copy it over */
225 		*e_token++ = *buf_ptr++;
226 		if (buf_ptr >= buf_end)
227 		    fill_buffer();
228 	    }
229 	*e_token++ = '\0';
230 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
231 	    if (++buf_ptr >= buf_end)
232 		fill_buffer();
233 	}
234 	ps.its_a_keyword = false;
235 	ps.sizeof_keyword = false;
236 	if (l_struct) {		/* if last token was 'struct', then this token
237 				 * should be treated as a declaration */
238 	    l_struct = false;
239 	    last_code = ident;
240 	    ps.last_u_d = true;
241 	    return (decl);
242 	}
243 	ps.last_u_d = false;	/* Operator after indentifier is binary */
244 	last_code = ident;	/* Remember that this is the code we will
245 				 * return */
246 
247 	/*
248 	 * This loop will check if the token is a keyword.
249 	 */
250 	for (p = specials; (j = p->rwd) != 0; p++) {
251 	    register char *p = s_token;	/* point at scanned token */
252 	    if (*j++ != *p++ || *j++ != *p++)
253 		continue;	/* This test depends on the fact that
254 				 * identifiers are always at least 1 character
255 				 * long (ie. the first two bytes of the
256 				 * identifier are always meaningful) */
257 	    if (p[-1] == 0)
258 		break;		/* If its a one-character identifier */
259 	    while (*p++ == *j)
260 		if (*j++ == 0)
261 		    goto found_keyword;	/* I wish that C had a multi-level
262 					 * break... */
263 	}
264 	if (p->rwd) {		/* we have a keyword */
265     found_keyword:
266 	    ps.its_a_keyword = true;
267 	    ps.last_u_d = true;
268 	    switch (p->rwcode) {
269 	    case 1:		/* it is a switch */
270 		return (swstmt);
271 	    case 2:		/* a case or default */
272 		return (casestmt);
273 
274 	    case 3:		/* a "struct" */
275 		if (ps.p_l_follow)
276 			break;	/* inside parens: cast */
277 		/*
278 		 * Next time around, we may want to know that we have had a
279 		 * 'struct'
280 		 */
281 		l_struct = true;
282 
283 		/*
284 		 * Fall through to test for a cast, function prototype or
285 		 * sizeof().
286 		 */
287 	    case 4:		/* one of the declaration keywords */
288 		if (ps.p_l_follow) {
289 		    ps.cast_mask |= 1 << ps.p_l_follow;
290 
291 		    /*
292 		     * Forget that we saw `struct' if we're in a sizeof().
293 		     */
294 		    if (ps.sizeof_mask)
295 			l_struct = false;
296 
297 		    break;	/* inside parens: cast, prototype or sizeof() */
298 		}
299 		last_code = decl;
300 		return (decl);
301 
302 	    case 5:		/* if, while, for */
303 		return (sp_paren);
304 
305 	    case 6:		/* do, else */
306 		return (sp_nparen);
307 
308 	    case 7:
309 		ps.sizeof_keyword = true;
310 	    default:		/* all others are treated like any other
311 				 * identifier */
312 		return (ident);
313 	    }			/* end of switch */
314 	}			/* end of if (found_it) */
315 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
316 	    register char *tp = buf_ptr;
317 	    while (tp < buf_end)
318 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
319 		    goto not_proc;
320 	    strncpy(ps.procname, token, sizeof ps.procname - 1);
321 	    ps.in_parameter_declaration = 1;
322 	    rparen_count = 1;
323     not_proc:;
324 	}
325 	/*
326 	 * The following hack attempts to guess whether or not the current
327 	 * token is in fact a declaration keyword -- one that has been
328 	 * typedefd
329 	 */
330 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
331 		&& !ps.p_l_follow
332 	        && !ps.block_init
333 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
334 		    ps.last_token == decl ||
335 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
336 	    ps.its_a_keyword = true;
337 	    ps.last_u_d = true;
338 	    last_code = decl;
339 	    return decl;
340 	}
341 	if (last_code == decl)	/* if this is a declared variable, then
342 				 * following sign is unary */
343 	    ps.last_u_d = true;	/* will make "int a -1" work */
344 	last_code = ident;
345 	return (ident);		/* the ident is not in the list */
346     }				/* end of procesing for alpanum character */
347 
348     /* Scan a non-alphanumeric token */
349 
350     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
351 				 * moved here */
352     *e_token = '\0';
353     if (++buf_ptr >= buf_end)
354 	fill_buffer();
355 
356     switch (*token) {
357     case '\n':
358 	unary_delim = ps.last_u_d;
359 	ps.last_nl = true;	/* remember that we just had a newline */
360 	code = (had_eof ? 0 : newline);
361 
362 	/*
363 	 * if data has been exausted, the newline is a dummy, and we should
364 	 * return code to stop
365 	 */
366 	break;
367 
368     case '\'':			/* start of quoted character */
369     case '"':			/* start of string */
370 	qchar = *token;
371 	if (troff) {
372 	    e_token[-1] = '`';
373 	    if (qchar == '"')
374 		*e_token++ = '`';
375 	    e_token = chfont(&bodyf, &stringf, e_token);
376 	}
377 	do {			/* copy the string */
378 	    while (1) {		/* move one character or [/<char>]<char> */
379 		if (*buf_ptr == '\n') {
380 		    printf("%d: Unterminated literal\n", line_no);
381 		    goto stop_lit;
382 		}
383 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
384 					 * since CHECK_SIZE guarantees that there
385 					 * are at least 5 entries left */
386 		*e_token = *buf_ptr++;
387 		if (buf_ptr >= buf_end)
388 		    fill_buffer();
389 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
390 		    if (*buf_ptr == '\n')	/* check for escaped newline */
391 			++line_no;
392 		    if (troff) {
393 			*++e_token = BACKSLASH;
394 			if (*buf_ptr == BACKSLASH)
395 			    *++e_token = BACKSLASH;
396 		    }
397 		    *++e_token = *buf_ptr++;
398 		    ++e_token;	/* we must increment this again because we
399 				 * copied two chars */
400 		    if (buf_ptr >= buf_end)
401 			fill_buffer();
402 		}
403 		else
404 		    break;	/* we copied one character */
405 	    }			/* end of while (1) */
406 	} while (*e_token++ != qchar);
407 	if (troff) {
408 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
409 	    if (qchar == '"')
410 		*e_token++ = '\'';
411 	}
412 stop_lit:
413 	code = ident;
414 	break;
415 
416     case ('('):
417     case ('['):
418 	unary_delim = true;
419 	code = lparen;
420 	break;
421 
422     case (')'):
423     case (']'):
424 	code = rparen;
425 	break;
426 
427     case '#':
428 	unary_delim = ps.last_u_d;
429 	code = preesc;
430 	break;
431 
432     case '?':
433 	unary_delim = true;
434 	code = question;
435 	break;
436 
437     case (':'):
438 	code = colon;
439 	unary_delim = true;
440 	break;
441 
442     case (';'):
443 	unary_delim = true;
444 	code = semicolon;
445 	break;
446 
447     case ('{'):
448 	unary_delim = true;
449 
450 	/*
451 	 * if (ps.in_or_st) ps.block_init = 1;
452 	 */
453 	/* ?	code = ps.block_init ? lparen : lbrace; */
454 	code = lbrace;
455 	break;
456 
457     case ('}'):
458 	unary_delim = true;
459 	/* ?	code = ps.block_init ? rparen : rbrace; */
460 	code = rbrace;
461 	break;
462 
463     case 014:			/* a form feed */
464 	unary_delim = ps.last_u_d;
465 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
466 				 * right */
467 	code = form_feed;
468 	break;
469 
470     case (','):
471 	unary_delim = true;
472 	code = comma;
473 	break;
474 
475     case '.':
476 	unary_delim = false;
477 	code = period;
478 	break;
479 
480     case '-':
481     case '+':			/* check for -, +, --, ++ */
482 	code = (ps.last_u_d ? unary_op : binary_op);
483 	unary_delim = true;
484 
485 	if (*buf_ptr == token[0]) {
486 	    /* check for doubled character */
487 	    *e_token++ = *buf_ptr++;
488 	    /* buffer overflow will be checked at end of loop */
489 	    if (last_code == ident || last_code == rparen) {
490 		code = (ps.last_u_d ? unary_op : postop);
491 		/* check for following ++ or -- */
492 		unary_delim = false;
493 	    }
494 	}
495 	else if (*buf_ptr == '=')
496 	    /* check for operator += */
497 	    *e_token++ = *buf_ptr++;
498 	else if (*buf_ptr == '>') {
499 	    /* check for operator -> */
500 	    *e_token++ = *buf_ptr++;
501 	    if (!pointer_as_binop) {
502 		unary_delim = false;
503 		code = unary_op;
504 		ps.want_blank = false;
505 	    }
506 	}
507 	break;			/* buffer overflow will be checked at end of
508 				 * switch */
509 
510     case '=':
511 	if (ps.in_or_st)
512 	    ps.block_init = 1;
513 #ifdef undef
514 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
515 	    e_token[-1] = *buf_ptr++;
516 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
517 		*e_token++ = *buf_ptr++;
518 	    *e_token++ = '=';	/* Flip =+ to += */
519 	    *e_token = 0;
520 	}
521 #else
522 	if (*buf_ptr == '=') {/* == */
523 	    *e_token++ = '=';	/* Flip =+ to += */
524 	    buf_ptr++;
525 	    *e_token = 0;
526 	}
527 #endif
528 	code = binary_op;
529 	unary_delim = true;
530 	break;
531 	/* can drop thru!!! */
532 
533     case '>':
534     case '<':
535     case '!':			/* ops like <, <<, <=, !=, etc */
536 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
537 	    *e_token++ = *buf_ptr;
538 	    if (++buf_ptr >= buf_end)
539 		fill_buffer();
540 	}
541 	if (*buf_ptr == '=')
542 	    *e_token++ = *buf_ptr++;
543 	code = (ps.last_u_d ? unary_op : binary_op);
544 	unary_delim = true;
545 	break;
546 
547     default:
548 	if (token[0] == '/' && *buf_ptr == '*') {
549 	    /* it is start of comment */
550 	    *e_token++ = '*';
551 
552 	    if (++buf_ptr >= buf_end)
553 		fill_buffer();
554 
555 	    code = comment;
556 	    unary_delim = ps.last_u_d;
557 	    break;
558 	}
559 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
560 	    /*
561 	     * handle ||, &&, etc, and also things as in int *****i
562 	     */
563 	    *e_token++ = *buf_ptr;
564 	    if (++buf_ptr >= buf_end)
565 		fill_buffer();
566 	}
567 	code = (ps.last_u_d ? unary_op : binary_op);
568 	unary_delim = true;
569 
570 
571     }				/* end of switch */
572     if (code != newline) {
573 	l_struct = false;
574 	last_code = code;
575     }
576     if (buf_ptr >= buf_end)	/* check for input buffer empty */
577 	fill_buffer();
578     ps.last_u_d = unary_delim;
579     *e_token = '\0';		/* null terminate the token */
580     return (code);
581 }
582 
583 /*
584  * Add the given keyword to the keyword table, using val as the keyword type
585  */
586 void
587 addkey(char *key, int val)
588 {
589     register struct templ *p = specials;
590     while (p->rwd)
591 	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
592 	    return;
593 	else
594 	    p++;
595     if (p >= specials + sizeof specials / sizeof specials[0])
596 	return;			/* For now, table overflows are silently
597 				 * ignored */
598     p->rwd = key;
599     p->rwcode = val;
600     p[1].rwd = 0;
601     p[1].rwcode = 0;
602 }
603