xref: /original-bsd/usr.bin/indent/lexi.c (revision d272e02a)
1 /*
2  * Copyright (c) 1985 Sun Microsystems, Inc.
3  * Copyright (c) 1980 The Regents of the University of California.
4  * Copyright (c) 1976 Board of Trustees of the University of Illinois.
5  * All rights reserved.
6  *
7  * %sccs.include.redist.c%
8  */
9 
10 #ifndef lint
11 static char sccsid[] = "@(#)lexi.c	5.16 (Berkeley) 02/26/91";
12 #endif /* not lint */
13 
14 /*
15  * Here we have the token scanner for indent.  It scans off one token and puts
16  * it in the global variable "token".  It returns a code, indicating the type
17  * of token scanned.
18  */
19 
20 #include <stdio.h>
21 #include <ctype.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include "indent_globs.h"
25 #include "indent_codes.h"
26 
27 #define alphanum 1
28 #define opchar 3
29 
30 struct templ {
31     char       *rwd;
32     int         rwcode;
33 };
34 
35 struct templ specials[100] =
36 {
37     "switch", 1,
38     "case", 2,
39     "break", 0,
40     "struct", 3,
41     "union", 3,
42     "enum", 3,
43     "default", 2,
44     "int", 4,
45     "char", 4,
46     "float", 4,
47     "double", 4,
48     "long", 4,
49     "short", 4,
50     "typdef", 4,
51     "unsigned", 4,
52     "register", 4,
53     "static", 4,
54     "global", 4,
55     "extern", 4,
56     "void", 4,
57     "goto", 0,
58     "return", 0,
59     "if", 5,
60     "while", 5,
61     "for", 5,
62     "else", 6,
63     "do", 6,
64     "sizeof", 7,
65     0, 0
66 };
67 
68 char        chartype[128] =
69 {				/* this is used to facilitate the decision of
70 				 * what type (alphanumeric, operator) each
71 				 * character is */
72     0, 0, 0, 0, 0, 0, 0, 0,
73     0, 0, 0, 0, 0, 0, 0, 0,
74     0, 0, 0, 0, 0, 0, 0, 0,
75     0, 0, 0, 0, 0, 0, 0, 0,
76     0, 3, 0, 0, 1, 3, 3, 0,
77     0, 0, 3, 3, 0, 3, 0, 3,
78     1, 1, 1, 1, 1, 1, 1, 1,
79     1, 1, 0, 0, 3, 3, 3, 3,
80     0, 1, 1, 1, 1, 1, 1, 1,
81     1, 1, 1, 1, 1, 1, 1, 1,
82     1, 1, 1, 1, 1, 1, 1, 1,
83     1, 1, 1, 0, 0, 0, 3, 1,
84     0, 1, 1, 1, 1, 1, 1, 1,
85     1, 1, 1, 1, 1, 1, 1, 1,
86     1, 1, 1, 1, 1, 1, 1, 1,
87     1, 1, 1, 0, 3, 0, 3, 0
88 };
89 
90 
91 
92 
93 int
94 lexi()
95 {
96     int         unary_delim;	/* this is set to 1 if the current token
97 				 *
98 				 * forces a following operator to be unary */
99     static int  last_code;	/* the last token type returned */
100     static int  l_struct;	/* set to 1 if the last token was 'struct' */
101     int         code;		/* internal code to be returned */
102     char        qchar;		/* the delimiter character for a string */
103 
104     e_token = s_token;		/* point to start of place to save token */
105     unary_delim = false;
106     ps.col_1 = ps.last_nl;	/* tell world that this token started in
107 				 * column 1 iff the last thing scanned was nl */
108     ps.last_nl = false;
109 
110     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
111 	ps.col_1 = false;	/* leading blanks imply token is not in column
112 				 * 1 */
113 	if (++buf_ptr >= buf_end)
114 	    fill_buffer();
115     }
116 
117     /* Scan an alphanumeric token */
118     if (chartype[*buf_ptr] == alphanum || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
119 	/*
120 	 * we have a character or number
121 	 */
122 	register char *j;	/* used for searching thru list of
123 				 *
124 				 * reserved words */
125 	register struct templ *p;
126 
127 	if (isdigit(*buf_ptr) || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
128 	    int         seendot = 0,
129 	                seenexp = 0;
130 	    if (*buf_ptr == '0' &&
131 		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
132 		*e_token++ = *buf_ptr++;
133 		*e_token++ = *buf_ptr++;
134 		while (isxdigit(*buf_ptr)) {
135 		    CHECK_SIZE_TOKEN;
136 		    *e_token++ = *buf_ptr++;
137 		}
138 	    }
139 	    else
140 		while (1) {
141 		    if (*buf_ptr == '.')
142 			if (seendot)
143 			    break;
144 			else
145 			    seendot++;
146 		    CHECK_SIZE_TOKEN;
147 		    *e_token++ = *buf_ptr++;
148 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.')
149 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
150 			    break;
151 			else {
152 			    seenexp++;
153 			    seendot++;
154 			    CHECK_SIZE_TOKEN;
155 			    *e_token++ = *buf_ptr++;
156 			    if (*buf_ptr == '+' || *buf_ptr == '-')
157 				*e_token++ = *buf_ptr++;
158 			}
159 		}
160 	    if (*buf_ptr == 'L' || *buf_ptr == 'l')
161 		*e_token++ = *buf_ptr++;
162 	}
163 	else
164 	    while (chartype[*buf_ptr] == alphanum) {	/* copy it over */
165 		CHECK_SIZE_TOKEN;
166 		*e_token++ = *buf_ptr++;
167 		if (buf_ptr >= buf_end)
168 		    fill_buffer();
169 	    }
170 	*e_token++ = '\0';
171 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
172 	    if (++buf_ptr >= buf_end)
173 		fill_buffer();
174 	}
175 	ps.its_a_keyword = false;
176 	ps.sizeof_keyword = false;
177 	if (l_struct) {		/* if last token was 'struct', then this token
178 				 * should be treated as a declaration */
179 	    l_struct = false;
180 	    last_code = ident;
181 	    ps.last_u_d = true;
182 	    return (decl);
183 	}
184 	ps.last_u_d = false;	/* Operator after indentifier is binary */
185 	last_code = ident;	/* Remember that this is the code we will
186 				 * return */
187 
188 	/*
189 	 * This loop will check if the token is a keyword.
190 	 */
191 	for (p = specials; (j = p->rwd) != 0; p++) {
192 	    register char *p = s_token;	/* point at scanned token */
193 	    if (*j++ != *p++ || *j++ != *p++)
194 		continue;	/* This test depends on the fact that
195 				 * identifiers are always at least 1 character
196 				 * long (ie. the first two bytes of the
197 				 * identifier are always meaningful) */
198 	    if (p[-1] == 0)
199 		break;		/* If its a one-character identifier */
200 	    while (*p++ == *j)
201 		if (*j++ == 0)
202 		    goto found_keyword;	/* I wish that C had a multi-level
203 					 * break... */
204 	}
205 	if (p->rwd) {		/* we have a keyword */
206     found_keyword:
207 	    ps.its_a_keyword = true;
208 	    ps.last_u_d = true;
209 	    switch (p->rwcode) {
210 	    case 1:		/* it is a switch */
211 		return (swstmt);
212 	    case 2:		/* a case or default */
213 		return (casestmt);
214 
215 	    case 3:		/* a "struct" */
216 		if (ps.p_l_follow)
217 		    break;	/* inside parens: cast */
218 		l_struct = true;
219 
220 		/*
221 		 * Next time around, we will want to know that we have had a
222 		 * 'struct'
223 		 */
224 	    case 4:		/* one of the declaration keywords */
225 		if (ps.p_l_follow) {
226 		    ps.cast_mask |= 1 << ps.p_l_follow;
227 		    break;	/* inside parens: cast */
228 		}
229 		last_code = decl;
230 		return (decl);
231 
232 	    case 5:		/* if, while, for */
233 		return (sp_paren);
234 
235 	    case 6:		/* do, else */
236 		return (sp_nparen);
237 
238 	    case 7:
239 		ps.sizeof_keyword = true;
240 	    default:		/* all others are treated like any other
241 				 * identifier */
242 		return (ident);
243 	    }			/* end of switch */
244 	}			/* end of if (found_it) */
245 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
246 	    register char *tp = buf_ptr;
247 	    while (tp < buf_end)
248 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
249 		    goto not_proc;
250 	    strncpy(ps.procname, token, sizeof ps.procname - 1);
251 	    ps.in_parameter_declaration = 1;
252 	    rparen_count = 1;
253     not_proc:;
254 	}
255 	/*
256 	 * The following hack attempts to guess whether or not the current
257 	 * token is in fact a declaration keyword -- one that has been
258 	 * typedefd
259 	 */
260 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
261 		&& !ps.p_l_follow
262 	        && !ps.block_init
263 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
264 		    ps.last_token == decl ||
265 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
266 	    ps.its_a_keyword = true;
267 	    ps.last_u_d = true;
268 	    last_code = decl;
269 	    return decl;
270 	}
271 	if (last_code == decl)	/* if this is a declared variable, then
272 				 * following sign is unary */
273 	    ps.last_u_d = true;	/* will make "int a -1" work */
274 	last_code = ident;
275 	return (ident);		/* the ident is not in the list */
276     }				/* end of procesing for alpanum character */
277 
278     /* Scan a non-alphanumeric token */
279 
280     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
281 				 * moved here */
282     *e_token = '\0';
283     if (++buf_ptr >= buf_end)
284 	fill_buffer();
285 
286     switch (*token) {
287     case '\n':
288 	unary_delim = ps.last_u_d;
289 	ps.last_nl = true;	/* remember that we just had a newline */
290 	code = (had_eof ? 0 : newline);
291 
292 	/*
293 	 * if data has been exausted, the newline is a dummy, and we should
294 	 * return code to stop
295 	 */
296 	break;
297 
298     case '\'':			/* start of quoted character */
299     case '"':			/* start of string */
300 	qchar = *token;
301 	if (troff) {
302 	    e_token[-1] = '`';
303 	    if (qchar == '"')
304 		*e_token++ = '`';
305 	    e_token = chfont(&bodyf, &stringf, e_token);
306 	}
307 	do {			/* copy the string */
308 	    while (1) {		/* move one character or [/<char>]<char> */
309 		if (*buf_ptr == '\n') {
310 		    printf("%d: Unterminated literal\n", line_no);
311 		    goto stop_lit;
312 		}
313 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
314 					 * since CHECK_SIZE guarantees that there
315 					 * are at least 5 entries left */
316 		*e_token = *buf_ptr++;
317 		if (buf_ptr >= buf_end)
318 		    fill_buffer();
319 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
320 		    if (*buf_ptr == '\n')	/* check for escaped newline */
321 			++line_no;
322 		    if (troff) {
323 			*++e_token = BACKSLASH;
324 			if (*buf_ptr == BACKSLASH)
325 			    *++e_token = BACKSLASH;
326 		    }
327 		    *++e_token = *buf_ptr++;
328 		    ++e_token;	/* we must increment this again because we
329 				 * copied two chars */
330 		    if (buf_ptr >= buf_end)
331 			fill_buffer();
332 		}
333 		else
334 		    break;	/* we copied one character */
335 	    }			/* end of while (1) */
336 	} while (*e_token++ != qchar);
337 	if (troff) {
338 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
339 	    if (qchar == '"')
340 		*e_token++ = '\'';
341 	}
342 stop_lit:
343 	code = ident;
344 	break;
345 
346     case ('('):
347     case ('['):
348 	unary_delim = true;
349 	code = lparen;
350 	break;
351 
352     case (')'):
353     case (']'):
354 	code = rparen;
355 	break;
356 
357     case '#':
358 	unary_delim = ps.last_u_d;
359 	code = preesc;
360 	break;
361 
362     case '?':
363 	unary_delim = true;
364 	code = question;
365 	break;
366 
367     case (':'):
368 	code = colon;
369 	unary_delim = true;
370 	break;
371 
372     case (';'):
373 	unary_delim = true;
374 	code = semicolon;
375 	break;
376 
377     case ('{'):
378 	unary_delim = true;
379 
380 	/*
381 	 * if (ps.in_or_st) ps.block_init = 1;
382 	 */
383 	/* ?	code = ps.block_init ? lparen : lbrace; */
384 	code = lbrace;
385 	break;
386 
387     case ('}'):
388 	unary_delim = true;
389 	/* ?	code = ps.block_init ? rparen : rbrace; */
390 	code = rbrace;
391 	break;
392 
393     case 014:			/* a form feed */
394 	unary_delim = ps.last_u_d;
395 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
396 				 * right */
397 	code = form_feed;
398 	break;
399 
400     case (','):
401 	unary_delim = true;
402 	code = comma;
403 	break;
404 
405     case '.':
406 	unary_delim = false;
407 	code = period;
408 	break;
409 
410     case '-':
411     case '+':			/* check for -, +, --, ++ */
412 	code = (ps.last_u_d ? unary_op : binary_op);
413 	unary_delim = true;
414 
415 	if (*buf_ptr == token[0]) {
416 	    /* check for doubled character */
417 	    *e_token++ = *buf_ptr++;
418 	    /* buffer overflow will be checked at end of loop */
419 	    if (last_code == ident || last_code == rparen) {
420 		code = (ps.last_u_d ? unary_op : postop);
421 		/* check for following ++ or -- */
422 		unary_delim = false;
423 	    }
424 	}
425 	else if (*buf_ptr == '=')
426 	    /* check for operator += */
427 	    *e_token++ = *buf_ptr++;
428 	else if (*buf_ptr == '>') {
429 	    /* check for operator -> */
430 	    *e_token++ = *buf_ptr++;
431 	    if (!pointer_as_binop) {
432 		unary_delim = false;
433 		code = unary_op;
434 		ps.want_blank = false;
435 	    }
436 	}
437 	break;			/* buffer overflow will be checked at end of
438 				 * switch */
439 
440     case '=':
441 	if (ps.in_or_st)
442 	    ps.block_init = 1;
443 #ifdef undef
444 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
445 	    e_token[-1] = *buf_ptr++;
446 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
447 		*e_token++ = *buf_ptr++;
448 	    *e_token++ = '=';	/* Flip =+ to += */
449 	    *e_token = 0;
450 	}
451 #else
452 	if (*buf_ptr == '=') {/* == */
453 	    *e_token++ = '=';	/* Flip =+ to += */
454 	    buf_ptr++;
455 	    *e_token = 0;
456 	}
457 #endif
458 	code = binary_op;
459 	unary_delim = true;
460 	break;
461 	/* can drop thru!!! */
462 
463     case '>':
464     case '<':
465     case '!':			/* ops like <, <<, <=, !=, etc */
466 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
467 	    *e_token++ = *buf_ptr;
468 	    if (++buf_ptr >= buf_end)
469 		fill_buffer();
470 	}
471 	if (*buf_ptr == '=')
472 	    *e_token++ = *buf_ptr++;
473 	code = (ps.last_u_d ? unary_op : binary_op);
474 	unary_delim = true;
475 	break;
476 
477     default:
478 	if (token[0] == '/' && *buf_ptr == '*') {
479 	    /* it is start of comment */
480 	    *e_token++ = '*';
481 
482 	    if (++buf_ptr >= buf_end)
483 		fill_buffer();
484 
485 	    code = comment;
486 	    unary_delim = ps.last_u_d;
487 	    break;
488 	}
489 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
490 	    /*
491 	     * handle ||, &&, etc, and also things as in int *****i
492 	     */
493 	    *e_token++ = *buf_ptr;
494 	    if (++buf_ptr >= buf_end)
495 		fill_buffer();
496 	}
497 	code = (ps.last_u_d ? unary_op : binary_op);
498 	unary_delim = true;
499 
500 
501     }				/* end of switch */
502     if (code != newline) {
503 	l_struct = false;
504 	last_code = code;
505     }
506     if (buf_ptr >= buf_end)	/* check for input buffer empty */
507 	fill_buffer();
508     ps.last_u_d = unary_delim;
509     *e_token = '\0';		/* null terminate the token */
510     return (code);
511 }
512 
513 /*
514  * Add the given keyword to the keyword table, using val as the keyword type
515  */
516 addkey(key, val)
517     char       *key;
518 {
519     register struct templ *p = specials;
520     while (p->rwd)
521 	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
522 	    return;
523 	else
524 	    p++;
525     if (p >= specials + sizeof specials / sizeof specials[0])
526 	return;			/* For now, table overflows are silently
527 				 * ignored */
528     p->rwd = key;
529     p->rwcode = val;
530     p[1].rwd = 0;
531     p[1].rwcode = 0;
532     return;
533 }
534