xref: /original-bsd/usr.bin/indent/lexi.c (revision 9acaf688)
1 /*
2  * Copyright (c) 1985 Sun Microsystems, Inc.
3  * Copyright (c) 1980 The Regents of the University of California.
4  * Copyright (c) 1976 Board of Trustees of the University of Illinois.
5  * All rights reserved.
6  *
7  * %sccs.include.redist.c%
8  */
9 
10 #ifndef lint
11 static char sccsid[] = "@(#)lexi.c	5.15 (Berkeley) 06/01/90";
12 #endif /* not lint */
13 
14 /*
15  * Here we have the token scanner for indent.  It scans off one token and puts
16  * it in the global variable "token".  It returns a code, indicating the type
17  * of token scanned.
18  */
19 
20 #include "indent_globs.h"
21 #include "indent_codes.h"
22 #include <ctype.h>
23 
24 #define alphanum 1
25 #define opchar 3
26 
27 struct templ {
28     char       *rwd;
29     int         rwcode;
30 };
31 
32 struct templ specials[100] =
33 {
34     "switch", 1,
35     "case", 2,
36     "break", 0,
37     "struct", 3,
38     "union", 3,
39     "enum", 3,
40     "default", 2,
41     "int", 4,
42     "char", 4,
43     "float", 4,
44     "double", 4,
45     "long", 4,
46     "short", 4,
47     "typdef", 4,
48     "unsigned", 4,
49     "register", 4,
50     "static", 4,
51     "global", 4,
52     "extern", 4,
53     "void", 4,
54     "goto", 0,
55     "return", 0,
56     "if", 5,
57     "while", 5,
58     "for", 5,
59     "else", 6,
60     "do", 6,
61     "sizeof", 7,
62     0, 0
63 };
64 
65 char        chartype[128] =
66 {				/* this is used to facilitate the decision of
67 				 * what type (alphanumeric, operator) each
68 				 * character is */
69     0, 0, 0, 0, 0, 0, 0, 0,
70     0, 0, 0, 0, 0, 0, 0, 0,
71     0, 0, 0, 0, 0, 0, 0, 0,
72     0, 0, 0, 0, 0, 0, 0, 0,
73     0, 3, 0, 0, 1, 3, 3, 0,
74     0, 0, 3, 3, 0, 3, 0, 3,
75     1, 1, 1, 1, 1, 1, 1, 1,
76     1, 1, 0, 0, 3, 3, 3, 3,
77     0, 1, 1, 1, 1, 1, 1, 1,
78     1, 1, 1, 1, 1, 1, 1, 1,
79     1, 1, 1, 1, 1, 1, 1, 1,
80     1, 1, 1, 0, 0, 0, 3, 1,
81     0, 1, 1, 1, 1, 1, 1, 1,
82     1, 1, 1, 1, 1, 1, 1, 1,
83     1, 1, 1, 1, 1, 1, 1, 1,
84     1, 1, 1, 0, 3, 0, 3, 0
85 };
86 
87 
88 
89 
90 int
91 lexi()
92 {
93     int         unary_delim;	/* this is set to 1 if the current token
94 				 *
95 				 * forces a following operator to be unary */
96     static int  last_code;	/* the last token type returned */
97     static int  l_struct;	/* set to 1 if the last token was 'struct' */
98     int         code;		/* internal code to be returned */
99     char        qchar;		/* the delimiter character for a string */
100 
101     e_token = s_token;		/* point to start of place to save token */
102     unary_delim = false;
103     ps.col_1 = ps.last_nl;	/* tell world that this token started in
104 				 * column 1 iff the last thing scanned was nl */
105     ps.last_nl = false;
106 
107     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
108 	ps.col_1 = false;	/* leading blanks imply token is not in column
109 				 * 1 */
110 	if (++buf_ptr >= buf_end)
111 	    fill_buffer();
112     }
113 
114     /* Scan an alphanumeric token */
115     if (chartype[*buf_ptr] == alphanum || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
116 	/*
117 	 * we have a character or number
118 	 */
119 	register char *j;	/* used for searching thru list of
120 				 *
121 				 * reserved words */
122 	register struct templ *p;
123 
124 	if (isdigit(*buf_ptr) || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
125 	    int         seendot = 0,
126 	                seenexp = 0;
127 	    if (*buf_ptr == '0' &&
128 		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
129 		*e_token++ = *buf_ptr++;
130 		*e_token++ = *buf_ptr++;
131 		while (isxdigit(*buf_ptr)) {
132 		    CHECK_SIZE_TOKEN;
133 		    *e_token++ = *buf_ptr++;
134 		}
135 	    }
136 	    else
137 		while (1) {
138 		    if (*buf_ptr == '.')
139 			if (seendot)
140 			    break;
141 			else
142 			    seendot++;
143 		    CHECK_SIZE_TOKEN;
144 		    *e_token++ = *buf_ptr++;
145 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.')
146 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
147 			    break;
148 			else {
149 			    seenexp++;
150 			    seendot++;
151 			    CHECK_SIZE_TOKEN;
152 			    *e_token++ = *buf_ptr++;
153 			    if (*buf_ptr == '+' || *buf_ptr == '-')
154 				*e_token++ = *buf_ptr++;
155 			}
156 		}
157 	    if (*buf_ptr == 'L' || *buf_ptr == 'l')
158 		*e_token++ = *buf_ptr++;
159 	}
160 	else
161 	    while (chartype[*buf_ptr] == alphanum) {	/* copy it over */
162 		CHECK_SIZE_TOKEN;
163 		*e_token++ = *buf_ptr++;
164 		if (buf_ptr >= buf_end)
165 		    fill_buffer();
166 	    }
167 	*e_token++ = '\0';
168 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
169 	    if (++buf_ptr >= buf_end)
170 		fill_buffer();
171 	}
172 	ps.its_a_keyword = false;
173 	ps.sizeof_keyword = false;
174 	if (l_struct) {		/* if last token was 'struct', then this token
175 				 * should be treated as a declaration */
176 	    l_struct = false;
177 	    last_code = ident;
178 	    ps.last_u_d = true;
179 	    return (decl);
180 	}
181 	ps.last_u_d = false;	/* Operator after indentifier is binary */
182 	last_code = ident;	/* Remember that this is the code we will
183 				 * return */
184 
185 	/*
186 	 * This loop will check if the token is a keyword.
187 	 */
188 	for (p = specials; (j = p->rwd) != 0; p++) {
189 	    register char *p = s_token;	/* point at scanned token */
190 	    if (*j++ != *p++ || *j++ != *p++)
191 		continue;	/* This test depends on the fact that
192 				 * identifiers are always at least 1 character
193 				 * long (ie. the first two bytes of the
194 				 * identifier are always meaningful) */
195 	    if (p[-1] == 0)
196 		break;		/* If its a one-character identifier */
197 	    while (*p++ == *j)
198 		if (*j++ == 0)
199 		    goto found_keyword;	/* I wish that C had a multi-level
200 					 * break... */
201 	}
202 	if (p->rwd) {		/* we have a keyword */
203     found_keyword:
204 	    ps.its_a_keyword = true;
205 	    ps.last_u_d = true;
206 	    switch (p->rwcode) {
207 	    case 1:		/* it is a switch */
208 		return (swstmt);
209 	    case 2:		/* a case or default */
210 		return (casestmt);
211 
212 	    case 3:		/* a "struct" */
213 		if (ps.p_l_follow)
214 		    break;	/* inside parens: cast */
215 		l_struct = true;
216 
217 		/*
218 		 * Next time around, we will want to know that we have had a
219 		 * 'struct'
220 		 */
221 	    case 4:		/* one of the declaration keywords */
222 		if (ps.p_l_follow) {
223 		    ps.cast_mask |= 1 << ps.p_l_follow;
224 		    break;	/* inside parens: cast */
225 		}
226 		last_code = decl;
227 		return (decl);
228 
229 	    case 5:		/* if, while, for */
230 		return (sp_paren);
231 
232 	    case 6:		/* do, else */
233 		return (sp_nparen);
234 
235 	    case 7:
236 		ps.sizeof_keyword = true;
237 	    default:		/* all others are treated like any other
238 				 * identifier */
239 		return (ident);
240 	    }			/* end of switch */
241 	}			/* end of if (found_it) */
242 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
243 	    register char *tp = buf_ptr;
244 	    while (tp < buf_end)
245 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
246 		    goto not_proc;
247 	    strncpy(ps.procname, token, sizeof ps.procname - 1);
248 	    ps.in_parameter_declaration = 1;
249 	    rparen_count = 1;
250     not_proc:;
251 	}
252 	/*
253 	 * The following hack attempts to guess whether or not the current
254 	 * token is in fact a declaration keyword -- one that has been
255 	 * typedefd
256 	 */
257 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
258 		&& !ps.p_l_follow
259 	        && !ps.block_init
260 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
261 		    ps.last_token == decl ||
262 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
263 	    ps.its_a_keyword = true;
264 	    ps.last_u_d = true;
265 	    last_code = decl;
266 	    return decl;
267 	}
268 	if (last_code == decl)	/* if this is a declared variable, then
269 				 * following sign is unary */
270 	    ps.last_u_d = true;	/* will make "int a -1" work */
271 	last_code = ident;
272 	return (ident);		/* the ident is not in the list */
273     }				/* end of procesing for alpanum character */
274 
275     /* Scan a non-alphanumeric token */
276 
277     *e_token++ = *buf_ptr;		/* if it is only a one-character token, it is
278 				 * moved here */
279     *e_token = '\0';
280     if (++buf_ptr >= buf_end)
281 	fill_buffer();
282 
283     switch (*token) {
284     case '\n':
285 	unary_delim = ps.last_u_d;
286 	ps.last_nl = true;	/* remember that we just had a newline */
287 	code = (had_eof ? 0 : newline);
288 
289 	/*
290 	 * if data has been exausted, the newline is a dummy, and we should
291 	 * return code to stop
292 	 */
293 	break;
294 
295     case '\'':			/* start of quoted character */
296     case '"':			/* start of string */
297 	qchar = *token;
298 	if (troff) {
299 	    e_token[-1] = '`';
300 	    if (qchar == '"')
301 		*e_token++ = '`';
302 	    e_token = chfont(&bodyf, &stringf, e_token);
303 	}
304 	do {			/* copy the string */
305 	    while (1) {		/* move one character or [/<char>]<char> */
306 		if (*buf_ptr == '\n') {
307 		    printf("%d: Unterminated literal\n", line_no);
308 		    goto stop_lit;
309 		}
310 		CHECK_SIZE_TOKEN;	/* Only have to do this once in this loop,
311 					 * since CHECK_SIZE guarantees that there
312 					 * are at least 5 entries left */
313 		*e_token = *buf_ptr++;
314 		if (buf_ptr >= buf_end)
315 		    fill_buffer();
316 		if (*e_token == BACKSLASH) {	/* if escape, copy extra char */
317 		    if (*buf_ptr == '\n')	/* check for escaped newline */
318 			++line_no;
319 		    if (troff) {
320 			*++e_token = BACKSLASH;
321 			if (*buf_ptr == BACKSLASH)
322 			    *++e_token = BACKSLASH;
323 		    }
324 		    *++e_token = *buf_ptr++;
325 		    ++e_token;	/* we must increment this again because we
326 				 * copied two chars */
327 		    if (buf_ptr >= buf_end)
328 			fill_buffer();
329 		}
330 		else
331 		    break;	/* we copied one character */
332 	    }			/* end of while (1) */
333 	} while (*e_token++ != qchar);
334 	if (troff) {
335 	    e_token = chfont(&stringf, &bodyf, e_token - 1);
336 	    if (qchar == '"')
337 		*e_token++ = '\'';
338 	}
339 stop_lit:
340 	code = ident;
341 	break;
342 
343     case ('('):
344     case ('['):
345 	unary_delim = true;
346 	code = lparen;
347 	break;
348 
349     case (')'):
350     case (']'):
351 	code = rparen;
352 	break;
353 
354     case '#':
355 	unary_delim = ps.last_u_d;
356 	code = preesc;
357 	break;
358 
359     case '?':
360 	unary_delim = true;
361 	code = question;
362 	break;
363 
364     case (':'):
365 	code = colon;
366 	unary_delim = true;
367 	break;
368 
369     case (';'):
370 	unary_delim = true;
371 	code = semicolon;
372 	break;
373 
374     case ('{'):
375 	unary_delim = true;
376 
377 	/*
378 	 * if (ps.in_or_st) ps.block_init = 1;
379 	 */
380 	/* ?	code = ps.block_init ? lparen : lbrace; */
381 	code = lbrace;
382 	break;
383 
384     case ('}'):
385 	unary_delim = true;
386 	/* ?	code = ps.block_init ? rparen : rbrace; */
387 	code = rbrace;
388 	break;
389 
390     case 014:			/* a form feed */
391 	unary_delim = ps.last_u_d;
392 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
393 				 * right */
394 	code = form_feed;
395 	break;
396 
397     case (','):
398 	unary_delim = true;
399 	code = comma;
400 	break;
401 
402     case '.':
403 	unary_delim = false;
404 	code = period;
405 	break;
406 
407     case '-':
408     case '+':			/* check for -, +, --, ++ */
409 	code = (ps.last_u_d ? unary_op : binary_op);
410 	unary_delim = true;
411 
412 	if (*buf_ptr == token[0]) {
413 	    /* check for doubled character */
414 	    *e_token++ = *buf_ptr++;
415 	    /* buffer overflow will be checked at end of loop */
416 	    if (last_code == ident || last_code == rparen) {
417 		code = (ps.last_u_d ? unary_op : postop);
418 		/* check for following ++ or -- */
419 		unary_delim = false;
420 	    }
421 	}
422 	else if (*buf_ptr == '=')
423 	    /* check for operator += */
424 	    *e_token++ = *buf_ptr++;
425 	else if (*buf_ptr == '>') {
426 	    /* check for operator -> */
427 	    *e_token++ = *buf_ptr++;
428 	    if (!pointer_as_binop) {
429 		unary_delim = false;
430 		code = unary_op;
431 		ps.want_blank = false;
432 	    }
433 	}
434 	break;			/* buffer overflow will be checked at end of
435 				 * switch */
436 
437     case '=':
438 	if (ps.in_or_st)
439 	    ps.block_init = 1;
440 #ifdef undef
441 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
442 	    e_token[-1] = *buf_ptr++;
443 	    if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
444 		*e_token++ = *buf_ptr++;
445 	    *e_token++ = '=';	/* Flip =+ to += */
446 	    *e_token = 0;
447 	}
448 #else
449 	if (*buf_ptr == '=') {/* == */
450 	    *e_token++ = '=';	/* Flip =+ to += */
451 	    buf_ptr++;
452 	    *e_token = 0;
453 	}
454 #endif
455 	code = binary_op;
456 	unary_delim = true;
457 	break;
458 	/* can drop thru!!! */
459 
460     case '>':
461     case '<':
462     case '!':			/* ops like <, <<, <=, !=, etc */
463 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
464 	    *e_token++ = *buf_ptr;
465 	    if (++buf_ptr >= buf_end)
466 		fill_buffer();
467 	}
468 	if (*buf_ptr == '=')
469 	    *e_token++ = *buf_ptr++;
470 	code = (ps.last_u_d ? unary_op : binary_op);
471 	unary_delim = true;
472 	break;
473 
474     default:
475 	if (token[0] == '/' && *buf_ptr == '*') {
476 	    /* it is start of comment */
477 	    *e_token++ = '*';
478 
479 	    if (++buf_ptr >= buf_end)
480 		fill_buffer();
481 
482 	    code = comment;
483 	    unary_delim = ps.last_u_d;
484 	    break;
485 	}
486 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
487 	    /*
488 	     * handle ||, &&, etc, and also things as in int *****i
489 	     */
490 	    *e_token++ = *buf_ptr;
491 	    if (++buf_ptr >= buf_end)
492 		fill_buffer();
493 	}
494 	code = (ps.last_u_d ? unary_op : binary_op);
495 	unary_delim = true;
496 
497 
498     }				/* end of switch */
499     if (code != newline) {
500 	l_struct = false;
501 	last_code = code;
502     }
503     if (buf_ptr >= buf_end)	/* check for input buffer empty */
504 	fill_buffer();
505     ps.last_u_d = unary_delim;
506     *e_token = '\0';		/* null terminate the token */
507     return (code);
508 }
509 
510 /*
511  * Add the given keyword to the keyword table, using val as the keyword type
512  */
513 addkey(key, val)
514     char       *key;
515 {
516     register struct templ *p = specials;
517     while (p->rwd)
518 	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
519 	    return;
520 	else
521 	    p++;
522     if (p >= specials + sizeof specials / sizeof specials[0])
523 	return;			/* For now, table overflows are silently
524 				 * ignored */
525     p->rwd = key;
526     p->rwcode = val;
527     p[1].rwd = 0;
528     p[1].rwcode = 0;
529     return;
530 }
531