xref: /original-bsd/usr.bin/indent/lexi.c (revision a9c19d04)
1 /*
2  * Copyright (c) 1980 Regents of the University of California.
3  * All rights reserved.  The Berkeley software License Agreement
4  * specifies the terms and conditions for redistribution.
5  */
6 
7 #ifndef lint
8 static char sccsid[] = "@(#)lexi.c	5.4 (Berkeley) 09/10/85";
9 #endif not lint
10 
11 /*-
12  *
13  *			  Copyright (C) 1976
14  *				by the
15  *			  Board of Trustees
16  *				of the
17  *			University of Illinois
18  *
19  *			 All rights reserved
20  *
21  *
22  * NAME:
23  *	lexi
24  *
25  * FUNCTION:
26  *	This is the token scanner for indent
27  *
28  * ALGORITHM:
29  *	1) Strip off intervening blanks and/or tabs.
30  *	2) If it is an alphanumeric token, move it to the token buffer "token".
31  *	   Check if it is a special reserved word that indent will want to
32  *	   know about.
33  *	3) Non-alphanumeric tokens are handled with a big switch statement.  A
34  *	   flag is kept to remember if the last token was a "unary delimiter",
35  *	   which forces a following operator to be unary as opposed to binary.
36  *
37  * PARAMETERS:
38  *	None
39  *
40  * RETURNS:
41  *	An integer code indicating the type of token scanned.
42  *
43  * GLOBALS:
44  *	buf_ptr =
45  *	had_eof
46  *	ps.last_u_d =	Set to true iff this token is a "unary delimiter"
47  *
48  * CALLS:
49  *	fill_buffer
50  *	printf (lib)
51  *
52  * CALLED BY:
53  *	main
54  *
55  * NOTES:
56  *	Start of comment is passed back so that the comment can be scanned by
57  *	pr_comment.
58  *
59  *	Strings and character literals are returned just like identifiers.
60  *
61  * HISTORY:
62  *	initial coding 	November 1976	D A Willcox of CAC
63  *	1/7/77		D A Willcox of CAC	Fix to provide proper handling
64  *						of "int a -1;"
65  *
66  */
67 
68 /*
69  * Here we have the token scanner for indent.  It scans off one token and
70  * puts it in the global variable "token".  It returns a code, indicating
71  * the type of token scanned.
72  */
73 
74 #include "indent_globs.h";
75 #include "indent_codes.h";
76 #include "ctype.h"
77 
78 #define alphanum 1
79 #define opchar 3
80 
81 struct templ {
82     char       *rwd;
83     int         rwcode;
84 };
85 
86 struct templ specials[100] =
87 {
88     "switch", 1,
89     "case", 2,
90     "break", 0,
91     "struct", 3,
92     "union", 3,
93     "enum", 3,
94     "default", 2,
95     "int", 4,
96     "char", 4,
97     "float", 4,
98     "double", 4,
99     "long", 4,
100     "short", 4,
101     "typdef", 4,
102     "unsigned", 4,
103     "register", 4,
104     "static", 4,
105     "global", 4,
106     "extern", 4,
107     "void", 4,
108     "goto", 0,
109     "return", 0,
110     "if", 5,
111     "while", 5,
112     "for", 5,
113     "else", 6,
114     "do", 6,
115     "sizeof", 7,
116     0, 0
117 };
118 
119 char        chartype[128] =
120 {				/* this is used to facilitate the decision
121 				 * of what type (alphanumeric, operator)
122 				 * each character is */
123     0, 0, 0, 0, 0, 0, 0, 0,
124     0, 0, 0, 0, 0, 0, 0, 0,
125     0, 0, 0, 0, 0, 0, 0, 0,
126     0, 0, 0, 0, 0, 0, 0, 0,
127     0, 3, 0, 0, 0, 3, 3, 0,
128     0, 0, 3, 3, 0, 3, 3, 3,
129     1, 1, 1, 1, 1, 1, 1, 1,
130     1, 1, 0, 0, 3, 3, 3, 3,
131     0, 1, 1, 1, 1, 1, 1, 1,
132     1, 1, 1, 1, 1, 1, 1, 1,
133     1, 1, 1, 1, 1, 1, 1, 1,
134     1, 1, 1, 0, 0, 0, 3, 1,
135     0, 1, 1, 1, 1, 1, 1, 1,
136     1, 1, 1, 1, 1, 1, 1, 1,
137     1, 1, 1, 1, 1, 1, 1, 1,
138     1, 1, 1, 0, 3, 0, 3, 0
139 };
140 
141 
142 
143 
144 int
145 lexi()
146 {
147     register char *tok;		/* local pointer to next char in token */
148     int         unary_delim;	/* this is set to 1 if the current token
149 				 *
150 				 * forces a following operator to be unary */
151     static int  last_code;	/* the last token type returned */
152     static int  l_struct;	/* set to 1 if the last token was 'struct' */
153     int         code;		/* internal code to be returned */
154     char        qchar;		/* the delimiter character for a string */
155 
156     tok = token;		/* point to start of place to save token */
157     unary_delim = false;
158     ps.col_1 = ps.last_nl;	/* tell world that this token started in
159 				 * column 1 iff the last thing scanned was
160 				 * nl */
161     ps.last_nl = false;
162 
163     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
164 	ps.col_1 = false;	/* leading blanks imply token is not in
165 				 * column 1 */
166 	if (++buf_ptr >= buf_end)
167 	    fill_buffer();
168     }
169 
170     /* Scan an alphanumeric token.  Note that we must also handle
171      * stuff like "1.0e+03" and "7e-6". */
172     if (chartype[*buf_ptr & 0177] == alphanum) {	/* we have a character
173 							 * or number */
174 	register char *j;	/* used for searching thru list of
175 				 * reserved words */
176 	register struct templ *p;
177 	register int c;
178 
179 	do {			/* copy it over */
180 	    *tok++ = *buf_ptr++;
181 	    if (buf_ptr >= buf_end)
182 		fill_buffer();
183 	} while (chartype[c = *buf_ptr & 0177] == alphanum ||
184 		isdigit(token[0]) && (c == '+' || c == '-') &&
185 		(tok[-1] == 'e' || tok[-1] == 'E'));
186 	*tok++ = '\0';
187 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
188 	    if (++buf_ptr >= buf_end)
189 		fill_buffer();
190 	}
191 	ps.its_a_keyword = false;
192 	ps.sizeof_keyword = false;
193 	if (l_struct) {		/* if last token was 'struct', then this
194 				 * token should be treated as a
195 				 * declaration */
196 	    l_struct = false;
197 	    last_code = ident;
198 	    ps.last_u_d = true;
199 	    return (decl);
200 	}
201 	ps.last_u_d = false;	/* Operator after indentifier is binary */
202 	last_code = ident;	/* Remember that this is the code we will
203 				 * return */
204 
205 	/*
206 	 * This loop will check if the token is a keyword.
207 	 */
208 	for (p = specials; (j = p->rwd) != 0; p++) {
209 	    tok = token;	/* point at scanned token */
210 	    if (*j++ != *tok++ || *j++ != *tok++)
211 		continue;	/* This test depends on the fact that
212 				 * identifiers are always at least 1
213 				 * character long (ie. the first two bytes
214 				 * of the identifier are always
215 				 * meaningful) */
216 	    if (tok[-1] == 0)
217 		break;		/* If its a one-character identifier */
218 	    while (*tok++ == *j)
219 		if (*j++ == 0)
220 		    goto found_keyword;	/* I wish that C had a multi-level
221 					 * break... */
222 	}
223 	if (p->rwd) {		/* we have a keyword */
224     found_keyword:
225 	    ps.its_a_keyword = true;
226 	    ps.last_u_d = true;
227 	    switch (p->rwcode) {
228 		case 1:	/* it is a switch */
229 		    return (swstmt);
230 		case 2:	/* a case or default */
231 		    return (casestmt);
232 
233 		case 3:	/* a "struct" */
234 		    if (ps.p_l_follow)
235 			break;	/* inside parens: cast */
236 		    l_struct = true;
237 
238 		    /*
239 		     * Next time around, we will want to know that we have
240 		     * had a 'struct'
241 		     */
242 		case 4:	/* one of the declaration keywords */
243 		    if (ps.p_l_follow) {
244 			ps.cast_mask |= 1 << ps.p_l_follow;
245 			break;	/* inside parens: cast */
246 		    }
247 		    last_code = decl;
248 		    return (decl);
249 
250 		case 5:	/* if, while, for */
251 		    return (sp_paren);
252 
253 		case 6:	/* do, else */
254 		    return (sp_nparen);
255 
256 		case 7:
257 		    ps.sizeof_keyword = true;
258 		default:	/* all others are treated like any other
259 				 * identifier */
260 		    return (ident);
261 	    }			/* end of switch */
262 	}			/* end of if (found_it) */
263 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0
264 	    && (buf_ptr[1] != ')' || buf_ptr[2] != ';')) {
265 	    strncpy(ps.procname, token, sizeof ps.procname - 1);
266 	    ps.in_parameter_declaration = 1;
267 	}
268 
269 	/*
270 	 * The following hack attempts to guess whether or not the current
271 	 * token is in fact a declaration keyword -- one that has been
272 	 * typedefd
273 	 */
274 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr))
275 	    && !ps.p_l_follow
276 	    && (ps.last_token == rparen || ps.last_token == semicolon ||
277 		ps.last_token == decl ||
278 		ps.last_token == lbrace || ps.last_token == rbrace)) {
279 	    ps.its_a_keyword = true;
280 	    ps.last_u_d = true;
281 	    last_code = decl;
282 	    return decl;
283 	}
284 	if (last_code == decl)	/* if this is a declared variable, then
285 				 * following sign is unary */
286 	    ps.last_u_d = true;	/* will make "int a -1" work */
287 	last_code = ident;
288 	return (ident);		/* the ident is not in the list */
289     }				/* end of procesing for alpanum character */
290     /* Scan a non-alphanumeric token */
291 
292     *tok++ = *buf_ptr;		/* if it is only a one-character token, it
293 				 * is moved here */
294     *tok = '\0';
295     if (++buf_ptr >= buf_end)
296 	fill_buffer();
297 
298     switch (*token) {
299 	case '\n':
300 	    unary_delim = ps.last_u_d;
301 	    ps.last_nl = true;	/* remember that we just had a newline */
302 	    code = (had_eof ? 0 : newline);
303 
304 	    /*
305 	     * if data has been exausted, the newline is a dummy, and we
306 	     * should return code to stop
307 	     */
308 	    break;
309 
310 	case '\'':		/* start of quoted character */
311 	case '"':		/* start of string */
312 	    qchar = *token;
313 	    if (troff) {
314 		tok[-1] = '`';
315 		if (qchar == '"')
316 		    *tok++ = '`';
317 		*tok++ = BACKSLASH;
318 		*tok++ = 'f';
319 		*tok++ = 'L';
320 	    }
321 	    do {		/* copy the string */
322 		while (1) {	/* move one character or [/<char>]<char> */
323 		    if (*buf_ptr == '\n') {
324 			printf("%d: Unterminated literal\n", line_no);
325 			goto stop_lit;
326 		    }
327 		    *tok = *buf_ptr++;
328 		    if (buf_ptr >= buf_end)
329 			fill_buffer();
330 		    if (had_eof || ((tok - token) > (bufsize - 2))) {
331 			printf("Unterminated literal\n");
332 			++tok;
333 			goto stop_lit;
334 			/* get outof literal copying loop */
335 		    }
336 		    if (*tok == BACKSLASH) {	/* if escape, copy extra
337 						 * char */
338 			if (*buf_ptr == '\n')	/* check for escaped
339 						 * newline */
340 			    ++line_no;
341 			if (troff) {
342 			    *++tok = BACKSLASH;
343 			    if (*buf_ptr == BACKSLASH)
344 				*++tok = BACKSLASH;
345 			}
346 			*++tok = *buf_ptr++;
347 			++tok;	/* we must increment this again because we
348 				 * copied two chars */
349 			if (buf_ptr >= buf_end)
350 			    fill_buffer();
351 		    }
352 		    else
353 			break;	/* we copied one character */
354 		}		/* end of while (1) */
355 	    } while (*tok++ != qchar);
356 	    if (troff) {
357 		tok[-1] = BACKSLASH;
358 		*tok++ = 'f';
359 		*tok++ = 'R';
360 		*tok++ = '\'';
361 		if (qchar == '"')
362 		    *tok++ = '\'';
363 	    }
364     stop_lit:
365 	    code = ident;
366 	    break;
367 
368 	case ('('):
369 	case ('['):
370 	    unary_delim = true;
371 	    code = lparen;
372 	    break;
373 
374 	case (')'):
375 	case (']'):
376 	    code = rparen;
377 	    break;
378 
379 	case '#':
380 	    unary_delim = ps.last_u_d;
381 	    code = preesc;
382 	    break;
383 
384 	case '?':
385 	    unary_delim = true;
386 	    code = question;
387 	    break;
388 
389 	case (':'):
390 	    code = colon;
391 	    unary_delim = true;
392 	    break;
393 
394 	case (';'):
395 	    unary_delim = true;
396 	    code = semicolon;
397 	    break;
398 
399 	case ('{'):
400 	    unary_delim = true;
401 
402 	    /*
403 	     * if (ps.in_or_st) ps.block_init = 1;
404 	     */
405 	    code = ps.block_init ? lparen : lbrace;
406 	    break;
407 
408 	case ('}'):
409 	    unary_delim = true;
410 	    code = ps.block_init ? rparen : rbrace;
411 	    break;
412 
413 	case 014:		/* a form feed */
414 	    unary_delim = ps.last_u_d;
415 	    ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
416 				 * right */
417 	    code = form_feed;
418 	    break;
419 
420 	case (','):
421 	    unary_delim = true;
422 	    code = comma;
423 	    break;
424 
425 	case '.':
426 	    unary_delim = false;
427 	    code = period;
428 	    break;
429 
430 	case '-':
431 	case '+':		/* check for -, +, --, ++ */
432 	    code = (ps.last_u_d ? unary_op : binary_op);
433 	    unary_delim = true;
434 
435 	    if (*buf_ptr == token[0]) {
436 		/* check for doubled character */
437 		*tok++ = *buf_ptr++;
438 		/* buffer overflow will be checked at end of loop */
439 		if (last_code == ident || last_code == rparen) {
440 		    code = (ps.last_u_d ? unary_op : postop);
441 		    /* check for following ++ or -- */
442 		    unary_delim = false;
443 		}
444 	    }
445 	    else if (*buf_ptr == '=')
446 		/* check for operator += */
447 		*tok++ = *buf_ptr++;
448 	    else if (token[0] == '-' && *buf_ptr == '>') {
449 		/* check for operator -> */
450 		*tok++ = *buf_ptr++;
451 		if (!pointer_as_binop) {
452 		    code = unary_op;
453 		    unary_delim = false;
454 		    ps.want_blank = false;
455 		}
456 	    }
457 	    /* buffer overflow will be checked at end of switch */
458 
459 	    break;
460 
461 	case '=':
462 	    if (ps.in_or_st)
463 		ps.block_init = 1;
464 	    if (chartype[*buf_ptr] == opchar) {	/* we have two char
465 						 * assignment */
466 		tok[-1] = *buf_ptr++;
467 		if ((tok[-1] == '<' || tok[-1] == '>') && tok[-1] == *buf_ptr)
468 		    *tok++ = *buf_ptr++;
469 		*tok++ = '=';	/* Flip =+ to += */
470 		*tok = 0;
471 	    }
472 	    code = binary_op;
473 	    unary_delim = true;
474 	    break;
475 	    /* can drop thru!!! */
476 
477 	case '>':
478 	case '<':
479 	case '!':		/* ops like <, <<, <=, !=, etc */
480 	    if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
481 		*tok++ = *buf_ptr;
482 		if (++buf_ptr >= buf_end)
483 		    fill_buffer();
484 	    }
485 	    if (*buf_ptr == '=')
486 		*tok++ = *buf_ptr++;
487 	    code = (ps.last_u_d ? unary_op : binary_op);
488 	    unary_delim = true;
489 	    break;
490 
491 	default:
492 	    if (token[0] == '/' && *buf_ptr == '*') {
493 		/* it is start of comment */
494 		*tok++ = '*';
495 
496 		if (++buf_ptr >= buf_end)
497 		    fill_buffer();
498 
499 		code = comment;
500 		unary_delim = ps.last_u_d;
501 		break;
502 	    }
503 	    while (*(tok - 1) == *buf_ptr || *buf_ptr == '=') {
504 		/* handle ||, &&, etc, and also things as in int *****i */
505 		*tok++ = *buf_ptr;
506 		if (++buf_ptr >= buf_end)
507 		    fill_buffer();
508 	    }
509 	    code = (ps.last_u_d ? unary_op : binary_op);
510 	    unary_delim = true;
511 
512 
513     }				/* end of switch */
514     if (code != newline) {
515 	l_struct = false;
516 	last_code = code;
517     }
518     if (buf_ptr >= buf_end)	/* check for input buffer empty */
519 	fill_buffer();
520     ps.last_u_d = unary_delim;
521     *tok = '\0';		/* null terminate the token */
522     return (code);
523 };
524 
525 /* Add the given keyword to the keyword table, using val as the keyword type
526    */
527 addkey (key, val)
528 char       *key;
529 {
530     register struct templ *p = specials;
531     while (p->rwd)
532 	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
533 	    return;
534 	else
535 	    p++;
536     if (p >= specials + sizeof specials / sizeof specials[0])
537 	return;			/* For now, table overflows are silently
538 				   ignored */
539     p->rwd = key;
540     p->rwcode = val;
541     p[1].rwd = 0;
542     p[1].rwcode = 0;
543     return;
544 }
545