xref: /original-bsd/usr.bin/indent/lexi.c (revision dddc135c)
1 /*
2  * Copyright (c) 1985 Sun Microsystems, Inc.
3  * Copyright (c) 1980 The Regents of the University of California.
4  * Copyright (c) 1976 Board of Trustees of the University of Illinois.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms are permitted
8  * provided that the above copyright notice and this paragraph are
9  * duplicated in all such forms and that any documentation,
10  * advertising materials, and other materials related to such
11  * distribution and use acknowledge that the software was developed
12  * by the University of California, Berkeley, the University of Illinois,
13  * Urbana, and Sun Microsystems, Inc.  The name of either University
14  * or Sun Microsystems may not be used to endorse or promote products
15  * derived from this software without specific prior written permission.
16  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
18  * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
19  */
20 
21 #ifndef lint
22 static char sccsid[] = "@(#)lexi.c	5.11 (Berkeley) 09/15/88";
23 #endif /* not lint */
24 
25 /*
26  * Here we have the token scanner for indent.  It scans off one token and puts
27  * it in the global variable "token".  It returns a code, indicating the type
28  * of token scanned.
29  */
30 
31 #include "indent_globs.h"
32 #include "indent_codes.h"
33 #include "ctype.h"
34 
35 #define alphanum 1
36 #define opchar 3
37 
38 struct templ {
39     char       *rwd;
40     int         rwcode;
41 };
42 
43 struct templ specials[100] =
44 {
45     "switch", 1,
46     "case", 2,
47     "break", 0,
48     "struct", 3,
49     "union", 3,
50     "enum", 3,
51     "default", 2,
52     "int", 4,
53     "char", 4,
54     "float", 4,
55     "double", 4,
56     "long", 4,
57     "short", 4,
58     "typdef", 4,
59     "unsigned", 4,
60     "register", 4,
61     "static", 4,
62     "global", 4,
63     "extern", 4,
64     "void", 4,
65     "goto", 0,
66     "return", 0,
67     "if", 5,
68     "while", 5,
69     "for", 5,
70     "else", 6,
71     "do", 6,
72     "sizeof", 7,
73     0, 0
74 };
75 
76 char        chartype[128] =
77 {				/* this is used to facilitate the decision of
78 				 * what type (alphanumeric, operator) each
79 				 * character is */
80     0, 0, 0, 0, 0, 0, 0, 0,
81     0, 0, 0, 0, 0, 0, 0, 0,
82     0, 0, 0, 0, 0, 0, 0, 0,
83     0, 0, 0, 0, 0, 0, 0, 0,
84     0, 3, 0, 0, 1, 3, 3, 0,
85     0, 0, 3, 3, 0, 3, 0, 3,
86     1, 1, 1, 1, 1, 1, 1, 1,
87     1, 1, 0, 0, 3, 3, 3, 3,
88     0, 1, 1, 1, 1, 1, 1, 1,
89     1, 1, 1, 1, 1, 1, 1, 1,
90     1, 1, 1, 1, 1, 1, 1, 1,
91     1, 1, 1, 0, 0, 0, 3, 1,
92     0, 1, 1, 1, 1, 1, 1, 1,
93     1, 1, 1, 1, 1, 1, 1, 1,
94     1, 1, 1, 1, 1, 1, 1, 1,
95     1, 1, 1, 0, 3, 0, 3, 0
96 };
97 
98 
99 
100 
101 int
102 lexi()
103 {
104     register char *tok;		/* local pointer to next char in token */
105     int         unary_delim;	/* this is set to 1 if the current token
106 				 *
107 				 * forces a following operator to be unary */
108     static int  last_code;	/* the last token type returned */
109     static int  l_struct;	/* set to 1 if the last token was 'struct' */
110     int         code;		/* internal code to be returned */
111     char        qchar;		/* the delimiter character for a string */
112 
113     tok = token;		/* point to start of place to save token */
114     unary_delim = false;
115     ps.col_1 = ps.last_nl;	/* tell world that this token started in
116 				 * column 1 iff the last thing scanned was nl */
117     ps.last_nl = false;
118 
119     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
120 	ps.col_1 = false;	/* leading blanks imply token is not in column
121 				 * 1 */
122 	if (++buf_ptr >= buf_end)
123 	    fill_buffer();
124     }
125 
126     /* Scan an alphanumeric token */
127     if (chartype[*buf_ptr] == alphanum || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
128 	/*
129 	 * we have a character or number
130 	 */
131 	register char *j;	/* used for searching thru list of
132 				 *
133 				 * reserved words */
134 	register struct templ *p;
135 
136 	if (isdigit(*buf_ptr) || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
137 	    int         seendot = 0,
138 	                seenexp = 0;
139 	    if (*buf_ptr == '0' &&
140 		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
141 		*tok++ = *buf_ptr++;
142 		*tok++ = *buf_ptr++;
143 		while (isxdigit(*buf_ptr))
144 		    *tok++ = *buf_ptr++;
145 	    }
146 	    else
147 		while (1) {
148 		    if (*buf_ptr == '.')
149 			if (seendot)
150 			    break;
151 			else
152 			    seendot++;
153 		    *tok++ = *buf_ptr++;
154 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.')
155 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
156 			    break;
157 			else {
158 			    seenexp++;
159 			    seendot++;
160 			    *tok++ = *buf_ptr++;
161 			    if (*buf_ptr == '+' || *buf_ptr == '-')
162 				*tok++ = *buf_ptr++;
163 			}
164 		}
165 	    if (*buf_ptr == 'L' || *buf_ptr == 'l')
166 		*tok++ = *buf_ptr++;
167 	}
168 	else
169 	    while (chartype[*buf_ptr] == alphanum) {	/* copy it over */
170 		*tok++ = *buf_ptr++;
171 		if (buf_ptr >= buf_end)
172 		    fill_buffer();
173 	    }
174 	*tok++ = '\0';
175 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
176 	    if (++buf_ptr >= buf_end)
177 		fill_buffer();
178 	}
179 	ps.its_a_keyword = false;
180 	ps.sizeof_keyword = false;
181 	if (l_struct) {		/* if last token was 'struct', then this token
182 				 * should be treated as a declaration */
183 	    l_struct = false;
184 	    last_code = ident;
185 	    ps.last_u_d = true;
186 	    return (decl);
187 	}
188 	ps.last_u_d = false;	/* Operator after indentifier is binary */
189 	last_code = ident;	/* Remember that this is the code we will
190 				 * return */
191 
192 	/*
193 	 * This loop will check if the token is a keyword.
194 	 */
195 	for (p = specials; (j = p->rwd) != 0; p++) {
196 	    tok = token;	/* point at scanned token */
197 	    if (*j++ != *tok++ || *j++ != *tok++)
198 		continue;	/* This test depends on the fact that
199 				 * identifiers are always at least 1 character
200 				 * long (ie. the first two bytes of the
201 				 * identifier are always meaningful) */
202 	    if (tok[-1] == 0)
203 		break;		/* If its a one-character identifier */
204 	    while (*tok++ == *j)
205 		if (*j++ == 0)
206 		    goto found_keyword;	/* I wish that C had a multi-level
207 					 * break... */
208 	}
209 	if (p->rwd) {		/* we have a keyword */
210     found_keyword:
211 	    ps.its_a_keyword = true;
212 	    ps.last_u_d = true;
213 	    switch (p->rwcode) {
214 	    case 1:		/* it is a switch */
215 		return (swstmt);
216 	    case 2:		/* a case or default */
217 		return (casestmt);
218 
219 	    case 3:		/* a "struct" */
220 		if (ps.p_l_follow)
221 		    break;	/* inside parens: cast */
222 		l_struct = true;
223 
224 		/*
225 		 * Next time around, we will want to know that we have had a
226 		 * 'struct'
227 		 */
228 	    case 4:		/* one of the declaration keywords */
229 		if (ps.p_l_follow) {
230 		    ps.cast_mask |= 1 << ps.p_l_follow;
231 		    break;	/* inside parens: cast */
232 		}
233 		last_code = decl;
234 		return (decl);
235 
236 	    case 5:		/* if, while, for */
237 		return (sp_paren);
238 
239 	    case 6:		/* do, else */
240 		return (sp_nparen);
241 
242 	    case 7:
243 		ps.sizeof_keyword = true;
244 	    default:		/* all others are treated like any other
245 				 * identifier */
246 		return (ident);
247 	    }			/* end of switch */
248 	}			/* end of if (found_it) */
249 	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
250 	    register char *tp = buf_ptr;
251 	    while (tp < buf_end)
252 		if (*tp++ == ')' && *tp == ';')
253 		    goto not_proc;
254 	    strncpy(ps.procname, token, sizeof ps.procname - 1);
255 	    ps.in_parameter_declaration = 1;
256     not_proc:;
257 	}
258 	/*
259 	 * The following hack attempts to guess whether or not the current
260 	 * token is in fact a declaration keyword -- one that has been
261 	 * typedefd
262 	 */
263 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
264 		&& !ps.p_l_follow
265 	        && !ps.block_init
266 		&& (ps.last_token == rparen || ps.last_token == semicolon ||
267 		    ps.last_token == decl ||
268 		    ps.last_token == lbrace || ps.last_token == rbrace)) {
269 	    ps.its_a_keyword = true;
270 	    ps.last_u_d = true;
271 	    last_code = decl;
272 	    return decl;
273 	}
274 	if (last_code == decl)	/* if this is a declared variable, then
275 				 * following sign is unary */
276 	    ps.last_u_d = true;	/* will make "int a -1" work */
277 	last_code = ident;
278 	return (ident);		/* the ident is not in the list */
279     }				/* end of procesing for alpanum character */
280     /* l l l Scan a non-alphanumeric token */
281 
282     *tok++ = *buf_ptr;		/* if it is only a one-character token, it is
283 				 * moved here */
284     *tok = '\0';
285     if (++buf_ptr >= buf_end)
286 	fill_buffer();
287 
288     switch (*token) {
289     case '\n':
290 	unary_delim = ps.last_u_d;
291 	ps.last_nl = true;	/* remember that we just had a newline */
292 	code = (had_eof ? 0 : newline);
293 
294 	/*
295 	 * if data has been exausted, the newline is a dummy, and we should
296 	 * return code to stop
297 	 */
298 	break;
299 
300     case '\'':			/* start of quoted character */
301     case '"':			/* start of string */
302 	qchar = *token;
303 	if (troff) {
304 	    tok[-1] = '`';
305 	    if (qchar == '"')
306 		*tok++ = '`';
307 	    tok = chfont(&bodyf, &stringf, tok);
308 	}
309 	do {			/* copy the string */
310 	    while (1) {		/* move one character or [/<char>]<char> */
311 		if (*buf_ptr == '\n') {
312 		    printf("%d: Unterminated literal\n", line_no);
313 		    goto stop_lit;
314 		}
315 		*tok = *buf_ptr++;
316 		if (buf_ptr >= buf_end)
317 		    fill_buffer();
318 		if (had_eof || ((tok - token) > (bufsize - 2))) {
319 		    printf("Unterminated literal\n");
320 		    ++tok;
321 		    goto stop_lit;
322 		    /* get outof literal copying loop */
323 		}
324 		if (*tok == BACKSLASH) {	/* if escape, copy extra char */
325 		    if (*buf_ptr == '\n')	/* check for escaped newline */
326 			++line_no;
327 		    if (troff) {
328 			*++tok = BACKSLASH;
329 			if (*buf_ptr == BACKSLASH)
330 			    *++tok = BACKSLASH;
331 		    }
332 		    *++tok = *buf_ptr++;
333 		    ++tok;	/* we must increment this again because we
334 				 * copied two chars */
335 		    if (buf_ptr >= buf_end)
336 			fill_buffer();
337 		}
338 		else
339 		    break;	/* we copied one character */
340 	    }			/* end of while (1) */
341 	} while (*tok++ != qchar);
342 	if (troff) {
343 	    tok = chfont(&stringf, &bodyf, tok - 1);
344 	    if (qchar == '"')
345 		*tok++ = '\'';
346 	}
347 stop_lit:
348 	code = ident;
349 	break;
350 
351     case ('('):
352     case ('['):
353 	unary_delim = true;
354 	code = lparen;
355 	break;
356 
357     case (')'):
358     case (']'):
359 	code = rparen;
360 	break;
361 
362     case '#':
363 	unary_delim = ps.last_u_d;
364 	code = preesc;
365 	break;
366 
367     case '?':
368 	unary_delim = true;
369 	code = question;
370 	break;
371 
372     case (':'):
373 	code = colon;
374 	unary_delim = true;
375 	break;
376 
377     case (';'):
378 	unary_delim = true;
379 	code = semicolon;
380 	break;
381 
382     case ('{'):
383 	unary_delim = true;
384 
385 	/*
386 	 * if (ps.in_or_st) ps.block_init = 1;
387 	 */
388 	/* ?	code = ps.block_init ? lparen : lbrace; */
389 	code = lbrace;
390 	break;
391 
392     case ('}'):
393 	unary_delim = true;
394 	/* ?	code = ps.block_init ? rparen : rbrace; */
395 	code = rbrace;
396 	break;
397 
398     case 014:			/* a form feed */
399 	unary_delim = ps.last_u_d;
400 	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
401 				 * right */
402 	code = form_feed;
403 	break;
404 
405     case (','):
406 	unary_delim = true;
407 	code = comma;
408 	break;
409 
410     case '.':
411 	unary_delim = false;
412 	code = period;
413 	break;
414 
415     case '-':
416     case '+':			/* check for -, +, --, ++ */
417 	code = (ps.last_u_d ? unary_op : binary_op);
418 	unary_delim = true;
419 
420 	if (*buf_ptr == token[0]) {
421 	    /* check for doubled character */
422 	    *tok++ = *buf_ptr++;
423 	    /* buffer overflow will be checked at end of loop */
424 	    if (last_code == ident || last_code == rparen) {
425 		code = (ps.last_u_d ? unary_op : postop);
426 		/* check for following ++ or -- */
427 		unary_delim = false;
428 	    }
429 	}
430 	else if (*buf_ptr == '=')
431 	    /* check for operator += */
432 	    *tok++ = *buf_ptr++;
433 	else if (*buf_ptr == '>') {
434 	    /* check for operator -> */
435 	    *tok++ = *buf_ptr++;
436 	    if (!pointer_as_binop) {
437 		unary_delim = false;
438 		code = unary_op;
439 		ps.want_blank = false;
440 	    }
441 	}
442 	break;			/* buffer overflow will be checked at end of
443 				 * switch */
444 
445     case '=':
446 	if (ps.in_or_st)
447 	    ps.block_init = 1;
448 #ifdef undef
449 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
450 	    tok[-1] = *buf_ptr++;
451 	    if ((tok[-1] == '<' || tok[-1] == '>') && tok[-1] == *buf_ptr)
452 		*tok++ = *buf_ptr++;
453 	    *tok++ = '=';	/* Flip =+ to += */
454 	    *tok = 0;
455 	}
456 #else
457 	if (*buf_ptr == '=') {/* == */
458 	    *tok++ = '=';	/* Flip =+ to += */
459 	    buf_ptr++;
460 	    *tok = 0;
461 	}
462 #endif
463 	code = binary_op;
464 	unary_delim = true;
465 	break;
466 	/* can drop thru!!! */
467 
468     case '>':
469     case '<':
470     case '!':			/* ops like <, <<, <=, !=, etc */
471 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
472 	    *tok++ = *buf_ptr;
473 	    if (++buf_ptr >= buf_end)
474 		fill_buffer();
475 	}
476 	if (*buf_ptr == '=')
477 	    *tok++ = *buf_ptr++;
478 	code = (ps.last_u_d ? unary_op : binary_op);
479 	unary_delim = true;
480 	break;
481 
482     default:
483 	if (token[0] == '/' && *buf_ptr == '*') {
484 	    /* it is start of comment */
485 	    *tok++ = '*';
486 
487 	    if (++buf_ptr >= buf_end)
488 		fill_buffer();
489 
490 	    code = comment;
491 	    unary_delim = ps.last_u_d;
492 	    break;
493 	}
494 	while (*(tok - 1) == *buf_ptr || *buf_ptr == '=') {
495 	    /*
496 	     * handle ||, &&, etc, and also things as in int *****i
497 	     */
498 	    *tok++ = *buf_ptr;
499 	    if (++buf_ptr >= buf_end)
500 		fill_buffer();
501 	}
502 	code = (ps.last_u_d ? unary_op : binary_op);
503 	unary_delim = true;
504 
505 
506     }				/* end of switch */
507     if (code != newline) {
508 	l_struct = false;
509 	last_code = code;
510     }
511     if (buf_ptr >= buf_end)	/* check for input buffer empty */
512 	fill_buffer();
513     ps.last_u_d = unary_delim;
514     *tok = '\0';		/* null terminate the token */
515     return (code);
516 };
517 
518 /*
519  * Add the given keyword to the keyword table, using val as the keyword type
520  */
521 addkey(key, val)
522     char       *key;
523 {
524     register struct templ *p = specials;
525     while (p->rwd)
526 	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
527 	    return;
528 	else
529 	    p++;
530     if (p >= specials + sizeof specials / sizeof specials[0])
531 	return;			/* For now, table overflows are silently
532 				 * ignored */
533     p->rwd = key;
534     p->rwcode = val;
535     p[1].rwd = 0;
536     p[1].rwcode = 0;
537     return;
538 }
539