1 /*
2  * Copyright (c) 1985 Sun Microsystems, Inc.
3  * Copyright (c) 1980 The Regents of the University of California.
4  * Copyright (c) 1976 Board of Trustees of the University of Illinois.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms are permitted
8  * provided that the above copyright notice and this paragraph are
9  * duplicated in all such forms and that any documentation,
10  * advertising materials, and other materials related to such
11  * distribution and use acknowledge that the software was developed
12  * by the University of California, Berkeley, the University of Illinois,
13  * Urbana, and Sun Microsystems, Inc.  The name of either University
14  * or Sun Microsystems may not be used to endorse or promote products
15  * derived from this software without specific prior written permission.
16  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
18  * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
19  */
20 
21 #ifndef lint
22 static char sccsid[] = "@(#)lexi.c	5.11 (Berkeley) 9/15/88";
23 #endif /* not lint */
24 
25 /*
26  * Here we have the token scanner for indent.  It scans off one token and puts
27  * it in the global variable "token".  It returns a code, indicating the type
28  * of token scanned.
29  */
30 
31 #include "indent_globs.h"
32 #include "ctype.h"
33 
34 #define alphanum 1
35 #define opchar 3
36 
37 enum rwcodes {
38   rw_break,
39   rw_switch,
40   rw_case,
41   rw_struct_like, /* struct, enum, union */
42   rw_decl,
43   rw_sp_paren, /* if, while, for */
44   rw_sp_nparen, /* do, else */
45   rw_sizeof
46   };
47 
48 struct templ {
49     char       *rwd;
50     enum rwcodes rwcode;
51 };
52 
53 struct templ *user_specials = 0;
54 unsigned int user_specials_max, user_specials_idx;
55 struct templ specials[] =
56 {
57     {"switch", rw_switch},
58     {"case", rw_case},
59     {"break", rw_break},
60     {"struct", rw_struct_like},
61     {"union", rw_struct_like},
62     {"enum", rw_struct_like},
63     {"default", rw_case},
64     {"int", rw_decl},
65     {"char", rw_decl},
66     {"float", rw_decl},
67     {"double", rw_decl},
68 /*    {"long", rw_decl},
69     {"short", rw_decl},*/
70     {"typdef", rw_decl},
71     {"unsigned", rw_decl},
72     {"register", rw_decl},
73     {"static", rw_decl},
74     {"global", rw_decl},
75     {"extern", rw_decl},
76     {"void", rw_decl},
77     {"va_dcl", rw_decl},
78     {"goto", rw_break},
79     {"return", rw_break},
80     {"if", rw_sp_paren},
81     {"while", rw_sp_paren},
82     {"for", rw_sp_paren},
83     {"else", rw_sp_nparen},
84     {"do", rw_sp_nparen},
85     {"sizeof", rw_sizeof},
86     {0, 0}
87 };
88 
89 char        chartype[128] =
90 {				/* this is used to facilitate the decision of
91 				 * what type (alphanumeric, operator) each
92 				 * character is */
93     0, 0, 0, 0, 0, 0, 0, 0,
94     0, 0, 0, 0, 0, 0, 0, 0,
95     0, 0, 0, 0, 0, 0, 0, 0,
96     0, 0, 0, 0, 0, 0, 0, 0,
97     0, 3, 0, 0, 1, 3, 3, 0,
98     0, 0, 3, 3, 0, 3, 0, 3,
99     1, 1, 1, 1, 1, 1, 1, 1,
100     1, 1, 0, 0, 3, 3, 3, 3,
101     0, 1, 1, 1, 1, 1, 1, 1,
102     1, 1, 1, 1, 1, 1, 1, 1,
103     1, 1, 1, 1, 1, 1, 1, 1,
104     1, 1, 1, 0, 0, 0, 3, 1,
105     0, 1, 1, 1, 1, 1, 1, 1,
106     1, 1, 1, 1, 1, 1, 1, 1,
107     1, 1, 1, 1, 1, 1, 1, 1,
108     1, 1, 1, 0, 3, 0, 3, 0
109 };
110 
111 
112 
113 
114 enum codes
lexi()115 lexi()
116 {
117     /* used to walk through the token */
118     char *tok;
119 
120     int         unary_delim;	/* this is set to 1 if the current token
121 				 *
122 				 * forces a following operator to be unary */
123     static enum codes last_code;	/* the last token type returned */
124     static int  l_struct;	/* set to 1 if the last token was 'struct' */
125     int         code;		/* internal code to be returned */
126     char        qchar;		/* the delimiter character for a string */
127 
128     unary_delim = false;
129     parser_state_tos->col_1 = parser_state_tos->last_nl;	/* tell world that this token started in
130 				 * column 1 iff the last thing scanned was nl */
131     parser_state_tos->last_nl = false;
132 
133     while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
134 	parser_state_tos->col_1 = false;	/* leading blanks imply token is not in column
135 				 * 1 */
136 	if (++buf_ptr >= buf_end)
137 	    fill_buffer();
138     }
139 
140     token = buf_ptr;
141 
142     /* Scan an alphanumeric token */
143     if (chartype[*buf_ptr] == alphanum || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
144 	/*
145 	 * we have a character or number
146 	 */
147 	register char *j;	/* used for searching thru list of
148 				 *
149 				 * reserved words */
150 	register struct templ *p;
151 
152 	if (isdigit(*buf_ptr) || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
153 	    int         seendot = 0,
154 	                seenexp = 0;
155 	    if (*buf_ptr == '0' &&
156 		    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
157 	        buf_ptr += 2;
158 		while (isxdigit(*buf_ptr))
159 		    buf_ptr++;
160 	    }
161 	    else
162 		while (1) {
163 		    if (*buf_ptr == '.')
164 			if (seendot)
165 			    break;
166 			else
167 			    seendot++;
168 		    buf_ptr++;
169 		    if (!isdigit(*buf_ptr) && *buf_ptr != '.')
170 			if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
171 			    break;
172 			else {
173 			    seenexp++;
174 			    seendot++;
175 			    buf_ptr++;
176 			    if (*buf_ptr == '+' || *buf_ptr == '-')
177 				buf_ptr++;
178 			}
179 		}
180 	    if (*buf_ptr == 'L' || *buf_ptr == 'l')
181 		buf_ptr++;
182 	}
183 	else
184 	    while (chartype[*buf_ptr] == alphanum) {	/* copy it over */
185 		buf_ptr++;
186 		if (buf_ptr >= buf_end)
187 		    fill_buffer();
188 	    }
189 	token_end = buf_ptr;
190 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
191 	    if (++buf_ptr >= buf_end)
192 		fill_buffer();
193 	}
194 	parser_state_tos->its_a_keyword = false;
195 	parser_state_tos->sizeof_keyword = false;
196 	if (l_struct) {		/* if last token was 'struct', then this token
197 				 * should be treated as a declaration */
198 	    l_struct = false;
199 	    last_code = ident;
200 	    parser_state_tos->last_u_d = true;
201 	    return (decl);
202 	}
203 	parser_state_tos->last_u_d = false;	/* Operator after indentifier is binary */
204 	last_code = ident;	/* Remember that this is the code we will
205 				 * return */
206 
207 	/*
208 	 * This loop will check if the token is a keyword.
209 	 */
210 	for (p = specials; (j = p->rwd) != 0; p++) {
211 	    tok = token;	/* point at scanned token */
212 	    if (*j++ != *tok++ || *j++ != *tok++)
213 		continue;	/* This test depends on the fact that
214 				 * identifiers are always at least 1 character
215 				 * long (ie. the first two bytes of the
216 				 * identifier are always meaningful) */
217 	    if (tok >= token_end)
218 		break;		/* If its a 1 or 2 character identifier */
219 	    while (tok < token_end && *tok++ == *j++)
220 		if (*j == 0 && tok == token_end)
221 		    goto found_keyword;	/* I wish that C had a multi-level
222 					 * break... */
223 	}
224 	if (p->rwd) {		/* we have a keyword */
225     found_keyword:
226 	    parser_state_tos->its_a_keyword = true;
227 	    parser_state_tos->last_u_d = true;
228 	    switch (p->rwcode) {
229 	    case rw_switch:		/* it is a switch */
230 		return (swstmt);
231 	    case rw_case:		/* a case or default */
232 		return (casestmt);
233 
234 	    case rw_struct_like:		/* a "struct" */
235 		if (parser_state_tos->p_l_follow)
236 		    break;	/* inside parens: cast */
237 		l_struct = true;
238 
239 		/*
240 		 * Next time around, we will want to know that we have had a
241 		 * 'struct'
242 		 */
243 	    case rw_decl:		/* one of the declaration keywords */
244 		if (parser_state_tos->p_l_follow) {
245 		    parser_state_tos->cast_mask |= 1 << parser_state_tos->p_l_follow;
246 		    break;	/* inside parens: cast */
247 		}
248 		last_code = decl;
249 		return (decl);
250 
251 	    case rw_sp_paren:		/* if, while, for */
252 		return (sp_paren);
253 
254 	    case rw_sp_nparen:		/* do, else */
255 		return (sp_nparen);
256 
257 	    case rw_sizeof:
258 		parser_state_tos->sizeof_keyword = true;
259 	    default:		/* all others are treated like any other
260 				 * identifier */
261 		return (ident);
262 	    }			/* end of switch */
263 	}			/* end of if (found_it) */
264 	if (*buf_ptr == '(' && parser_state_tos->tos <= 1 && parser_state_tos->ind_level == 0) {
265 	    register char *tp = buf_ptr;
266 	    while (tp < buf_end)
267 		if (*tp++ == ')' && *tp == ';')
268 		    goto not_proc;
269 	    parser_state_tos->procname = token;
270 	    parser_state_tos->procname_end = token_end;
271 	    parser_state_tos->in_parameter_declaration = 1;
272     not_proc:;
273 	}
274 	/*
275 	 * The following hack attempts to guess whether or not the current
276 	 * token is in fact a declaration keyword -- one that has been
277 	 * typedefd
278 	 */
279 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
280 		&& !parser_state_tos->p_l_follow
281 	        && !parser_state_tos->block_init
282 		&& (parser_state_tos->last_token == rparen || parser_state_tos->last_token == semicolon ||
283 		    parser_state_tos->last_token == decl ||
284 		    parser_state_tos->last_token == lbrace || parser_state_tos->last_token == rbrace)) {
285 	    parser_state_tos->its_a_keyword = true;
286 	    parser_state_tos->last_u_d = true;
287 	    last_code = decl;
288 	    return decl;
289 	}
290 	if (last_code == decl)	/* if this is a declared variable, then
291 				 * following sign is unary */
292 	    parser_state_tos->last_u_d = true;	/* will make "int a -1" work */
293 	last_code = ident;
294 	return (ident);		/* the ident is not in the list */
295     }				/* end of procesing for alpanum character */
296     /* l l l Scan a non-alphanumeric token */
297 
298     /* If it is not a one character token, token_end will get changed
299        later.  */
300     token_end = buf_ptr + 1;
301 
302     if (++buf_ptr >= buf_end)
303 	fill_buffer();
304 
305     switch (*token) {
306     case '\n':
307 	unary_delim = parser_state_tos->last_u_d;
308 	parser_state_tos->last_nl = true;	/* remember that we just had a newline */
309 	code = (had_eof ? 0 : newline);
310 
311 	/*
312 	 * if data has been exausted, the newline is a dummy, and we should
313 	 * return code to stop
314 	 */
315 	break;
316 
317     case '\'':			/* start of quoted character */
318     case '"':			/* start of string */
319 	qchar = *token;
320 
321 	/* Find out how big the literal is so we can set token_end.  */
322 
323 	/* Invariant:  before loop test buf_ptr points to the next */
324 	/* character that we have not yet checked. */
325 	while (*buf_ptr != qchar && *buf_ptr != 0 && *buf_ptr != '\n')
326 	  {
327 	    if (*buf_ptr == '\\')
328 	      {
329 		buf_ptr++;
330 		if (buf_ptr >= buf_end)
331 		  fill_buffer ();
332 		if (*buf_ptr == '\n')
333 		  ++line_no;
334 		if (*buf_ptr == 0)
335 		  break;
336 	      }
337 	    buf_ptr++;
338 	    if (buf_ptr >= buf_end)
339 	      fill_buffer ();
340 	  }
341 	if (*buf_ptr == '\n' || *buf_ptr == 0)
342 	  {
343 	    diag (1,
344 		  qchar == '\''
345 		    ? "Unterminated character constant"
346 		    : "Unterminated string constant"
347 		 );
348 	  }
349 	else
350 	  {
351 	    /* Advance over end quote char.  */
352 	    buf_ptr++;
353 	    if (buf_ptr >= buf_end)
354 	      fill_buffer ();
355 	  }
356 
357 	code = ident;
358 	break;
359 
360     case ('('):
361         if (lpc && *buf_ptr == '{') {
362 	    buf_ptr++;
363 	}
364     case ('['):
365 	unary_delim = true;
366 	code = lparen;
367 	break;
368 
369     case (')'):
370     case (']'):
371 	code = rparen;
372 	break;
373 
374     case '#':
375 	unary_delim = parser_state_tos->last_u_d;
376 	code = preesc;
377 	break;
378 
379     case '?':
380 	unary_delim = true;
381 	code = question;
382 	break;
383 
384     case (':'):
385         if (lpc && *buf_ptr == ':') {
386 	    buf_ptr++;
387 	    code = unary_op;
388 	    unary_delim = true;
389 	    break;
390 	}
391 	code = colon;
392 	unary_delim = true;
393 	break;
394 
395     case (';'):
396 	unary_delim = true;
397 	code = semicolon;
398 	break;
399 
400     case ('{'):
401 	unary_delim = true;
402 
403 	/* This check is made in the code for '='.  No one who writes
404 	   initializers without '=' these days deserves to have indent
405 	   work on their code (besides which, uncommenting this would
406 	   screw up anything which assumes that parser_state_tos->block_init really
407 	   means you are in an initializer.  */
408 	/*
409 	 * if (parser_state_tos->in_or_st) parser_state_tos->block_init = 1;
410 	 */
411 
412 	/* The following neat hack causes the braces in structure
413 	   initializations to be treated as parentheses, thus causing
414 	   initializations to line up correctly, e.g.
415 	   struct foo bar =
416 	   {{a,
417 	     b,
418 	     c},
419 	    {1,
420 	     2}};
421 	   If lparen is returned, token can be used to distinguish
422 	   between '{' and '(' where necessary.  */
423 
424 	code = parser_state_tos->block_init ? lparen : lbrace;
425 	break;
426 
427     case ('}'):
428         if (lpc && *buf_ptr == ')') {
429 	    buf_ptr++;
430 	    code = rparen;
431 	    break;
432 	}
433 	unary_delim = true;
434 	/* The following neat hack is explained under '{' above.  */
435 	code = parser_state_tos->block_init ? rparen : rbrace;
436 
437 	break;
438 
439     case 014:			/* a form feed */
440 	unary_delim = parser_state_tos->last_u_d;
441 	parser_state_tos->last_nl = true;	/* remember this so we can set 'parser_state_tos->col_1'
442 				 * right */
443 	code = form_feed;
444 	break;
445 
446     case (','):
447 	unary_delim = true;
448 	code = comma;
449 	break;
450 
451     case '.':
452 	unary_delim = false;
453 	code = period;
454 	break;
455 
456     case '-':
457     case '+':			/* check for -, +, --, ++ */
458 	code = (parser_state_tos->last_u_d ? unary_op : binary_op);
459 	unary_delim = true;
460 
461 	if (*buf_ptr == token[0]) {
462 	    /* check for doubled character */
463 	    buf_ptr++;
464 	    /* buffer overflow will be checked at end of loop */
465 	    if (last_code == ident || last_code == rparen) {
466 		code = (parser_state_tos->last_u_d ? unary_op : postop);
467 		/* check for following ++ or -- */
468 		unary_delim = false;
469 	    }
470 	}
471 	else if (*buf_ptr == '=')
472 	    /* check for operator += */
473 	    buf_ptr++;
474 	else if (*buf_ptr == '>') {
475 	    /* check for operator -> */
476 	    buf_ptr++;
477 	    if (!pointer_as_binop) {
478 		unary_delim = false;
479 		code = unary_op;
480 		parser_state_tos->want_blank = false;
481 	    }
482 	}
483 	break;			/* buffer overflow will be checked at end of
484 				 * switch */
485 
486     case '=':
487 	if (parser_state_tos->in_or_st)
488 	    parser_state_tos->block_init = 1;
489 
490 	if (*buf_ptr == '=') /* == */
491 	    buf_ptr++;
492 
493 	code = binary_op;
494 	unary_delim = true;
495 	break;
496 	/* can drop thru!!! */
497 
498     case '>':
499     case '<':
500     case '!':			/* ops like <, <<, <=, !=, etc */
501 	if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
502 	    if (++buf_ptr >= buf_end)
503 		fill_buffer();
504 	}
505 
506 	code = (parser_state_tos->last_u_d ? unary_op : binary_op);
507 	unary_delim = true;
508 	break;
509 
510     default:
511 	if (token[0] == '/' && *buf_ptr == '*') {
512 	    /* it is start of comment */
513 
514 	    if (++buf_ptr >= buf_end)
515 		fill_buffer();
516 
517 	    code = comment;
518 	    unary_delim = parser_state_tos->last_u_d;
519 	    break;
520 	}
521 	while (*(buf_ptr - 1) == *buf_ptr || *buf_ptr == '=') {
522 	    /*
523 	     * handle ||, &&, etc, and also things as in int *****i
524 	     */
525 	    if (++buf_ptr >= buf_end)
526 		fill_buffer();
527 	}
528 	code = (parser_state_tos->last_u_d ? unary_op : binary_op);
529 	unary_delim = true;
530 
531 
532     }				/* end of switch */
533     if (code != newline) {
534 	l_struct = false;
535 	last_code = code;
536     }
537     token_end = buf_ptr;
538     if (buf_ptr >= buf_end)	/* check for input buffer empty */
539 	fill_buffer();
540     parser_state_tos->last_u_d = unary_delim;
541 
542     return (code);
543 }
544 
545 /*
546  * Add the given keyword to the keyword table, using val as the keyword type
547  */
addkey(key,val)548 addkey(key, val)
549     char       *key;
550      enum rwcodes val;
551 {
552     register struct templ *p = specials;
553     while (p->rwd)
554 	if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
555 	    return;
556 	else
557 	    p++;
558 
559     if (user_specials == 0)
560       {
561 	user_specials = (struct templ *) xmalloc (5 * sizeof (struct templ));
562 	if (user_specials == 0)
563 	  {
564 	    fputs ("indent: out of memory\n", stderr);
565 	    exit (1);
566 	  }
567 	user_specials_max = 5;
568 	user_specials_idx = 0;
569       }
570     else if (user_specials_idx == user_specials_max)
571       {
572 	user_specials_max += 5;
573 	user_specials = (struct templ *) xrealloc ((char *) user_specials,
574 						  user_specials_max
575 						  * sizeof (struct templ));
576       }
577     p = &user_specials[user_specials_idx++];
578 
579     p->rwd = key;
580     p->rwcode = val;
581     p[1].rwd = 0;
582     p[1].rwcode = 0;
583     return;
584 }
585