1 /*
2 * Copyright (c) 1985 Sun Microsystems, Inc.
3 * Copyright (c) 1980 The Regents of the University of California.
4 * Copyright (c) 1976 Board of Trustees of the University of Illinois.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms are permitted
8 * provided that the above copyright notice and this paragraph are
9 * duplicated in all such forms and that any documentation,
10 * advertising materials, and other materials related to such
11 * distribution and use acknowledge that the software was developed
12 * by the University of California, Berkeley, the University of Illinois,
13 * Urbana, and Sun Microsystems, Inc. The name of either University
14 * or Sun Microsystems may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
16 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
18 * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
19 */
20
21 #ifndef lint
22 static char sccsid[] = "@(#)lexi.c 5.11 (Berkeley) 9/15/88";
23 #endif /* not lint */
24
25 /*
26 * Here we have the token scanner for indent. It scans off one token and puts
27 * it in the global variable "token". It returns a code, indicating the type
28 * of token scanned.
29 */
30
31 #include "indent_globs.h"
32 #include "ctype.h"
33
34 #define alphanum 1
35 #define opchar 3
36
37 enum rwcodes {
38 rw_break,
39 rw_switch,
40 rw_case,
41 rw_struct_like, /* struct, enum, union */
42 rw_decl,
43 rw_sp_paren, /* if, while, for */
44 rw_sp_nparen, /* do, else */
45 rw_sizeof
46 };
47
48 struct templ {
49 char *rwd;
50 enum rwcodes rwcode;
51 };
52
53 struct templ *user_specials = 0;
54 unsigned int user_specials_max, user_specials_idx;
55 struct templ specials[] =
56 {
57 {"switch", rw_switch},
58 {"case", rw_case},
59 {"break", rw_break},
60 {"struct", rw_struct_like},
61 {"union", rw_struct_like},
62 {"enum", rw_struct_like},
63 {"default", rw_case},
64 {"int", rw_decl},
65 {"char", rw_decl},
66 {"float", rw_decl},
67 {"double", rw_decl},
68 /* {"long", rw_decl},
69 {"short", rw_decl},*/
70 {"typdef", rw_decl},
71 {"unsigned", rw_decl},
72 {"register", rw_decl},
73 {"static", rw_decl},
74 {"global", rw_decl},
75 {"extern", rw_decl},
76 {"void", rw_decl},
77 {"va_dcl", rw_decl},
78 {"goto", rw_break},
79 {"return", rw_break},
80 {"if", rw_sp_paren},
81 {"while", rw_sp_paren},
82 {"for", rw_sp_paren},
83 {"else", rw_sp_nparen},
84 {"do", rw_sp_nparen},
85 {"sizeof", rw_sizeof},
86 {0, 0}
87 };
88
89 char chartype[128] =
90 { /* this is used to facilitate the decision of
91 * what type (alphanumeric, operator) each
92 * character is */
93 0, 0, 0, 0, 0, 0, 0, 0,
94 0, 0, 0, 0, 0, 0, 0, 0,
95 0, 0, 0, 0, 0, 0, 0, 0,
96 0, 0, 0, 0, 0, 0, 0, 0,
97 0, 3, 0, 0, 1, 3, 3, 0,
98 0, 0, 3, 3, 0, 3, 0, 3,
99 1, 1, 1, 1, 1, 1, 1, 1,
100 1, 1, 0, 0, 3, 3, 3, 3,
101 0, 1, 1, 1, 1, 1, 1, 1,
102 1, 1, 1, 1, 1, 1, 1, 1,
103 1, 1, 1, 1, 1, 1, 1, 1,
104 1, 1, 1, 0, 0, 0, 3, 1,
105 0, 1, 1, 1, 1, 1, 1, 1,
106 1, 1, 1, 1, 1, 1, 1, 1,
107 1, 1, 1, 1, 1, 1, 1, 1,
108 1, 1, 1, 0, 3, 0, 3, 0
109 };
110
111
112
113
114 enum codes
lexi()115 lexi()
116 {
117 /* used to walk through the token */
118 char *tok;
119
120 int unary_delim; /* this is set to 1 if the current token
121 *
122 * forces a following operator to be unary */
123 static enum codes last_code; /* the last token type returned */
124 static int l_struct; /* set to 1 if the last token was 'struct' */
125 int code; /* internal code to be returned */
126 char qchar; /* the delimiter character for a string */
127
128 unary_delim = false;
129 parser_state_tos->col_1 = parser_state_tos->last_nl; /* tell world that this token started in
130 * column 1 iff the last thing scanned was nl */
131 parser_state_tos->last_nl = false;
132
133 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
134 parser_state_tos->col_1 = false; /* leading blanks imply token is not in column
135 * 1 */
136 if (++buf_ptr >= buf_end)
137 fill_buffer();
138 }
139
140 token = buf_ptr;
141
142 /* Scan an alphanumeric token */
143 if (chartype[*buf_ptr] == alphanum || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
144 /*
145 * we have a character or number
146 */
147 register char *j; /* used for searching thru list of
148 *
149 * reserved words */
150 register struct templ *p;
151
152 if (isdigit(*buf_ptr) || buf_ptr[0] == '.' && isdigit(buf_ptr[1])) {
153 int seendot = 0,
154 seenexp = 0;
155 if (*buf_ptr == '0' &&
156 (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
157 buf_ptr += 2;
158 while (isxdigit(*buf_ptr))
159 buf_ptr++;
160 }
161 else
162 while (1) {
163 if (*buf_ptr == '.')
164 if (seendot)
165 break;
166 else
167 seendot++;
168 buf_ptr++;
169 if (!isdigit(*buf_ptr) && *buf_ptr != '.')
170 if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
171 break;
172 else {
173 seenexp++;
174 seendot++;
175 buf_ptr++;
176 if (*buf_ptr == '+' || *buf_ptr == '-')
177 buf_ptr++;
178 }
179 }
180 if (*buf_ptr == 'L' || *buf_ptr == 'l')
181 buf_ptr++;
182 }
183 else
184 while (chartype[*buf_ptr] == alphanum) { /* copy it over */
185 buf_ptr++;
186 if (buf_ptr >= buf_end)
187 fill_buffer();
188 }
189 token_end = buf_ptr;
190 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
191 if (++buf_ptr >= buf_end)
192 fill_buffer();
193 }
194 parser_state_tos->its_a_keyword = false;
195 parser_state_tos->sizeof_keyword = false;
196 if (l_struct) { /* if last token was 'struct', then this token
197 * should be treated as a declaration */
198 l_struct = false;
199 last_code = ident;
200 parser_state_tos->last_u_d = true;
201 return (decl);
202 }
203 parser_state_tos->last_u_d = false; /* Operator after indentifier is binary */
204 last_code = ident; /* Remember that this is the code we will
205 * return */
206
207 /*
208 * This loop will check if the token is a keyword.
209 */
210 for (p = specials; (j = p->rwd) != 0; p++) {
211 tok = token; /* point at scanned token */
212 if (*j++ != *tok++ || *j++ != *tok++)
213 continue; /* This test depends on the fact that
214 * identifiers are always at least 1 character
215 * long (ie. the first two bytes of the
216 * identifier are always meaningful) */
217 if (tok >= token_end)
218 break; /* If its a 1 or 2 character identifier */
219 while (tok < token_end && *tok++ == *j++)
220 if (*j == 0 && tok == token_end)
221 goto found_keyword; /* I wish that C had a multi-level
222 * break... */
223 }
224 if (p->rwd) { /* we have a keyword */
225 found_keyword:
226 parser_state_tos->its_a_keyword = true;
227 parser_state_tos->last_u_d = true;
228 switch (p->rwcode) {
229 case rw_switch: /* it is a switch */
230 return (swstmt);
231 case rw_case: /* a case or default */
232 return (casestmt);
233
234 case rw_struct_like: /* a "struct" */
235 if (parser_state_tos->p_l_follow)
236 break; /* inside parens: cast */
237 l_struct = true;
238
239 /*
240 * Next time around, we will want to know that we have had a
241 * 'struct'
242 */
243 case rw_decl: /* one of the declaration keywords */
244 if (parser_state_tos->p_l_follow) {
245 parser_state_tos->cast_mask |= 1 << parser_state_tos->p_l_follow;
246 break; /* inside parens: cast */
247 }
248 last_code = decl;
249 return (decl);
250
251 case rw_sp_paren: /* if, while, for */
252 return (sp_paren);
253
254 case rw_sp_nparen: /* do, else */
255 return (sp_nparen);
256
257 case rw_sizeof:
258 parser_state_tos->sizeof_keyword = true;
259 default: /* all others are treated like any other
260 * identifier */
261 return (ident);
262 } /* end of switch */
263 } /* end of if (found_it) */
264 if (*buf_ptr == '(' && parser_state_tos->tos <= 1 && parser_state_tos->ind_level == 0) {
265 register char *tp = buf_ptr;
266 while (tp < buf_end)
267 if (*tp++ == ')' && *tp == ';')
268 goto not_proc;
269 parser_state_tos->procname = token;
270 parser_state_tos->procname_end = token_end;
271 parser_state_tos->in_parameter_declaration = 1;
272 not_proc:;
273 }
274 /*
275 * The following hack attempts to guess whether or not the current
276 * token is in fact a declaration keyword -- one that has been
277 * typedefd
278 */
279 if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
280 && !parser_state_tos->p_l_follow
281 && !parser_state_tos->block_init
282 && (parser_state_tos->last_token == rparen || parser_state_tos->last_token == semicolon ||
283 parser_state_tos->last_token == decl ||
284 parser_state_tos->last_token == lbrace || parser_state_tos->last_token == rbrace)) {
285 parser_state_tos->its_a_keyword = true;
286 parser_state_tos->last_u_d = true;
287 last_code = decl;
288 return decl;
289 }
290 if (last_code == decl) /* if this is a declared variable, then
291 * following sign is unary */
292 parser_state_tos->last_u_d = true; /* will make "int a -1" work */
293 last_code = ident;
294 return (ident); /* the ident is not in the list */
295 } /* end of procesing for alpanum character */
296 /* l l l Scan a non-alphanumeric token */
297
298 /* If it is not a one character token, token_end will get changed
299 later. */
300 token_end = buf_ptr + 1;
301
302 if (++buf_ptr >= buf_end)
303 fill_buffer();
304
305 switch (*token) {
306 case '\n':
307 unary_delim = parser_state_tos->last_u_d;
308 parser_state_tos->last_nl = true; /* remember that we just had a newline */
309 code = (had_eof ? 0 : newline);
310
311 /*
312 * if data has been exausted, the newline is a dummy, and we should
313 * return code to stop
314 */
315 break;
316
317 case '\'': /* start of quoted character */
318 case '"': /* start of string */
319 qchar = *token;
320
321 /* Find out how big the literal is so we can set token_end. */
322
323 /* Invariant: before loop test buf_ptr points to the next */
324 /* character that we have not yet checked. */
325 while (*buf_ptr != qchar && *buf_ptr != 0 && *buf_ptr != '\n')
326 {
327 if (*buf_ptr == '\\')
328 {
329 buf_ptr++;
330 if (buf_ptr >= buf_end)
331 fill_buffer ();
332 if (*buf_ptr == '\n')
333 ++line_no;
334 if (*buf_ptr == 0)
335 break;
336 }
337 buf_ptr++;
338 if (buf_ptr >= buf_end)
339 fill_buffer ();
340 }
341 if (*buf_ptr == '\n' || *buf_ptr == 0)
342 {
343 diag (1,
344 qchar == '\''
345 ? "Unterminated character constant"
346 : "Unterminated string constant"
347 );
348 }
349 else
350 {
351 /* Advance over end quote char. */
352 buf_ptr++;
353 if (buf_ptr >= buf_end)
354 fill_buffer ();
355 }
356
357 code = ident;
358 break;
359
360 case ('('):
361 if (lpc && *buf_ptr == '{') {
362 buf_ptr++;
363 }
364 case ('['):
365 unary_delim = true;
366 code = lparen;
367 break;
368
369 case (')'):
370 case (']'):
371 code = rparen;
372 break;
373
374 case '#':
375 unary_delim = parser_state_tos->last_u_d;
376 code = preesc;
377 break;
378
379 case '?':
380 unary_delim = true;
381 code = question;
382 break;
383
384 case (':'):
385 if (lpc && *buf_ptr == ':') {
386 buf_ptr++;
387 code = unary_op;
388 unary_delim = true;
389 break;
390 }
391 code = colon;
392 unary_delim = true;
393 break;
394
395 case (';'):
396 unary_delim = true;
397 code = semicolon;
398 break;
399
400 case ('{'):
401 unary_delim = true;
402
403 /* This check is made in the code for '='. No one who writes
404 initializers without '=' these days deserves to have indent
405 work on their code (besides which, uncommenting this would
406 screw up anything which assumes that parser_state_tos->block_init really
407 means you are in an initializer. */
408 /*
409 * if (parser_state_tos->in_or_st) parser_state_tos->block_init = 1;
410 */
411
412 /* The following neat hack causes the braces in structure
413 initializations to be treated as parentheses, thus causing
414 initializations to line up correctly, e.g.
415 struct foo bar =
416 {{a,
417 b,
418 c},
419 {1,
420 2}};
421 If lparen is returned, token can be used to distinguish
422 between '{' and '(' where necessary. */
423
424 code = parser_state_tos->block_init ? lparen : lbrace;
425 break;
426
427 case ('}'):
428 if (lpc && *buf_ptr == ')') {
429 buf_ptr++;
430 code = rparen;
431 break;
432 }
433 unary_delim = true;
434 /* The following neat hack is explained under '{' above. */
435 code = parser_state_tos->block_init ? rparen : rbrace;
436
437 break;
438
439 case 014: /* a form feed */
440 unary_delim = parser_state_tos->last_u_d;
441 parser_state_tos->last_nl = true; /* remember this so we can set 'parser_state_tos->col_1'
442 * right */
443 code = form_feed;
444 break;
445
446 case (','):
447 unary_delim = true;
448 code = comma;
449 break;
450
451 case '.':
452 unary_delim = false;
453 code = period;
454 break;
455
456 case '-':
457 case '+': /* check for -, +, --, ++ */
458 code = (parser_state_tos->last_u_d ? unary_op : binary_op);
459 unary_delim = true;
460
461 if (*buf_ptr == token[0]) {
462 /* check for doubled character */
463 buf_ptr++;
464 /* buffer overflow will be checked at end of loop */
465 if (last_code == ident || last_code == rparen) {
466 code = (parser_state_tos->last_u_d ? unary_op : postop);
467 /* check for following ++ or -- */
468 unary_delim = false;
469 }
470 }
471 else if (*buf_ptr == '=')
472 /* check for operator += */
473 buf_ptr++;
474 else if (*buf_ptr == '>') {
475 /* check for operator -> */
476 buf_ptr++;
477 if (!pointer_as_binop) {
478 unary_delim = false;
479 code = unary_op;
480 parser_state_tos->want_blank = false;
481 }
482 }
483 break; /* buffer overflow will be checked at end of
484 * switch */
485
486 case '=':
487 if (parser_state_tos->in_or_st)
488 parser_state_tos->block_init = 1;
489
490 if (*buf_ptr == '=') /* == */
491 buf_ptr++;
492
493 code = binary_op;
494 unary_delim = true;
495 break;
496 /* can drop thru!!! */
497
498 case '>':
499 case '<':
500 case '!': /* ops like <, <<, <=, !=, etc */
501 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
502 if (++buf_ptr >= buf_end)
503 fill_buffer();
504 }
505
506 code = (parser_state_tos->last_u_d ? unary_op : binary_op);
507 unary_delim = true;
508 break;
509
510 default:
511 if (token[0] == '/' && *buf_ptr == '*') {
512 /* it is start of comment */
513
514 if (++buf_ptr >= buf_end)
515 fill_buffer();
516
517 code = comment;
518 unary_delim = parser_state_tos->last_u_d;
519 break;
520 }
521 while (*(buf_ptr - 1) == *buf_ptr || *buf_ptr == '=') {
522 /*
523 * handle ||, &&, etc, and also things as in int *****i
524 */
525 if (++buf_ptr >= buf_end)
526 fill_buffer();
527 }
528 code = (parser_state_tos->last_u_d ? unary_op : binary_op);
529 unary_delim = true;
530
531
532 } /* end of switch */
533 if (code != newline) {
534 l_struct = false;
535 last_code = code;
536 }
537 token_end = buf_ptr;
538 if (buf_ptr >= buf_end) /* check for input buffer empty */
539 fill_buffer();
540 parser_state_tos->last_u_d = unary_delim;
541
542 return (code);
543 }
544
545 /*
546 * Add the given keyword to the keyword table, using val as the keyword type
547 */
addkey(key,val)548 addkey(key, val)
549 char *key;
550 enum rwcodes val;
551 {
552 register struct templ *p = specials;
553 while (p->rwd)
554 if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
555 return;
556 else
557 p++;
558
559 if (user_specials == 0)
560 {
561 user_specials = (struct templ *) xmalloc (5 * sizeof (struct templ));
562 if (user_specials == 0)
563 {
564 fputs ("indent: out of memory\n", stderr);
565 exit (1);
566 }
567 user_specials_max = 5;
568 user_specials_idx = 0;
569 }
570 else if (user_specials_idx == user_specials_max)
571 {
572 user_specials_max += 5;
573 user_specials = (struct templ *) xrealloc ((char *) user_specials,
574 user_specials_max
575 * sizeof (struct templ));
576 }
577 p = &user_specials[user_specials_idx++];
578
579 p->rwd = key;
580 p->rwcode = val;
581 p[1].rwd = 0;
582 p[1].rwcode = 0;
583 return;
584 }
585