xref: /original-bsd/usr.bin/indent/lexi.c (revision f0fd5f8a)
1 static char sccsid[] = "@(#)lexi.c	4.1	(Berkeley)	10/21/82";
2 
3 /*
4 
5 			  Copyright (C) 1976
6 				by the
7 			  Board of Trustees
8 				of the
9 			University of Illinois
10 
11 			 All rights reserved
12 
13 
14 NAME:
15 	lexi
16 
17 FUNCTION:
18 	This is the token scanner for indent
19 
20 ALGORITHM:
21 	1) Strip off intervening blanks and/or tabs.
22 	2) If it is an alphanumeric token, move it to the token buffer "token".
23 	   Check if it is a special reserved word that indent will want to
24 	   know about.
25 	3) Non-alphanumeric tokens are handled with a big switch statement.  A
26 	   flag is kept to remember if the last token was a "unary delimiter",
27 	   which forces a following operator to be unary as opposed to binary.
28 
29 PARAMETERS:
30 	None
31 
32 RETURNS:
33 	An integer code indicating the type of token scanned.
34 
35 GLOBALS:
36 	buf_ptr =
37 	had_eof
38 	last_u_d =	Set to true iff this token is a "unary delimiter"
39 
40 CALLS:
41 	fill_buffer
42 	printf (lib)
43 
44 CALLED BY:
45 	main
46 
47 NOTES:
48 	Start of comment is passed back so that the comment can be scanned by
49 	pr_comment.
50 
51 	Strings and character literals are returned just like identifiers.
52 
53 HISTORY:
54 	initial coding 	November 1976	D A Willcox of CAC
55 	1/7/77		D A Willcox of CAC	Fix to provide proper handling
56 						of "int a -1;"
57 
58 */
59 
60 /* Here we have the token scanner for indent.  It scans off one token and
61    puts it in the global variable "token".  It returns a code, indicating the
62    type of token scanned. */
63 
64 #include "indent_globs.h";
65 #include "indent_codes.h";
66 
67 
68 
69 #define alphanum 1
70 #define opchar 3
71 
72 struct templ {
73     char   *rwd;
74     int     rwcode;
75 };
76 
77 struct templ    specials[] =
78 {
79     "switch", 1,
80     "case", 2,
81     "struct", 3,
82     "default", 2,
83     "int", 4,
84     "char", 4,
85     "float", 4,
86     "double", 4,
87     "long", 4,
88     "short", 4,
89     "typdef", 4,
90     "unsigned", 4,
91     "register", 4,
92     "static", 4,
93     "global", 4,
94     "extern", 4,
95     "if", 5,
96     "while", 5,
97     "for", 5,
98     "else", 6,
99     "do", 6,
100     "sizeof", 0,
101     0, 0
102 };
103 
104 char    chartype[128] =
105 {		   /* this is used to facilitate the decision of what type
106 		      (alphanumeric, operator) each character is */
107     0, 0, 0, 0, 0, 0, 0, 0,
108     0, 0, 0, 0, 0, 0, 0, 0,
109     0, 0, 0, 0, 0, 0, 0, 0,
110     0, 0, 0, 0, 0, 0, 0, 0,
111     0, 3, 0, 0, 0, 3, 3, 0,
112     0, 0, 3, 3, 0, 3, 3, 3,
113     1, 1, 1, 1, 1, 1, 1, 1,
114     1, 1, 0, 0, 3, 3, 3, 3,
115     0, 1, 1, 1, 1, 1, 1, 1,
116     1, 1, 1, 1, 1, 1, 1, 1,
117     1, 1, 1, 1, 1, 1, 1, 1,
118     1, 1, 1, 0, 0, 0, 3, 1,
119     0, 1, 1, 1, 1, 1, 1, 1,
120     1, 1, 1, 1, 1, 1, 1, 1,
121     1, 1, 1, 1, 1, 1, 1, 1,
122     1, 1, 1, 0, 3, 0, 3, 0
123 };
124 
125 int     last_nl = true;
126  /* this is true if the last thing scanned was a newline */
127 
128 
129 
130 int     lexi () {
131     register char  *tok;
132  /* local pointer to next char in token */
133     register int    i;
134  /* local loop counter */
135     register char  *j;
136  /* used for searching thru list of reserved words */
137     int     unary_delim;
138  /* this is set to 1 if the current token forces a following operator to be
139     unary */
140     static int  last_code;
141  /* the last token type returned */
142     static int  l_struct;
143  /* set to 1 if the last token was 'struct' */
144     int     found_it;
145     int     code;  /* internal code to be returned */
146     char    qchar; /* the delimiter character for a string */
147 
148     tok = token;	       /* point to start of place to save token */
149     unary_delim = false;
150     col_1 = last_nl;	       /* tell world that this token started in column
151 			          1 iff the last thing scanned was nl */
152     last_nl = false;
153 
154     while (*buf_ptr == ' ' || *buf_ptr == '\t') {
155     /* get rid of blanks */
156 	col_1 = false;	       /* leading blanks imply token is not in column 1
157 			          */
158 	if (++buf_ptr >= buf_end)
159 	    fill_buffer ();
160     }
161 
162 /*----------------------------------------------------------*\
163 |    Scan an alphanumeric token
164 \*----------------------------------------------------------*/
165 
166     if (chartype[*buf_ptr & 0177] == alphanum) {
167     /* we have a character or number */
168 	while (chartype[*buf_ptr & 0177] == alphanum) {
169 	/* copy it over */
170 	    *tok++ = *buf_ptr++;
171 	    if (buf_ptr >= buf_end)
172 		fill_buffer ();
173 	}
174 
175 	*tok++ = '\0';
176 
177 	if (l_struct) {	       /* if last token was 'struct', then this token
178 			          should be treated as a declaration */
179 	    l_struct = false;
180 	    last_code = ident;
181 	    last_u_d = true;
182 	    return (decl);
183 	}
184 
185 	last_u_d = false;      /* operator after indentifier is binary */
186 
187 	for (i = 0; specials[i].rwd != 0; ++i) {
188 	/* this loop will check if the token is a keyword.  if so, a following
189 	   operator is unary */
190 	    last_code = ident; /* remember that this is the code we will return
191 			          */
192 	    j = specials[i].rwd;
193 	/* point at ith reserved word */
194 	    tok = token;       /* point at scanned toekn */
195 	    found_it = true;   /* set to false if not found */
196 	    do {
197 		if (*tok++ != *j) {
198 		    found_it = false;
199 		    break;
200 		}
201 	    } while (*j++);
202 
203 	    if (found_it) {    /* we have a keyword */
204 		last_u_d = true;
205 		switch (specials[i].rwcode) {
206 		    case 1:    /* it is a switch */
207 			return (swstmt);
208 		    case 2:    /* a case or default */
209 			return (casestmt);
210 
211 		    case 3:    /* a "struct" */
212 			l_struct = true;
213 		    /* Next time around, we will want to know that we have had
214 		       a 'struct' */
215 		    case 4:    /* one of the declaration keywords */
216 			if(p_l_follow) break;	/* inside parens: cast */
217 			last_code = decl;
218 			return (decl);
219 
220 		    case 5:    /* if, while, for */
221 			return (sp_paren);
222 
223 		    case 6:    /* do, else */
224 			return (sp_nparen);
225 
226 		    default:   /* all others are treated like any other
227 			          identifier */
228 			return (ident);
229 		}	       /* end of switch */
230 	    }		       /* end of if (found_it) */
231 
232 	}
233 
234 	if (last_code == decl) /* if this is a declared variable, then
235 			          following sign is unary */
236 	    last_u_d = true;   /* will make "int a -1" work */
237 	last_code = ident;
238 	return (ident);	       /* the ident is not in the list */
239     }			       /* end of procesing for alpanum character */
240 
241 
242 
243 /*----------------------------------------------------------*\
244 |   Scan a non-alphanumeric token
245 \*----------------------------------------------------------*/
246 
247     *tok++ = *buf_ptr;	       /* if it is only a one-character token, it is
248 			          moved here */
249     *tok = '\0';
250     if (++buf_ptr >= buf_end)
251 	fill_buffer ();
252 
253     switch (*token) {
254 	case '\n':
255 	    unary_delim = last_u_d;
256 	    last_nl = true;    /* remember that we just had a newline */
257 	    code = (had_eof ? 0 : newline);
258 	/* if data has been exausted, the newline is a dummy, and we should
259 	   return code to stop */
260 	    break;
261 
262 	case '\'': 	       /* start of quoted character */
263 	    qchar = '\'';      /* remember final delimiter */
264 	    goto copy_lit;     /* and go to common literal code */
265 
266 	case '"': 	       /* start of string */
267 	    qchar = '"';
268 
269     copy_lit:
270 	    do {	       /* copy the string */
271 		while (1) {    /* move one character or [/<char>]<char> */
272 		    if (*buf_ptr == '\n') {
273 		    /* check for unterminated literal */
274 			printf ("%d: Unterminated literal\n", line_no);
275 			goto stop_lit;
276 		    /* Don't copy any more */
277 		    }
278 
279 		    *tok = *buf_ptr++;
280 		    if (buf_ptr >= buf_end)
281 			fill_buffer ();
282 		    if (had_eof || ((tok - token) > (bufsize - 2))) {
283 			printf ("Unterminated literal\n");
284 			++tok;
285 			goto stop_lit;
286 		    /* get outof literal copying loop */
287 		    }
288 
289 		    if (*tok == '\\') {
290 		    /* if escape, copy extra char */
291 			if (*buf_ptr == '\n')
292 			       /* check for escaped newline */
293 			    ++line_no;
294 			*(++tok) = *buf_ptr++;
295 			++tok; /* we must increment this again because we
296 			          copied two chars */
297 			if (buf_ptr >= buf_end)
298 			    fill_buffer ();
299 		    }
300 		    else
301 			break; /* we copied one character */
302 		}	       /* end of while (1) */
303 	    } while (*tok++ != qchar);
304 
305     stop_lit:
306 	    code = ident;
307 	    break;
308 
309 	case ('('):
310 	case ('['):
311 	    unary_delim = true;
312 	    code = lparen;
313 	    break;
314 
315 	case (')'):
316 	case (']'):
317 	    code = rparen;
318 	    break;
319 
320 	case '#':
321 	    unary_delim = last_u_d;
322 	    code = preesc;
323 	    break;
324 
325 	case '?':
326 	    unary_delim = true;
327 	    code = question;
328 	    break;
329 
330 	case (':'):
331 	    code = colon;
332 	    unary_delim = true;
333 	    break;
334 
335 	case (';'):
336 	    unary_delim = true;
337 	    code = semicolon;
338 	    break;
339 
340 	case ('{'):
341 	    unary_delim = true;
342 	    code = lbrace;
343 	    break;
344 
345 	case ('}'):
346 	    unary_delim = true;
347 	    code = rbrace;
348 	    break;
349 
350 	case 014: 	       /* a form feed */
351 	    unary_delim = last_u_d;
352 	    last_nl = true;    /* remember this so we can set 'col_1' right */
353 	    code = form_feed;
354 	    break;
355 
356 	case (','):
357 	    unary_delim = true;
358 	    code = comma;
359 	    break;
360 
361 	case '.':
362 	    unary_delim = false;
363 	    code = period;
364 	    break;
365 
366 	case '-':
367 	case '+': 	       /* check for -, +, --, ++ */
368 	    code = (last_u_d ? unary_op : binary_op);
369 	    unary_delim = true;
370 
371 	    if (*buf_ptr == token[0]) {
372 	    /* check for doubled character */
373 		*tok++ = *buf_ptr++;
374 	    /* buffer overflow will be checked at end of loop */
375 		if (last_code == ident || last_code == rparen) {
376 		    code = (last_u_d ? unary_op : postop);
377 		/* check for following ++ or -- */
378 		    unary_delim = false;
379 		}
380 	    }
381 	    else
382 		if (*buf_ptr == '>' || *buf_ptr == '=')
383 			       /* check for operator -> or += */
384 		    *tok++ = *buf_ptr++;
385 	/* buffer overflow will be checked at end of switch */
386 
387 	    break;
388 
389 	case '=':
390 	    if (chartype[*buf_ptr] == opchar) {
391 	    /* we have two char assignment */
392 		*tok++ = *buf_ptr;
393 	    /* move second character */
394 		if (++buf_ptr >= buf_end)
395 		    fill_buffer ();
396 	    }
397 
398 	    code = binary_op;
399 	    unary_delim = true;
400 	    if (token[1] != '<' && token[1] != '>')
401 			       /* check for possible 3 char operator */
402 		break;
403 	/* can drop thru!!! */
404 
405 	case '>':
406 	case '<':
407 	case '!': 	       /* ops like <, <<, <=, !=, etc */
408 	    if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
409 		*tok++ = *buf_ptr;
410 		if (++buf_ptr >= buf_end)
411 		    fill_buffer ();
412 	    }
413 
414 	    if (*buf_ptr == '=')
415 		 *tok++ = *buf_ptr++;
416 	    code = (last_u_d ? unary_op : binary_op);
417 	    unary_delim = true;
418 	    break;
419 
420 	default:
421 	    if (token[0] == '/' && *buf_ptr == '*') {
422 	    /* it is start of comment */
423 		*tok++ = '*';
424 
425 		if (++buf_ptr >= buf_end)
426 		    fill_buffer ();
427 
428 		code = comment;
429 		unary_delim = last_u_d;
430 		break;
431 	    }
432 
433 	    while (*(tok - 1) == *buf_ptr || *buf_ptr=='=') {
434 	    /* handle ||, &&, etc, and also things as in int *****i */
435 		*tok++ = *buf_ptr;
436 		if (++buf_ptr >= buf_end)
437 		    fill_buffer ();
438 	    }
439 
440 
441 	    code = (last_u_d ? unary_op : binary_op);
442 	    unary_delim = true;
443 
444 
445     }			       /* end of switch */
446 
447     if (code != newline) {
448 	l_struct = false;
449 	last_code = code;
450     }
451 
452     if (buf_ptr >= buf_end)    /* check for input buffer empty */
453 	fill_buffer ();
454     last_u_d = unary_delim;
455     *tok = '\0';	       /* null terminate the token */
456     return (code);
457 };
458