xref: /freebsd/usr.bin/ctags/C.c (revision 87b0195a)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1987, 1993, 1994
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. Neither the name of the University nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 #include <sys/cdefs.h>
33 #include <limits.h>
34 #include <stddef.h>
35 #include <stdio.h>
36 #include <string.h>
37 
38 #include "ctags.h"
39 
40 static bool	func_entry(void);
41 static void	hash_entry(void);
42 static void	skip_string(int);
43 static bool	str_entry(int);
44 
45 /*
46  * c_entries --
47  *	read .c and .h files and call appropriate routines
48  */
49 void
c_entries(void)50 c_entries(void)
51 {
52 	int	c;			/* current character */
53 	int	level;			/* brace level */
54 	int	token;			/* if reading a token */
55 	bool	t_def;			/* if reading a typedef */
56 	int	t_level;		/* typedef's brace level */
57 	char	*sp;			/* buffer pointer */
58 	char	tok[MAXTOKEN];		/* token buffer */
59 
60 	lineftell = ftell(inf);
61 	sp = tok; token = t_def = false; t_level = -1; level = 0; lineno = 1;
62 	while (GETC(!=, EOF)) {
63 		switch (c) {
64 		/*
65 		 * Here's where it DOESN'T handle: {
66 		 *	foo(a)
67 		 *	{
68 		 *	#ifdef notdef
69 		 *		}
70 		 *	#endif
71 		 *		if (a)
72 		 *			puts("hello, world");
73 		 *	}
74 		 */
75 		case '{':
76 			++level;
77 			goto endtok;
78 		case '}':
79 			/*
80 			 * if level goes below zero, try and fix
81 			 * it, even though we've already messed up
82 			 */
83 			if (--level < 0)
84 				level = 0;
85 			goto endtok;
86 
87 		case '\n':
88 			SETLINE;
89 			/*
90 			 * the above 3 cases are similar in that they
91 			 * are special characters that also end tokens.
92 			 */
93 	endtok:			if (sp > tok) {
94 				*sp = EOS;
95 				token = true;
96 				sp = tok;
97 			}
98 			else
99 				token = false;
100 			continue;
101 
102 		/*
103 		 * We ignore quoted strings and character constants
104 		 * completely.
105 		 */
106 		case '"':
107 		case '\'':
108 			skip_string(c);
109 			break;
110 
111 		/*
112 		 * comments can be fun; note the state is unchanged after
113 		 * return, in case we found:
114 		 *	"foo() XX comment XX { int bar; }"
115 		 */
116 		case '/':
117 			if (GETC(==, '*') || c == '/') {
118 				skip_comment(c);
119 				continue;
120 			}
121 			(void)ungetc(c, inf);
122 			c = '/';
123 			goto storec;
124 
125 		/* hash marks flag #define's. */
126 		case '#':
127 			if (sp == tok) {
128 				hash_entry();
129 				break;
130 			}
131 			goto storec;
132 
133 		/*
134 		 * if we have a current token, parenthesis on
135 		 * level zero indicates a function.
136 		 */
137 		case '(':
138 			if (!level && token) {
139 				int	curline;
140 
141 				if (sp != tok)
142 					*sp = EOS;
143 				/*
144 				 * grab the line immediately, we may
145 				 * already be wrong, for example,
146 				 *	foo\n
147 				 *	(arg1,
148 				 */
149 				get_line();
150 				curline = lineno;
151 				if (func_entry()) {
152 					++level;
153 					pfnote(tok, curline);
154 				}
155 				break;
156 			}
157 			goto storec;
158 
159 		/*
160 		 * semi-colons indicate the end of a typedef; if we find a
161 		 * typedef we search for the next semi-colon of the same
162 		 * level as the typedef.  Ignoring "structs", they are
163 		 * tricky, since you can find:
164 		 *
165 		 *	"typedef long time_t;"
166 		 *	"typedef unsigned int u_int;"
167 		 *	"typedef unsigned int u_int [10];"
168 		 *
169 		 * If looking at a typedef, we save a copy of the last token
170 		 * found.  Then, when we find the ';' we take the current
171 		 * token if it starts with a valid token name, else we take
172 		 * the one we saved.  There's probably some reasonable
173 		 * alternative to this...
174 		 */
175 		case ';':
176 			if (t_def && level == t_level) {
177 				t_def = false;
178 				get_line();
179 				if (sp != tok)
180 					*sp = EOS;
181 				pfnote(tok, lineno);
182 				break;
183 			}
184 			goto storec;
185 
186 		/*
187 		 * store characters until one that can't be part of a token
188 		 * comes along; check the current token against certain
189 		 * reserved words.
190 		 */
191 		default:
192 			/* ignore whitespace */
193 			if (c == ' ' || c == '\t') {
194 				int save = c;
195 				while (GETC(!=, EOF) && (c == ' ' || c == '\t'))
196 					;
197 				if (c == EOF)
198 					return;
199 				(void)ungetc(c, inf);
200 				c = save;
201 			}
202 	storec:		if (!intoken(c)) {
203 				if (sp == tok)
204 					break;
205 				*sp = EOS;
206 				if (tflag) {
207 					/* no typedefs inside typedefs */
208 					if (!t_def &&
209 						   !memcmp(tok, "typedef",8)) {
210 						t_def = true;
211 						t_level = level;
212 						break;
213 					}
214 					/* catch "typedef struct" */
215 					if ((!t_def || t_level < level)
216 					    && (!memcmp(tok, "struct", 7)
217 					    || !memcmp(tok, "union", 6)
218 					    || !memcmp(tok, "enum", 5))) {
219 						/*
220 						 * get line immediately;
221 						 * may change before '{'
222 						 */
223 						get_line();
224 						if (str_entry(c))
225 							++level;
226 						break;
227 						/* } */
228 					}
229 				}
230 				sp = tok;
231 			}
232 			else if (sp != tok || begtoken(c)) {
233 				if (sp == tok + sizeof tok - 1)
234 					/* Too long -- truncate it */
235 					*sp = EOS;
236 				else
237 					*sp++ = c;
238 				token = true;
239 			}
240 			continue;
241 		}
242 
243 		sp = tok;
244 		token = false;
245 	}
246 }
247 
248 /*
249  * func_entry --
250  *	handle a function reference
251  */
252 static bool
func_entry(void)253 func_entry(void)
254 {
255 	int	c;			/* current character */
256 	int	level = 0;		/* for matching '()' */
257 	static char attribute[] = "__attribute__";
258 	char	maybe_attribute[sizeof attribute + 1],
259 		*anext;
260 
261 	/*
262 	 * Find the end of the assumed function declaration.
263 	 * Note that ANSI C functions can have type definitions so keep
264 	 * track of the parentheses nesting level.
265 	 */
266 	while (GETC(!=, EOF)) {
267 		switch (c) {
268 		case '\'':
269 		case '"':
270 			/* skip strings and character constants */
271 			skip_string(c);
272 			break;
273 		case '/':
274 			/* skip comments */
275 			if (GETC(==, '*') || c == '/')
276 				skip_comment(c);
277 			break;
278 		case '(':
279 			level++;
280 			break;
281 		case ')':
282 			if (level == 0)
283 				goto fnd;
284 			level--;
285 			break;
286 		case '\n':
287 			SETLINE;
288 		}
289 	}
290 	return (false);
291 fnd:
292 	/*
293 	 * we assume that the character after a function's right paren
294 	 * is a token character if it's a function and a non-token
295 	 * character if it's a declaration.  Comments don't count...
296 	 */
297 	for (anext = maybe_attribute;;) {
298 		while (GETC(!=, EOF) && iswhite(c))
299 			if (c == '\n')
300 				SETLINE;
301 		if (c == EOF)
302 			return false;
303 		/*
304 		 * Recognize the gnu __attribute__ extension, which would
305 		 * otherwise make the heuristic test DTWT
306 		 */
307 		if (anext == maybe_attribute) {
308 			if (intoken(c)) {
309 				*anext++ = c;
310 				continue;
311 			}
312 		} else {
313 			if (intoken(c)) {
314 				if (anext - maybe_attribute
315 				 < (ptrdiff_t)(sizeof attribute - 1))
316 					*anext++ = c;
317 				else	break;
318 				continue;
319 			} else {
320 				*anext++ = '\0';
321 				if (strcmp(maybe_attribute, attribute) == 0) {
322 					(void)ungetc(c, inf);
323 					return false;
324 				}
325 				break;
326 			}
327 		}
328 		if (intoken(c) || c == '{')
329 			break;
330 		if (c == '/' && (GETC(==, '*') || c == '/'))
331 			skip_comment(c);
332 		else {				/* don't ever "read" '/' */
333 			(void)ungetc(c, inf);
334 			return (false);
335 		}
336 	}
337 	if (c != '{')
338 		(void)skip_key('{');
339 	return (true);
340 }
341 
342 /*
343  * hash_entry --
344  *	handle a line starting with a '#'
345  */
346 static void
hash_entry(void)347 hash_entry(void)
348 {
349 	int	c;			/* character read */
350 	int	curline;		/* line started on */
351 	char	*sp;			/* buffer pointer */
352 	char	tok[MAXTOKEN];		/* storage buffer */
353 
354 	/* ignore leading whitespace */
355 	while (GETC(!=, EOF) && (c == ' ' || c == '\t'))
356 		;
357 	(void)ungetc(c, inf);
358 
359 	curline = lineno;
360 	for (sp = tok;;) {		/* get next token */
361 		if (GETC(==, EOF))
362 			return;
363 		if (iswhite(c))
364 			break;
365 		if (sp == tok + sizeof tok - 1)
366 			/* Too long -- truncate it */
367 			*sp = EOS;
368 		else
369 			*sp++ = c;
370 	}
371 	*sp = EOS;
372 	if (memcmp(tok, "define", 6))	/* only interested in #define's */
373 		goto skip;
374 	for (;;) {			/* this doesn't handle "#define \n" */
375 		if (GETC(==, EOF))
376 			return;
377 		if (!iswhite(c))
378 			break;
379 	}
380 	for (sp = tok;;) {		/* get next token */
381 		if (sp == tok + sizeof tok - 1)
382 			/* Too long -- truncate it */
383 			*sp = EOS;
384 		else
385 			*sp++ = c;
386 		if (GETC(==, EOF))
387 			return;
388 		/*
389 		 * this is where it DOESN'T handle
390 		 * "#define \n"
391 		 */
392 		if (!intoken(c))
393 			break;
394 	}
395 	*sp = EOS;
396 	if (dflag || c == '(') {	/* only want macros */
397 		get_line();
398 		pfnote(tok, curline);
399 	}
400 skip:	if (c == '\n') {		/* get rid of rest of define */
401 		SETLINE
402 		if (*(sp - 1) != '\\')
403 			return;
404 	}
405 	(void)skip_key('\n');
406 }
407 
408 /*
409  * str_entry --
410  *	handle a struct, union or enum entry
411  */
412 static bool
str_entry(int c)413 str_entry(int c) /* c is current character */
414 {
415 	int	curline;		/* line started on */
416 	char	*sp;			/* buffer pointer */
417 	char	tok[LINE_MAX];		/* storage buffer */
418 
419 	curline = lineno;
420 	while (iswhite(c))
421 		if (GETC(==, EOF))
422 			return (false);
423 	if (c == '{')		/* it was "struct {" */
424 		return (true);
425 	for (sp = tok;;) {		/* get next token */
426 		if (sp == tok + sizeof tok - 1)
427 			/* Too long -- truncate it */
428 			*sp = EOS;
429 		else
430 			*sp++ = c;
431 		if (GETC(==, EOF))
432 			return (false);
433 		if (!intoken(c))
434 			break;
435 	}
436 	switch (c) {
437 		case '{':		/* it was "struct foo{" */
438 			--sp;
439 			break;
440 		case '\n':		/* it was "struct foo\n" */
441 			SETLINE;
442 			/*FALLTHROUGH*/
443 		default:		/* probably "struct foo " */
444 			while (GETC(!=, EOF))
445 				if (!iswhite(c))
446 					break;
447 			if (c != '{') {
448 				(void)ungetc(c, inf);
449 				return (false);
450 			}
451 	}
452 	*sp = EOS;
453 	pfnote(tok, curline);
454 	return (true);
455 }
456 
457 /*
458  * skip_comment --
459  *	skip over comment
460  */
461 void
skip_comment(int t)462 skip_comment(int t) /* t is comment character */
463 {
464 	int	c;			/* character read */
465 	int	star;			/* '*' flag */
466 
467 	for (star = 0; GETC(!=, EOF);)
468 		switch(c) {
469 		/* comments don't nest, nor can they be escaped. */
470 		case '*':
471 			star = true;
472 			break;
473 		case '/':
474 			if (star && t == '*')
475 				return;
476 			break;
477 		case '\n':
478 			SETLINE;
479 			if (t == '/')
480 				return;
481 			/*FALLTHROUGH*/
482 		default:
483 			star = false;
484 			break;
485 		}
486 }
487 
488 /*
489  * skip_string --
490  *	skip to the end of a string or character constant.
491  */
492 void
skip_string(int key)493 skip_string(int key)
494 {
495 	int	c,
496 		skip;
497 
498 	for (skip = false; GETC(!=, EOF); )
499 		switch (c) {
500 		case '\\':		/* a backslash escapes anything */
501 			skip = !skip;	/* we toggle in case it's "\\" */
502 			break;
503 		case '\n':
504 			SETLINE;
505 			/*FALLTHROUGH*/
506 		default:
507 			if (c == key && !skip)
508 				return;
509 			skip = false;
510 		}
511 }
512 
513 /*
514  * skip_key --
515  *	skip to next char "key"
516  */
517 bool
skip_key(int key)518 skip_key(int key)
519 {
520 	int	c;
521 	bool	skip;
522 	bool	retval;
523 
524 	for (skip = retval = false; GETC(!=, EOF);)
525 		switch(c) {
526 		case '\\':		/* a backslash escapes anything */
527 			skip = !skip;	/* we toggle in case it's "\\" */
528 			break;
529 		case ';':		/* special case for yacc; if one */
530 		case '|':		/* of these chars occurs, we may */
531 			retval = true;	/* have moved out of the rule */
532 			break;		/* not used by C */
533 		case '\'':
534 		case '"':
535 			/* skip strings and character constants */
536 			skip_string(c);
537 			break;
538 		case '/':
539 			/* skip comments */
540 			if (GETC(==, '*') || c == '/') {
541 				skip_comment(c);
542 				break;
543 			}
544 			(void)ungetc(c, inf);
545 			c = '/';
546 			goto norm;
547 		case '\n':
548 			SETLINE;
549 			/*FALLTHROUGH*/
550 		default:
551 		norm:
552 			if (c == key && !skip)
553 				return (retval);
554 			skip = false;
555 		}
556 	return (retval);
557 }
558