1 /*
2 *   $Id: lregex.c 747 2009-11-06 02:33:37Z dhiebert $
3 *
4 *   Copyright (c) 2000-2003, Darren Hiebert
5 *
6 *   This source code is released for free distribution under the terms of the
7 *   GNU General Public License.
8 *
9 *   This module contains functions for applying regular expression matching.
10 *
11 *   The code for utlizing the Gnu regex package with regards to processing the
12 *   regex option and checking for regex matches was adapted from routines in
13 *   Gnu etags.
14 */
15 
16 /*
17 *   INCLUDE FILES
18 */
19 #include "general.h"  /* must always come first */
20 
21 #include <string.h>
22 
23 #ifdef HAVE_REGCOMP
24 # include <ctype.h>
25 # include <stddef.h>
26 # ifdef HAVE_SYS_TYPES_H
27 #  include <sys/types.h>  /* declare off_t (not known to regex.h on FreeBSD) */
28 # endif
29 # include <regex.h>
30 #endif
31 
32 #include "debug.h"
33 #include "entry.h"
34 #include "parse.h"
35 #include "read.h"
36 #include "routines.h"
37 
38 #ifdef HAVE_REGEX
39 
40 /*
41 *   MACROS
42 */
43 
44 /* Back-references \0 through \9 */
45 #define BACK_REFERENCE_COUNT 10
46 
47 #if defined (HAVE_REGCOMP) && !defined (REGCOMP_BROKEN)
48 # define POSIX_REGEX
49 #endif
50 
51 #define REGEX_NAME "Regex"
52 
53 /*
54 *   DATA DECLARATIONS
55 */
56 #if defined (POSIX_REGEX)
57 
58 struct sKind {
59 	boolean enabled;
60 	char letter;
61 	char* name;
62 	char* description;
63 };
64 
65 enum pType { PTRN_TAG, PTRN_CALLBACK };
66 
67 typedef struct {
68 	regex_t *pattern;
69 	enum pType type;
70 	union {
71 		struct {
72 			char *name_pattern;
73 			struct sKind kind;
74 		} tag;
75 		struct {
76 			regexCallback function;
77 		} callback;
78 	} u;
79 } regexPattern;
80 
81 #endif
82 
83 typedef struct {
84 	regexPattern *patterns;
85 	unsigned int count;
86 } patternSet;
87 
88 /*
89 *   DATA DEFINITIONS
90 */
91 
92 static boolean regexBroken = FALSE;
93 
94 /* Array of pattern sets, indexed by language */
95 static patternSet* Sets = NULL;
96 static int SetUpper = -1;  /* upper language index in list */
97 
98 /*
99 *   FUNCTION DEFINITIONS
100 */
101 
clearPatternSet(const langType language)102 static void clearPatternSet (const langType language)
103 {
104 	if (language <= SetUpper)
105 	{
106 		patternSet* const set = Sets + language;
107 		unsigned int i;
108 		for (i = 0  ;  i < set->count  ;  ++i)
109 		{
110 			regexPattern *p = &set->patterns [i];
111 #if defined (POSIX_REGEX)
112 			regfree (p->pattern);
113 #endif
114 			eFree (p->pattern);
115 			p->pattern = NULL;
116 
117 			if (p->type == PTRN_TAG)
118 			{
119 				eFree (p->u.tag.name_pattern);
120 				p->u.tag.name_pattern = NULL;
121 				eFree (p->u.tag.kind.name);
122 				p->u.tag.kind.name = NULL;
123 				if (p->u.tag.kind.description != NULL)
124 				{
125 					eFree (p->u.tag.kind.description);
126 					p->u.tag.kind.description = NULL;
127 				}
128 			}
129 		}
130 		if (set->patterns != NULL)
131 			eFree (set->patterns);
132 		set->patterns = NULL;
133 		set->count = 0;
134 	}
135 }
136 
137 /*
138 *   Regex psuedo-parser
139 */
140 
makeRegexTag(const vString * const name,const struct sKind * const kind)141 static void makeRegexTag (
142 		const vString* const name, const struct sKind* const kind)
143 {
144 	if (kind->enabled)
145 	{
146 		tagEntryInfo e;
147 		Assert (name != NULL  &&  vStringLength (name) > 0);
148 		Assert (kind != NULL);
149 		initTagEntry (&e, vStringValue (name));
150 		e.kind     = kind->letter;
151 		e.kindName = kind->name;
152 		makeTagEntry (&e);
153 	}
154 }
155 
156 /*
157 *   Regex pattern definition
158 */
159 
160 /* Take a string like "/blah/" and turn it into "blah", making sure
161  * that the first and last characters are the same, and handling
162  * quoted separator characters.  Actually, stops on the occurrence of
163  * an unquoted separator.  Also turns "\t" into a Tab character.
164  * Returns pointer to terminating separator.  Works in place.  Null
165  * terminates name string.
166  */
scanSeparators(char * name)167 static char* scanSeparators (char* name)
168 {
169 	char sep = name [0];
170 	char *copyto = name;
171 	boolean quoted = FALSE;
172 
173 	for (++name ; *name != '\0' ; ++name)
174 	{
175 		if (quoted)
176 		{
177 			if (*name == sep)
178 				*copyto++ = sep;
179 			else if (*name == 't')
180 				*copyto++ = '\t';
181 			else
182 			{
183 				/* Something else is quoted, so preserve the quote. */
184 				*copyto++ = '\\';
185 				*copyto++ = *name;
186 			}
187 			quoted = FALSE;
188 		}
189 		else if (*name == '\\')
190 			quoted = TRUE;
191 		else if (*name == sep)
192 		{
193 			break;
194 		}
195 		else
196 			*copyto++ = *name;
197 	}
198 	*copyto = '\0';
199 	return name;
200 }
201 
202 /* Parse `regexp', in form "/regex/name/[k,Kind/]flags" (where the separator
203  * character is whatever the first character of `regexp' is), by breaking it
204  * up into null terminated strings, removing the separators, and expanding
205  * '\t' into tabs. When complete, `regexp' points to the line matching
206  * pattern, a pointer to the name matching pattern is written to `name', a
207  * pointer to the kinds is written to `kinds' (possibly NULL), and a pointer
208  * to the trailing flags is written to `flags'. If the pattern is not in the
209  * correct format, a false value is returned.
210  */
parseTagRegex(char * const regexp,char ** const name,char ** const kinds,char ** const flags)211 static boolean parseTagRegex (
212 		char* const regexp, char** const name,
213 		char** const kinds, char** const flags)
214 {
215 	boolean result = FALSE;
216 	const int separator = (unsigned char) regexp [0];
217 
218 	*name = scanSeparators (regexp);
219 	if (*regexp == '\0')
220 		error (WARNING, "empty regexp");
221 	else if (**name != separator)
222 		error (WARNING, "%s: incomplete regexp", regexp);
223 	else
224 	{
225 		char* const third = scanSeparators (*name);
226 		if (**name == '\0')
227 			error (WARNING, "%s: regexp missing name pattern", regexp);
228 		if ((*name) [strlen (*name) - 1] == '\\')
229 			error (WARNING, "error in name pattern: \"%s\"", *name);
230 		if (*third != separator)
231 			error (WARNING, "%s: regexp missing final separator", regexp);
232 		else
233 		{
234 			char* const fourth = scanSeparators (third);
235 			if (*fourth == separator)
236 			{
237 				*kinds = third;
238 				scanSeparators (fourth);
239 				*flags = fourth;
240 			}
241 			else
242 			{
243 				*flags = third;
244 				*kinds = NULL;
245 			}
246 			result = TRUE;
247 		}
248 	}
249 	return result;
250 }
251 
addCompiledTagPattern(const langType language,regex_t * const pattern,char * const name,const char kind,char * const kindName,char * const description)252 static void addCompiledTagPattern (
253 		const langType language, regex_t* const pattern,
254 		char* const name, const char kind, char* const kindName,
255 		char *const description)
256 {
257 	patternSet* set;
258 	regexPattern *ptrn;
259 	if (language > SetUpper)
260 	{
261 		int i;
262 		Sets = xRealloc (Sets, (language + 1), patternSet);
263 		for (i = SetUpper + 1  ;  i <= language  ;  ++i)
264 		{
265 			Sets [i].patterns = NULL;
266 			Sets [i].count = 0;
267 		}
268 		SetUpper = language;
269 	}
270 	set = Sets + language;
271 	set->patterns = xRealloc (set->patterns, (set->count + 1), regexPattern);
272 	ptrn = &set->patterns [set->count];
273 	set->count += 1;
274 
275 	ptrn->pattern = pattern;
276 	ptrn->type    = PTRN_TAG;
277 	ptrn->u.tag.name_pattern = name;
278 	ptrn->u.tag.kind.enabled = TRUE;
279 	ptrn->u.tag.kind.letter  = kind;
280 	ptrn->u.tag.kind.name    = kindName;
281 	ptrn->u.tag.kind.description = description;
282 }
283 
addCompiledCallbackPattern(const langType language,regex_t * const pattern,const regexCallback callback)284 static void addCompiledCallbackPattern (
285 		const langType language, regex_t* const pattern,
286 		const regexCallback callback)
287 {
288 	patternSet* set;
289 	regexPattern *ptrn;
290 	if (language > SetUpper)
291 	{
292 		int i;
293 		Sets = xRealloc (Sets, (language + 1), patternSet);
294 		for (i = SetUpper + 1  ;  i <= language  ;  ++i)
295 		{
296 			Sets [i].patterns = NULL;
297 			Sets [i].count = 0;
298 		}
299 		SetUpper = language;
300 	}
301 	set = Sets + language;
302 	set->patterns = xRealloc (set->patterns, (set->count + 1), regexPattern);
303 	ptrn = &set->patterns [set->count];
304 	set->count += 1;
305 
306 	ptrn->pattern = pattern;
307 	ptrn->type    = PTRN_CALLBACK;
308 	ptrn->u.callback.function = callback;
309 }
310 
311 #if defined (POSIX_REGEX)
312 
compileRegex(const char * const regexp,const char * const flags)313 static regex_t* compileRegex (const char* const regexp, const char* const flags)
314 {
315 	int cflags = REG_EXTENDED | REG_NEWLINE;
316 	regex_t *result = NULL;
317 	int errcode;
318 	int i;
319 	for (i = 0  ; flags != NULL  &&  flags [i] != '\0'  ;  ++i)
320 	{
321 		switch ((int) flags [i])
322 		{
323 			case 'b': cflags &= ~REG_EXTENDED; break;
324 			case 'e': cflags |= REG_EXTENDED;  break;
325 			case 'i': cflags |= REG_ICASE;     break;
326 			default: error (WARNING, "unknown regex flag: '%c'", *flags); break;
327 		}
328 	}
329 	result = xMalloc (1, regex_t);
330 	errcode = regcomp (result, regexp, cflags);
331 	if (errcode != 0)
332 	{
333 		char errmsg[256];
334 		regerror (errcode, result, errmsg, 256);
335 		error (WARNING, "regcomp %s: %s", regexp, errmsg);
336 		regfree (result);
337 		eFree (result);
338 		result = NULL;
339 	}
340 	return result;
341 }
342 
343 #endif
344 
parseKinds(const char * const kinds,char * const kind,char ** const kindName,char ** description)345 static void parseKinds (
346 		const char* const kinds, char* const kind, char** const kindName,
347 		char **description)
348 {
349 	*kind = '\0';
350 	*kindName = NULL;
351 	*description = NULL;
352 	if (kinds == NULL  ||  kinds [0] == '\0')
353 	{
354 		*kind = 'r';
355 		*kindName = eStrdup ("regex");
356 	}
357 	else if (kinds [0] != '\0')
358 	{
359 		const char* k = kinds;
360 		if (k [0] != ','  &&  (k [1] == ','  ||  k [1] == '\0'))
361 			*kind = *k++;
362 		else
363 			*kind = 'r';
364 		if (*k == ',')
365 			++k;
366 		if (k [0] == '\0')
367 			*kindName = eStrdup ("regex");
368 		else
369 		{
370 			const char *const comma = strchr (k, ',');
371 			if (comma == NULL)
372 				*kindName = eStrdup (k);
373 			else
374 			{
375 				*kindName = (char*) eMalloc (comma - k + 1);
376 				strncpy (*kindName, k, comma - k);
377 				(*kindName) [comma - k] = '\0';
378 				k = comma + 1;
379 				if (k [0] != '\0')
380 					*description = eStrdup (k);
381 			}
382 		}
383 	}
384 }
385 
printRegexKind(const regexPattern * pat,unsigned int i,boolean indent)386 static void printRegexKind (const regexPattern *pat, unsigned int i, boolean indent)
387 {
388 	const struct sKind *const kind = &pat [i].u.tag.kind;
389 	const char *const indentation = indent ? "    " : "";
390 	Assert (pat [i].type == PTRN_TAG);
391 	printf ("%s%c  %s %s\n", indentation,
392 			kind->letter != '\0' ? kind->letter : '?',
393 			kind->description != NULL ? kind->description : kind->name,
394 			kind->enabled ? "" : " [off]");
395 }
396 
processLanguageRegex(const langType language,const char * const parameter)397 static void processLanguageRegex (const langType language,
398 		const char* const parameter)
399 {
400 	if (parameter == NULL  ||  parameter [0] == '\0')
401 		clearPatternSet (language);
402 	else if (parameter [0] != '@')
403 		addLanguageRegex (language, parameter);
404 	else if (! doesFileExist (parameter + 1))
405 		error (WARNING, "cannot open regex file");
406 	else
407 	{
408 		const char* regexfile = parameter + 1;
409 		FILE* const fp = fopen (regexfile, "r");
410 		if (fp == NULL)
411 			error (WARNING | PERROR, "%s", regexfile);
412 		else
413 		{
414 			vString* const regex = vStringNew ();
415 			while (readLine (regex, fp))
416 				addLanguageRegex (language, vStringValue (regex));
417 			fclose (fp);
418 			vStringDelete (regex);
419 		}
420 	}
421 }
422 
423 /*
424 *   Regex pattern matching
425 */
426 
427 #if defined (POSIX_REGEX)
428 
substitute(const char * const in,const char * out,const int nmatch,const regmatch_t * const pmatch)429 static vString* substitute (
430 		const char* const in, const char* out,
431 		const int nmatch, const regmatch_t* const pmatch)
432 {
433 	vString* result = vStringNew ();
434 	const char* p;
435 	for (p = out  ;  *p != '\0'  ;  p++)
436 	{
437 		if (*p == '\\'  &&  isdigit ((int) *++p))
438 		{
439 			const int dig = *p - '0';
440 			if (0 < dig  &&  dig < nmatch  &&  pmatch [dig].rm_so != -1)
441 			{
442 				const int diglen = pmatch [dig].rm_eo - pmatch [dig].rm_so;
443 				vStringNCatS (result, in + pmatch [dig].rm_so, diglen);
444 			}
445 		}
446 		else if (*p != '\n'  &&  *p != '\r')
447 			vStringPut (result, *p);
448 	}
449 	vStringTerminate (result);
450 	return result;
451 }
452 
matchTagPattern(const vString * const line,const regexPattern * const patbuf,const regmatch_t * const pmatch)453 static void matchTagPattern (const vString* const line,
454 		const regexPattern* const patbuf,
455 		const regmatch_t* const pmatch)
456 {
457 	vString *const name = substitute (vStringValue (line),
458 			patbuf->u.tag.name_pattern, BACK_REFERENCE_COUNT, pmatch);
459 	vStringStripLeading (name);
460 	vStringStripTrailing (name);
461 	if (vStringLength (name) > 0)
462 		makeRegexTag (name, &patbuf->u.tag.kind);
463 	else
464 		error (WARNING, "%s:%ld: null expansion of name pattern \"%s\"",
465 			getInputFileName (), getInputLineNumber (),
466 			patbuf->u.tag.name_pattern);
467 	vStringDelete (name);
468 }
469 
matchCallbackPattern(const vString * const line,const regexPattern * const patbuf,const regmatch_t * const pmatch)470 static void matchCallbackPattern (
471 		const vString* const line, const regexPattern* const patbuf,
472 		const regmatch_t* const pmatch)
473 {
474 	regexMatch matches [BACK_REFERENCE_COUNT];
475 	unsigned int count = 0;
476 	int i;
477 	for (i = 0  ;  i < BACK_REFERENCE_COUNT  &&  pmatch [i].rm_so != -1  ;  ++i)
478 	{
479 		matches [i].start  = pmatch [i].rm_so;
480 		matches [i].length = pmatch [i].rm_eo - pmatch [i].rm_so;
481 		++count;
482 	}
483 	patbuf->u.callback.function (vStringValue (line), matches, count);
484 }
485 
matchRegexPattern(const vString * const line,const regexPattern * const patbuf)486 static boolean matchRegexPattern (const vString* const line,
487 		const regexPattern* const patbuf)
488 {
489 	boolean result = FALSE;
490 	regmatch_t pmatch [BACK_REFERENCE_COUNT];
491 	const int match = regexec (patbuf->pattern, vStringValue (line),
492 							   BACK_REFERENCE_COUNT, pmatch, 0);
493 	if (match == 0)
494 	{
495 		result = TRUE;
496 		if (patbuf->type == PTRN_TAG)
497 			matchTagPattern (line, patbuf, pmatch);
498 		else if (patbuf->type == PTRN_CALLBACK)
499 			matchCallbackPattern (line, patbuf, pmatch);
500 		else
501 		{
502 			Assert ("invalid pattern type" == NULL);
503 			result = FALSE;
504 		}
505 	}
506 	return result;
507 }
508 
509 #endif
510 
511 /* PUBLIC INTERFACE */
512 
513 /* Match against all patterns for specified language. Returns true if at least
514  * on pattern matched.
515  */
matchRegex(const vString * const line,const langType language)516 extern boolean matchRegex (const vString* const line, const langType language)
517 {
518 	boolean result = FALSE;
519 	if (language != LANG_IGNORE  &&  language <= SetUpper  &&
520 		Sets [language].count > 0)
521 	{
522 		const patternSet* const set = Sets + language;
523 		unsigned int i;
524 		for (i = 0  ;  i < set->count  ;  ++i)
525 			if (matchRegexPattern (line, set->patterns + i))
526 				result = TRUE;
527 	}
528 	return result;
529 }
530 
findRegexTags(void)531 extern void findRegexTags (void)
532 {
533 	/* merely read all lines of the file */
534 	while (fileReadLine () != NULL)
535 		;
536 }
537 
538 #endif  /* HAVE_REGEX */
539 
addTagRegex(const langType language __unused__,const char * const regex __unused__,const char * const name __unused__,const char * const kinds __unused__,const char * const flags __unused__)540 extern void addTagRegex (
541 		const langType language __unused__,
542 		const char* const regex __unused__,
543 		const char* const name __unused__,
544 		const char* const kinds __unused__,
545 		const char* const flags __unused__)
546 {
547 #ifdef HAVE_REGEX
548 	Assert (regex != NULL);
549 	Assert (name != NULL);
550 	if (! regexBroken)
551 	{
552 		regex_t* const cp = compileRegex (regex, flags);
553 		if (cp != NULL)
554 		{
555 			char kind;
556 			char* kindName;
557 			char* description;
558 			parseKinds (kinds, &kind, &kindName, &description);
559 			addCompiledTagPattern (language, cp, eStrdup (name),
560 					kind, kindName, description);
561 		}
562 	}
563 #endif
564 }
565 
addCallbackRegex(const langType language __unused__,const char * const regex __unused__,const char * const flags __unused__,const regexCallback callback __unused__)566 extern void addCallbackRegex (
567 		const langType language __unused__,
568 		const char* const regex __unused__,
569 		const char* const flags __unused__,
570 		const regexCallback callback __unused__)
571 {
572 #ifdef HAVE_REGEX
573 	Assert (regex != NULL);
574 	if (! regexBroken)
575 	{
576 		regex_t* const cp = compileRegex (regex, flags);
577 		if (cp != NULL)
578 			addCompiledCallbackPattern (language, cp, callback);
579 	}
580 #endif
581 }
582 
addLanguageRegex(const langType language __unused__,const char * const regex __unused__)583 extern void addLanguageRegex (
584 		const langType language __unused__, const char* const regex __unused__)
585 {
586 #ifdef HAVE_REGEX
587 	if (! regexBroken)
588 	{
589 		char *const regex_pat = eStrdup (regex);
590 		char *name, *kinds, *flags;
591 		if (parseTagRegex (regex_pat, &name, &kinds, &flags))
592 		{
593 			addTagRegex (language, regex_pat, name, kinds, flags);
594 			eFree (regex_pat);
595 		}
596 	}
597 #endif
598 }
599 
600 /*
601 *   Regex option parsing
602 */
603 
processRegexOption(const char * const option,const char * const parameter __unused__)604 extern boolean processRegexOption (const char *const option,
605 								   const char *const parameter __unused__)
606 {
607 	boolean handled = FALSE;
608 	const char* const dash = strchr (option, '-');
609 	if (dash != NULL  &&  strncmp (option, "regex", dash - option) == 0)
610 	{
611 #ifdef HAVE_REGEX
612 		langType language;
613 		language = getNamedLanguage (dash + 1);
614 		if (language == LANG_IGNORE)
615 			error (WARNING, "unknown language \"%s\" in --%s option", (dash + 1), option);
616 		else
617 			processLanguageRegex (language, parameter);
618 #else
619 		error (WARNING, "regex support not available; required for --%s option",
620 		   option);
621 #endif
622 		handled = TRUE;
623 	}
624 	return handled;
625 }
626 
disableRegexKinds(const langType language __unused__)627 extern void disableRegexKinds (const langType language __unused__)
628 {
629 #ifdef HAVE_REGEX
630 	if (language <= SetUpper  &&  Sets [language].count > 0)
631 	{
632 		patternSet* const set = Sets + language;
633 		unsigned int i;
634 		for (i = 0  ;  i < set->count  ;  ++i)
635 			if (set->patterns [i].type == PTRN_TAG)
636 				set->patterns [i].u.tag.kind.enabled = FALSE;
637 	}
638 #endif
639 }
640 
enableRegexKind(const langType language __unused__,const int kind __unused__,const boolean mode __unused__)641 extern boolean enableRegexKind (
642 		const langType language __unused__,
643 		const int kind __unused__, const boolean mode __unused__)
644 {
645 	boolean result = FALSE;
646 #ifdef HAVE_REGEX
647 	if (language <= SetUpper  &&  Sets [language].count > 0)
648 	{
649 		patternSet* const set = Sets + language;
650 		unsigned int i;
651 		for (i = 0  ;  i < set->count  ;  ++i)
652 			if (set->patterns [i].type == PTRN_TAG &&
653 				set->patterns [i].u.tag.kind.letter == kind)
654 			{
655 				set->patterns [i].u.tag.kind.enabled = mode;
656 				result = TRUE;
657 			}
658 	}
659 #endif
660 	return result;
661 }
662 
printRegexKinds(const langType language __unused__,boolean indent __unused__)663 extern void printRegexKinds (const langType language __unused__, boolean indent __unused__)
664 {
665 #ifdef HAVE_REGEX
666 	if (language <= SetUpper  &&  Sets [language].count > 0)
667 	{
668 		patternSet* const set = Sets + language;
669 		unsigned int i;
670 		for (i = 0  ;  i < set->count  ;  ++i)
671 			if (set->patterns [i].type == PTRN_TAG)
672 				printRegexKind (set->patterns, i, indent);
673 	}
674 #endif
675 }
676 
freeRegexResources(void)677 extern void freeRegexResources (void)
678 {
679 #ifdef HAVE_REGEX
680 	int i;
681 	for (i = 0  ;  i <= SetUpper  ;  ++i)
682 		clearPatternSet (i);
683 	if (Sets != NULL)
684 		eFree (Sets);
685 	Sets = NULL;
686 	SetUpper = -1;
687 #endif
688 }
689 
690 /* Check for broken regcomp() on Cygwin */
checkRegex(void)691 extern void checkRegex (void)
692 {
693 #if defined (HAVE_REGEX) && defined (CHECK_REGCOMP)
694 	regex_t patbuf;
695 	int errcode;
696 	if (regcomp (&patbuf, "/hello/", 0) != 0)
697 	{
698 		error (WARNING, "Disabling broken regex");
699 		regexBroken = TRUE;
700 	}
701 #endif
702 }
703 
704 /* vi:set tabstop=4 shiftwidth=4: */
705