1 /*
2  * Copyright (c) 2004-2005 Sergey Lyubka <valenok@gmail.com>
3  * All rights reserved
4  *
5  * "THE BEER-WARE LICENSE" (Revision 42):
6  * Sergey Lyubka wrote this file.  As long as you retain this notice you
7  * can do whatever you want with this stuff. If we meet some day, and you think
8  * this stuff is worth it, you can buy me a beer in return.
9  */
10 
11 /*
12  * Downloaded Sat Nov  5 17:43:06 CET 2011 at
13  * http://slre.sourceforge.net/1.0/slre.c
14  */
15 
16 #ifdef SLRE_TEST
17 #include <stdio.h>
18 #include <assert.h>
19 #include <ctype.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #else
23 #include <log.h>
24 #include <common.h>
25 #include <linux/ctype.h>
26 #endif /* SLRE_TEST */
27 
28 #include <errno.h>
29 
30 #include <slre.h>
31 
32 enum {END, BRANCH, ANY, EXACT, ANYOF, ANYBUT, OPEN, CLOSE, BOL, EOL,
33 	STAR, PLUS, STARQ, PLUSQ, QUEST, SPACE, NONSPACE, DIGIT};
34 
35 #ifdef SLRE_TEST
36 static struct {
37 	const char	*name;
38 	int		narg;
39 	const char	*flags;
40 } opcodes[] = {
41 	{"END",		0, ""},		/* End of code block or program	*/
42 	{"BRANCH",	2, "oo"},	/* Alternative operator, "|"	*/
43 	{"ANY",		0, ""},		/* Match any character, "."	*/
44 	{"EXACT",	2, "d"},	/* Match exact string		*/
45 	{"ANYOF",	2, "D"},	/* Match any from set, "[]"	*/
46 	{"ANYBUT",	2, "D"},	/* Match any but from set, "[^]"*/
47 	{"OPEN ",	1, "i"},	/* Capture start, "("		*/
48 	{"CLOSE",	1, "i"},	/* Capture end, ")"		*/
49 	{"BOL",		0, ""},		/* Beginning of string, "^"	*/
50 	{"EOL",		0, ""},		/* End of string, "$"		*/
51 	{"STAR",	1, "o"},	/* Match zero or more times "*"	*/
52 	{"PLUS",	1, "o"},	/* Match one or more times, "+"	*/
53 	{"STARQ",	1, "o"},	/* Non-greedy STAR,  "*?"	*/
54 	{"PLUSQ",	1, "o"},	/* Non-greedy PLUS, "+?"	*/
55 	{"QUEST",	1, "o"},	/* Match zero or one time, "?"	*/
56 	{"SPACE",	0, ""},		/* Match whitespace, "\s"	*/
57 	{"NONSPACE",	0, ""},		/* Match non-space, "\S"	*/
58 	{"DIGIT",	0, ""}		/* Match digit, "\d"		*/
59 };
60 #endif /* SLRE_TEST */
61 
62 /*
63  * Commands and operands are all unsigned char (1 byte long). All code offsets
64  * are relative to current address, and positive (always point forward). Data
65  * offsets are absolute. Commands with operands:
66  *
67  * BRANCH offset1 offset2
68  *	Try to match the code block that follows the BRANCH instruction
69  *	(code block ends with END). If no match, try to match code block that
70  *	starts at offset1. If either of these match, jump to offset2.
71  *
72  * EXACT data_offset data_length
73  *	Try to match exact string. String is recorded in data section from
74  *	data_offset, and has length data_length.
75  *
76  * OPEN capture_number
77  * CLOSE capture_number
78  *	If the user have passed 'struct cap' array for captures, OPEN
79  *	records the beginning of the matched substring (cap->ptr), CLOSE
80  *	sets the length (cap->len) for respective capture_number.
81  *
82  * STAR code_offset
83  * PLUS code_offset
84  * QUEST code_offset
85  *	*, +, ?, respectively. Try to gobble as much as possible from the
86  *	matched buffer, until code block that follows these instructions
87  *	matches. When the longest possible string is matched,
88  *	jump to code_offset
89  *
90  * STARQ, PLUSQ are non-greedy versions of STAR and PLUS.
91  */
92 
93 static const char *meta_chars = "|.^$*+?()[\\";
94 
95 #ifdef SLRE_TEST
96 
97 static void
print_character_set(FILE * fp,const unsigned char * p,int len)98 print_character_set(FILE *fp, const unsigned char *p, int len)
99 {
100 	int	i;
101 
102 	for (i = 0; i < len; i++) {
103 		if (i > 0)
104 			(void) fputc(',', fp);
105 		if (p[i] == 0) {
106 			i++;
107 			if (p[i] == 0)
108 				(void) fprintf(fp, "\\x%02x", p[i]);
109 			else
110 				(void) fprintf(fp, "%s", opcodes[p[i]].name);
111 		} else if (isprint(p[i])) {
112 			(void) fputc(p[i], fp);
113 		} else {
114 			(void) fprintf(fp, "\\x%02x", p[i]);
115 		}
116 	}
117 }
118 
119 void
slre_dump(const struct slre * r,FILE * fp)120 slre_dump(const struct slre *r, FILE *fp)
121 {
122 	int	i, j, ch, op, pc;
123 
124 	for (pc = 0; pc < r->code_size; pc++) {
125 
126 		op = r->code[pc];
127 		(void) fprintf(fp, "%3d %s ", pc, opcodes[op].name);
128 
129 		for (i = 0; opcodes[op].flags[i] != '\0'; i++)
130 			switch (opcodes[op].flags[i]) {
131 			case 'i':
132 				(void) fprintf(fp, "%d ", r->code[pc + 1]);
133 				pc++;
134 				break;
135 			case 'o':
136 				(void) fprintf(fp, "%d ",
137 				    pc + r->code[pc + 1] - i);
138 				pc++;
139 				break;
140 			case 'D':
141 				print_character_set(fp, r->data +
142 				    r->code[pc + 1], r->code[pc + 2]);
143 				pc += 2;
144 				break;
145 			case 'd':
146 				(void) fputc('"', fp);
147 				for (j = 0; j < r->code[pc + 2]; j++) {
148 					ch = r->data[r->code[pc + 1] + j];
149 					if (isprint(ch)) {
150 						(void) fputc(ch, fp);
151 					} else {
152 						(void) fprintf(fp,
153 							"\\x%02x", ch);
154 					}
155 				}
156 				(void) fputc('"', fp);
157 				pc += 2;
158 				break;
159 			}
160 
161 		(void) fputc('\n', fp);
162 	}
163 }
164 #endif /* SLRE_TEST */
165 
166 static void
set_jump_offset(struct slre * r,int pc,int offset)167 set_jump_offset(struct slre *r, int pc, int offset)
168 {
169 	assert(offset < r->code_size);
170 
171 	if (r->code_size - offset > 0xff)
172 		r->err_str = "Jump offset is too big";
173 	else
174 		r->code[pc] = (unsigned char) (r->code_size - offset);
175 }
176 
177 static void
emit(struct slre * r,int code)178 emit(struct slre *r, int code)
179 {
180 	if (r->code_size >= (int) (sizeof(r->code) / sizeof(r->code[0])))
181 		r->err_str = "RE is too long (code overflow)";
182 	else
183 		r->code[r->code_size++] = (unsigned char) code;
184 }
185 
186 static void
store_char_in_data(struct slre * r,int ch)187 store_char_in_data(struct slre *r, int ch)
188 {
189 	if (r->data_size >= (int) sizeof(r->data))
190 		r->err_str = "RE is too long (data overflow)";
191 	else
192 		r->data[r->data_size++] = ch;
193 }
194 
195 static void
exact(struct slre * r,const char ** re)196 exact(struct slre *r, const char **re)
197 {
198 	int	old_data_size = r->data_size;
199 
200 	while (**re != '\0' && (strchr(meta_chars, **re)) == NULL)
201 		store_char_in_data(r, *(*re)++);
202 
203 	emit(r, EXACT);
204 	emit(r, old_data_size);
205 	emit(r, r->data_size - old_data_size);
206 }
207 
208 static int
get_escape_char(const char ** re)209 get_escape_char(const char **re)
210 {
211 	int	res;
212 
213 	switch (*(*re)++) {
214 	case 'n':
215 		res = '\n';
216 		break;
217 	case 'r':
218 		res = '\r';
219 		break;
220 	case 't':
221 		res = '\t';
222 		break;
223 	case '0':
224 		res = 0;
225 		break;
226 	case 'S':
227 		res = NONSPACE << 8;
228 		break;
229 	case 's':
230 		res = SPACE << 8;
231 		break;
232 	case 'd':
233 		res = DIGIT << 8;
234 		break;
235 	default:
236 		res = (*re)[-1];
237 		break;
238 	}
239 
240 	return res;
241 }
242 
243 static void
anyof(struct slre * r,const char ** re)244 anyof(struct slre *r, const char **re)
245 {
246 	int	esc, old_data_size = r->data_size, op = ANYOF;
247 
248 	if (**re == '^') {
249 		op = ANYBUT;
250 		(*re)++;
251 	}
252 
253 	while (**re != '\0')
254 
255 		switch (*(*re)++) {
256 		case ']':
257 			emit(r, op);
258 			emit(r, old_data_size);
259 			emit(r, r->data_size - old_data_size);
260 			return;
261 			/* NOTREACHED */
262 			break;
263 		case '\\':
264 			esc = get_escape_char(re);
265 			if ((esc & 0xff) == 0) {
266 				store_char_in_data(r, 0);
267 				store_char_in_data(r, esc >> 8);
268 			} else {
269 				store_char_in_data(r, esc);
270 			}
271 			break;
272 		default:
273 			store_char_in_data(r, (*re)[-1]);
274 			break;
275 		}
276 
277 	r->err_str = "No closing ']' bracket";
278 }
279 
280 static void
relocate(struct slre * r,int begin,int shift)281 relocate(struct slre *r, int begin, int shift)
282 {
283 	emit(r, END);
284 	memmove(r->code + begin + shift, r->code + begin, r->code_size - begin);
285 	r->code_size += shift;
286 }
287 
288 static void
quantifier(struct slre * r,int prev,int op)289 quantifier(struct slre *r, int prev, int op)
290 {
291 	if (r->code[prev] == EXACT && r->code[prev + 2] > 1) {
292 		r->code[prev + 2]--;
293 		emit(r, EXACT);
294 		emit(r, r->code[prev + 1] + r->code[prev + 2]);
295 		emit(r, 1);
296 		prev = r->code_size - 3;
297 	}
298 	relocate(r, prev, 2);
299 	r->code[prev] = op;
300 	set_jump_offset(r, prev + 1, prev);
301 }
302 
303 static void
exact_one_char(struct slre * r,int ch)304 exact_one_char(struct slre *r, int ch)
305 {
306 	emit(r, EXACT);
307 	emit(r, r->data_size);
308 	emit(r, 1);
309 	store_char_in_data(r, ch);
310 }
311 
312 static void
fixup_branch(struct slre * r,int fixup)313 fixup_branch(struct slre *r, int fixup)
314 {
315 	if (fixup > 0) {
316 		emit(r, END);
317 		set_jump_offset(r, fixup, fixup - 2);
318 	}
319 }
320 
321 static void
compile(struct slre * r,const char ** re)322 compile(struct slre *r, const char **re)
323 {
324 	int	op, esc, branch_start, last_op, fixup, cap_no, level;
325 
326 	fixup = 0;
327 	level = r->num_caps;
328 	branch_start = last_op = r->code_size;
329 
330 	for (;;)
331 		switch (*(*re)++) {
332 		case '\0':
333 			(*re)--;
334 			return;
335 			/* NOTREACHED */
336 			break;
337 		case '^':
338 			emit(r, BOL);
339 			break;
340 		case '$':
341 			emit(r, EOL);
342 			break;
343 		case '.':
344 			last_op = r->code_size;
345 			emit(r, ANY);
346 			break;
347 		case '[':
348 			last_op = r->code_size;
349 			anyof(r, re);
350 			break;
351 		case '\\':
352 			last_op = r->code_size;
353 			esc = get_escape_char(re);
354 			if (esc & 0xff00)
355 				emit(r, esc >> 8);
356 			else
357 				exact_one_char(r, esc);
358 			break;
359 		case '(':
360 			last_op = r->code_size;
361 			cap_no = ++r->num_caps;
362 			emit(r, OPEN);
363 			emit(r, cap_no);
364 
365 			compile(r, re);
366 			if (*(*re)++ != ')') {
367 				r->err_str = "No closing bracket";
368 				return;
369 			}
370 
371 			emit(r, CLOSE);
372 			emit(r, cap_no);
373 			break;
374 		case ')':
375 			(*re)--;
376 			fixup_branch(r, fixup);
377 			if (level == 0) {
378 				r->err_str = "Unbalanced brackets";
379 				return;
380 			}
381 			return;
382 			/* NOTREACHED */
383 			break;
384 		case '+':
385 		case '*':
386 			op = (*re)[-1] == '*' ? STAR : PLUS;
387 			if (**re == '?') {
388 				(*re)++;
389 				op = op == STAR ? STARQ : PLUSQ;
390 			}
391 			quantifier(r, last_op, op);
392 			break;
393 		case '?':
394 			quantifier(r, last_op, QUEST);
395 			break;
396 		case '|':
397 			fixup_branch(r, fixup);
398 			relocate(r, branch_start, 3);
399 			r->code[branch_start] = BRANCH;
400 			set_jump_offset(r, branch_start + 1, branch_start);
401 			fixup = branch_start + 2;
402 			r->code[fixup] = 0xff;
403 			break;
404 		default:
405 			(*re)--;
406 			last_op = r->code_size;
407 			exact(r, re);
408 			break;
409 		}
410 }
411 
412 int
slre_compile(struct slre * r,const char * re)413 slre_compile(struct slre *r, const char *re)
414 {
415 	r->err_str = NULL;
416 	r->code_size = r->data_size = r->num_caps = r->anchored = 0;
417 
418 	if (*re == '^')
419 		r->anchored++;
420 
421 	emit(r, OPEN);	/* This will capture what matches full RE */
422 	emit(r, 0);
423 
424 	while (*re != '\0')
425 		compile(r, &re);
426 
427 	if (r->code[2] == BRANCH)
428 		fixup_branch(r, 4);
429 
430 	emit(r, CLOSE);
431 	emit(r, 0);
432 	emit(r, END);
433 
434 	return (r->err_str == NULL ? 1 : 0);
435 }
436 
437 static int match(const struct slre *, int,
438 		const char *, int, int *, struct cap *);
439 
440 static void
loop_greedy(const struct slre * r,int pc,const char * s,int len,int * ofs)441 loop_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
442 {
443 	int	saved_offset, matched_offset;
444 
445 	matched_offset = *ofs;
446 
447 	while (match(r, pc + 2, s, len, ofs, NULL)) {
448 		saved_offset = *ofs;
449 		if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
450 			matched_offset = saved_offset;
451 		*ofs = saved_offset;
452 	}
453 
454 	*ofs = matched_offset;
455 }
456 
457 static void
loop_non_greedy(const struct slre * r,int pc,const char * s,int len,int * ofs)458 loop_non_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
459 {
460 	int	saved_offset = *ofs;
461 
462 	while (match(r, pc + 2, s, len, ofs, NULL)) {
463 		saved_offset = *ofs;
464 		if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
465 			break;
466 	}
467 
468 	*ofs = saved_offset;
469 }
470 
471 static int
is_any_of(const unsigned char * p,int len,const char * s,int * ofs)472 is_any_of(const unsigned char *p, int len, const char *s, int *ofs)
473 {
474 	int	i, ch;
475 
476 	ch = s[*ofs];
477 
478 	for (i = 0; i < len; i++)
479 		if (p[i] == ch) {
480 			(*ofs)++;
481 			return 1;
482 		}
483 
484 	return 0;
485 }
486 
487 static int
is_any_but(const unsigned char * p,int len,const char * s,int * ofs)488 is_any_but(const unsigned char *p, int len, const char *s, int *ofs)
489 {
490 	int	i, ch;
491 
492 	ch = s[*ofs];
493 
494 	for (i = 0; i < len; i++) {
495 		if (p[i] == ch)
496 			return 0;
497 	}
498 
499 	(*ofs)++;
500 	return 1;
501 }
502 
503 static int
match(const struct slre * r,int pc,const char * s,int len,int * ofs,struct cap * caps)504 match(const struct slre *r, int pc, const char *s, int len,
505 		int *ofs, struct cap *caps)
506 {
507 	int	n, saved_offset, res = 1;
508 
509 	while (res && r->code[pc] != END) {
510 
511 		assert(pc < r->code_size);
512 		assert(pc < (int) (sizeof(r->code) / sizeof(r->code[0])));
513 
514 		switch (r->code[pc]) {
515 		case BRANCH:
516 			saved_offset = *ofs;
517 			res = match(r, pc + 3, s, len, ofs, caps);
518 			if (res == 0) {
519 				*ofs = saved_offset;
520 				res = match(r, pc + r->code[pc + 1],
521 				    s, len, ofs, caps);
522 			}
523 			pc += r->code[pc + 2];
524 			break;
525 		case EXACT:
526 			res = 0;
527 			n = r->code[pc + 2];	/* String length */
528 			if (n <= len - *ofs && !memcmp(s + *ofs, r->data +
529 			    r->code[pc + 1], n)) {
530 				(*ofs) += n;
531 				res = 1;
532 			}
533 			pc += 3;
534 			break;
535 		case QUEST:
536 			res = 1;
537 			saved_offset = *ofs;
538 			if (!match(r, pc + 2, s, len, ofs, caps))
539 				*ofs = saved_offset;
540 			pc += r->code[pc + 1];
541 			break;
542 		case STAR:
543 			res = 1;
544 			loop_greedy(r, pc, s, len, ofs);
545 			pc += r->code[pc + 1];
546 			break;
547 		case STARQ:
548 			res = 1;
549 			loop_non_greedy(r, pc, s, len, ofs);
550 			pc += r->code[pc + 1];
551 			break;
552 		case PLUS:
553 			res = match(r, pc + 2, s, len, ofs, caps);
554 			if (res == 0)
555 				break;
556 
557 			loop_greedy(r, pc, s, len, ofs);
558 			pc += r->code[pc + 1];
559 			break;
560 		case PLUSQ:
561 			res = match(r, pc + 2, s, len, ofs, caps);
562 			if (res == 0)
563 				break;
564 
565 			loop_non_greedy(r, pc, s, len, ofs);
566 			pc += r->code[pc + 1];
567 			break;
568 		case SPACE:
569 			res = 0;
570 			if (*ofs < len && isspace(((unsigned char *)s)[*ofs])) {
571 				(*ofs)++;
572 				res = 1;
573 			}
574 			pc++;
575 			break;
576 		case NONSPACE:
577 			res = 0;
578 			if (*ofs < len &&
579 					!isspace(((unsigned char *)s)[*ofs])) {
580 				(*ofs)++;
581 				res = 1;
582 			}
583 			pc++;
584 			break;
585 		case DIGIT:
586 			res = 0;
587 			if (*ofs < len && isdigit(((unsigned char *)s)[*ofs])) {
588 				(*ofs)++;
589 				res = 1;
590 			}
591 			pc++;
592 			break;
593 		case ANY:
594 			res = 0;
595 			if (*ofs < len) {
596 				(*ofs)++;
597 				res = 1;
598 			}
599 			pc++;
600 			break;
601 		case ANYOF:
602 			res = 0;
603 			if (*ofs < len)
604 				res = is_any_of(r->data + r->code[pc + 1],
605 					r->code[pc + 2], s, ofs);
606 			pc += 3;
607 			break;
608 		case ANYBUT:
609 			res = 0;
610 			if (*ofs < len)
611 				res = is_any_but(r->data + r->code[pc + 1],
612 					r->code[pc + 2], s, ofs);
613 			pc += 3;
614 			break;
615 		case BOL:
616 			res = *ofs == 0 ? 1 : 0;
617 			pc++;
618 			break;
619 		case EOL:
620 			res = *ofs == len ? 1 : 0;
621 			pc++;
622 			break;
623 		case OPEN:
624 			if (caps != NULL)
625 				caps[r->code[pc + 1]].ptr = s + *ofs;
626 			pc += 2;
627 			break;
628 		case CLOSE:
629 			if (caps != NULL)
630 				caps[r->code[pc + 1]].len = (s + *ofs) -
631 				    caps[r->code[pc + 1]].ptr;
632 			pc += 2;
633 			break;
634 		case END:
635 			pc++;
636 			break;
637 		default:
638 			printf("unknown cmd (%d) at %d\n", r->code[pc], pc);
639 			assert(0);
640 			break;
641 		}
642 	}
643 
644 	return res;
645 }
646 
647 int
slre_match(const struct slre * r,const char * buf,int len,struct cap * caps)648 slre_match(const struct slre *r, const char *buf, int len,
649 		struct cap *caps)
650 {
651 	int	i, ofs = 0, res = 0;
652 
653 	if (r->anchored) {
654 		res = match(r, 0, buf, len, &ofs, caps);
655 	} else {
656 		for (i = 0; i < len && res == 0; i++) {
657 			ofs = i;
658 			res = match(r, 0, buf, len, &ofs, caps);
659 		}
660 	}
661 
662 	return res;
663 }
664 
665 #ifdef SLRE_TEST
666 #define N_CAPS	5
667 
main(int argc,char * argv[])668 int main(int argc, char *argv[])
669 {
670 	struct slre	slre;
671 	struct cap	caps[N_CAPS];
672 	unsigned char	data[1 * 1024 * 1024];
673 	FILE		*fp;
674 	int		i, res, len;
675 
676 	if (argc < 2) {
677 		fprintf(stderr, "Usage: %s 'slre' <file>\n", argv[0]);
678 		return 1;
679 	}
680 
681 	fp = fopen(argv[2], "rb");
682 	if (fp == NULL) {
683 		fprintf(stderr, "Error: cannot open %s:%s\n",
684 			argv[2], strerror(errno));
685 		return 1;
686 	}
687 
688 	if (!slre_compile(&slre, argv[1])) {
689 		fprintf(stderr, "Error compiling slre: %s\n", slre.err_str);
690 		return 1;
691 	}
692 
693 	slre_dump(&slre, stderr);
694 
695 	while (fgets(data, sizeof(data), fp) != NULL) {
696 		len = strlen(data);
697 
698 		if ((len > 0) && (data[len-1] == '\n')) {
699 			data[len-1] = '\0';
700 			--len;
701 		}
702 
703 		printf("Data = \"%s\"\n", data);
704 
705 		(void) memset(caps, 0, sizeof(caps));
706 
707 		res = slre_match(&slre, data, len, caps);
708 		printf("Result [%d]: %d\n", i, res);
709 
710 		for (i = 0; i < N_CAPS; i++) {
711 			if (caps[i].len > 0) {
712 				printf("Substring %d: len=%d  [%.*s]\n", i,
713 					caps[i].len,
714 					caps[i].len, caps[i].ptr);
715 			}
716 		}
717 		printf("----------------------------------------------------\n");
718 	}
719 	(void) fclose(fp);
720 
721 	return 0;
722 }
723 #endif /* SLRE_TEST */
724