xref: /openbsd/usr.sbin/httpd/patterns.c (revision 3fbbbb84)
1 /*	$OpenBSD: patterns.c,v 1.5 2016/02/14 18:20:59 semarie Exp $	*/
2 
3 /*
4  * Copyright (c) 2015 Reyk Floeter <reyk@openbsd.org>
5  * Copyright (C) 1994-2015 Lua.org, PUC-Rio.
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining
8  * a copy of this software and associated documentation files (the
9  * "Software"), to deal in the Software without restriction, including
10  * without limitation the rights to use, copy, modify, merge, publish,
11  * distribute, sublicense, and/or sell copies of the Software, and to
12  * permit persons to whom the Software is furnished to do so, subject to
13  * the following conditions:
14  *
15  * The above copyright notice and this permission notice shall be
16  * included in all copies or substantial portions of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
22  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25  */
26 
27 /*
28  * Derived from Lua 5.3.1:
29  * $Id: patterns.c,v 1.5 2016/02/14 18:20:59 semarie Exp $
30  * Standard library for string operations and pattern-matching
31  */
32 
33 #include <sys/types.h>
34 #include <ctype.h>
35 #include <errno.h>
36 #include <stddef.h>
37 #include <stdlib.h>
38 #include <string.h>
39 
40 #include "patterns.h"
41 
42 #define uchar(c)	((unsigned char)(c)) /* macro to 'unsign' a char */
43 #define CAP_UNFINISHED	(-1)
44 #define CAP_POSITION	(-2)
45 #define L_ESC		'%'
46 #define SPECIALS	"^$*+?.([%-"
47 
48 struct match_state {
49 	int matchdepth;		/* control for recursive depth (to avoid C
50 				 * stack overflow) */
51 	int repetitioncounter;	/* control the repetition items */
52 	int maxcaptures;	/* configured capture limit */
53 	const char *src_init;	/* init of source string */
54 	const char *src_end;	/* end ('\0') of source string */
55 	const char *p_end;	/* end ('\0') of pattern */
56 	const char *error;	/* should be NULL */
57 	int level;		/* total number of captures (finished or
58 				 * unfinished) */
59 	struct {
60 		const char *init;
61 		ptrdiff_t len;
62 	} capture[MAXCAPTURES];
63 };
64 
65 /* recursive function */
66 static const char *match(struct match_state *, const char *, const char *);
67 
68 static int
match_error(struct match_state * ms,const char * error)69 match_error(struct match_state *ms, const char *error)
70 {
71 	ms->error = ms->error == NULL ? error : ms->error;
72 	return (-1);
73 }
74 
75 static int
check_capture(struct match_state * ms,int l)76 check_capture(struct match_state *ms, int l)
77 {
78 	l -= '1';
79 	if (l < 0 || l >= ms->level || ms->capture[l].len == CAP_UNFINISHED)
80 		return match_error(ms, "invalid capture index");
81 	return (l);
82 }
83 
84 static int
capture_to_close(struct match_state * ms)85 capture_to_close(struct match_state *ms)
86 {
87 	int level = ms->level;
88 	for (level--; level >= 0; level--)
89 		if (ms->capture[level].len == CAP_UNFINISHED)
90 			return (level);
91 	return match_error(ms, "invalid pattern capture");
92 }
93 
94 static const char *
classend(struct match_state * ms,const char * p)95 classend(struct match_state *ms, const char *p)
96 {
97 	switch (*p++) {
98 	case L_ESC:
99 		if (p == ms->p_end)
100 			match_error(ms,
101 			    "malformed pattern (ends with '%')");
102 		return p + 1;
103 	case '[':
104 		if (*p == '^')
105 			p++;
106 		do {
107 			/* look for a ']' */
108 			if (p == ms->p_end) {
109 				match_error(ms,
110 				    "malformed pattern (missing ']')");
111 				break;
112 			}
113 			if (*(p++) == L_ESC && p < ms->p_end) {
114 				/* skip escapes (e.g. '%]') */
115 				p++;
116 			}
117 		} while (*p != ']');
118 		return p + 1;
119 	default:
120 		return p;
121 	}
122 }
123 
124 static int
match_class(int c,int cl)125 match_class(int c, int cl)
126 {
127 	int res;
128 	switch (tolower(cl)) {
129 	case 'a':
130 		res = isalpha(c);
131 		break;
132 	case 'c':
133 		res = iscntrl(c);
134 		break;
135 	case 'd':
136 		res = isdigit(c);
137 		break;
138 	case 'g':
139 		res = isgraph(c);
140 		break;
141 	case 'l':
142 		res = islower(c);
143 		break;
144 	case 'p':
145 		res = ispunct(c);
146 		break;
147 	case 's':
148 		res = isspace(c);
149 		break;
150 	case 'u':
151 		res = isupper(c);
152 		break;
153 	case 'w':
154 		res = isalnum(c);
155 		break;
156 	case 'x':
157 		res = isxdigit(c);
158 		break;
159 	default:
160 		return (cl == c);
161 	}
162 	return (islower(cl) ? res : !res);
163 }
164 
165 static int
matchbracketclass(int c,const char * p,const char * ec)166 matchbracketclass(int c, const char *p, const char *ec)
167 {
168 	int sig = 1;
169 	if (*(p + 1) == '^') {
170 		sig = 0;
171 		/* skip the '^' */
172 		p++;
173 	}
174 	while (++p < ec) {
175 		if (*p == L_ESC) {
176 			p++;
177 			if (match_class(c, uchar(*p)))
178 				return sig;
179 		} else if ((*(p + 1) == '-') && (p + 2 < ec)) {
180 			p += 2;
181 			if (uchar(*(p - 2)) <= c && c <= uchar(*p))
182 				return sig;
183 		} else if (uchar(*p) == c)
184 			return sig;
185 	}
186 	return !sig;
187 }
188 
189 static int
singlematch(struct match_state * ms,const char * s,const char * p,const char * ep)190 singlematch(struct match_state *ms, const char *s, const char *p,
191     const char *ep)
192 {
193 	if (s >= ms->src_end)
194 		return 0;
195 	else {
196 		int c = uchar(*s);
197 		switch (*p) {
198 		case '.':
199 			/* matches any char */
200 			return (1);
201 		case L_ESC:
202 			return match_class(c, uchar(*(p + 1)));
203 		case '[':
204 			return matchbracketclass(c, p, ep - 1);
205 		default:
206 			return (uchar(*p) == c);
207 		}
208 	}
209 }
210 
211 static const char *
matchbalance(struct match_state * ms,const char * s,const char * p)212 matchbalance(struct match_state *ms, const char *s, const char *p)
213 {
214 	if (p >= ms->p_end - 1) {
215 		match_error(ms,
216 		    "malformed pattern (missing arguments to '%b')");
217 		return (NULL);
218 	}
219 	if (*s != *p)
220 		return (NULL);
221 	else {
222 		int b = *p;
223 		int e = *(p + 1);
224 		int cont = 1;
225 		while (++s < ms->src_end) {
226 			if (*s == e) {
227 				if (--cont == 0)
228 					return s + 1;
229 			} else if (*s == b)
230 				cont++;
231 		}
232 	}
233 
234 	/* string ends out of balance */
235 	return (NULL);
236 }
237 
238 static const char *
max_expand(struct match_state * ms,const char * s,const char * p,const char * ep)239 max_expand(struct match_state *ms, const char *s, const char *p, const char *ep)
240 {
241 	ptrdiff_t i = 0;
242 	/* counts maximum expand for item */
243 	while (singlematch(ms, s + i, p, ep))
244 		i++;
245 	/* keeps trying to match with the maximum repetitions */
246 	while (i >= 0) {
247 		const char *res = match(ms, (s + i), ep + 1);
248 		if (res)
249 			return res;
250 		/* else didn't match; reduce 1 repetition to try again */
251 		i--;
252 	}
253 	return NULL;
254 }
255 
256 static const char *
min_expand(struct match_state * ms,const char * s,const char * p,const char * ep)257 min_expand(struct match_state *ms, const char *s, const char *p, const char *ep)
258 {
259 	for (;;) {
260 		const char *res = match(ms, s, ep + 1);
261 		if (res != NULL)
262 			return res;
263 		else if (singlematch(ms, s, p, ep))
264 			s++;	/* try with one more repetition */
265 		else
266 			return NULL;
267 	}
268 }
269 
270 static const char *
start_capture(struct match_state * ms,const char * s,const char * p,int what)271 start_capture(struct match_state *ms, const char *s, const char *p, int what)
272 {
273 	const char *res;
274 
275 	int level = ms->level;
276 	if (level >= ms->maxcaptures) {
277 		match_error(ms, "too many captures");
278 		return (NULL);
279 	}
280 	ms->capture[level].init = s;
281 	ms->capture[level].len = what;
282 	ms->level = level + 1;
283 	/* undo capture if match failed */
284 	if ((res = match(ms, s, p)) == NULL)
285 		ms->level--;
286 	return res;
287 }
288 
289 static const char *
end_capture(struct match_state * ms,const char * s,const char * p)290 end_capture(struct match_state *ms, const char *s, const char *p)
291 {
292 	int l = capture_to_close(ms);
293 	const char *res;
294 	if (l == -1)
295 		return NULL;
296 	/* close capture */
297 	ms->capture[l].len = s - ms->capture[l].init;
298 	/* undo capture if match failed */
299 	if ((res = match(ms, s, p)) == NULL)
300 		ms->capture[l].len = CAP_UNFINISHED;
301 	return res;
302 }
303 
304 static const char *
match_capture(struct match_state * ms,const char * s,int l)305 match_capture(struct match_state *ms, const char *s, int l)
306 {
307 	size_t len;
308 	l = check_capture(ms, l);
309 	if (l == -1)
310 		return NULL;
311 	len = ms->capture[l].len;
312 	if ((size_t) (ms->src_end - s) >= len &&
313 	    memcmp(ms->capture[l].init, s, len) == 0)
314 		return s + len;
315 	else
316 		return NULL;
317 }
318 
319 static const char *
match(struct match_state * ms,const char * s,const char * p)320 match(struct match_state *ms, const char *s, const char *p)
321 {
322 	const char *ep, *res;
323 	char previous;
324 
325 	if (ms->matchdepth-- == 0) {
326 		match_error(ms, "pattern too complex");
327 		return (NULL);
328 	}
329 
330 	/* using goto's to optimize tail recursion */
331  init:
332 	/* end of pattern? */
333 	if (p != ms->p_end) {
334 		switch (*p) {
335 		case '(':
336 			/* start capture */
337 			if (*(p + 1) == ')')
338 				/* position capture? */
339 				s = start_capture(ms, s, p + 2, CAP_POSITION);
340 			else
341 				s = start_capture(ms, s, p + 1, CAP_UNFINISHED);
342 			break;
343 		case ')':
344 			/* end capture */
345 			s = end_capture(ms, s, p + 1);
346 			break;
347 		case '$':
348 			/* is the '$' the last char in pattern? */
349 			if ((p + 1) != ms->p_end) {
350 				/* no; go to default */
351 				goto dflt;
352 			}
353 			 /* check end of string */
354 			s = (s == ms->src_end) ? s : NULL;
355 			break;
356 		case L_ESC:
357 			/* escaped sequences not in the format class[*+?-]? */
358 			switch (*(p + 1)) {
359 			case 'b':
360 				/* balanced string? */
361 				s = matchbalance(ms, s, p + 2);
362 				if (s != NULL) {
363 					p += 4;
364 					/* return match(ms, s, p + 4); */
365 					goto init;
366 				} /* else fail (s == NULL) */
367 				break;
368 			case 'f':
369 				/* frontier? */
370 				p += 2;
371 				if (*p != '[') {
372 					match_error(ms, "missing '['"
373 					    " after '%f' in pattern");
374 					break;
375 				}
376 				/* points to what is next */
377 				ep = classend(ms, p);
378 				if (ms->error != NULL)
379 					break;
380 				previous =
381 				    (s == ms->src_init) ? '\0' : *(s - 1);
382 				if (!matchbracketclass(uchar(previous),
383 				    p, ep - 1) &&
384 				    matchbracketclass(uchar(*s),
385 				    p, ep - 1)) {
386 					p = ep;
387 					/* return match(ms, s, ep); */
388 					goto init;
389 				}
390 				/* match failed */
391 				s = NULL;
392 				break;
393 			case '0':
394 			case '1':
395 			case '2':
396 			case '3':
397 			case '4':
398 			case '5':
399 			case '6':
400 			case '7':
401 			case '8':
402 			case '9':
403 				/* capture results (%0-%9)? */
404 				s = match_capture(ms, s, uchar(*(p + 1)));
405 				if (s != NULL) {
406 					p += 2;
407 					/* return match(ms, s, p + 2) */
408 					goto init;
409 				}
410 				break;
411 			default:
412 				goto dflt;
413 			}
414 			break;
415 		default:
416 
417 			/* pattern class plus optional suffix */
418 	dflt:
419 			/* points to optional suffix */
420 			ep = classend(ms, p);
421 			if (ms->error != NULL)
422 				break;
423 
424 			/* does not match at least once? */
425 			if (!singlematch(ms, s, p, ep)) {
426 				if (ms->repetitioncounter-- == 0) {
427 					match_error(ms, "max repetition items");
428 					s = NULL; /* fail */
429 				/* accept empty? */
430 				} else if
431 				    (*ep == '*' || *ep == '?' || *ep == '-') {
432 					 p = ep + 1;
433 					/* return match(ms, s, ep + 1); */
434 					 goto init;
435 				} else {
436 					/* '+' or no suffix */
437 					s = NULL; /* fail */
438 				}
439 			} else {
440 				/* matched once */
441 				/* handle optional suffix */
442 				switch (*ep) {
443 				case '?':
444 					/* optional */
445 					if ((res =
446 					    match(ms, s + 1, ep + 1)) != NULL)
447 						s = res;
448 					else {
449 						/*
450 						 * else return
451 						 *     match(ms, s, ep + 1);
452 						 */
453 						p = ep + 1;
454 						goto init;
455 					}
456 					break;
457 				case '+':
458 					/* 1 or more repetitions */
459 					s++; /* 1 match already done */
460 					/* FALLTHROUGH */
461 				case '*':
462 					/* 0 or more repetitions */
463 					s = max_expand(ms, s, p, ep);
464 					break;
465 				case '-':
466 					/* 0 or more repetitions (minimum) */
467 					s = min_expand(ms, s, p, ep);
468 					break;
469 				default:
470 					/* no suffix */
471 					s++;
472 					p = ep;
473 					/* return match(ms, s + 1, ep); */
474 					goto init;
475 				}
476 			}
477 			break;
478 		}
479 	}
480 	ms->matchdepth++;
481 	return s;
482 }
483 
484 static const char *
lmemfind(const char * s1,size_t l1,const char * s2,size_t l2)485 lmemfind(const char *s1, size_t l1,
486     const char *s2, size_t l2)
487 {
488 	const char *init;
489 
490 	if (l2 == 0) {
491 		/* empty strings are everywhere */
492 		return (s1);
493 	} else if (l2 > l1) {
494 		/* avoids a negative 'l1' */
495 		return (NULL);
496 	} else {
497 		/*
498 		 * to search for a '*s2' inside 's1'
499 		 * - 1st char will be checked by 'memchr'
500 		 * - 's2' cannot be found after that
501 		 */
502 		l2--;
503 		l1 = l1 - l2;
504 		while (l1 > 0 &&
505 		    (init = (const char *)memchr(s1, *s2, l1)) != NULL) {
506 			/* 1st char is already checked */
507 			init++;
508 			if (memcmp(init, s2 + 1, l2) == 0)
509 				return init - 1;
510 			else {
511 				/* correct 'l1' and 's1' to try again */
512 				l1 -= init - s1;
513 				s1 = init;
514 			}
515 		}
516 		/* not found */
517 		return (NULL);
518 	}
519 }
520 
521 static int
push_onecapture(struct match_state * ms,int i,const char * s,const char * e,struct str_find * sm)522 push_onecapture(struct match_state *ms, int i, const char *s,
523     const char *e, struct str_find *sm)
524 {
525 	if (i >= ms->level) {
526 		if (i == 0 || ms->level == 0) {
527 			/* add whole match */
528 			sm->sm_so = (off_t)(s - ms->src_init);
529 			sm->sm_eo = (off_t)(e - s) + sm->sm_so;
530 		} else
531 			return match_error(ms, "invalid capture index");
532 	} else {
533 		ptrdiff_t l = ms->capture[i].len;
534 		if (l == CAP_UNFINISHED)
535 			return match_error(ms, "unfinished capture");
536 		sm->sm_so = ms->capture[i].init - ms->src_init;
537 		sm->sm_eo = sm->sm_so + l;
538 	}
539 	sm->sm_eo = sm->sm_eo < sm->sm_so ? sm->sm_so : sm->sm_eo;
540 	return (0);
541 }
542 
543 static int
push_captures(struct match_state * ms,const char * s,const char * e,struct str_find * sm,size_t nsm)544 push_captures(struct match_state *ms, const char *s, const char *e,
545     struct str_find *sm, size_t nsm)
546 {
547 	unsigned int i;
548 	unsigned int nlevels = (ms->level <= 0 && s) ? 1 : ms->level;
549 
550 	if (nlevels > nsm)
551 		nlevels = nsm;
552 	for (i = 0; i < nlevels; i++)
553 		if (push_onecapture(ms, i, s, e, sm + i) == -1)
554 			break;
555 
556 	/* number of strings pushed */
557 	return (nlevels);
558 }
559 
560 /* check whether pattern has no special characters */
561 static int
nospecials(const char * p,size_t l)562 nospecials(const char *p, size_t l)
563 {
564 	size_t upto = 0;
565 
566 	do {
567 		if (strpbrk(p + upto, SPECIALS)) {
568 			/* pattern has a special character */
569 			return 0;
570 		}
571 		/* may have more after \0 */
572 		upto += strlen(p + upto) + 1;
573 	} while (upto <= l);
574 
575 	/* no special chars found */
576 	return (1);
577 }
578 
579 static int
str_find_aux(struct match_state * ms,const char * pattern,const char * string,struct str_find * sm,size_t nsm,off_t init)580 str_find_aux(struct match_state *ms, const char *pattern, const char *string,
581     struct str_find *sm, size_t nsm, off_t init)
582 {
583 	size_t		 ls = strlen(string);
584 	size_t		 lp = strlen(pattern);
585 	const char	*s = string;
586 	const char	*p = pattern;
587 	const char	*s1, *s2;
588 	int		 anchor, i;
589 
590 	if (init < 0)
591 		init = 0;
592 	else if (init > (off_t)ls)
593 		return match_error(ms, "starting after string's end");
594 	s1 = s + init;
595 
596 	if (nospecials(p, lp)) {
597 		/* do a plain search */
598 		s2 = lmemfind(s1, ls - (size_t)init, p, lp);
599 		if (s2 != NULL) {
600 			i = 0;
601 			sm[i].sm_so = 0;
602 			sm[i].sm_eo = ls;
603 			if (nsm > 1) {
604 				i++;
605 				sm[i].sm_so = s2 - s;
606 				sm[i].sm_eo = (s2 - s) + lp;
607 			}
608 			return (i + 1);
609 		}
610 		return (0);
611 	}
612 
613 	anchor = (*p == '^');
614 	if (anchor) {
615 		p++;
616 		lp--;	/* skip anchor character */
617 	}
618 	ms->maxcaptures = (nsm > MAXCAPTURES ? MAXCAPTURES : nsm) - 1;
619 	ms->matchdepth = MAXCCALLS;
620 	ms->repetitioncounter = MAXREPETITION;
621 	ms->src_init = s;
622 	ms->src_end = s + ls;
623 	ms->p_end = p + lp;
624 	do {
625 		const char *res;
626 		ms->level = 0;
627 		if ((res = match(ms, s1, p)) != NULL) {
628 			sm->sm_so = 0;
629 			sm->sm_eo = ls;
630 			return push_captures(ms, s1, res, sm + 1, nsm - 1) + 1;
631 
632 		} else if (ms->error != NULL) {
633 			return 0;
634 		}
635 	} while (s1++ < ms->src_end && !anchor);
636 
637 	return 0;
638 }
639 
640 int
str_find(const char * string,const char * pattern,struct str_find * sm,size_t nsm,const char ** errstr)641 str_find(const char *string, const char *pattern, struct str_find *sm,
642     size_t nsm, const char **errstr)
643 {
644 	struct match_state	ms;
645 	int			ret;
646 
647 	memset(&ms, 0, sizeof(ms));
648 	memset(sm, 0, nsm * sizeof(*sm));
649 
650 	ret = str_find_aux(&ms, pattern, string, sm, nsm, 0);
651 	if (ms.error != NULL) {
652 		/* Return 0 on error and store the error string */
653 		*errstr = ms.error;
654 		ret = 0;
655 	} else
656 		*errstr = NULL;
657 
658 	return (ret);
659 }
660 
661 int
str_match(const char * string,const char * pattern,struct str_match * m,const char ** errstr)662 str_match(const char *string, const char *pattern, struct str_match *m,
663     const char **errstr)
664 {
665 	struct str_find		 sm[MAXCAPTURES];
666 	struct match_state	 ms;
667 	int			 ret, i;
668 	size_t			 len, nsm;
669 
670 	nsm = MAXCAPTURES;
671 	memset(&ms, 0, sizeof(ms));
672 	memset(sm, 0, sizeof(sm));
673 	memset(m, 0, sizeof(*m));
674 
675 	ret = str_find_aux(&ms, pattern, string, sm, nsm, 0);
676 	if (ret <= 0 || ms.error != NULL) {
677 		/* Return -1 on error and store the error string */
678 		*errstr = ms.error;
679 		return (-1);
680 	}
681 
682 	if ((m->sm_match = calloc(ret, sizeof(char *))) == NULL) {
683 		*errstr = strerror(errno);
684 		return (-1);
685 	}
686 	m->sm_nmatch = ret;
687 
688 	for (i = 0; i < ret; i++) {
689 		if (sm[i].sm_so > sm[i].sm_eo)
690 			continue;
691 		len = sm[i].sm_eo - sm[i].sm_so;
692 		if ((m->sm_match[i] = strndup(string +
693 		    sm[i].sm_so, len)) == NULL) {
694 			*errstr = strerror(errno);
695 			str_match_free(m);
696 			return (-1);
697 		}
698 	}
699 
700 	*errstr = NULL;
701 	return (0);
702 }
703 
704 void
str_match_free(struct str_match * m)705 str_match_free(struct str_match *m)
706 {
707 	unsigned int	 i = 0;
708 	for (i = 0; i < m->sm_nmatch; i++)
709 		free(m->sm_match[i]);
710 	free(m->sm_match);
711 	m->sm_match = NULL;
712 	m->sm_nmatch = 0;
713 }
714