1 /* $OpenBSD: patterns.c,v 1.5 2016/02/14 18:20:59 semarie Exp $ */
2
3 /*
4 * Copyright (c) 2015 Reyk Floeter <reyk@openbsd.org>
5 * Copyright (C) 1994-2015 Lua.org, PUC-Rio.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining
8 * a copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sublicense, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be
16 * included in all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
22 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 */
26
27 /*
28 * Derived from Lua 5.3.1:
29 * $Id: patterns.c,v 1.5 2016/02/14 18:20:59 semarie Exp $
30 * Standard library for string operations and pattern-matching
31 */
32
33 #include <sys/types.h>
34 #include <ctype.h>
35 #include <errno.h>
36 #include <stddef.h>
37 #include <stdlib.h>
38 #include <string.h>
39
40 #include "patterns.h"
41
42 #define uchar(c) ((unsigned char)(c)) /* macro to 'unsign' a char */
43 #define CAP_UNFINISHED (-1)
44 #define CAP_POSITION (-2)
45 #define L_ESC '%'
46 #define SPECIALS "^$*+?.([%-"
47
48 struct match_state {
49 int matchdepth; /* control for recursive depth (to avoid C
50 * stack overflow) */
51 int repetitioncounter; /* control the repetition items */
52 int maxcaptures; /* configured capture limit */
53 const char *src_init; /* init of source string */
54 const char *src_end; /* end ('\0') of source string */
55 const char *p_end; /* end ('\0') of pattern */
56 const char *error; /* should be NULL */
57 int level; /* total number of captures (finished or
58 * unfinished) */
59 struct {
60 const char *init;
61 ptrdiff_t len;
62 } capture[MAXCAPTURES];
63 };
64
65 /* recursive function */
66 static const char *match(struct match_state *, const char *, const char *);
67
68 static int
match_error(struct match_state * ms,const char * error)69 match_error(struct match_state *ms, const char *error)
70 {
71 ms->error = ms->error == NULL ? error : ms->error;
72 return (-1);
73 }
74
75 static int
check_capture(struct match_state * ms,int l)76 check_capture(struct match_state *ms, int l)
77 {
78 l -= '1';
79 if (l < 0 || l >= ms->level || ms->capture[l].len == CAP_UNFINISHED)
80 return match_error(ms, "invalid capture index");
81 return (l);
82 }
83
84 static int
capture_to_close(struct match_state * ms)85 capture_to_close(struct match_state *ms)
86 {
87 int level = ms->level;
88 for (level--; level >= 0; level--)
89 if (ms->capture[level].len == CAP_UNFINISHED)
90 return (level);
91 return match_error(ms, "invalid pattern capture");
92 }
93
94 static const char *
classend(struct match_state * ms,const char * p)95 classend(struct match_state *ms, const char *p)
96 {
97 switch (*p++) {
98 case L_ESC:
99 if (p == ms->p_end)
100 match_error(ms,
101 "malformed pattern (ends with '%')");
102 return p + 1;
103 case '[':
104 if (*p == '^')
105 p++;
106 do {
107 /* look for a ']' */
108 if (p == ms->p_end) {
109 match_error(ms,
110 "malformed pattern (missing ']')");
111 break;
112 }
113 if (*(p++) == L_ESC && p < ms->p_end) {
114 /* skip escapes (e.g. '%]') */
115 p++;
116 }
117 } while (*p != ']');
118 return p + 1;
119 default:
120 return p;
121 }
122 }
123
124 static int
match_class(int c,int cl)125 match_class(int c, int cl)
126 {
127 int res;
128 switch (tolower(cl)) {
129 case 'a':
130 res = isalpha(c);
131 break;
132 case 'c':
133 res = iscntrl(c);
134 break;
135 case 'd':
136 res = isdigit(c);
137 break;
138 case 'g':
139 res = isgraph(c);
140 break;
141 case 'l':
142 res = islower(c);
143 break;
144 case 'p':
145 res = ispunct(c);
146 break;
147 case 's':
148 res = isspace(c);
149 break;
150 case 'u':
151 res = isupper(c);
152 break;
153 case 'w':
154 res = isalnum(c);
155 break;
156 case 'x':
157 res = isxdigit(c);
158 break;
159 default:
160 return (cl == c);
161 }
162 return (islower(cl) ? res : !res);
163 }
164
165 static int
matchbracketclass(int c,const char * p,const char * ec)166 matchbracketclass(int c, const char *p, const char *ec)
167 {
168 int sig = 1;
169 if (*(p + 1) == '^') {
170 sig = 0;
171 /* skip the '^' */
172 p++;
173 }
174 while (++p < ec) {
175 if (*p == L_ESC) {
176 p++;
177 if (match_class(c, uchar(*p)))
178 return sig;
179 } else if ((*(p + 1) == '-') && (p + 2 < ec)) {
180 p += 2;
181 if (uchar(*(p - 2)) <= c && c <= uchar(*p))
182 return sig;
183 } else if (uchar(*p) == c)
184 return sig;
185 }
186 return !sig;
187 }
188
189 static int
singlematch(struct match_state * ms,const char * s,const char * p,const char * ep)190 singlematch(struct match_state *ms, const char *s, const char *p,
191 const char *ep)
192 {
193 if (s >= ms->src_end)
194 return 0;
195 else {
196 int c = uchar(*s);
197 switch (*p) {
198 case '.':
199 /* matches any char */
200 return (1);
201 case L_ESC:
202 return match_class(c, uchar(*(p + 1)));
203 case '[':
204 return matchbracketclass(c, p, ep - 1);
205 default:
206 return (uchar(*p) == c);
207 }
208 }
209 }
210
211 static const char *
matchbalance(struct match_state * ms,const char * s,const char * p)212 matchbalance(struct match_state *ms, const char *s, const char *p)
213 {
214 if (p >= ms->p_end - 1) {
215 match_error(ms,
216 "malformed pattern (missing arguments to '%b')");
217 return (NULL);
218 }
219 if (*s != *p)
220 return (NULL);
221 else {
222 int b = *p;
223 int e = *(p + 1);
224 int cont = 1;
225 while (++s < ms->src_end) {
226 if (*s == e) {
227 if (--cont == 0)
228 return s + 1;
229 } else if (*s == b)
230 cont++;
231 }
232 }
233
234 /* string ends out of balance */
235 return (NULL);
236 }
237
238 static const char *
max_expand(struct match_state * ms,const char * s,const char * p,const char * ep)239 max_expand(struct match_state *ms, const char *s, const char *p, const char *ep)
240 {
241 ptrdiff_t i = 0;
242 /* counts maximum expand for item */
243 while (singlematch(ms, s + i, p, ep))
244 i++;
245 /* keeps trying to match with the maximum repetitions */
246 while (i >= 0) {
247 const char *res = match(ms, (s + i), ep + 1);
248 if (res)
249 return res;
250 /* else didn't match; reduce 1 repetition to try again */
251 i--;
252 }
253 return NULL;
254 }
255
256 static const char *
min_expand(struct match_state * ms,const char * s,const char * p,const char * ep)257 min_expand(struct match_state *ms, const char *s, const char *p, const char *ep)
258 {
259 for (;;) {
260 const char *res = match(ms, s, ep + 1);
261 if (res != NULL)
262 return res;
263 else if (singlematch(ms, s, p, ep))
264 s++; /* try with one more repetition */
265 else
266 return NULL;
267 }
268 }
269
270 static const char *
start_capture(struct match_state * ms,const char * s,const char * p,int what)271 start_capture(struct match_state *ms, const char *s, const char *p, int what)
272 {
273 const char *res;
274
275 int level = ms->level;
276 if (level >= ms->maxcaptures) {
277 match_error(ms, "too many captures");
278 return (NULL);
279 }
280 ms->capture[level].init = s;
281 ms->capture[level].len = what;
282 ms->level = level + 1;
283 /* undo capture if match failed */
284 if ((res = match(ms, s, p)) == NULL)
285 ms->level--;
286 return res;
287 }
288
289 static const char *
end_capture(struct match_state * ms,const char * s,const char * p)290 end_capture(struct match_state *ms, const char *s, const char *p)
291 {
292 int l = capture_to_close(ms);
293 const char *res;
294 if (l == -1)
295 return NULL;
296 /* close capture */
297 ms->capture[l].len = s - ms->capture[l].init;
298 /* undo capture if match failed */
299 if ((res = match(ms, s, p)) == NULL)
300 ms->capture[l].len = CAP_UNFINISHED;
301 return res;
302 }
303
304 static const char *
match_capture(struct match_state * ms,const char * s,int l)305 match_capture(struct match_state *ms, const char *s, int l)
306 {
307 size_t len;
308 l = check_capture(ms, l);
309 if (l == -1)
310 return NULL;
311 len = ms->capture[l].len;
312 if ((size_t) (ms->src_end - s) >= len &&
313 memcmp(ms->capture[l].init, s, len) == 0)
314 return s + len;
315 else
316 return NULL;
317 }
318
319 static const char *
match(struct match_state * ms,const char * s,const char * p)320 match(struct match_state *ms, const char *s, const char *p)
321 {
322 const char *ep, *res;
323 char previous;
324
325 if (ms->matchdepth-- == 0) {
326 match_error(ms, "pattern too complex");
327 return (NULL);
328 }
329
330 /* using goto's to optimize tail recursion */
331 init:
332 /* end of pattern? */
333 if (p != ms->p_end) {
334 switch (*p) {
335 case '(':
336 /* start capture */
337 if (*(p + 1) == ')')
338 /* position capture? */
339 s = start_capture(ms, s, p + 2, CAP_POSITION);
340 else
341 s = start_capture(ms, s, p + 1, CAP_UNFINISHED);
342 break;
343 case ')':
344 /* end capture */
345 s = end_capture(ms, s, p + 1);
346 break;
347 case '$':
348 /* is the '$' the last char in pattern? */
349 if ((p + 1) != ms->p_end) {
350 /* no; go to default */
351 goto dflt;
352 }
353 /* check end of string */
354 s = (s == ms->src_end) ? s : NULL;
355 break;
356 case L_ESC:
357 /* escaped sequences not in the format class[*+?-]? */
358 switch (*(p + 1)) {
359 case 'b':
360 /* balanced string? */
361 s = matchbalance(ms, s, p + 2);
362 if (s != NULL) {
363 p += 4;
364 /* return match(ms, s, p + 4); */
365 goto init;
366 } /* else fail (s == NULL) */
367 break;
368 case 'f':
369 /* frontier? */
370 p += 2;
371 if (*p != '[') {
372 match_error(ms, "missing '['"
373 " after '%f' in pattern");
374 break;
375 }
376 /* points to what is next */
377 ep = classend(ms, p);
378 if (ms->error != NULL)
379 break;
380 previous =
381 (s == ms->src_init) ? '\0' : *(s - 1);
382 if (!matchbracketclass(uchar(previous),
383 p, ep - 1) &&
384 matchbracketclass(uchar(*s),
385 p, ep - 1)) {
386 p = ep;
387 /* return match(ms, s, ep); */
388 goto init;
389 }
390 /* match failed */
391 s = NULL;
392 break;
393 case '0':
394 case '1':
395 case '2':
396 case '3':
397 case '4':
398 case '5':
399 case '6':
400 case '7':
401 case '8':
402 case '9':
403 /* capture results (%0-%9)? */
404 s = match_capture(ms, s, uchar(*(p + 1)));
405 if (s != NULL) {
406 p += 2;
407 /* return match(ms, s, p + 2) */
408 goto init;
409 }
410 break;
411 default:
412 goto dflt;
413 }
414 break;
415 default:
416
417 /* pattern class plus optional suffix */
418 dflt:
419 /* points to optional suffix */
420 ep = classend(ms, p);
421 if (ms->error != NULL)
422 break;
423
424 /* does not match at least once? */
425 if (!singlematch(ms, s, p, ep)) {
426 if (ms->repetitioncounter-- == 0) {
427 match_error(ms, "max repetition items");
428 s = NULL; /* fail */
429 /* accept empty? */
430 } else if
431 (*ep == '*' || *ep == '?' || *ep == '-') {
432 p = ep + 1;
433 /* return match(ms, s, ep + 1); */
434 goto init;
435 } else {
436 /* '+' or no suffix */
437 s = NULL; /* fail */
438 }
439 } else {
440 /* matched once */
441 /* handle optional suffix */
442 switch (*ep) {
443 case '?':
444 /* optional */
445 if ((res =
446 match(ms, s + 1, ep + 1)) != NULL)
447 s = res;
448 else {
449 /*
450 * else return
451 * match(ms, s, ep + 1);
452 */
453 p = ep + 1;
454 goto init;
455 }
456 break;
457 case '+':
458 /* 1 or more repetitions */
459 s++; /* 1 match already done */
460 /* FALLTHROUGH */
461 case '*':
462 /* 0 or more repetitions */
463 s = max_expand(ms, s, p, ep);
464 break;
465 case '-':
466 /* 0 or more repetitions (minimum) */
467 s = min_expand(ms, s, p, ep);
468 break;
469 default:
470 /* no suffix */
471 s++;
472 p = ep;
473 /* return match(ms, s + 1, ep); */
474 goto init;
475 }
476 }
477 break;
478 }
479 }
480 ms->matchdepth++;
481 return s;
482 }
483
484 static const char *
lmemfind(const char * s1,size_t l1,const char * s2,size_t l2)485 lmemfind(const char *s1, size_t l1,
486 const char *s2, size_t l2)
487 {
488 const char *init;
489
490 if (l2 == 0) {
491 /* empty strings are everywhere */
492 return (s1);
493 } else if (l2 > l1) {
494 /* avoids a negative 'l1' */
495 return (NULL);
496 } else {
497 /*
498 * to search for a '*s2' inside 's1'
499 * - 1st char will be checked by 'memchr'
500 * - 's2' cannot be found after that
501 */
502 l2--;
503 l1 = l1 - l2;
504 while (l1 > 0 &&
505 (init = (const char *)memchr(s1, *s2, l1)) != NULL) {
506 /* 1st char is already checked */
507 init++;
508 if (memcmp(init, s2 + 1, l2) == 0)
509 return init - 1;
510 else {
511 /* correct 'l1' and 's1' to try again */
512 l1 -= init - s1;
513 s1 = init;
514 }
515 }
516 /* not found */
517 return (NULL);
518 }
519 }
520
521 static int
push_onecapture(struct match_state * ms,int i,const char * s,const char * e,struct str_find * sm)522 push_onecapture(struct match_state *ms, int i, const char *s,
523 const char *e, struct str_find *sm)
524 {
525 if (i >= ms->level) {
526 if (i == 0 || ms->level == 0) {
527 /* add whole match */
528 sm->sm_so = (off_t)(s - ms->src_init);
529 sm->sm_eo = (off_t)(e - s) + sm->sm_so;
530 } else
531 return match_error(ms, "invalid capture index");
532 } else {
533 ptrdiff_t l = ms->capture[i].len;
534 if (l == CAP_UNFINISHED)
535 return match_error(ms, "unfinished capture");
536 sm->sm_so = ms->capture[i].init - ms->src_init;
537 sm->sm_eo = sm->sm_so + l;
538 }
539 sm->sm_eo = sm->sm_eo < sm->sm_so ? sm->sm_so : sm->sm_eo;
540 return (0);
541 }
542
543 static int
push_captures(struct match_state * ms,const char * s,const char * e,struct str_find * sm,size_t nsm)544 push_captures(struct match_state *ms, const char *s, const char *e,
545 struct str_find *sm, size_t nsm)
546 {
547 unsigned int i;
548 unsigned int nlevels = (ms->level <= 0 && s) ? 1 : ms->level;
549
550 if (nlevels > nsm)
551 nlevels = nsm;
552 for (i = 0; i < nlevels; i++)
553 if (push_onecapture(ms, i, s, e, sm + i) == -1)
554 break;
555
556 /* number of strings pushed */
557 return (nlevels);
558 }
559
560 /* check whether pattern has no special characters */
561 static int
nospecials(const char * p,size_t l)562 nospecials(const char *p, size_t l)
563 {
564 size_t upto = 0;
565
566 do {
567 if (strpbrk(p + upto, SPECIALS)) {
568 /* pattern has a special character */
569 return 0;
570 }
571 /* may have more after \0 */
572 upto += strlen(p + upto) + 1;
573 } while (upto <= l);
574
575 /* no special chars found */
576 return (1);
577 }
578
579 static int
str_find_aux(struct match_state * ms,const char * pattern,const char * string,struct str_find * sm,size_t nsm,off_t init)580 str_find_aux(struct match_state *ms, const char *pattern, const char *string,
581 struct str_find *sm, size_t nsm, off_t init)
582 {
583 size_t ls = strlen(string);
584 size_t lp = strlen(pattern);
585 const char *s = string;
586 const char *p = pattern;
587 const char *s1, *s2;
588 int anchor, i;
589
590 if (init < 0)
591 init = 0;
592 else if (init > (off_t)ls)
593 return match_error(ms, "starting after string's end");
594 s1 = s + init;
595
596 if (nospecials(p, lp)) {
597 /* do a plain search */
598 s2 = lmemfind(s1, ls - (size_t)init, p, lp);
599 if (s2 != NULL) {
600 i = 0;
601 sm[i].sm_so = 0;
602 sm[i].sm_eo = ls;
603 if (nsm > 1) {
604 i++;
605 sm[i].sm_so = s2 - s;
606 sm[i].sm_eo = (s2 - s) + lp;
607 }
608 return (i + 1);
609 }
610 return (0);
611 }
612
613 anchor = (*p == '^');
614 if (anchor) {
615 p++;
616 lp--; /* skip anchor character */
617 }
618 ms->maxcaptures = (nsm > MAXCAPTURES ? MAXCAPTURES : nsm) - 1;
619 ms->matchdepth = MAXCCALLS;
620 ms->repetitioncounter = MAXREPETITION;
621 ms->src_init = s;
622 ms->src_end = s + ls;
623 ms->p_end = p + lp;
624 do {
625 const char *res;
626 ms->level = 0;
627 if ((res = match(ms, s1, p)) != NULL) {
628 sm->sm_so = 0;
629 sm->sm_eo = ls;
630 return push_captures(ms, s1, res, sm + 1, nsm - 1) + 1;
631
632 } else if (ms->error != NULL) {
633 return 0;
634 }
635 } while (s1++ < ms->src_end && !anchor);
636
637 return 0;
638 }
639
640 int
str_find(const char * string,const char * pattern,struct str_find * sm,size_t nsm,const char ** errstr)641 str_find(const char *string, const char *pattern, struct str_find *sm,
642 size_t nsm, const char **errstr)
643 {
644 struct match_state ms;
645 int ret;
646
647 memset(&ms, 0, sizeof(ms));
648 memset(sm, 0, nsm * sizeof(*sm));
649
650 ret = str_find_aux(&ms, pattern, string, sm, nsm, 0);
651 if (ms.error != NULL) {
652 /* Return 0 on error and store the error string */
653 *errstr = ms.error;
654 ret = 0;
655 } else
656 *errstr = NULL;
657
658 return (ret);
659 }
660
661 int
str_match(const char * string,const char * pattern,struct str_match * m,const char ** errstr)662 str_match(const char *string, const char *pattern, struct str_match *m,
663 const char **errstr)
664 {
665 struct str_find sm[MAXCAPTURES];
666 struct match_state ms;
667 int ret, i;
668 size_t len, nsm;
669
670 nsm = MAXCAPTURES;
671 memset(&ms, 0, sizeof(ms));
672 memset(sm, 0, sizeof(sm));
673 memset(m, 0, sizeof(*m));
674
675 ret = str_find_aux(&ms, pattern, string, sm, nsm, 0);
676 if (ret <= 0 || ms.error != NULL) {
677 /* Return -1 on error and store the error string */
678 *errstr = ms.error;
679 return (-1);
680 }
681
682 if ((m->sm_match = calloc(ret, sizeof(char *))) == NULL) {
683 *errstr = strerror(errno);
684 return (-1);
685 }
686 m->sm_nmatch = ret;
687
688 for (i = 0; i < ret; i++) {
689 if (sm[i].sm_so > sm[i].sm_eo)
690 continue;
691 len = sm[i].sm_eo - sm[i].sm_so;
692 if ((m->sm_match[i] = strndup(string +
693 sm[i].sm_so, len)) == NULL) {
694 *errstr = strerror(errno);
695 str_match_free(m);
696 return (-1);
697 }
698 }
699
700 *errstr = NULL;
701 return (0);
702 }
703
704 void
str_match_free(struct str_match * m)705 str_match_free(struct str_match *m)
706 {
707 unsigned int i = 0;
708 for (i = 0; i < m->sm_nmatch; i++)
709 free(m->sm_match[i]);
710 free(m->sm_match);
711 m->sm_match = NULL;
712 m->sm_nmatch = 0;
713 }
714