1 /*
2 Bacula(R) - The Network Backup Solution
3
4 Copyright (C) 2000-2020 Kern Sibbald
5
6 The original author of Bacula is Kern Sibbald, with contributions
7 from many others, a complete list can be found in the file AUTHORS.
8
9 You may use this file and others of this release according to the
10 license defined in the LICENSE file, which includes the Affero General
11 Public License, v3.0 ("AGPLv3") and some additional permissions and
12 terms pursuant to its AGPLv3 Section 7.
13
14 This notice must be preserved when any source code is
15 conveyed and/or propagated.
16
17 Bacula(R) is a registered trademark of Kern Sibbald.
18 */
19
20 /* regexpr.c
21 *
22 * Author: Tatu Ylonen <ylo@ngs.fi>
23 *
24 * Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
25 *
26 * Permission to use, copy, modify, distribute, and sell this software
27 * and its documentation for any purpose is hereby granted without
28 * fee, provided that the above copyright notice appear in all copies.
29 * This software is provided "as is" without express or implied
30 * warranty.
31 *
32 * Created: Thu Sep 26 17:14:05 1991 ylo
33 * Last modified: Mon Nov 4 17:06:48 1991 ylo
34 * Ported to Think C: 19 Jan 1992 guido@cwi.nl
35 *
36 * This code draws many ideas from the regular expression packages by
37 * Henry Spencer of the University of Toronto and Richard Stallman of
38 * the Free Software Foundation.
39 *
40 * Emacs-specific code and syntax table code is almost directly borrowed
41 * from GNU regexp.
42 *
43 * Bugs fixed and lots of reorganization by Jeffrey C. Ollie, April
44 * 1997 Thanks for bug reports and ideas from Andrew Kuchling, Tim
45 * Peters, Guido van Rossum, Ka-Ping Yee, Sjoerd Mullender, and
46 * probably one or two others that I'm forgetting.
47 *
48 * This file modified to work with Bacula and C++ by
49 * Kern Sibbald, April 2006
50 *
51 * This file modified to work with REG_ICASE and Bacula by
52 * Eric Bollengier April 2007
53 */
54
55 #include "bacula.h"
56 #include "bregex.h"
57
58 #define set_error(x) bufp->errmsg=((char *)(x))
59 #define got_error bufp->errmsg!=NULL
60
61 /* The original code blithely assumed that sizeof(short) == 2. Not
62 * always true. Original instances of "(short)x" were replaced by
63 * SHORT(x), where SHORT is #defined below. */
64
65 #define SHORT(x) ((x) & 0x8000 ? (x) - 0x10000 : (x))
66
67 /* The stack implementation is taken from an idea by Andrew Kuchling.
68 * It's a doubly linked list of arrays. The advantages of this over a
69 * simple linked list are that the number of mallocs required are
70 * reduced. It also makes it possible to statically allocate enough
71 * space so that small patterns don't ever need to call malloc.
72 *
73 * The advantages over a single array is that is periodically
74 * realloced when more space is needed is that we avoid ever copying
75 * the stack. */
76
77 /* item_t is the basic stack element. Defined as a union of
78 * structures so that both registers, failure points, and counters can
79 * be pushed/popped from the stack. There's nothing built into the
80 * item to keep track of whether a certain stack item is a register, a
81 * failure point, or a counter. */
82
83 typedef union item_t {
84 struct {
85 int num;
86 int level;
87 unsigned char *start;
88 unsigned char *end;
89 } reg;
90 struct {
91 int count;
92 int level;
93 int phantom;
94 unsigned char *code;
95 unsigned char *text;
96 } fail;
97 struct {
98 int num;
99 int level;
100 int count;
101 } cntr;
102 } item_t;
103
104 #define B_STACK_PAGE_SIZE 256
105 #define NUM_REGISTERS 256
106
107 /* A 'page' of stack items. */
108
109 typedef struct item_page_t {
110 item_t items[B_STACK_PAGE_SIZE];
111 struct item_page_t *prev;
112 struct item_page_t *next;
113 } item_page_t;
114
115
116 typedef struct match_state {
117 /* The number of registers that have been pushed onto the stack
118 * since the last failure point. */
119
120 int count;
121
122 /* Used to control when registers need to be pushed onto the
123 * stack. */
124
125 int level;
126
127 /* The number of failure points on the stack. */
128
129 int point;
130
131 /* Storage for the registers. Each register consists of two
132 * pointers to characters. So register N is represented as
133 * start[N] and end[N]. The pointers must be converted to
134 * offsets from the beginning of the string before returning the
135 * registers to the calling program. */
136
137 unsigned char *start[NUM_REGISTERS];
138 unsigned char *end[NUM_REGISTERS];
139
140 /* Keeps track of whether a register has changed recently. */
141
142 int changed[NUM_REGISTERS];
143
144 /* Structure to encapsulate the stack. */
145 struct {
146 /* index into the current page. If index == 0 and you need
147 * to pop an item, move to the previous page and set index
148 * = B_STACK_PAGE_SIZE - 1. Otherwise decrement index to
149 * push a page. If index == B_STACK_PAGE_SIZE and you need
150 * to push a page move to the next page and set index =
151 * 0. If there is no new next page, allocate a new page
152 * and link it in. Otherwise, increment index to push a
153 * page. */
154
155 int index;
156 item_page_t *current; /* Pointer to the current page. */
157 item_page_t first; /* First page is statically allocated. */
158 } stack;
159 } match_state;
160
161 /* Initialize a state object */
162
163 /* #define NEW_STATE(state) \ */
164 /* memset(&state, 0, (void *)(&state.stack) - (void *)(&state)); \ */
165 /* state.stack.current = &state.stack.first; \ */
166 /* state.stack.first.prev = NULL; \ */
167 /* state.stack.first.next = NULL; \ */
168 /* state.stack.index = 0; \ */
169 /* state.level = 1 */
170
171 #define NEW_STATE(state, nregs) \
172 { \
173 int i; \
174 for (i = 0; i < nregs; i++) \
175 { \
176 state.start[i] = NULL; \
177 state.end[i] = NULL; \
178 state.changed[i] = 0; \
179 } \
180 state.stack.current = &state.stack.first; \
181 state.stack.first.prev = NULL; \
182 state.stack.first.next = NULL; \
183 state.stack.index = 0; \
184 state.level = 1; \
185 state.count = 0; \
186 state.level = 0; \
187 state.point = 0; \
188 }
189
190 /* Free any memory that might have been malloc'd */
191
192 #define FREE_STATE(state) \
193 while(state.stack.first.next != NULL) \
194 { \
195 state.stack.current = state.stack.first.next; \
196 state.stack.first.next = state.stack.current->next; \
197 free(state.stack.current); \
198 }
199
200 /* Discard the top 'count' stack items. */
201
202 #define STACK_DISCARD(stack, count, on_error) \
203 stack.index -= count; \
204 while (stack.index < 0) \
205 { \
206 if (stack.current->prev == NULL) \
207 on_error; \
208 stack.current = stack.current->prev; \
209 stack.index += B_STACK_PAGE_SIZE; \
210 }
211
212 /* Store a pointer to the previous item on the stack. Used to pop an
213 * item off of the stack. */
214
215 #define STACK_PREV(stack, top, on_error) \
216 if (stack.index == 0) \
217 { \
218 if (stack.current->prev == NULL) \
219 on_error; \
220 stack.current = stack.current->prev; \
221 stack.index = B_STACK_PAGE_SIZE - 1; \
222 } \
223 else \
224 { \
225 stack.index--; \
226 } \
227 top = &(stack.current->items[stack.index])
228
229 /* Store a pointer to the next item on the stack. Used to push an item
230 * on to the stack. */
231
232 #define STACK_NEXT(stack, top, on_error) \
233 if (stack.index == B_STACK_PAGE_SIZE) \
234 { \
235 if (stack.current->next == NULL) \
236 { \
237 stack.current->next = (item_page_t *)malloc(sizeof(item_page_t)); \
238 if (stack.current->next == NULL) \
239 on_error; \
240 stack.current->next->prev = stack.current; \
241 stack.current->next->next = NULL; \
242 } \
243 stack.current = stack.current->next; \
244 stack.index = 0; \
245 } \
246 top = &(stack.current->items[stack.index++])
247
248 /* Store a pointer to the item that is 'count' items back in the
249 * stack. STACK_BACK(stack, top, 1, on_error) is equivalent to
250 * STACK_TOP(stack, top, on_error). */
251
252 #define STACK_BACK(stack, top, count, on_error) \
253 { \
254 int index; \
255 item_page_t *current; \
256 current = stack.current; \
257 index = stack.index - (count); \
258 while (index < 0) \
259 { \
260 if (current->prev == NULL) \
261 on_error; \
262 current = current->prev; \
263 index += B_STACK_PAGE_SIZE; \
264 } \
265 top = &(current->items[index]); \
266 }
267
268 /* Store a pointer to the top item on the stack. Execute the
269 * 'on_error' code if there are no items on the stack. */
270
271 #define STACK_TOP(stack, top, on_error) \
272 if (stack.index == 0) \
273 { \
274 if (stack.current->prev == NULL) \
275 on_error; \
276 top = &(stack.current->prev->items[B_STACK_PAGE_SIZE - 1]); \
277 } \
278 else \
279 { \
280 top = &(stack.current->items[stack.index - 1]); \
281 }
282
283 /* Test to see if the stack is empty */
284
285 #define STACK_EMPTY(stack) ((stack.index == 0) && \
286 (stack.current->prev == NULL))
287
288 /* Return the start of register 'reg' */
289
290 #define GET_REG_START(state, reg) (state.start[reg])
291
292 /* Return the end of register 'reg' */
293
294 #define GET_REG_END(state, reg) (state.end[reg])
295
296 /* Set the start of register 'reg'. If the state of the register needs
297 * saving, push it on the stack. */
298
299 #define SET_REG_START(state, reg, text, on_error) \
300 if(state.changed[reg] < state.level) \
301 { \
302 item_t *item; \
303 STACK_NEXT(state.stack, item, on_error); \
304 item->reg.num = reg; \
305 item->reg.start = state.start[reg]; \
306 item->reg.end = state.end[reg]; \
307 item->reg.level = state.changed[reg]; \
308 state.changed[reg] = state.level; \
309 state.count++; \
310 } \
311 state.start[reg] = text
312
313 /* Set the end of register 'reg'. If the state of the register needs
314 * saving, push it on the stack. */
315
316 #define SET_REG_END(state, reg, text, on_error) \
317 if(state.changed[reg] < state.level) \
318 { \
319 item_t *item; \
320 STACK_NEXT(state.stack, item, on_error); \
321 item->reg.num = reg; \
322 item->reg.start = state.start[reg]; \
323 item->reg.end = state.end[reg]; \
324 item->reg.level = state.changed[reg]; \
325 state.changed[reg] = state.level; \
326 state.count++; \
327 } \
328 state.end[reg] = text
329
330 #define PUSH_FAILURE(state, xcode, xtext, on_error) \
331 { \
332 item_t *item; \
333 STACK_NEXT(state.stack, item, on_error); \
334 item->fail.code = xcode; \
335 item->fail.text = xtext; \
336 item->fail.count = state.count; \
337 item->fail.level = state.level; \
338 item->fail.phantom = 0; \
339 state.count = 0; \
340 state.level++; \
341 state.point++; \
342 }
343
344 /* Update the last failure point with a new position in the text. */
345
346 #define UPDATE_FAILURE(state, xtext, on_error) \
347 { \
348 item_t *item; \
349 STACK_BACK(state.stack, item, state.count + 1, on_error); \
350 if (!item->fail.phantom) \
351 { \
352 item_t *item2; \
353 STACK_NEXT(state.stack, item2, on_error); \
354 item2->fail.code = item->fail.code; \
355 item2->fail.text = xtext; \
356 item2->fail.count = state.count; \
357 item2->fail.level = state.level; \
358 item2->fail.phantom = 1; \
359 state.count = 0; \
360 state.level++; \
361 state.point++; \
362 } \
363 else \
364 { \
365 STACK_DISCARD(state.stack, state.count, on_error); \
366 STACK_TOP(state.stack, item, on_error); \
367 item->fail.text = xtext; \
368 state.count = 0; \
369 state.level++; \
370 } \
371 }
372
373 #define POP_FAILURE(state, xcode, xtext, on_empty, on_error) \
374 { \
375 item_t *item; \
376 do \
377 { \
378 while(state.count > 0) \
379 { \
380 STACK_PREV(state.stack, item, on_error); \
381 state.start[item->reg.num] = item->reg.start; \
382 state.end[item->reg.num] = item->reg.end; \
383 state.changed[item->reg.num] = item->reg.level; \
384 state.count--; \
385 } \
386 STACK_PREV(state.stack, item, on_empty); \
387 xcode = item->fail.code; \
388 xtext = item->fail.text; \
389 state.count = item->fail.count; \
390 state.level = item->fail.level; \
391 state.point--; \
392 } \
393 while (item->fail.text == NULL); \
394 }
395
396 enum regexp_compiled_ops { /* opcodes for compiled regexp */
397 Cend, /* end of pattern reached */
398 Cbol, /* beginning of line */
399 Ceol, /* end of line */
400 Cset, /* character set. Followed by 32 bytes of set. */
401 Cexact, /* followed by a byte to match */
402 Canychar, /* matches any character except newline */
403 Cstart_memory, /* set register start addr (followed by reg number) */
404 Cend_memory, /* set register end addr (followed by reg number) */
405 Cmatch_memory, /* match a duplicate of reg contents (regnum follows) */
406 Cjump, /* followed by two bytes (lsb,msb) of displacement. */
407 Cstar_jump, /* will change to jump/update_failure_jump at runtime */
408 Cfailure_jump, /* jump to addr on failure */
409 Cupdate_failure_jump, /* update topmost failure point and jump */
410 Cdummy_failure_jump, /* push a dummy failure point and jump */
411 Cbegbuf, /* match at beginning of buffer */
412 Cendbuf, /* match at end of buffer */
413 Cwordbeg, /* match at beginning of word */
414 Cwordend, /* match at end of word */
415 Cwordbound, /* match if at word boundary */
416 Cnotwordbound, /* match if not at word boundary */
417 Csyntaxspec, /* matches syntax code (1 byte follows) */
418 Cnotsyntaxspec, /* matches if syntax code does not match (1 byte follows) */
419 Crepeat1
420 };
421
422 enum regexp_syntax_op { /* syntax codes for plain and quoted characters */
423 Rend, /* special code for end of regexp */
424 Rnormal, /* normal character */
425 Ranychar, /* any character except newline */
426 Rquote, /* the quote character */
427 Rbol, /* match beginning of line */
428 Reol, /* match end of line */
429 Roptional, /* match preceding expression optionally */
430 Rstar, /* match preceding expr zero or more times */
431 Rplus, /* match preceding expr one or more times */
432 Ror, /* match either of alternatives */
433 Ropenpar, /* opening parenthesis */
434 Rclosepar, /* closing parenthesis */
435 Rmemory, /* match memory register */
436 Rextended_memory, /* \vnn to match registers 10-99 */
437 Ropenset, /* open set. Internal syntax hard-coded below. */
438 /* the following are gnu extensions to "normal" regexp syntax */
439 Rbegbuf, /* beginning of buffer */
440 Rendbuf, /* end of buffer */
441 Rwordchar, /* word character */
442 Rnotwordchar, /* not word character */
443 Rwordbeg, /* beginning of word */
444 Rwordend, /* end of word */
445 Rwordbound, /* word bound */
446 Rnotwordbound, /* not word bound */
447 Rnum_ops
448 };
449
450 static int re_compile_initialized = 0;
451 static int regexp_syntax = RE_SYNTAX_EGREP;
452 int re_syntax = RE_SYNTAX_EGREP; /* Exported copy of regexp_syntax */
453 static unsigned char plain_ops[256];
454 static unsigned char quoted_ops[256];
455 static unsigned char precedences[Rnum_ops];
456 static int regexp_context_indep_ops;
457 static int regexp_ansi_sequences;
458
459 #define NUM_LEVELS 5 /* number of precedence levels in use */
460 #define MAX_NESTING 100 /* max nesting level of operators */
461
462 #define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
463
464 unsigned char re_syntax_table[256];
465
re_compile_initialize(void)466 void re_compile_initialize(void)
467 {
468 int a;
469
470 static int syntax_table_inited = 0;
471
472 if (!syntax_table_inited) {
473 syntax_table_inited = 1;
474 memset(re_syntax_table, 0, 256);
475 for (a = 'a'; a <= 'z'; a++)
476 re_syntax_table[a] = Sword;
477 for (a = 'A'; a <= 'Z'; a++)
478 re_syntax_table[a] = Sword;
479 for (a = '0'; a <= '9'; a++)
480 re_syntax_table[a] = Sword | Sdigit | Shexdigit;
481 for (a = '0'; a <= '7'; a++)
482 re_syntax_table[a] |= Soctaldigit;
483 for (a = 'A'; a <= 'F'; a++)
484 re_syntax_table[a] |= Shexdigit;
485 for (a = 'a'; a <= 'f'; a++)
486 re_syntax_table[a] |= Shexdigit;
487 re_syntax_table[(int)'_'] = Sword;
488 for (a = 9; a <= 13; a++)
489 re_syntax_table[a] = Swhitespace;
490 re_syntax_table[(int)' '] = Swhitespace;
491 }
492 re_compile_initialized = 1;
493 for (a = 0; a < 256; a++) {
494 plain_ops[a] = Rnormal;
495 quoted_ops[a] = Rnormal;
496 }
497 for (a = '0'; a <= '9'; a++)
498 quoted_ops[a] = Rmemory;
499 plain_ops[(int)'\134'] = Rquote;
500 if (regexp_syntax & RE_NO_BK_PARENS) {
501 plain_ops[(int)'('] = Ropenpar;
502 plain_ops[(int)')'] = Rclosepar;
503 } else {
504 quoted_ops[(int)'('] = Ropenpar;
505 quoted_ops[(int)')'] = Rclosepar;
506 }
507 if (regexp_syntax & RE_NO_BK_VBAR) {
508 plain_ops[(int)'\174'] = Ror;
509 } else {
510 quoted_ops[(int)'\174'] = Ror;
511 }
512 plain_ops[(int)'*'] = Rstar;
513 if (regexp_syntax & RE_BK_PLUS_QM) {
514 quoted_ops[(int)'+'] = Rplus;
515 quoted_ops[(int)'?'] = Roptional;
516 } else {
517 plain_ops[(int)'+'] = Rplus;
518 plain_ops[(int)'?'] = Roptional;
519 }
520 if (regexp_syntax & RE_NEWLINE_OR) {
521 plain_ops[(int)'\n'] = Ror;
522 }
523 plain_ops[(int)'\133'] = Ropenset;
524 plain_ops[(int)'\136'] = Rbol;
525 plain_ops[(int)'$'] = Reol;
526 plain_ops[(int)'.'] = Ranychar;
527 if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS)) {
528 quoted_ops[(int)'w'] = Rwordchar;
529 quoted_ops[(int)'W'] = Rnotwordchar;
530 quoted_ops[(int)'<'] = Rwordbeg;
531 quoted_ops[(int)'>'] = Rwordend;
532 quoted_ops[(int)'b'] = Rwordbound;
533 quoted_ops[(int)'B'] = Rnotwordbound;
534 quoted_ops[(int)'`'] = Rbegbuf;
535 quoted_ops[(int)'\''] = Rendbuf;
536 }
537 if (regexp_syntax & RE_ANSI_HEX) {
538 quoted_ops[(int)'v'] = Rextended_memory;
539 }
540 for (a = 0; a < Rnum_ops; a++) {
541 precedences[a] = 4;
542 }
543 if (regexp_syntax & RE_TIGHT_VBAR) {
544 precedences[Ror] = 3;
545 precedences[Rbol] = 2;
546 precedences[Reol] = 2;
547 } else {
548 precedences[Ror] = 2;
549 precedences[Rbol] = 3;
550 precedences[Reol] = 3;
551 }
552 precedences[Rclosepar] = 1;
553 precedences[Rend] = 0;
554 regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
555 regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
556 }
557
re_set_syntax(int syntax)558 int re_set_syntax(int syntax) {
559 int ret;
560
561 ret = regexp_syntax;
562 regexp_syntax = syntax;
563 re_syntax = syntax; /* Exported copy */
564 re_compile_initialize();
565 return ret;
566 }
567
hex_char_to_decimal(int ch)568 static int hex_char_to_decimal(int ch) {
569 if (ch >= '0' && ch <= '9')
570 return ch - '0';
571 if (ch >= 'a' && ch <= 'f')
572 return ch - 'a' + 10;
573 if (ch >= 'A' && ch <= 'F')
574 return ch - 'A' + 10;
575 return 16;
576 }
577
re_compile_fastmap_aux(regex_t * bufp,unsigned char * code,int pos,unsigned char * visited,unsigned char * can_be_null,unsigned char * fastmap)578 static void re_compile_fastmap_aux(regex_t * bufp, unsigned char *code, int pos,
579 unsigned char *visited,
580 unsigned char *can_be_null,
581 unsigned char *fastmap)
582 {
583 int a;
584 int b;
585 int syntaxcode;
586
587 if (visited[pos])
588 return; /* we have already been here */
589 visited[pos] = 1;
590 for (;;) {
591 switch (code[pos++]) {
592 case Cend:
593 *can_be_null = 1;
594 return;
595 case Cbol:
596 case Cbegbuf:
597 case Cendbuf:
598 case Cwordbeg:
599 case Cwordend:
600 case Cwordbound:
601 case Cnotwordbound:
602 for (a = 0; a < 256; a++)
603 fastmap[a] = 1;
604 break;
605 case Csyntaxspec:
606 syntaxcode = code[pos++];
607 for (a = 0; a < 256; a++)
608 if (SYNTAX(a) & syntaxcode)
609 fastmap[a] = 1;
610 return;
611 case Cnotsyntaxspec:
612 syntaxcode = code[pos++];
613 for (a = 0; a < 256; a++)
614 if (!(SYNTAX(a) & syntaxcode))
615 fastmap[a] = 1;
616 return;
617 case Ceol:
618 fastmap[(int)'\n'] = 1;
619 if (*can_be_null == 0)
620 *can_be_null = 2; /* can match null, but only at end of buffer */
621 return;
622 case Cset:
623 for (a = 0; a < 256 / 8; a++)
624 if (code[pos + a] != 0)
625 for (b = 0; b < 8; b++)
626 if (code[pos + a] & (1 << b))
627 fastmap[(a << 3) + b] = 1;
628 pos += 256 / 8;
629 return;
630 case Cexact:
631 fastmap[(unsigned char)code[pos]] = 1;
632 return;
633 case Canychar:
634 for (a = 0; a < 256; a++)
635 if (a != '\n')
636 fastmap[a] = 1;
637 return;
638 case Cstart_memory:
639 case Cend_memory:
640 pos++;
641 break;
642 case Cmatch_memory:
643 for (a = 0; a < 256; a++)
644 fastmap[a] = 1;
645 *can_be_null = 1;
646 return;
647 case Cjump:
648 case Cdummy_failure_jump:
649 case Cupdate_failure_jump:
650 case Cstar_jump:
651 a = (unsigned char)code[pos++];
652 a |= (unsigned char)code[pos++] << 8;
653 pos += (int)SHORT(a);
654 if (visited[pos]) {
655 /* argh... the regexp contains empty loops. This is not
656 good, as this may cause a failure stack overflow when
657 matching. Oh well. */
658 /* this path leads nowhere; pursue other paths. */
659 return;
660 }
661 visited[pos] = 1;
662 break;
663 case Cfailure_jump:
664 a = (unsigned char)code[pos++];
665 a |= (unsigned char)code[pos++] << 8;
666 a = pos + (int)SHORT(a);
667 re_compile_fastmap_aux(bufp, code, a, visited, can_be_null, fastmap);
668 break;
669 case Crepeat1:
670 pos += 2;
671 break;
672 default:
673 set_error("Unknown regex opcode: memory corrupted?");
674 return;
675 }
676 }
677 }
678
re_do_compile_fastmap(regex_t * bufp,unsigned char * buffer,int used,int pos,unsigned char * can_be_null,unsigned char * fastmap)679 static int re_do_compile_fastmap(regex_t * bufp, unsigned char *buffer, int used,
680 int pos, unsigned char *can_be_null,
681 unsigned char *fastmap)
682 {
683 unsigned char small_visited[512], *visited;
684
685 if (used <= (int)sizeof(small_visited))
686 visited = small_visited;
687 else {
688 visited = (unsigned char *)malloc(used);
689 if (!visited)
690 return 0;
691 }
692 *can_be_null = 0;
693 memset(fastmap, 0, 256);
694 memset(visited, 0, used);
695 re_compile_fastmap_aux(bufp, buffer, pos, visited, can_be_null, fastmap);
696 if (visited != small_visited)
697 free(visited);
698 return 1;
699 }
700
re_compile_fastmap(regex_t * bufp)701 void re_compile_fastmap(regex_t * bufp)
702 {
703 if (!bufp->fastmap || bufp->fastmap_accurate)
704 return;
705 // assert(bufp->used > 0);
706 if (!re_do_compile_fastmap(bufp, bufp->buffer,
707 bufp->used, 0, &bufp->can_be_null, bufp->fastmap))
708 return;
709 if (got_error)
710 return;
711 if (bufp->buffer[0] == Cbol)
712 bufp->anchor = 1; /* begline */
713 else if (bufp->buffer[0] == Cbegbuf)
714 bufp->anchor = 2; /* begbuf */
715 else
716 bufp->anchor = 0; /* none */
717 bufp->fastmap_accurate = 1;
718 }
719
720 /*
721 * star is coded as:
722 * 1: failure_jump 2
723 * ... code for operand of star
724 * star_jump 1
725 * 2: ... code after star
726 *
727 * We change the star_jump to update_failure_jump if we can determine
728 * that it is safe to do so; otherwise we change it to an ordinary
729 * jump.
730 *
731 * plus is coded as
732 *
733 * jump 2
734 * 1: failure_jump 3
735 * 2: ... code for operand of plus
736 * star_jump 1
737 * 3: ... code after plus
738 *
739 * For star_jump considerations this is processed identically to star.
740 *
741 */
742
re_optimize_star_jump(regex_t * bufp,unsigned char * code)743 static int re_optimize_star_jump(regex_t * bufp, unsigned char *code)
744 {
745 unsigned char map[256];
746 unsigned char can_be_null;
747 unsigned char *p1;
748 unsigned char *p2;
749 unsigned char ch;
750 int a;
751 int b;
752 int num_instructions = 0;
753
754 a = (unsigned char)*code++;
755 a |= (unsigned char)*code++ << 8;
756 a = (int)SHORT(a);
757
758 p1 = code + a + 3; /* skip the failure_jump */
759 /* Check that the jump is within the pattern */
760 if (p1 < bufp->buffer || bufp->buffer + bufp->used < p1) {
761 set_error("Regex VM jump out of bounds (failure_jump opt)");
762 return 0;
763 }
764 // assert(p1[-3] == Cfailure_jump);
765 p2 = code;
766 /* p1 points inside loop, p2 points to after loop */
767 if (!re_do_compile_fastmap(bufp, bufp->buffer, bufp->used,
768 (int)(p2 - bufp->buffer), &can_be_null, map))
769 goto make_normal_jump;
770
771 /* If we might introduce a new update point inside the
772 * loop, we can't optimize because then update_jump would
773 * update a wrong failure point. Thus we have to be
774 * quite careful here.
775 */
776
777 /* loop until we find something that consumes a character */
778 for (;;) {
779 num_instructions++;
780 switch (*p1++) {
781 case Cbol:
782 case Ceol:
783 case Cbegbuf:
784 case Cendbuf:
785 case Cwordbeg:
786 case Cwordend:
787 case Cwordbound:
788 case Cnotwordbound:
789 continue;
790 case Cstart_memory:
791 case Cend_memory:
792 p1++;
793 continue;
794 case Cexact:
795 ch = (unsigned char)*p1++;
796 if (map[(int)ch])
797 goto make_normal_jump;
798 break;
799 case Canychar:
800 for (b = 0; b < 256; b++)
801 if (b != '\n' && map[b])
802 goto make_normal_jump;
803 break;
804 case Cset:
805 for (b = 0; b < 256; b++)
806 if ((p1[b >> 3] & (1 << (b & 7))) && map[b])
807 goto make_normal_jump;
808 p1 += 256 / 8;
809 break;
810 default:
811 goto make_normal_jump;
812 }
813 break;
814 }
815 /* now we know that we can't backtrack. */
816 while (p1 != p2 - 3) {
817 num_instructions++;
818 switch (*p1++) {
819 case Cend:
820 return 0;
821 case Cbol:
822 case Ceol:
823 case Canychar:
824 case Cbegbuf:
825 case Cendbuf:
826 case Cwordbeg:
827 case Cwordend:
828 case Cwordbound:
829 case Cnotwordbound:
830 break;
831 case Cset:
832 p1 += 256 / 8;
833 break;
834 case Cexact:
835 case Cstart_memory:
836 case Cend_memory:
837 case Cmatch_memory:
838 case Csyntaxspec:
839 case Cnotsyntaxspec:
840 p1++;
841 break;
842 case Cjump:
843 case Cstar_jump:
844 case Cfailure_jump:
845 case Cupdate_failure_jump:
846 case Cdummy_failure_jump:
847 goto make_normal_jump;
848 default:
849 return 0;
850 }
851 }
852
853 /* make_update_jump: */
854 code -= 3;
855 a += 3; /* jump to after the Cfailure_jump */
856 code[0] = Cupdate_failure_jump;
857 code[1] = a & 0xff;
858 code[2] = a >> 8;
859 if (num_instructions > 1)
860 return 1;
861 // assert(num_instructions == 1);
862 /* if the only instruction matches a single character, we can do
863 * better */
864 p1 = code + 3 + a; /* start of sole instruction */
865 if (*p1 == Cset || *p1 == Cexact || *p1 == Canychar ||
866 *p1 == Csyntaxspec || *p1 == Cnotsyntaxspec)
867 code[0] = Crepeat1;
868 return 1;
869
870 make_normal_jump:
871 code -= 3;
872 *code = Cjump;
873 return 1;
874 }
875
re_optimize(regex_t * bufp)876 static int re_optimize(regex_t * bufp)
877 {
878 unsigned char *code;
879
880 code = bufp->buffer;
881
882 while (1) {
883 switch (*code++) {
884 case Cend:
885 return 1;
886 case Canychar:
887 case Cbol:
888 case Ceol:
889 case Cbegbuf:
890 case Cendbuf:
891 case Cwordbeg:
892 case Cwordend:
893 case Cwordbound:
894 case Cnotwordbound:
895 break;
896 case Cset:
897 code += 256 / 8;
898 break;
899 case Cexact:
900 case Cstart_memory:
901 case Cend_memory:
902 case Cmatch_memory:
903 case Csyntaxspec:
904 case Cnotsyntaxspec:
905 code++;
906 break;
907 case Cstar_jump:
908 if (!re_optimize_star_jump(bufp, code)) {
909 return 0;
910 }
911 /* fall through */
912 case Cupdate_failure_jump:
913 case Cjump:
914 case Cdummy_failure_jump:
915 case Cfailure_jump:
916 case Crepeat1:
917 code += 2;
918 break;
919 default:
920 return 0;
921 }
922 }
923 }
924
925 #define NEXTCHAR(var) \
926 { \
927 if (pos >= size) \
928 goto ends_prematurely; \
929 (var) = regex[pos]; \
930 pos++; \
931 }
932
933 #define ALLOC(amount) \
934 { \
935 if (pattern_offset+(amount) > alloc) \
936 { \
937 alloc += 256 + (amount); \
938 pattern = (unsigned char *)realloc(pattern, alloc); \
939 if (!pattern) \
940 goto out_of_memory; \
941 } \
942 }
943
944 #define STORE(ch) pattern[pattern_offset++] = (ch)
945
946 #define CURRENT_LEVEL_START (starts[starts_base + current_level])
947
948 #define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
949
950 #define PUSH_LEVEL_STARTS \
951 if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
952 starts_base += NUM_LEVELS; \
953 else \
954 goto too_complex \
955
956 #define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
957
958 #define PUT_ADDR(offset,addr) \
959 { \
960 int disp = (addr) - (offset) - 2; \
961 pattern[(offset)] = disp & 0xff; \
962 pattern[(offset)+1] = (disp>>8) & 0xff; \
963 }
964
965 #define INSERT_JUMP(pos,type,addr) \
966 { \
967 int a, p = (pos), t = (type), ad = (addr); \
968 for (a = pattern_offset - 1; a >= p; a--) \
969 pattern[a + 3] = pattern[a]; \
970 pattern[p] = t; \
971 PUT_ADDR(p+1,ad); \
972 pattern_offset += 3; \
973 }
974
975 #define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
976
977 #define SET_FIELDS \
978 { \
979 bufp->allocated = alloc; \
980 bufp->buffer = pattern; \
981 bufp->used = pattern_offset; \
982 }
983
984 #define GETHEX(var) \
985 { \
986 unsigned char gethex_ch, gethex_value; \
987 NEXTCHAR(gethex_ch); \
988 gethex_value = hex_char_to_decimal(gethex_ch); \
989 if (gethex_value == 16) \
990 goto hex_error; \
991 NEXTCHAR(gethex_ch); \
992 gethex_ch = hex_char_to_decimal(gethex_ch); \
993 if (gethex_ch == 16) \
994 goto hex_error; \
995 (var) = gethex_value * 16 + gethex_ch; \
996 }
997
998 #define ANSI_TRANSLATE(ch) \
999 { \
1000 switch (ch) \
1001 { \
1002 case 'a': \
1003 case 'A': \
1004 { \
1005 ch = 7; /* audible bell */ \
1006 break; \
1007 } \
1008 case 'b': \
1009 case 'B': \
1010 { \
1011 ch = 8; /* backspace */ \
1012 break; \
1013 } \
1014 case 'f': \
1015 case 'F': \
1016 { \
1017 ch = 12; /* form feed */ \
1018 break; \
1019 } \
1020 case 'n': \
1021 case 'N': \
1022 { \
1023 ch = 10; /* line feed */ \
1024 break; \
1025 } \
1026 case 'r': \
1027 case 'R': \
1028 { \
1029 ch = 13; /* carriage return */ \
1030 break; \
1031 } \
1032 case 't': \
1033 case 'T': \
1034 { \
1035 ch = 9; /* tab */ \
1036 break; \
1037 } \
1038 case 'v': \
1039 case 'V': \
1040 { \
1041 ch = 11; /* vertical tab */ \
1042 break; \
1043 } \
1044 case 'x': /* hex code */ \
1045 case 'X': \
1046 { \
1047 GETHEX(ch); \
1048 break; \
1049 } \
1050 default: \
1051 { \
1052 /* other characters passed through */ \
1053 if (translate) \
1054 ch = translate[(unsigned char)ch]; \
1055 break; \
1056 } \
1057 } \
1058 }
1059
re_compile_pattern(regex_t * bufp,unsigned char * regex)1060 const char *re_compile_pattern(regex_t * bufp, unsigned char *regex)
1061 {
1062 int a;
1063 int pos;
1064 int op;
1065 int current_level;
1066 int level;
1067 int opcode;
1068 int pattern_offset = 0, alloc;
1069 int starts[NUM_LEVELS * MAX_NESTING];
1070 int starts_base;
1071 int future_jumps[MAX_NESTING];
1072 int num_jumps;
1073 unsigned char ch = '\0';
1074 unsigned char *pattern;
1075 unsigned char *translate;
1076 int next_register;
1077 int paren_depth;
1078 int num_open_registers;
1079 int open_registers[RE_NREGS];
1080 int beginning_context;
1081 int size = strlen((char *)regex);
1082
1083 if (!re_compile_initialized)
1084 re_compile_initialize();
1085 bufp->used = 0;
1086 bufp->fastmap_accurate = 0;
1087 bufp->uses_registers = 1;
1088 bufp->num_registers = 1;
1089 translate = bufp->translate;
1090 pattern = bufp->buffer;
1091 alloc = bufp->allocated;
1092 if (alloc == 0 || pattern == NULL) {
1093 alloc = 256;
1094 bufp->buffer = pattern = (unsigned char *)malloc(alloc);
1095 if (!pattern)
1096 goto out_of_memory;
1097 }
1098 pattern_offset = 0;
1099 starts_base = 0;
1100 num_jumps = 0;
1101 current_level = 0;
1102 SET_LEVEL_START;
1103 num_open_registers = 0;
1104 next_register = 1;
1105 paren_depth = 0;
1106 beginning_context = 1;
1107 op = -1;
1108 /* we use Rend dummy to ensure that pending jumps are updated
1109 (due to low priority of Rend) before exiting the loop. */
1110 pos = 0;
1111 while (op != Rend) {
1112 if (pos >= size)
1113 op = Rend;
1114 else {
1115 NEXTCHAR(ch);
1116 if (translate)
1117 ch = translate[(unsigned char)ch];
1118 op = plain_ops[(unsigned char)ch];
1119 if (op == Rquote) {
1120 NEXTCHAR(ch);
1121 op = quoted_ops[(unsigned char)ch];
1122 if (op == Rnormal && regexp_ansi_sequences)
1123 ANSI_TRANSLATE(ch);
1124 }
1125 }
1126 level = precedences[op];
1127 /* printf("ch='%c' op=%d level=%d current_level=%d
1128 curlevstart=%d\n", ch, op, level, current_level,
1129 CURRENT_LEVEL_START); */
1130 if (level > current_level) {
1131 for (current_level++; current_level < level; current_level++)
1132 SET_LEVEL_START;
1133 SET_LEVEL_START;
1134 } else if (level < current_level) {
1135 current_level = level;
1136 for (; num_jumps > 0 &&
1137 future_jumps[num_jumps - 1] >= CURRENT_LEVEL_START; num_jumps--)
1138 PUT_ADDR(future_jumps[num_jumps - 1], pattern_offset);
1139 }
1140 switch (op) {
1141 case Rend:
1142 break;
1143 case Rnormal:
1144 normal_char:
1145 opcode = Cexact;
1146 store_opcode_and_arg: /* opcode & ch must be set */
1147 SET_LEVEL_START;
1148 ALLOC(2);
1149 STORE(opcode);
1150 STORE(ch);
1151 break;
1152 case Ranychar:
1153 opcode = Canychar;
1154 store_opcode:
1155 SET_LEVEL_START;
1156 ALLOC(1);
1157 STORE(opcode);
1158 break;
1159 case Rquote:
1160 set_error("Rquote");
1161 /*NOTREACHED*/ case Rbol:
1162 if (!beginning_context) {
1163 if (regexp_context_indep_ops)
1164 goto op_error;
1165 else
1166 goto normal_char;
1167 }
1168 opcode = Cbol;
1169 goto store_opcode;
1170 case Reol:
1171 if (!((pos >= size) ||
1172 ((regexp_syntax & RE_NO_BK_VBAR) ?
1173 (regex[pos] == '\174') :
1174 (pos + 1 < size && regex[pos] == '\134' &&
1175 regex[pos + 1] == '\174')) ||
1176 ((regexp_syntax & RE_NO_BK_PARENS) ?
1177 (regex[pos] == ')') :
1178 (pos + 1 < size && regex[pos] == '\134' &&
1179 regex[pos + 1] == ')')))) {
1180 if (regexp_context_indep_ops)
1181 goto op_error;
1182 else
1183 goto normal_char;
1184 }
1185 opcode = Ceol;
1186 goto store_opcode;
1187 /* NOTREACHED */
1188 break;
1189 case Roptional:
1190 if (beginning_context) {
1191 if (regexp_context_indep_ops)
1192 goto op_error;
1193 else
1194 goto normal_char;
1195 }
1196 if (CURRENT_LEVEL_START == pattern_offset)
1197 break; /* ignore empty patterns for ? */
1198 ALLOC(3);
1199 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 3);
1200 break;
1201 case Rstar:
1202 case Rplus:
1203 if (beginning_context) {
1204 if (regexp_context_indep_ops)
1205 goto op_error;
1206 else
1207 goto normal_char;
1208 }
1209 if (CURRENT_LEVEL_START == pattern_offset)
1210 break; /* ignore empty patterns for + and * */
1211 ALLOC(9);
1212 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1213 INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
1214 if (op == Rplus) /* jump over initial failure_jump */
1215 INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
1216 CURRENT_LEVEL_START + 6);
1217 break;
1218 case Ror:
1219 ALLOC(6);
1220 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1221 if (num_jumps >= MAX_NESTING)
1222 goto too_complex;
1223 STORE(Cjump);
1224 future_jumps[num_jumps++] = pattern_offset;
1225 STORE(0);
1226 STORE(0);
1227 SET_LEVEL_START;
1228 break;
1229 case Ropenpar:
1230 {
1231 SET_LEVEL_START;
1232 if (next_register < RE_NREGS) {
1233 bufp->uses_registers = 1;
1234 ALLOC(2);
1235 STORE(Cstart_memory);
1236 STORE(next_register);
1237 open_registers[num_open_registers++] = next_register;
1238 bufp->num_registers++;
1239 next_register++;
1240 }
1241 paren_depth++;
1242 PUSH_LEVEL_STARTS;
1243 current_level = 0;
1244 SET_LEVEL_START;
1245 break;
1246 }
1247 case Rclosepar:
1248 if (paren_depth <= 0)
1249 goto parenthesis_error;
1250 POP_LEVEL_STARTS;
1251 current_level = precedences[Ropenpar];
1252 paren_depth--;
1253 if (paren_depth < num_open_registers) {
1254 bufp->uses_registers = 1;
1255 ALLOC(2);
1256 STORE(Cend_memory);
1257 num_open_registers--;
1258 STORE(open_registers[num_open_registers]);
1259 }
1260 break;
1261 case Rmemory:
1262 if (ch == '0')
1263 goto bad_match_register;
1264 if (!(ch >= '0' && ch <= '9')) {
1265 goto bad_match_register;
1266 }
1267 bufp->uses_registers = 1;
1268 opcode = Cmatch_memory;
1269 ch -= '0';
1270 goto store_opcode_and_arg;
1271 case Rextended_memory:
1272 NEXTCHAR(ch);
1273 if (ch < '0' || ch > '9')
1274 goto bad_match_register;
1275 NEXTCHAR(a);
1276 if (a < '0' || a > '9')
1277 goto bad_match_register;
1278 ch = 10 * (a - '0') + ch - '0';
1279 if (ch == 0 || ch >= RE_NREGS)
1280 goto bad_match_register;
1281 bufp->uses_registers = 1;
1282 opcode = Cmatch_memory;
1283 goto store_opcode_and_arg;
1284 case Ropenset:
1285 {
1286 int complement;
1287 int prev;
1288 int offset;
1289 int range;
1290 int firstchar;
1291
1292 SET_LEVEL_START;
1293 ALLOC(1 + 256 / 8);
1294 STORE(Cset);
1295 offset = pattern_offset;
1296 for (a = 0; a < 256 / 8; a++)
1297 STORE(0);
1298 NEXTCHAR(ch);
1299 if (translate)
1300 ch = translate[(unsigned char)ch];
1301 if (ch == '\136') {
1302 complement = 1;
1303 NEXTCHAR(ch);
1304 if (translate)
1305 ch = translate[(unsigned char)ch];
1306 } else
1307 complement = 0;
1308 prev = -1;
1309 range = 0;
1310 firstchar = 1;
1311 while (ch != '\135' || firstchar) {
1312 firstchar = 0;
1313 if (regexp_ansi_sequences && ch == '\134') {
1314 NEXTCHAR(ch);
1315 ANSI_TRANSLATE(ch);
1316 }
1317 if (range) {
1318 for (a = prev; a <= (int)ch; a++)
1319 SETBIT(pattern, offset, a);
1320 prev = -1;
1321 range = 0;
1322 } else if (prev != -1 && ch == '-')
1323 range = 1;
1324 else {
1325 SETBIT(pattern, offset, ch);
1326 prev = ch;
1327 }
1328 NEXTCHAR(ch);
1329 if (translate)
1330 ch = translate[(unsigned char)ch];
1331 }
1332 if (range)
1333 SETBIT(pattern, offset, '-');
1334 if (complement) {
1335 for (a = 0; a < 256 / 8; a++)
1336 pattern[offset + a] ^= 0xff;
1337 }
1338 break;
1339 }
1340 case Rbegbuf:
1341 {
1342 opcode = Cbegbuf;
1343 goto store_opcode;
1344 }
1345 case Rendbuf:
1346 {
1347 opcode = Cendbuf;
1348 goto store_opcode;
1349 }
1350 case Rwordchar:
1351 {
1352 opcode = Csyntaxspec;
1353 ch = Sword;
1354 goto store_opcode_and_arg;
1355 }
1356 case Rnotwordchar:
1357 {
1358 opcode = Cnotsyntaxspec;
1359 ch = Sword;
1360 goto store_opcode_and_arg;
1361 }
1362 case Rwordbeg:
1363 {
1364 opcode = Cwordbeg;
1365 goto store_opcode;
1366 }
1367 case Rwordend:
1368 {
1369 opcode = Cwordend;
1370 goto store_opcode;
1371 }
1372 case Rwordbound:
1373 {
1374 opcode = Cwordbound;
1375 goto store_opcode;
1376 }
1377 case Rnotwordbound:
1378 {
1379 opcode = Cnotwordbound;
1380 goto store_opcode;
1381 }
1382 default:
1383 {
1384 abort();
1385 }
1386 }
1387 beginning_context = (op == Ropenpar || op == Ror);
1388 }
1389 if (starts_base != 0)
1390 goto parenthesis_error;
1391 // assert(num_jumps == 0);
1392 ALLOC(1);
1393 STORE(Cend);
1394 SET_FIELDS;
1395 if (!re_optimize(bufp))
1396 return "Optimization error";
1397 return NULL;
1398
1399 op_error:
1400 SET_FIELDS;
1401 return "Badly placed special character";
1402
1403 bad_match_register:
1404 SET_FIELDS;
1405 return "Bad match register number";
1406
1407 hex_error:
1408 SET_FIELDS;
1409 return "Bad hexadecimal number";
1410
1411 parenthesis_error:
1412 SET_FIELDS;
1413 return "Badly placed parenthesis";
1414
1415 out_of_memory:
1416 SET_FIELDS;
1417 return "Out of memory";
1418
1419 ends_prematurely:
1420 SET_FIELDS;
1421 return "Regular expression ends prematurely";
1422
1423 too_complex:
1424 SET_FIELDS;
1425 return "Regular expression too complex";
1426 }
1427
1428 #undef CHARAT
1429 #undef NEXTCHAR
1430 #undef GETHEX
1431 #undef ALLOC
1432 #undef STORE
1433 #undef CURRENT_LEVEL_START
1434 #undef SET_LEVEL_START
1435 #undef PUSH_LEVEL_STARTS
1436 #undef POP_LEVEL_STARTS
1437 #undef PUT_ADDR
1438 #undef INSERT_JUMP
1439 #undef SETBIT
1440 #undef SET_FIELDS
1441
1442 #define PREFETCH if (text == textend) goto fail
1443
1444 #define NEXTCHAR(var) \
1445 PREFETCH; \
1446 var = (unsigned char)*text++; \
1447 if (translate) \
1448 var = translate[var]
1449
1450
regcomp(regex_t * bufp,const char * regex,int cflags)1451 int regcomp(regex_t * bufp, const char *regex, int cflags)
1452 {
1453 memset(bufp, 0, sizeof(regex_t));
1454 bufp->cflags = cflags;
1455 if (bufp->cflags & REG_ICASE) {
1456 char *p, *lcase = bstrdup(regex);
1457 for( p = lcase; *p ; p++) {
1458 *p = tolower(*p);
1459 }
1460 re_compile_pattern(bufp, (unsigned char *)lcase);
1461 bfree(lcase);
1462 } else {
1463 re_compile_pattern(bufp, (unsigned char *)regex);
1464 }
1465 if (got_error) {
1466 return -1;
1467 }
1468 return 0;
1469 }
1470
re_registers_to_regmatch(regexp_registers_t old_regs,regmatch_t pmatch[],size_t nmatch)1471 void re_registers_to_regmatch(regexp_registers_t old_regs,
1472 regmatch_t pmatch[],
1473 size_t nmatch)
1474 {
1475 size_t i=0;
1476
1477 /* We have to set the last entry to -1 */
1478 nmatch = nmatch - 1;
1479 for (i=0; (i < nmatch) && (old_regs->start[i] > -1) ; i++) {
1480 pmatch[i].rm_so = old_regs->start[i];
1481 pmatch[i].rm_eo = old_regs->end[i];
1482 }
1483
1484 pmatch[i].rm_eo = pmatch[i].rm_so = -1;
1485 }
1486
regexec(regex_t * preg,const char * string,size_t nmatch,regmatch_t pmatch[],int eflags)1487 int regexec(regex_t * preg, const char *string, size_t nmatch,
1488 regmatch_t pmatch[], int eflags)
1489 {
1490 int stat;
1491 int len = strlen(string);
1492 struct re_registers regs;
1493 stat = re_search(preg, (unsigned char *)string, len, 0, len, ®s);
1494 if (stat >= 0 && nmatch > 0) {
1495 re_registers_to_regmatch(®s, pmatch, nmatch);
1496 }
1497 /* stat is the start position in the string base 0 where
1498 * the pattern was found or negative if not found.
1499 */
1500 return stat < 0 ? -1 : 0;
1501 }
1502
regerror(int errcode,regex_t * preg,char * errbuf,size_t errbuf_size)1503 size_t regerror(int errcode, regex_t * preg, char *errbuf, size_t errbuf_size)
1504 {
1505 bstrncpy(errbuf, preg->errmsg, errbuf_size);
1506 return 0;
1507 }
1508
regfree(regex_t * preg)1509 void regfree(regex_t * preg)
1510 {
1511 if (preg->lcase) {
1512 free_pool_memory(preg->lcase);
1513 preg->lcase = NULL;
1514 }
1515 if (preg->buffer) {
1516 free(preg->buffer);
1517 preg->buffer = NULL;
1518 }
1519 }
1520
re_match(regex_t * bufp,unsigned char * string,int size,int pos,regexp_registers_t old_regs)1521 int re_match(regex_t * bufp, unsigned char *string, int size, int pos,
1522 regexp_registers_t old_regs)
1523 {
1524 unsigned char *code;
1525 unsigned char *translate;
1526 unsigned char *text;
1527 unsigned char *textstart;
1528 unsigned char *textend;
1529 int a;
1530 int b;
1531 int ch;
1532 int reg;
1533 int match_end;
1534 unsigned char *regstart;
1535 unsigned char *regend;
1536 int regsize;
1537 match_state state;
1538
1539 // assert(pos >= 0 && size >= 0);
1540 // assert(pos <= size);
1541
1542 text = string + pos;
1543 textstart = string;
1544 textend = string + size;
1545
1546 code = bufp->buffer;
1547
1548 translate = bufp->translate;
1549
1550 NEW_STATE(state, bufp->num_registers);
1551
1552 continue_matching:
1553 switch (*code++) {
1554 case Cend:
1555 {
1556 match_end = text - textstart;
1557 if (old_regs) {
1558 old_regs->start[0] = pos;
1559 old_regs->end[0] = match_end;
1560 if (!bufp->uses_registers) {
1561 for (a = 1; a < RE_NREGS; a++) {
1562 old_regs->start[a] = -1;
1563 old_regs->end[a] = -1;
1564 }
1565 } else {
1566 for (a = 1; a < bufp->num_registers; a++) {
1567 if ((GET_REG_START(state, a) == NULL) ||
1568 (GET_REG_END(state, a) == NULL)) {
1569 old_regs->start[a] = -1;
1570 old_regs->end[a] = -1;
1571 continue;
1572 }
1573 old_regs->start[a] = GET_REG_START(state, a) - textstart;
1574 old_regs->end[a] = GET_REG_END(state, a) - textstart;
1575 }
1576 for (; a < RE_NREGS; a++) {
1577 old_regs->start[a] = -1;
1578 old_regs->end[a] = -1;
1579 }
1580 }
1581 }
1582 FREE_STATE(state);
1583 return match_end - pos;
1584 }
1585 case Cbol:
1586 {
1587 if (text == textstart || text[-1] == '\n')
1588 goto continue_matching;
1589 goto fail;
1590 }
1591 case Ceol:
1592 {
1593 if (text == textend || *text == '\n')
1594 goto continue_matching;
1595 goto fail;
1596 }
1597 case Cset:
1598 {
1599 NEXTCHAR(ch);
1600 if (code[ch / 8] & (1 << (ch & 7))) {
1601 code += 256 / 8;
1602 goto continue_matching;
1603 }
1604 goto fail;
1605 }
1606 case Cexact:
1607 {
1608 NEXTCHAR(ch);
1609 if (ch != (unsigned char)*code++)
1610 goto fail;
1611 goto continue_matching;
1612 }
1613 case Canychar:
1614 {
1615 NEXTCHAR(ch);
1616 if (ch == '\n')
1617 goto fail;
1618 goto continue_matching;
1619 }
1620 case Cstart_memory:
1621 {
1622 reg = *code++;
1623 SET_REG_START(state, reg, text, goto error);
1624 goto continue_matching;
1625 }
1626 case Cend_memory:
1627 {
1628 reg = *code++;
1629 SET_REG_END(state, reg, text, goto error);
1630 goto continue_matching;
1631 }
1632 case Cmatch_memory:
1633 {
1634 reg = *code++;
1635 regstart = GET_REG_START(state, reg);
1636 regend = GET_REG_END(state, reg);
1637 if ((regstart == NULL) || (regend == NULL))
1638 goto fail; /* or should we just match nothing? */
1639 regsize = regend - regstart;
1640
1641 if (regsize > (textend - text))
1642 goto fail;
1643 if (translate) {
1644 for (; regstart < regend; regstart++, text++)
1645 if (translate[*regstart] != translate[*text])
1646 goto fail;
1647 } else
1648 for (; regstart < regend; regstart++, text++)
1649 if (*regstart != *text)
1650 goto fail;
1651 goto continue_matching;
1652 }
1653 case Cupdate_failure_jump:
1654 {
1655 UPDATE_FAILURE(state, text, goto error);
1656 /* fall to next case */
1657 }
1658 /* treat Cstar_jump just like Cjump if it hasn't been optimized */
1659 case Cstar_jump:
1660 case Cjump:
1661 {
1662 a = (unsigned char)*code++;
1663 a |= (unsigned char)*code++ << 8;
1664 code += (int)SHORT(a);
1665 if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1666 set_error("Regex VM jump out of bounds (Cjump)");
1667 FREE_STATE(state);
1668 return -2;
1669 }
1670 goto continue_matching;
1671 }
1672 case Cdummy_failure_jump:
1673 {
1674 unsigned char *failuredest;
1675
1676 a = (unsigned char)*code++;
1677 a |= (unsigned char)*code++ << 8;
1678 a = (int)SHORT(a);
1679 // assert(*code == Cfailure_jump);
1680 b = (unsigned char)code[1];
1681 b |= (unsigned char)code[2] << 8;
1682 failuredest = code + (int)SHORT(b) + 3;
1683 if (failuredest < bufp->buffer || bufp->buffer + bufp->used < failuredest) {
1684 set_error
1685 ("Regex VM jump out of bounds (Cdummy_failure_jump failuredest)");
1686 FREE_STATE(state);
1687 return -2;
1688 }
1689 PUSH_FAILURE(state, failuredest, NULL, goto error);
1690 code += a;
1691 if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1692 set_error("Regex VM jump out of bounds (Cdummy_failure_jump code)");
1693 FREE_STATE(state);
1694 return -2;
1695 }
1696 goto continue_matching;
1697 }
1698 case Cfailure_jump:
1699 {
1700 a = (unsigned char)*code++;
1701 a |= (unsigned char)*code++ << 8;
1702 a = (int)SHORT(a);
1703 if (code + a < bufp->buffer || bufp->buffer + bufp->used < code + a) {
1704 set_error("Regex VM jump out of bounds (Cfailure_jump)");
1705 FREE_STATE(state);
1706 return -2;
1707 }
1708 PUSH_FAILURE(state, code + a, text, goto error);
1709 goto continue_matching;
1710 }
1711 case Crepeat1:
1712 {
1713 unsigned char *pinst;
1714 a = (unsigned char)*code++;
1715 a |= (unsigned char)*code++ << 8;
1716 a = (int)SHORT(a);
1717 pinst = code + a;
1718 if (pinst < bufp->buffer || bufp->buffer + bufp->used < pinst) {
1719 set_error("Regex VM jump out of bounds (Crepeat1)");
1720 FREE_STATE(state);
1721 return -2;
1722 }
1723 /* pinst is sole instruction in loop, and it matches a
1724 * single character. Since Crepeat1 was originally a
1725 * Cupdate_failure_jump, we also know that backtracking
1726 * is useless: so long as the single-character
1727 * expression matches, it must be used. Also, in the
1728 * case of +, we've already matched one character, so +
1729 * can't fail: nothing here can cause a failure. */
1730 switch (*pinst++) {
1731 case Cset:
1732 {
1733 if (translate) {
1734 while (text < textend) {
1735 ch = translate[(unsigned char)*text];
1736 if (pinst[ch / 8] & (1 << (ch & 7)))
1737 text++;
1738 else
1739 break;
1740 }
1741 } else {
1742 while (text < textend) {
1743 ch = (unsigned char)*text;
1744 if (pinst[ch / 8] & (1 << (ch & 7)))
1745 text++;
1746 else
1747 break;
1748 }
1749 }
1750 break;
1751 }
1752 case Cexact:
1753 {
1754 ch = (unsigned char)*pinst;
1755 if (translate) {
1756 while (text < textend && translate[(unsigned char)*text] == ch)
1757 text++;
1758 } else {
1759 while (text < textend && (unsigned char)*text == ch)
1760 text++;
1761 }
1762 break;
1763 }
1764 case Canychar:
1765 {
1766 while (text < textend && (unsigned char)*text != '\n')
1767 text++;
1768 break;
1769 }
1770 case Csyntaxspec:
1771 {
1772 a = (unsigned char)*pinst;
1773 if (translate) {
1774 while (text < textend && (SYNTAX(translate[*text]) & a))
1775 text++;
1776 } else {
1777 while (text < textend && (SYNTAX(*text) & a))
1778 text++;
1779 }
1780 break;
1781 }
1782 case Cnotsyntaxspec:
1783 {
1784 a = (unsigned char)*pinst;
1785 if (translate) {
1786 while (text < textend && !(SYNTAX(translate[*text]) & a))
1787 text++;
1788 } else {
1789 while (text < textend && !(SYNTAX(*text) & a))
1790 text++;
1791 }
1792 break;
1793 }
1794 default:
1795 {
1796 FREE_STATE(state);
1797 set_error("Unknown regex opcode: memory corrupted?");
1798 return -2;
1799 /*NOTREACHED*/}
1800 }
1801 /* due to the funky way + and * are compiled, the top
1802 * failure- stack entry at this point is actually a
1803 * success entry -- update it & pop it */
1804 UPDATE_FAILURE(state, text, goto error);
1805 goto fail; /* i.e., succeed <wink/sigh> */
1806 }
1807 case Cbegbuf:
1808 {
1809 if (text == textstart)
1810 goto continue_matching;
1811 goto fail;
1812 }
1813 case Cendbuf:
1814 {
1815 if (text == textend)
1816 goto continue_matching;
1817 goto fail;
1818 }
1819 case Cwordbeg:
1820 {
1821 if (text == textend)
1822 goto fail;
1823 if (!(SYNTAX(*text) & Sword))
1824 goto fail;
1825 if (text == textstart)
1826 goto continue_matching;
1827 if (!(SYNTAX(text[-1]) & Sword))
1828 goto continue_matching;
1829 goto fail;
1830 }
1831 case Cwordend:
1832 {
1833 if (text == textstart)
1834 goto fail;
1835 if (!(SYNTAX(text[-1]) & Sword))
1836 goto fail;
1837 if (text == textend)
1838 goto continue_matching;
1839 if (!(SYNTAX(*text) & Sword))
1840 goto continue_matching;
1841 goto fail;
1842 }
1843 case Cwordbound:
1844 {
1845 /* Note: as in gnu regexp, this also matches at the
1846 * beginning and end of buffer. */
1847
1848 if (text == textstart || text == textend)
1849 goto continue_matching;
1850 if ((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword))
1851 goto continue_matching;
1852 goto fail;
1853 }
1854 case Cnotwordbound:
1855 {
1856 /* Note: as in gnu regexp, this never matches at the
1857 * beginning and end of buffer. */
1858 if (text == textstart || text == textend)
1859 goto fail;
1860 if (!((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword)))
1861 goto continue_matching;
1862 goto fail;
1863 }
1864 case Csyntaxspec:
1865 {
1866 NEXTCHAR(ch);
1867 if (!(SYNTAX(ch) & (unsigned char)*code++))
1868 goto fail;
1869 goto continue_matching;
1870 }
1871 case Cnotsyntaxspec:
1872 {
1873 NEXTCHAR(ch);
1874 if (SYNTAX(ch) & (unsigned char)*code++)
1875 goto fail;
1876 goto continue_matching;
1877 }
1878 default:
1879 {
1880 FREE_STATE(state);
1881 set_error("Unknown regex opcode: memory corrupted?");
1882 return -2;
1883 /*NOTREACHED*/}
1884 }
1885
1886
1887
1888 #if 0 /* This line is never reached --Guido */
1889 abort();
1890 #endif
1891 /*
1892 *NOTREACHED
1893 */
1894
1895 /* Using "break;" in the above switch statement is equivalent to "goto fail;" */
1896 fail:
1897 POP_FAILURE(state, code, text, goto done_matching, goto error);
1898 goto continue_matching;
1899
1900 done_matching:
1901 /* if(translated != NULL) */
1902 /* free(translated); */
1903 FREE_STATE(state);
1904 return -1;
1905
1906 error:
1907 /* if (translated != NULL) */
1908 /* free(translated); */
1909 FREE_STATE(state);
1910 return -2;
1911 }
1912
1913
1914 #undef PREFETCH
1915 #undef NEXTCHAR
1916
re_search(regex_t * bufp,unsigned char * str,int size,int pos,int range,regexp_registers_t regs)1917 int re_search(regex_t * bufp, unsigned char *str, int size, int pos,
1918 int range, regexp_registers_t regs)
1919 {
1920 unsigned char *fastmap;
1921 unsigned char *translate;
1922 unsigned char *text;
1923 unsigned char *partstart;
1924 unsigned char *partend;
1925 int dir;
1926 int ret;
1927 unsigned char anchor;
1928 unsigned char *string = str;
1929
1930 if (bufp->cflags & REG_ICASE) { /* we must use string in lowercase */
1931 int len = strlen((const char *)str);
1932 if (!bufp->lcase) {
1933 bufp->lcase = get_pool_memory(PM_FNAME);
1934 }
1935 bufp->lcase = check_pool_memory_size(bufp->lcase, len+1);
1936 unsigned char *dst = (unsigned char *)bufp->lcase;
1937 while (*string) {
1938 *dst++ = tolower(*string++);
1939 }
1940 *dst = '\0';
1941 string = (unsigned char *)bufp->lcase;
1942 }
1943
1944 // assert(size >= 0 && pos >= 0);
1945 // assert(pos + range >= 0 && pos + range <= size); /* Bugfix by ylo */
1946
1947 fastmap = bufp->fastmap;
1948 translate = bufp->translate;
1949 if (fastmap && !bufp->fastmap_accurate) {
1950 re_compile_fastmap(bufp);
1951 if (got_error)
1952 return -2;
1953 }
1954
1955 anchor = bufp->anchor;
1956 if (bufp->can_be_null == 1) /* can_be_null == 2: can match null at eob */
1957 fastmap = NULL;
1958
1959 if (range < 0) {
1960 dir = -1;
1961 range = -range;
1962 } else
1963 dir = 1;
1964
1965 if (anchor == 2) {
1966 if (pos != 0)
1967 return -1;
1968 else
1969 range = 0;
1970 }
1971
1972 for (; range >= 0; range--, pos += dir) {
1973 if (fastmap) {
1974 if (dir == 1) { /* searching forwards */
1975
1976 text = string + pos;
1977 partend = string + size;
1978 partstart = text;
1979 if (translate)
1980 while (text != partend &&
1981 !fastmap[(unsigned char)translate[(unsigned char)*text]])
1982 text++;
1983 else
1984 while (text != partend && !fastmap[(unsigned char)*text])
1985 text++;
1986 pos += text - partstart;
1987 range -= text - partstart;
1988 if (pos == size && bufp->can_be_null == 0)
1989 return -1;
1990 } else { /* searching backwards */
1991 text = string + pos;
1992 partstart = string + pos - range;
1993 partend = text;
1994 if (translate)
1995 while (text != partstart && !fastmap[(unsigned char)
1996 translate[(unsigned char)*text]])
1997 text--;
1998 else
1999 while (text != partstart && !fastmap[(unsigned char)*text])
2000 text--;
2001 pos -= partend - text;
2002 range -= partend - text;
2003 }
2004 }
2005 if (anchor == 1) { /* anchored to begline */
2006 if (pos > 0 && (string[pos - 1] != '\n'))
2007 continue;
2008 }
2009 // assert(pos >= 0 && pos <= size);
2010 ret = re_match(bufp, string, size, pos, regs);
2011 if (ret >= 0)
2012 return pos;
2013 if (ret == -2)
2014 return -2;
2015 }
2016 return -1;
2017 }
2018
2019 /*
2020 ** Local Variables:
2021 ** mode: c
2022 ** c-file-style: "python"
2023 ** End:
2024 */
2025