1 /*
2 BAREOS® - Backup Archiving REcovery Open Sourced
3
4 Copyright (C) 2006-2010 Free Software Foundation Europe e.V.
5
6 This program is Free Software; you can redistribute it and/or
7 modify it under the terms of version three of the GNU Affero General Public
8 License as published by the Free Software Foundation and included
9 in the file LICENSE.
10
11 This program is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Affero General Public License for more details.
15
16 You should have received a copy of the GNU Affero General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301, USA.
20 */
21 /* regexpr.c
22 *
23 * Author: Tatu Ylonen <ylo@ngs.fi>
24 *
25 * Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
26 *
27 * Permission to use, copy, modify, distribute, and sell this software
28 * and its documentation for any purpose is hereby granted without
29 * fee, provided that the above copyright notice appear in all copies.
30 * This software is provided "as is" without express or implied
31 * warranty.
32 *
33 * Created: Thu Sep 26 17:14:05 1991 ylo
34 * Last modified: Mon Nov 4 17:06:48 1991 ylo
35 * Ported to Think C: 19 Jan 1992 guido@cwi.nl
36 *
37 * This code draws many ideas from the regular expression packages by
38 * Henry Spencer of the University of Toronto and Richard Stallman of
39 * the Free Software Foundation.
40 *
41 * Emacs-specific code and syntax table code is almost directly borrowed
42 * from GNU regexp.
43 *
44 * Bugs fixed and lots of reorganization by Jeffrey C. Ollie, April
45 * 1997 Thanks for bug reports and ideas from Andrew Kuchling, Tim
46 * Peters, Guido van Rossum, Ka-Ping Yee, Sjoerd Mullender, and
47 * probably one or two others that I'm forgetting.
48 *
49 * This file modified to work with BAREOS and C++ by
50 * Kern Sibbald, April 2006
51 *
52 * This file modified to work with REG_ICASE and BAREOS by
53 * Eric Bollengier April 2007
54 */
55
56 #include "include/bareos.h"
57 #include "bregex.h"
58
59 #define set_error(x) bufp->errmsg=((char *)(x))
60 #define got_error bufp->errmsg!=NULL
61
62 /* The original code blithely assumed that sizeof(short) == 2. Not
63 * always true. Original instances of "(short)x" were replaced by
64 * SHORT(x), where SHORT is #defined below. */
65
66 #define SHORT(x) ((x) & 0x8000 ? (x) - 0x10000 : (x))
67
68 /* The stack implementation is taken from an idea by Andrew Kuchling.
69 * It's a doubly linked list of arrays. The advantages of this over a
70 * simple linked list are that the number of mallocs required are
71 * reduced. It also makes it possible to statically allocate enough
72 * space so that small patterns don't ever need to call malloc.
73 *
74 * The advantages over a single array is that is periodically
75 * realloced when more space is needed is that we avoid ever copying
76 * the stack. */
77
78 /* item_t is the basic stack element. Defined as a union of
79 * structures so that both registers, failure points, and counters can
80 * be pushed/popped from the stack. There's nothing built into the
81 * item to keep track of whether a certain stack item is a register, a
82 * failure point, or a counter. */
83
84 typedef union item_t {
85 struct {
86 int num;
87 int level;
88 unsigned char *start;
89 unsigned char *end;
90 } reg;
91 struct {
92 int count;
93 int level;
94 int phantom;
95 unsigned char *code;
96 unsigned char *text;
97 } fail;
98 struct {
99 int num;
100 int level;
101 int count;
102 } cntr;
103 } item_t;
104
105 #define B_STACK_PAGE_SIZE 256
106 #define NUM_REGISTERS 256
107
108 /* A 'page' of stack items. */
109
110 typedef struct item_page_t {
111 item_t items[B_STACK_PAGE_SIZE];
112 struct item_page_t *prev;
113 struct item_page_t *next;
114 } item_page_t;
115
116
117 typedef struct match_state {
118 /* The number of registers that have been pushed onto the stack
119 * since the last failure point. */
120
121 int count;
122
123 /* Used to control when registers need to be pushed onto the
124 * stack. */
125
126 int level;
127
128 /* The number of failure points on the stack. */
129
130 int point;
131
132 /* Storage for the registers. Each register consists of two
133 * pointers to characters. So register N is represented as
134 * start[N] and end[N]. The pointers must be converted to
135 * offsets from the beginning of the string before returning the
136 * registers to the calling program. */
137
138 unsigned char *start[NUM_REGISTERS];
139 unsigned char *end[NUM_REGISTERS];
140
141 /* Keeps track of whether a register has changed recently. */
142
143 int changed[NUM_REGISTERS];
144
145 /* Structure to encapsulate the stack. */
146 struct {
147 /* index into the current page. If index == 0 and you need
148 * to pop an item, move to the previous page and set index
149 * = B_STACK_PAGE_SIZE - 1. Otherwise decrement index to
150 * push a page. If index == B_STACK_PAGE_SIZE and you need
151 * to push a page move to the next page and set index =
152 * 0. If there is no new next page, allocate a new page
153 * and link it in. Otherwise, increment index to push a
154 * page. */
155
156 int index;
157 item_page_t *current; /* Pointer to the current page. */
158 item_page_t first; /* First page is statically allocated. */
159 } stack;
160 } match_state;
161
162 /* Initialize a state object */
163
164 /* #define NEW_STATE(state) \ */
165 /* memset(&state, 0, (void *)(&state.stack) - (void *)(&state)); \ */
166 /* state.stack.current = &state.stack.first; \ */
167 /* state.stack.first.prev = NULL; \ */
168 /* state.stack.first.next = NULL; \ */
169 /* state.stack.index = 0; \ */
170 /* state.level = 1 */
171
172 #define NEW_STATE(state, nregs) \
173 { \
174 int i; \
175 for (i = 0; i < nregs; i++) \
176 { \
177 state.start[i] = NULL; \
178 state.end[i] = NULL; \
179 state.changed[i] = 0; \
180 } \
181 state.stack.current = &state.stack.first; \
182 state.stack.first.prev = NULL; \
183 state.stack.first.next = NULL; \
184 state.stack.index = 0; \
185 state.level = 1; \
186 state.count = 0; \
187 state.level = 0; \
188 state.point = 0; \
189 }
190
191 /* Free any memory that might have been malloc'd */
192
193 #define FREE_STATE(state) \
194 while(state.stack.first.next != NULL) \
195 { \
196 state.stack.current = state.stack.first.next; \
197 state.stack.first.next = state.stack.current->next; \
198 free(state.stack.current); \
199 }
200
201 /* Discard the top 'count' stack items. */
202
203 #define STACK_DISCARD(stack, count, on_error) \
204 stack.index -= count; \
205 while (stack.index < 0) \
206 { \
207 if (stack.current->prev == NULL) \
208 on_error; \
209 stack.current = stack.current->prev; \
210 stack.index += B_STACK_PAGE_SIZE; \
211 }
212
213 /* Store a pointer to the previous item on the stack. Used to pop an
214 * item off of the stack. */
215
216 #define STACK_PREV(stack, top, on_error) \
217 if (stack.index == 0) \
218 { \
219 if (stack.current->prev == NULL) \
220 on_error; \
221 stack.current = stack.current->prev; \
222 stack.index = B_STACK_PAGE_SIZE - 1; \
223 } \
224 else \
225 { \
226 stack.index--; \
227 } \
228 top = &(stack.current->items[stack.index])
229
230 /* Store a pointer to the next item on the stack. Used to push an item
231 * on to the stack. */
232
233 #define STACK_NEXT(stack, top, on_error) \
234 if (stack.index == B_STACK_PAGE_SIZE) \
235 { \
236 if (stack.current->next == NULL) \
237 { \
238 stack.current->next = (item_page_t *)malloc(sizeof(item_page_t)); \
239 if (stack.current->next == NULL) \
240 on_error; \
241 stack.current->next->prev = stack.current; \
242 stack.current->next->next = NULL; \
243 } \
244 stack.current = stack.current->next; \
245 stack.index = 0; \
246 } \
247 top = &(stack.current->items[stack.index++])
248
249 /* Store a pointer to the item that is 'count' items back in the
250 * stack. STACK_BACK(stack, top, 1, on_error) is equivalent to
251 * STACK_TOP(stack, top, on_error). */
252
253 #define STACK_BACK(stack, top, count, on_error) \
254 { \
255 int index; \
256 item_page_t *current; \
257 current = stack.current; \
258 index = stack.index - (count); \
259 while (index < 0) \
260 { \
261 if (current->prev == NULL) \
262 on_error; \
263 current = current->prev; \
264 index += B_STACK_PAGE_SIZE; \
265 } \
266 top = &(current->items[index]); \
267 }
268
269 /* Store a pointer to the top item on the stack. Execute the
270 * 'on_error' code if there are no items on the stack. */
271
272 #define STACK_TOP(stack, top, on_error) \
273 if (stack.index == 0) \
274 { \
275 if (stack.current->prev == NULL) \
276 on_error; \
277 top = &(stack.current->prev->items[B_STACK_PAGE_SIZE - 1]); \
278 } \
279 else \
280 { \
281 top = &(stack.current->items[stack.index - 1]); \
282 }
283
284 /* Test to see if the stack is empty */
285
286 #define STACK_EMPTY(stack) ((stack.index == 0) && \
287 (stack.current->prev == NULL))
288
289 /* Return the start of register 'reg' */
290
291 #define GET_REG_START(state, reg) (state.start[reg])
292
293 /* Return the end of register 'reg' */
294
295 #define GET_REG_END(state, reg) (state.end[reg])
296
297 /* Set the start of register 'reg'. If the state of the register needs
298 * saving, push it on the stack. */
299
300 #define SET_REG_START(state, reg, text, on_error) \
301 if(state.changed[reg] < state.level) \
302 { \
303 item_t *item; \
304 STACK_NEXT(state.stack, item, on_error); \
305 item->reg.num = reg; \
306 item->reg.start = state.start[reg]; \
307 item->reg.end = state.end[reg]; \
308 item->reg.level = state.changed[reg]; \
309 state.changed[reg] = state.level; \
310 state.count++; \
311 } \
312 state.start[reg] = text
313
314 /* Set the end of register 'reg'. If the state of the register needs
315 * saving, push it on the stack. */
316
317 #define SET_REG_END(state, reg, text, on_error) \
318 if(state.changed[reg] < state.level) \
319 { \
320 item_t *item; \
321 STACK_NEXT(state.stack, item, on_error); \
322 item->reg.num = reg; \
323 item->reg.start = state.start[reg]; \
324 item->reg.end = state.end[reg]; \
325 item->reg.level = state.changed[reg]; \
326 state.changed[reg] = state.level; \
327 state.count++; \
328 } \
329 state.end[reg] = text
330
331 #define PUSH_FAILURE(state, xcode, xtext, on_error) \
332 { \
333 item_t *item; \
334 STACK_NEXT(state.stack, item, on_error); \
335 item->fail.code = xcode; \
336 item->fail.text = xtext; \
337 item->fail.count = state.count; \
338 item->fail.level = state.level; \
339 item->fail.phantom = 0; \
340 state.count = 0; \
341 state.level++; \
342 state.point++; \
343 }
344
345 /* Update the last failure point with a new position in the text. */
346
347 #define UPDATE_FAILURE(state, xtext, on_error) \
348 { \
349 item_t *item; \
350 STACK_BACK(state.stack, item, state.count + 1, on_error); \
351 if (!item->fail.phantom) \
352 { \
353 item_t *item2; \
354 STACK_NEXT(state.stack, item2, on_error); \
355 item2->fail.code = item->fail.code; \
356 item2->fail.text = xtext; \
357 item2->fail.count = state.count; \
358 item2->fail.level = state.level; \
359 item2->fail.phantom = 1; \
360 state.count = 0; \
361 state.level++; \
362 state.point++; \
363 } \
364 else \
365 { \
366 STACK_DISCARD(state.stack, state.count, on_error); \
367 STACK_TOP(state.stack, item, on_error); \
368 item->fail.text = xtext; \
369 state.count = 0; \
370 state.level++; \
371 } \
372 }
373
374 #define POP_FAILURE(state, xcode, xtext, on_empty, on_error) \
375 { \
376 item_t *item; \
377 do \
378 { \
379 while(state.count > 0) \
380 { \
381 STACK_PREV(state.stack, item, on_error); \
382 state.start[item->reg.num] = item->reg.start; \
383 state.end[item->reg.num] = item->reg.end; \
384 state.changed[item->reg.num] = item->reg.level; \
385 state.count--; \
386 } \
387 STACK_PREV(state.stack, item, on_empty); \
388 xcode = item->fail.code; \
389 xtext = item->fail.text; \
390 state.count = item->fail.count; \
391 state.level = item->fail.level; \
392 state.point--; \
393 } \
394 while (item->fail.text == NULL); \
395 }
396
397 enum regexp_compiled_ops { /* opcodes for compiled regexp */
398 Cend, /* end of pattern reached */
399 Cbol, /* beginning of line */
400 Ceol, /* end of line */
401 Cset, /* character set. Followed by 32 bytes of set. */
402 Cexact, /* followed by a byte to match */
403 Canychar, /* matches any character except newline */
404 Cstart_memory, /* set register start addr (followed by reg number) */
405 Cend_memory, /* set register end addr (followed by reg number) */
406 Cmatch_memory, /* match a duplicate of reg contents (regnum follows) */
407 Cjump, /* followed by two bytes (lsb,msb) of displacement. */
408 Cstar_jump, /* will change to jump/update_failure_jump at runtime */
409 Cfailure_jump, /* jump to addr on failure */
410 Cupdate_failure_jump, /* update topmost failure point and jump */
411 Cdummy_failure_jump, /* push a dummy failure point and jump */
412 Cbegbuf, /* match at beginning of buffer */
413 Cendbuf, /* match at end of buffer */
414 Cwordbeg, /* match at beginning of word */
415 Cwordend, /* match at end of word */
416 Cwordbound, /* match if at word boundary */
417 Cnotwordbound, /* match if not at word boundary */
418 Csyntaxspec, /* matches syntax code (1 byte follows) */
419 Cnotsyntaxspec, /* matches if syntax code does not match (1 byte follows) */
420 Crepeat1
421 };
422
423 enum regexp_syntax_op { /* syntax codes for plain and quoted characters */
424 Rend, /* special code for end of regexp */
425 Rnormal, /* normal character */
426 Ranychar, /* any character except newline */
427 Rquote, /* the quote character */
428 Rbol, /* match beginning of line */
429 Reol, /* match end of line */
430 Roptional, /* match preceding expression optionally */
431 Rstar, /* match preceding expr zero or more times */
432 Rplus, /* match preceding expr one or more times */
433 Ror, /* match either of alternatives */
434 Ropenpar, /* opening parenthesis */
435 Rclosepar, /* closing parenthesis */
436 Rmemory, /* match memory register */
437 Rextended_memory, /* \vnn to match registers 10-99 */
438 Ropenset, /* open set. Internal syntax hard-coded below. */
439 /* the following are gnu extensions to "normal" regexp syntax */
440 Rbegbuf, /* beginning of buffer */
441 Rendbuf, /* end of buffer */
442 Rwordchar, /* word character */
443 Rnotwordchar, /* not word character */
444 Rwordbeg, /* beginning of word */
445 Rwordend, /* end of word */
446 Rwordbound, /* word bound */
447 Rnotwordbound, /* not word bound */
448 Rnum_ops
449 };
450
451 static int re_compile_initialized = 0;
452 static int regexp_syntax = RE_SYNTAX_EGREP;
453 int re_syntax = RE_SYNTAX_EGREP; /* Exported copy of regexp_syntax */
454 static unsigned char plain_ops[256];
455 static unsigned char quoted_ops[256];
456 static unsigned char precedences[Rnum_ops];
457 static int regexp_context_indep_ops;
458 static int regexp_ansi_sequences;
459
460 #define NUM_LEVELS 5 /* number of precedence levels in use */
461 #define MAX_NESTING 100 /* max nesting level of operators */
462
463 #define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
464
465 unsigned char re_syntax_table[256];
466
ReCompileInitialize(void)467 void ReCompileInitialize(void)
468 {
469 int a;
470
471 static int syntax_table_inited = 0;
472
473 if (!syntax_table_inited) {
474 syntax_table_inited = 1;
475 memset(re_syntax_table, 0, 256);
476 for (a = 'a'; a <= 'z'; a++)
477 re_syntax_table[a] = Sword;
478 for (a = 'A'; a <= 'Z'; a++)
479 re_syntax_table[a] = Sword;
480 for (a = '0'; a <= '9'; a++)
481 re_syntax_table[a] = Sword | Sdigit | Shexdigit;
482 for (a = '0'; a <= '7'; a++)
483 re_syntax_table[a] |= Soctaldigit;
484 for (a = 'A'; a <= 'F'; a++)
485 re_syntax_table[a] |= Shexdigit;
486 for (a = 'a'; a <= 'f'; a++)
487 re_syntax_table[a] |= Shexdigit;
488 re_syntax_table[(int)'_'] = Sword;
489 for (a = 9; a <= 13; a++)
490 re_syntax_table[a] = Swhitespace;
491 re_syntax_table[(int)' '] = Swhitespace;
492 }
493 re_compile_initialized = 1;
494 for (a = 0; a < 256; a++) {
495 plain_ops[a] = Rnormal;
496 quoted_ops[a] = Rnormal;
497 }
498 for (a = '0'; a <= '9'; a++)
499 quoted_ops[a] = Rmemory;
500 plain_ops[(int)'\134'] = Rquote;
501 if (regexp_syntax & RE_NO_BK_PARENS) {
502 plain_ops[(int)'('] = Ropenpar;
503 plain_ops[(int)')'] = Rclosepar;
504 } else {
505 quoted_ops[(int)'('] = Ropenpar;
506 quoted_ops[(int)')'] = Rclosepar;
507 }
508 if (regexp_syntax & RE_NO_BK_VBAR) {
509 plain_ops[(int)'\174'] = Ror;
510 } else {
511 quoted_ops[(int)'\174'] = Ror;
512 }
513 plain_ops[(int)'*'] = Rstar;
514 if (regexp_syntax & RE_BK_PLUS_QM) {
515 quoted_ops[(int)'+'] = Rplus;
516 quoted_ops[(int)'?'] = Roptional;
517 } else {
518 plain_ops[(int)'+'] = Rplus;
519 plain_ops[(int)'?'] = Roptional;
520 }
521 if (regexp_syntax & RE_NEWLINE_OR) {
522 plain_ops[(int)'\n'] = Ror;
523 }
524 plain_ops[(int)'\133'] = Ropenset;
525 plain_ops[(int)'\136'] = Rbol;
526 plain_ops[(int)'$'] = Reol;
527 plain_ops[(int)'.'] = Ranychar;
528 if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS)) {
529 quoted_ops[(int)'w'] = Rwordchar;
530 quoted_ops[(int)'W'] = Rnotwordchar;
531 quoted_ops[(int)'<'] = Rwordbeg;
532 quoted_ops[(int)'>'] = Rwordend;
533 quoted_ops[(int)'b'] = Rwordbound;
534 quoted_ops[(int)'B'] = Rnotwordbound;
535 quoted_ops[(int)'`'] = Rbegbuf;
536 quoted_ops[(int)'\''] = Rendbuf;
537 }
538 if (regexp_syntax & RE_ANSI_HEX) {
539 quoted_ops[(int)'v'] = Rextended_memory;
540 }
541 for (a = 0; a < Rnum_ops; a++) {
542 precedences[a] = 4;
543 }
544 if (regexp_syntax & RE_TIGHT_VBAR) {
545 precedences[Ror] = 3;
546 precedences[Rbol] = 2;
547 precedences[Reol] = 2;
548 } else {
549 precedences[Ror] = 2;
550 precedences[Rbol] = 3;
551 precedences[Reol] = 3;
552 }
553 precedences[Rclosepar] = 1;
554 precedences[Rend] = 0;
555 regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
556 regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
557 }
558
ReSetSyntax(int syntax)559 int ReSetSyntax(int syntax) {
560 int ret;
561
562 ret = regexp_syntax;
563 regexp_syntax = syntax;
564 re_syntax = syntax; /* Exported copy */
565 ReCompileInitialize();
566 return ret;
567 }
568
HexCharToDecimal(int ch)569 static int HexCharToDecimal(int ch) {
570 if (ch >= '0' && ch <= '9')
571 return ch - '0';
572 if (ch >= 'a' && ch <= 'f')
573 return ch - 'a' + 10;
574 if (ch >= 'A' && ch <= 'F')
575 return ch - 'A' + 10;
576 return 16;
577 }
578
re_compile_fastmap_aux(regex_t * bufp,unsigned char * code,int pos,unsigned char * visited,unsigned char * can_be_null,unsigned char * fastmap)579 static void re_compile_fastmap_aux(regex_t * bufp, unsigned char *code, int pos,
580 unsigned char *visited,
581 unsigned char *can_be_null,
582 unsigned char *fastmap)
583 {
584 int a;
585 int b;
586 int syntaxcode;
587
588 if (visited[pos])
589 return; /* we have already been here */
590 visited[pos] = 1;
591 for (;;) {
592 switch (code[pos++]) {
593 case Cend:
594 *can_be_null = 1;
595 return;
596 case Cbol:
597 case Cbegbuf:
598 case Cendbuf:
599 case Cwordbeg:
600 case Cwordend:
601 case Cwordbound:
602 case Cnotwordbound:
603 for (a = 0; a < 256; a++)
604 fastmap[a] = 1;
605 break;
606 case Csyntaxspec:
607 syntaxcode = code[pos++];
608 for (a = 0; a < 256; a++)
609 if (SYNTAX(a) & syntaxcode)
610 fastmap[a] = 1;
611 return;
612 case Cnotsyntaxspec:
613 syntaxcode = code[pos++];
614 for (a = 0; a < 256; a++)
615 if (!(SYNTAX(a) & syntaxcode))
616 fastmap[a] = 1;
617 return;
618 case Ceol:
619 fastmap[(int)'\n'] = 1;
620 if (*can_be_null == 0)
621 *can_be_null = 2; /* can match null, but only at end of buffer */
622 return;
623 case Cset:
624 for (a = 0; a < 256 / 8; a++)
625 if (code[pos + a] != 0)
626 for (b = 0; b < 8; b++)
627 if (code[pos + a] & (1 << b))
628 fastmap[(a << 3) + b] = 1;
629 pos += 256 / 8;
630 return;
631 case Cexact:
632 fastmap[(unsigned char)code[pos]] = 1;
633 return;
634 case Canychar:
635 for (a = 0; a < 256; a++)
636 if (a != '\n')
637 fastmap[a] = 1;
638 return;
639 case Cstart_memory:
640 case Cend_memory:
641 pos++;
642 break;
643 case Cmatch_memory:
644 for (a = 0; a < 256; a++)
645 fastmap[a] = 1;
646 *can_be_null = 1;
647 return;
648 case Cjump:
649 case Cdummy_failure_jump:
650 case Cupdate_failure_jump:
651 case Cstar_jump:
652 a = (unsigned char)code[pos++];
653 a |= (unsigned char)code[pos++] << 8;
654 pos += (int)SHORT(a);
655 if (visited[pos]) {
656 /* argh... the regexp contains empty loops. This is not
657 good, as this may cause a failure stack overflow when
658 matching. Oh well. */
659 /* this path leads nowhere; pursue other paths. */
660 return;
661 }
662 visited[pos] = 1;
663 break;
664 case Cfailure_jump:
665 a = (unsigned char)code[pos++];
666 a |= (unsigned char)code[pos++] << 8;
667 a = pos + (int)SHORT(a);
668 re_compile_fastmap_aux(bufp, code, a, visited, can_be_null, fastmap);
669 break;
670 case Crepeat1:
671 pos += 2;
672 break;
673 default:
674 set_error("Unknown regex opcode: memory corrupted?");
675 return;
676 }
677 }
678 }
679
re_do_compile_fastmap(regex_t * bufp,unsigned char * buffer,int used,int pos,unsigned char * can_be_null,unsigned char * fastmap)680 static int re_do_compile_fastmap(regex_t * bufp, unsigned char *buffer, int used,
681 int pos, unsigned char *can_be_null,
682 unsigned char *fastmap)
683 {
684 unsigned char small_visited[512], *visited;
685
686 if (used <= (int)sizeof(small_visited))
687 visited = small_visited;
688 else {
689 visited = (unsigned char *)malloc(used);
690 if (!visited)
691 return 0;
692 }
693 *can_be_null = 0;
694 memset(fastmap, 0, 256);
695 memset(visited, 0, used);
696 re_compile_fastmap_aux(bufp, buffer, pos, visited, can_be_null, fastmap);
697 if (visited != small_visited)
698 free(visited);
699 return 1;
700 }
701
ReCompileFastmap(regex_t * bufp)702 void ReCompileFastmap(regex_t * bufp)
703 {
704 if (!bufp->fastmap || bufp->fastmap_accurate)
705 return;
706 // assert(bufp->used > 0);
707 if (!re_do_compile_fastmap(bufp, bufp->buffer,
708 bufp->used, 0, &bufp->can_be_null, bufp->fastmap))
709 return;
710 if (got_error)
711 return;
712 if (bufp->buffer[0] == Cbol)
713 bufp->anchor = 1; /* begline */
714 else if (bufp->buffer[0] == Cbegbuf)
715 bufp->anchor = 2; /* begbuf */
716 else
717 bufp->anchor = 0; /* none */
718 bufp->fastmap_accurate = 1;
719 }
720
721 /*
722 * star is coded as:
723 * 1: failure_jump 2
724 * ... code for operand of star
725 * star_jump 1
726 * 2: ... code after star
727 *
728 * We change the star_jump to update_failure_jump if we can determine
729 * that it is safe to do so; otherwise we change it to an ordinary
730 * jump.
731 *
732 * plus is coded as
733 *
734 * jump 2
735 * 1: failure_jump 3
736 * 2: ... code for operand of plus
737 * star_jump 1
738 * 3: ... code after plus
739 *
740 * For star_jump considerations this is processed identically to star.
741 *
742 */
743
ReOptimizeStarJump(regex_t * bufp,unsigned char * code)744 static int ReOptimizeStarJump(regex_t * bufp, unsigned char *code)
745 {
746 unsigned char map[256];
747 unsigned char can_be_null;
748 unsigned char *p1;
749 unsigned char *p2;
750 unsigned char ch;
751 int a;
752 int b;
753 int num_instructions = 0;
754
755 a = (unsigned char)*code++;
756 a |= (unsigned char)*code++ << 8;
757 a = (int)SHORT(a);
758
759 p1 = code + a + 3; /* skip the failure_jump */
760 /* Check that the jump is within the pattern */
761 if (p1 < bufp->buffer || bufp->buffer + bufp->used < p1) {
762 set_error("Regex VM jump out of bounds (failure_jump opt)");
763 return 0;
764 }
765 // assert(p1[-3] == Cfailure_jump);
766 p2 = code;
767 /* p1 points inside loop, p2 points to after loop */
768 if (!re_do_compile_fastmap(bufp, bufp->buffer, bufp->used,
769 (int)(p2 - bufp->buffer), &can_be_null, map))
770 goto make_normal_jump;
771
772 /* If we might introduce a new update point inside the
773 * loop, we can't optimize because then update_jump would
774 * update a wrong failure point. Thus we have to be
775 * quite careful here.
776 */
777
778 /* loop until we find something that consumes a character */
779 for (;;) {
780 num_instructions++;
781 switch (*p1++) {
782 case Cbol:
783 case Ceol:
784 case Cbegbuf:
785 case Cendbuf:
786 case Cwordbeg:
787 case Cwordend:
788 case Cwordbound:
789 case Cnotwordbound:
790 continue;
791 case Cstart_memory:
792 case Cend_memory:
793 p1++;
794 continue;
795 case Cexact:
796 ch = (unsigned char)*p1++;
797 if (map[(int)ch])
798 goto make_normal_jump;
799 break;
800 case Canychar:
801 for (b = 0; b < 256; b++)
802 if (b != '\n' && map[b])
803 goto make_normal_jump;
804 break;
805 case Cset:
806 for (b = 0; b < 256; b++)
807 if ((p1[b >> 3] & (1 << (b & 7))) && map[b])
808 goto make_normal_jump;
809 p1 += 256 / 8;
810 break;
811 default:
812 goto make_normal_jump;
813 }
814 break;
815 }
816 /* now we know that we can't backtrack. */
817 while (p1 != p2 - 3) {
818 num_instructions++;
819 switch (*p1++) {
820 case Cend:
821 return 0;
822 case Cbol:
823 case Ceol:
824 case Canychar:
825 case Cbegbuf:
826 case Cendbuf:
827 case Cwordbeg:
828 case Cwordend:
829 case Cwordbound:
830 case Cnotwordbound:
831 break;
832 case Cset:
833 p1 += 256 / 8;
834 break;
835 case Cexact:
836 case Cstart_memory:
837 case Cend_memory:
838 case Cmatch_memory:
839 case Csyntaxspec:
840 case Cnotsyntaxspec:
841 p1++;
842 break;
843 case Cjump:
844 case Cstar_jump:
845 case Cfailure_jump:
846 case Cupdate_failure_jump:
847 case Cdummy_failure_jump:
848 goto make_normal_jump;
849 default:
850 return 0;
851 }
852 }
853
854 /* make_update_jump: */
855 code -= 3;
856 a += 3; /* jump to after the Cfailure_jump */
857 code[0] = Cupdate_failure_jump;
858 code[1] = a & 0xff;
859 code[2] = a >> 8;
860 if (num_instructions > 1)
861 return 1;
862 // assert(num_instructions == 1);
863 /* if the only instruction matches a single character, we can do
864 * better */
865 p1 = code + 3 + a; /* start of sole instruction */
866 if (*p1 == Cset || *p1 == Cexact || *p1 == Canychar ||
867 *p1 == Csyntaxspec || *p1 == Cnotsyntaxspec)
868 code[0] = Crepeat1;
869 return 1;
870
871 make_normal_jump:
872 code -= 3;
873 *code = Cjump;
874 return 1;
875 }
876
ReOptimize(regex_t * bufp)877 static int ReOptimize(regex_t * bufp)
878 {
879 unsigned char *code;
880
881 code = bufp->buffer;
882
883 while (1) {
884 switch (*code++) {
885 case Cend:
886 return 1;
887 case Canychar:
888 case Cbol:
889 case Ceol:
890 case Cbegbuf:
891 case Cendbuf:
892 case Cwordbeg:
893 case Cwordend:
894 case Cwordbound:
895 case Cnotwordbound:
896 break;
897 case Cset:
898 code += 256 / 8;
899 break;
900 case Cexact:
901 case Cstart_memory:
902 case Cend_memory:
903 case Cmatch_memory:
904 case Csyntaxspec:
905 case Cnotsyntaxspec:
906 code++;
907 break;
908 case Cstar_jump:
909 if (!ReOptimizeStarJump(bufp, code)) {
910 return 0;
911 }
912 /* fall through */
913 case Cupdate_failure_jump:
914 case Cjump:
915 case Cdummy_failure_jump:
916 case Cfailure_jump:
917 case Crepeat1:
918 code += 2;
919 break;
920 default:
921 return 0;
922 }
923 }
924 }
925
926 #define NEXTCHAR(var) \
927 { \
928 if (pos >= size) \
929 goto ends_prematurely; \
930 (var) = regex[pos]; \
931 pos++; \
932 }
933
934 #define ALLOC(amount) \
935 { \
936 if (pattern_offset+(amount) > alloc) \
937 { \
938 alloc += 256 + (amount); \
939 pattern = (unsigned char *)realloc(pattern, alloc); \
940 if (!pattern) \
941 goto out_of_memory; \
942 } \
943 }
944
945 #define STORE(ch) pattern[pattern_offset++] = (ch)
946
947 #define CURRENT_LEVEL_START (starts[starts_base + current_level])
948
949 #define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
950
951 #define PUSH_LEVEL_STARTS \
952 if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
953 starts_base += NUM_LEVELS; \
954 else \
955 goto too_complex \
956
957 #define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
958
959 #define PUT_ADDR(offset,addr) \
960 { \
961 int disp = (addr) - (offset) - 2; \
962 pattern[(offset)] = disp & 0xff; \
963 pattern[(offset)+1] = (disp>>8) & 0xff; \
964 }
965
966 #define INSERT_JUMP(pos,type,addr) \
967 { \
968 int a, p = (pos), t = (type), ad = (addr); \
969 for (a = pattern_offset - 1; a >= p; a--) \
970 pattern[a + 3] = pattern[a]; \
971 pattern[p] = t; \
972 PUT_ADDR(p+1,ad); \
973 pattern_offset += 3; \
974 }
975
976 #define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
977
978 #define SET_FIELDS \
979 { \
980 bufp->allocated = alloc; \
981 bufp->buffer = pattern; \
982 bufp->used = pattern_offset; \
983 }
984
985 #define GETHEX(var) \
986 { \
987 unsigned char gethex_ch, gethex_value; \
988 NEXTCHAR(gethex_ch); \
989 gethex_value = HexCharToDecimal(gethex_ch); \
990 if (gethex_value == 16) \
991 goto hex_error; \
992 NEXTCHAR(gethex_ch); \
993 gethex_ch = HexCharToDecimal(gethex_ch); \
994 if (gethex_ch == 16) \
995 goto hex_error; \
996 (var) = gethex_value * 16 + gethex_ch; \
997 }
998
999 #define ANSI_TRANSLATE(ch) \
1000 { \
1001 switch (ch) \
1002 { \
1003 case 'a': \
1004 case 'A': \
1005 { \
1006 ch = 7; /* audible bell */ \
1007 break; \
1008 } \
1009 case 'b': \
1010 case 'B': \
1011 { \
1012 ch = 8; /* backspace */ \
1013 break; \
1014 } \
1015 case 'f': \
1016 case 'F': \
1017 { \
1018 ch = 12; /* form feed */ \
1019 break; \
1020 } \
1021 case 'n': \
1022 case 'N': \
1023 { \
1024 ch = 10; /* line feed */ \
1025 break; \
1026 } \
1027 case 'r': \
1028 case 'R': \
1029 { \
1030 ch = 13; /* carriage return */ \
1031 break; \
1032 } \
1033 case 't': \
1034 case 'T': \
1035 { \
1036 ch = 9; /* tab */ \
1037 break; \
1038 } \
1039 case 'v': \
1040 case 'V': \
1041 { \
1042 ch = 11; /* vertical tab */ \
1043 break; \
1044 } \
1045 case 'x': /* hex code */ \
1046 case 'X': \
1047 { \
1048 GETHEX(ch); \
1049 break; \
1050 } \
1051 default: \
1052 { \
1053 /* other characters passed through */ \
1054 if (translate) \
1055 ch = translate[(unsigned char)ch]; \
1056 break; \
1057 } \
1058 } \
1059 }
1060
re_compile_pattern(regex_t * bufp,unsigned char * regex)1061 const char *re_compile_pattern(regex_t * bufp, unsigned char *regex)
1062 {
1063 int a;
1064 int pos;
1065 int op;
1066 int current_level;
1067 int level;
1068 int opcode;
1069 int pattern_offset = 0, alloc;
1070 int starts[NUM_LEVELS * MAX_NESTING];
1071 int starts_base;
1072 int future_jumps[MAX_NESTING];
1073 int num_jumps;
1074 unsigned char ch = '\0';
1075 unsigned char *pattern;
1076 unsigned char *translate;
1077 int next_register;
1078 int paren_depth;
1079 int num_open_registers;
1080 int open_registers[RE_NREGS];
1081 int beginning_context;
1082 int size = strlen((char *)regex);
1083
1084 if (!re_compile_initialized)
1085 ReCompileInitialize();
1086 bufp->used = 0;
1087 bufp->fastmap_accurate = 0;
1088 bufp->uses_registers = 1;
1089 bufp->num_registers = 1;
1090 translate = bufp->translate;
1091 pattern = bufp->buffer;
1092 alloc = bufp->allocated;
1093 if (alloc == 0 || pattern == NULL) {
1094 alloc = 256;
1095 bufp->buffer = pattern = (unsigned char *)malloc(alloc);
1096 if (!pattern)
1097 goto out_of_memory;
1098 }
1099 pattern_offset = 0;
1100 starts_base = 0;
1101 num_jumps = 0;
1102 current_level = 0;
1103 SET_LEVEL_START;
1104 num_open_registers = 0;
1105 next_register = 1;
1106 paren_depth = 0;
1107 beginning_context = 1;
1108 op = -1;
1109 /* we use Rend dummy to ensure that pending jumps are updated
1110 (due to low priority of Rend) before exiting the loop. */
1111 pos = 0;
1112 while (op != Rend) {
1113 if (pos >= size)
1114 op = Rend;
1115 else {
1116 NEXTCHAR(ch);
1117 if (translate)
1118 ch = translate[(unsigned char)ch];
1119 op = plain_ops[(unsigned char)ch];
1120 if (op == Rquote) {
1121 NEXTCHAR(ch);
1122 op = quoted_ops[(unsigned char)ch];
1123 if (op == Rnormal && regexp_ansi_sequences)
1124 ANSI_TRANSLATE(ch);
1125 }
1126 }
1127 level = precedences[op];
1128 /* printf("ch='%c' op=%d level=%d current_level=%d
1129 curlevstart=%d\n", ch, op, level, current_level,
1130 CURRENT_LEVEL_START); */
1131 if (level > current_level) {
1132 for (current_level++; current_level < level; current_level++)
1133 SET_LEVEL_START;
1134 SET_LEVEL_START;
1135 } else if (level < current_level) {
1136 current_level = level;
1137 for (; num_jumps > 0 &&
1138 future_jumps[num_jumps - 1] >= CURRENT_LEVEL_START; num_jumps--)
1139 PUT_ADDR(future_jumps[num_jumps - 1], pattern_offset);
1140 }
1141 switch (op) {
1142 case Rend:
1143 break;
1144 case Rnormal:
1145 normal_char:
1146 opcode = Cexact;
1147 store_opcode_and_arg: /* opcode & ch must be set */
1148 SET_LEVEL_START;
1149 ALLOC(2);
1150 STORE(opcode);
1151 STORE(ch);
1152 break;
1153 case Ranychar:
1154 opcode = Canychar;
1155 store_opcode:
1156 SET_LEVEL_START;
1157 ALLOC(1);
1158 STORE(opcode);
1159 break;
1160 case Rquote:
1161 set_error("Rquote");
1162 /* NOTREACHED */
1163 break;
1164 case Rbol:
1165 if (!beginning_context) {
1166 if (regexp_context_indep_ops)
1167 goto op_error;
1168 else
1169 goto normal_char;
1170 }
1171 opcode = Cbol;
1172 goto store_opcode;
1173 case Reol:
1174 if (!((pos >= size) ||
1175 ((regexp_syntax & RE_NO_BK_VBAR) ?
1176 (regex[pos] == '\174') :
1177 (pos + 1 < size && regex[pos] == '\134' &&
1178 regex[pos + 1] == '\174')) ||
1179 ((regexp_syntax & RE_NO_BK_PARENS) ?
1180 (regex[pos] == ')') :
1181 (pos + 1 < size && regex[pos] == '\134' &&
1182 regex[pos + 1] == ')')))) {
1183 if (regexp_context_indep_ops)
1184 goto op_error;
1185 else
1186 goto normal_char;
1187 }
1188 opcode = Ceol;
1189 goto store_opcode;
1190 /* NOTREACHED */
1191 break;
1192 case Roptional:
1193 if (beginning_context) {
1194 if (regexp_context_indep_ops)
1195 goto op_error;
1196 else
1197 goto normal_char;
1198 }
1199 if (CURRENT_LEVEL_START == pattern_offset)
1200 break; /* ignore empty patterns for ? */
1201 ALLOC(3);
1202 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 3);
1203 break;
1204 case Rstar:
1205 case Rplus:
1206 if (beginning_context) {
1207 if (regexp_context_indep_ops)
1208 goto op_error;
1209 else
1210 goto normal_char;
1211 }
1212 if (CURRENT_LEVEL_START == pattern_offset)
1213 break; /* ignore empty patterns for + and * */
1214 ALLOC(9);
1215 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1216 INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
1217 if (op == Rplus) /* jump over initial failure_jump */
1218 INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
1219 CURRENT_LEVEL_START + 6);
1220 break;
1221 case Ror:
1222 ALLOC(6);
1223 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1224 if (num_jumps >= MAX_NESTING)
1225 goto too_complex;
1226 STORE(Cjump);
1227 future_jumps[num_jumps++] = pattern_offset;
1228 STORE(0);
1229 STORE(0);
1230 SET_LEVEL_START;
1231 break;
1232 case Ropenpar:
1233 {
1234 SET_LEVEL_START;
1235 if (next_register < RE_NREGS) {
1236 bufp->uses_registers = 1;
1237 ALLOC(2);
1238 STORE(Cstart_memory);
1239 STORE(next_register);
1240 open_registers[num_open_registers++] = next_register;
1241 bufp->num_registers++;
1242 next_register++;
1243 }
1244 paren_depth++;
1245 PUSH_LEVEL_STARTS;
1246 current_level = 0;
1247 SET_LEVEL_START;
1248 break;
1249 }
1250 case Rclosepar:
1251 if (paren_depth <= 0)
1252 goto parenthesIsError;
1253 POP_LEVEL_STARTS;
1254 current_level = precedences[Ropenpar];
1255 paren_depth--;
1256 if (paren_depth < num_open_registers) {
1257 bufp->uses_registers = 1;
1258 ALLOC(2);
1259 STORE(Cend_memory);
1260 num_open_registers--;
1261 STORE(open_registers[num_open_registers]);
1262 }
1263 break;
1264 case Rmemory:
1265 if (ch == '0')
1266 goto bad_match_register;
1267 if (!(ch >= '0' && ch <= '9')) {
1268 goto bad_match_register;
1269 }
1270 bufp->uses_registers = 1;
1271 opcode = Cmatch_memory;
1272 ch -= '0';
1273 goto store_opcode_and_arg;
1274 case Rextended_memory:
1275 NEXTCHAR(ch);
1276 if (ch < '0' || ch > '9')
1277 goto bad_match_register;
1278 NEXTCHAR(a);
1279 if (a < '0' || a > '9')
1280 goto bad_match_register;
1281 ch = 10 * (a - '0') + ch - '0';
1282 if (ch == 0 || ch >= RE_NREGS)
1283 goto bad_match_register;
1284 bufp->uses_registers = 1;
1285 opcode = Cmatch_memory;
1286 goto store_opcode_and_arg;
1287 case Ropenset:
1288 {
1289 int complement;
1290 int prev;
1291 int offset;
1292 int range;
1293 int firstchar;
1294
1295 SET_LEVEL_START;
1296 ALLOC(1 + 256 / 8);
1297 STORE(Cset);
1298 offset = pattern_offset;
1299 for (a = 0; a < 256 / 8; a++)
1300 STORE(0);
1301 NEXTCHAR(ch);
1302 if (translate)
1303 ch = translate[(unsigned char)ch];
1304 if (ch == '\136') {
1305 complement = 1;
1306 NEXTCHAR(ch);
1307 if (translate)
1308 ch = translate[(unsigned char)ch];
1309 } else
1310 complement = 0;
1311 prev = -1;
1312 range = 0;
1313 firstchar = 1;
1314 while (ch != '\135' || firstchar) {
1315 firstchar = 0;
1316 if (regexp_ansi_sequences && ch == '\134') {
1317 NEXTCHAR(ch);
1318 ANSI_TRANSLATE(ch);
1319 }
1320 if (range) {
1321 for (a = prev; a <= (int)ch; a++)
1322 SETBIT(pattern, offset, a);
1323 prev = -1;
1324 range = 0;
1325 } else if (prev != -1 && ch == '-')
1326 range = 1;
1327 else {
1328 SETBIT(pattern, offset, ch);
1329 prev = ch;
1330 }
1331 NEXTCHAR(ch);
1332 if (translate)
1333 ch = translate[(unsigned char)ch];
1334 }
1335 if (range)
1336 SETBIT(pattern, offset, '-');
1337 if (complement) {
1338 for (a = 0; a < 256 / 8; a++)
1339 pattern[offset + a] ^= 0xff;
1340 }
1341 break;
1342 }
1343 case Rbegbuf:
1344 {
1345 opcode = Cbegbuf;
1346 goto store_opcode;
1347 }
1348 case Rendbuf:
1349 {
1350 opcode = Cendbuf;
1351 goto store_opcode;
1352 }
1353 case Rwordchar:
1354 {
1355 opcode = Csyntaxspec;
1356 ch = Sword;
1357 goto store_opcode_and_arg;
1358 }
1359 case Rnotwordchar:
1360 {
1361 opcode = Cnotsyntaxspec;
1362 ch = Sword;
1363 goto store_opcode_and_arg;
1364 }
1365 case Rwordbeg:
1366 {
1367 opcode = Cwordbeg;
1368 goto store_opcode;
1369 }
1370 case Rwordend:
1371 {
1372 opcode = Cwordend;
1373 goto store_opcode;
1374 }
1375 case Rwordbound:
1376 {
1377 opcode = Cwordbound;
1378 goto store_opcode;
1379 }
1380 case Rnotwordbound:
1381 {
1382 opcode = Cnotwordbound;
1383 goto store_opcode;
1384 }
1385 default:
1386 {
1387 abort();
1388 }
1389 }
1390 beginning_context = (op == Ropenpar || op == Ror);
1391 }
1392 if (starts_base != 0)
1393 goto parenthesIsError;
1394 // assert(num_jumps == 0);
1395 ALLOC(1);
1396 STORE(Cend);
1397 SET_FIELDS;
1398 if (!ReOptimize(bufp))
1399 return "Optimization error";
1400 return NULL;
1401
1402 op_error:
1403 SET_FIELDS;
1404 return "Badly placed special character";
1405
1406 bad_match_register:
1407 SET_FIELDS;
1408 return "Bad match register number";
1409
1410 hex_error:
1411 SET_FIELDS;
1412 return "Bad hexadecimal number";
1413
1414 parenthesIsError:
1415 SET_FIELDS;
1416 return "Badly placed parenthesis";
1417
1418 out_of_memory:
1419 SET_FIELDS;
1420 return "Out of memory";
1421
1422 ends_prematurely:
1423 SET_FIELDS;
1424 return "Regular expression ends prematurely";
1425
1426 too_complex:
1427 SET_FIELDS;
1428 return "Regular expression too complex";
1429 }
1430
1431 #undef CHARAT
1432 #undef NEXTCHAR
1433 #undef GETHEX
1434 #undef ALLOC
1435 #undef STORE
1436 #undef CURRENT_LEVEL_START
1437 #undef SET_LEVEL_START
1438 #undef PUSH_LEVEL_STARTS
1439 #undef POP_LEVEL_STARTS
1440 #undef PUT_ADDR
1441 #undef INSERT_JUMP
1442 #undef SETBIT
1443 #undef SET_FIELDS
1444
1445 #define PREFETCH if (text == textend) goto fail
1446
1447 #define NEXTCHAR(var) \
1448 PREFETCH; \
1449 var = (unsigned char)*text++; \
1450 if (translate) \
1451 var = translate[var]
1452
regcomp(regex_t * bufp,const char * regex,int cflags)1453 int regcomp(regex_t * bufp, const char *regex, int cflags)
1454 {
1455 memset(bufp, 0, sizeof(regex_t));
1456 bufp->cflags = cflags;
1457 if (bufp->cflags & REG_ICASE) {
1458 char *p, *lcase = bstrdup(regex);
1459 for( p = lcase; *p ; p++) {
1460 *p = tolower(*p);
1461 }
1462 re_compile_pattern(bufp, (unsigned char *)lcase);
1463 bfree(lcase);
1464 } else {
1465 re_compile_pattern(bufp, (unsigned char *)regex);
1466 }
1467 if (got_error) {
1468 return -1;
1469 }
1470 return 0;
1471 }
1472
re_registers_to_regmatch(regexp_registers_t old_regs,regmatch_t pmatch[],size_t nmatch)1473 void re_registers_to_regmatch(regexp_registers_t old_regs,
1474 regmatch_t pmatch[],
1475 size_t nmatch)
1476 {
1477 if (!(nmatch == 0 && pmatch == NULL)) {
1478 size_t i = 0;
1479
1480 /*
1481 * We have to set the last entry to -1
1482 */
1483 nmatch = nmatch - 1;
1484 for (i = 0; (i < nmatch) && (old_regs->start[i] > -1) ; i++) {
1485 pmatch[i].rm_so = old_regs->start[i];
1486 pmatch[i].rm_eo = old_regs->end[i];
1487 }
1488
1489 pmatch[i].rm_eo = pmatch[i].rm_so = -1;
1490 }
1491 }
1492
regexec(regex_t * preg,const char * string,size_t nmatch,regmatch_t pmatch[],int eflags)1493 int regexec(regex_t *preg, const char *string, size_t nmatch,
1494 regmatch_t pmatch[], int eflags)
1495 {
1496 int status;
1497 int len = strlen(string);
1498 struct re_registers regs;
1499
1500 status = ReSearch(preg, (unsigned char *)string, len, 0, len, ®s);
1501 if (status >= 0) {
1502 re_registers_to_regmatch(®s, pmatch, nmatch);
1503 }
1504
1505 /*
1506 * status is the start position in the string base 0 where
1507 * the pattern was found or negative if not found.
1508 */
1509 return status < 0 ? -1 : 0;
1510 }
1511
regerror(int errcode,regex_t * preg,char * errbuf,size_t errbuf_size)1512 size_t regerror(int errcode, regex_t * preg, char *errbuf, size_t errbuf_size)
1513 {
1514 bstrncpy(errbuf, preg->errmsg, errbuf_size);
1515 return 0;
1516 }
1517
regfree(regex_t * preg)1518 void regfree(regex_t * preg)
1519 {
1520 if (preg->lcase) {
1521 FreePoolMemory(preg->lcase);
1522 preg->lcase = NULL;
1523 }
1524 if (preg->buffer) {
1525 free(preg->buffer);
1526 preg->buffer = NULL;
1527 }
1528 }
1529
ReMatch(regex_t * bufp,unsigned char * string,int size,int pos,regexp_registers_t old_regs)1530 int ReMatch(regex_t * bufp, unsigned char *string, int size, int pos,
1531 regexp_registers_t old_regs)
1532 {
1533 unsigned char *code;
1534 unsigned char *translate;
1535 unsigned char *text;
1536 unsigned char *textstart;
1537 unsigned char *textend;
1538 int a;
1539 int b;
1540 int ch;
1541 int reg;
1542 int match_end;
1543 unsigned char *regstart;
1544 unsigned char *regend;
1545 int regsize;
1546 match_state state;
1547
1548 // assert(pos >= 0 && size >= 0);
1549 // assert(pos <= size);
1550
1551 text = string + pos;
1552 textstart = string;
1553 textend = string + size;
1554
1555 code = bufp->buffer;
1556
1557 translate = bufp->translate;
1558
1559 NEW_STATE(state, bufp->num_registers);
1560
1561 continue_matching:
1562 switch (*code++) {
1563 case Cend:
1564 {
1565 match_end = text - textstart;
1566 if (old_regs) {
1567 old_regs->start[0] = pos;
1568 old_regs->end[0] = match_end;
1569 if (!bufp->uses_registers) {
1570 for (a = 1; a < RE_NREGS; a++) {
1571 old_regs->start[a] = -1;
1572 old_regs->end[a] = -1;
1573 }
1574 } else {
1575 for (a = 1; a < bufp->num_registers; a++) {
1576 if ((GET_REG_START(state, a) == NULL) ||
1577 (GET_REG_END(state, a) == NULL)) {
1578 old_regs->start[a] = -1;
1579 old_regs->end[a] = -1;
1580 continue;
1581 }
1582 old_regs->start[a] = GET_REG_START(state, a) - textstart;
1583 old_regs->end[a] = GET_REG_END(state, a) - textstart;
1584 }
1585 for (; a < RE_NREGS; a++) {
1586 old_regs->start[a] = -1;
1587 old_regs->end[a] = -1;
1588 }
1589 }
1590 }
1591 FREE_STATE(state);
1592 return match_end - pos;
1593 }
1594 case Cbol:
1595 {
1596 if (text == textstart || text[-1] == '\n')
1597 goto continue_matching;
1598 goto fail;
1599 }
1600 case Ceol:
1601 {
1602 if (text == textend || *text == '\n')
1603 goto continue_matching;
1604 goto fail;
1605 }
1606 case Cset:
1607 {
1608 NEXTCHAR(ch);
1609 if (code[ch / 8] & (1 << (ch & 7))) {
1610 code += 256 / 8;
1611 goto continue_matching;
1612 }
1613 goto fail;
1614 }
1615 case Cexact:
1616 {
1617 NEXTCHAR(ch);
1618 if (ch != (unsigned char)*code++)
1619 goto fail;
1620 goto continue_matching;
1621 }
1622 case Canychar:
1623 {
1624 NEXTCHAR(ch);
1625 if (ch == '\n')
1626 goto fail;
1627 goto continue_matching;
1628 }
1629 case Cstart_memory:
1630 {
1631 reg = *code++;
1632 SET_REG_START(state, reg, text, goto error);
1633 goto continue_matching;
1634 }
1635 case Cend_memory:
1636 {
1637 reg = *code++;
1638 SET_REG_END(state, reg, text, goto error);
1639 goto continue_matching;
1640 }
1641 case Cmatch_memory:
1642 {
1643 reg = *code++;
1644 regstart = GET_REG_START(state, reg);
1645 regend = GET_REG_END(state, reg);
1646 if ((regstart == NULL) || (regend == NULL))
1647 goto fail; /* or should we just match nothing? */
1648 regsize = regend - regstart;
1649
1650 if (regsize > (textend - text))
1651 goto fail;
1652 if (translate) {
1653 for (; regstart < regend; regstart++, text++)
1654 if (translate[*regstart] != translate[*text])
1655 goto fail;
1656 } else
1657 for (; regstart < regend; regstart++, text++)
1658 if (*regstart != *text)
1659 goto fail;
1660 goto continue_matching;
1661 }
1662 case Cupdate_failure_jump:
1663 {
1664 UPDATE_FAILURE(state, text, goto error);
1665 /* fall to next case */
1666 }
1667 /* treat Cstar_jump just like Cjump if it hasn't been optimized */
1668 case Cstar_jump:
1669 case Cjump:
1670 {
1671 a = (unsigned char)*code++;
1672 a |= (unsigned char)*code++ << 8;
1673 code += (int)SHORT(a);
1674 if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1675 set_error("Regex VM jump out of bounds (Cjump)");
1676 FREE_STATE(state);
1677 return -2;
1678 }
1679 goto continue_matching;
1680 }
1681 case Cdummy_failure_jump:
1682 {
1683 unsigned char *failuredest;
1684
1685 a = (unsigned char)*code++;
1686 a |= (unsigned char)*code++ << 8;
1687 a = (int)SHORT(a);
1688 // assert(*code == Cfailure_jump);
1689 b = (unsigned char)code[1];
1690 b |= (unsigned char)code[2] << 8;
1691 failuredest = code + (int)SHORT(b) + 3;
1692 if (failuredest < bufp->buffer || bufp->buffer + bufp->used < failuredest) {
1693 set_error
1694 ("Regex VM jump out of bounds (Cdummy_failure_jump failuredest)");
1695 FREE_STATE(state);
1696 return -2;
1697 }
1698 PUSH_FAILURE(state, failuredest, NULL, goto error);
1699 code += a;
1700 if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1701 set_error("Regex VM jump out of bounds (Cdummy_failure_jump code)");
1702 FREE_STATE(state);
1703 return -2;
1704 }
1705 goto continue_matching;
1706 }
1707 case Cfailure_jump:
1708 {
1709 a = (unsigned char)*code++;
1710 a |= (unsigned char)*code++ << 8;
1711 a = (int)SHORT(a);
1712 if (code + a < bufp->buffer || bufp->buffer + bufp->used < code + a) {
1713 set_error("Regex VM jump out of bounds (Cfailure_jump)");
1714 FREE_STATE(state);
1715 return -2;
1716 }
1717 PUSH_FAILURE(state, code + a, text, goto error);
1718 goto continue_matching;
1719 }
1720 case Crepeat1:
1721 {
1722 unsigned char *pinst;
1723 a = (unsigned char)*code++;
1724 a |= (unsigned char)*code++ << 8;
1725 a = (int)SHORT(a);
1726 pinst = code + a;
1727 if (pinst < bufp->buffer || bufp->buffer + bufp->used < pinst) {
1728 set_error("Regex VM jump out of bounds (Crepeat1)");
1729 FREE_STATE(state);
1730 return -2;
1731 }
1732 /* pinst is sole instruction in loop, and it matches a
1733 * single character. Since Crepeat1 was originally a
1734 * Cupdate_failure_jump, we also know that backtracking
1735 * is useless: so long as the single-character
1736 * expression matches, it must be used. Also, in the
1737 * case of +, we've already matched one character, so +
1738 * can't fail: nothing here can cause a failure. */
1739 switch (*pinst++) {
1740 case Cset:
1741 {
1742 if (translate) {
1743 while (text < textend) {
1744 ch = translate[(unsigned char)*text];
1745 if (pinst[ch / 8] & (1 << (ch & 7)))
1746 text++;
1747 else
1748 break;
1749 }
1750 } else {
1751 while (text < textend) {
1752 ch = (unsigned char)*text;
1753 if (pinst[ch / 8] & (1 << (ch & 7)))
1754 text++;
1755 else
1756 break;
1757 }
1758 }
1759 break;
1760 }
1761 case Cexact:
1762 {
1763 ch = (unsigned char)*pinst;
1764 if (translate) {
1765 while (text < textend && translate[(unsigned char)*text] == ch)
1766 text++;
1767 } else {
1768 while (text < textend && (unsigned char)*text == ch)
1769 text++;
1770 }
1771 break;
1772 }
1773 case Canychar:
1774 {
1775 while (text < textend && (unsigned char)*text != '\n')
1776 text++;
1777 break;
1778 }
1779 case Csyntaxspec:
1780 {
1781 a = (unsigned char)*pinst;
1782 if (translate) {
1783 while (text < textend && (SYNTAX(translate[*text]) & a))
1784 text++;
1785 } else {
1786 while (text < textend && (SYNTAX(*text) & a))
1787 text++;
1788 }
1789 break;
1790 }
1791 case Cnotsyntaxspec:
1792 {
1793 a = (unsigned char)*pinst;
1794 if (translate) {
1795 while (text < textend && !(SYNTAX(translate[*text]) & a))
1796 text++;
1797 } else {
1798 while (text < textend && !(SYNTAX(*text) & a))
1799 text++;
1800 }
1801 break;
1802 }
1803 default:
1804 {
1805 FREE_STATE(state);
1806 set_error("Unknown regex opcode: memory corrupted?");
1807 return -2;
1808 /*NOTREACHED*/}
1809 }
1810 /* due to the funky way + and * are compiled, the top
1811 * failure- stack entry at this point is actually a
1812 * success entry -- update it & pop it */
1813 UPDATE_FAILURE(state, text, goto error);
1814 goto fail; /* i.e., succeed <wink/sigh> */
1815 }
1816 case Cbegbuf:
1817 {
1818 if (text == textstart)
1819 goto continue_matching;
1820 goto fail;
1821 }
1822 case Cendbuf:
1823 {
1824 if (text == textend)
1825 goto continue_matching;
1826 goto fail;
1827 }
1828 case Cwordbeg:
1829 {
1830 if (text == textend)
1831 goto fail;
1832 if (!(SYNTAX(*text) & Sword))
1833 goto fail;
1834 if (text == textstart)
1835 goto continue_matching;
1836 if (!(SYNTAX(text[-1]) & Sword))
1837 goto continue_matching;
1838 goto fail;
1839 }
1840 case Cwordend:
1841 {
1842 if (text == textstart)
1843 goto fail;
1844 if (!(SYNTAX(text[-1]) & Sword))
1845 goto fail;
1846 if (text == textend)
1847 goto continue_matching;
1848 if (!(SYNTAX(*text) & Sword))
1849 goto continue_matching;
1850 goto fail;
1851 }
1852 case Cwordbound:
1853 {
1854 /* Note: as in gnu regexp, this also matches at the
1855 * beginning and end of buffer. */
1856
1857 if (text == textstart || text == textend)
1858 goto continue_matching;
1859 if ((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword))
1860 goto continue_matching;
1861 goto fail;
1862 }
1863 case Cnotwordbound:
1864 {
1865 /* Note: as in gnu regexp, this never matches at the
1866 * beginning and end of buffer. */
1867 if (text == textstart || text == textend)
1868 goto fail;
1869 if (!((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword)))
1870 goto continue_matching;
1871 goto fail;
1872 }
1873 case Csyntaxspec:
1874 {
1875 NEXTCHAR(ch);
1876 if (!(SYNTAX(ch) & (unsigned char)*code++))
1877 goto fail;
1878 goto continue_matching;
1879 }
1880 case Cnotsyntaxspec:
1881 {
1882 NEXTCHAR(ch);
1883 if (SYNTAX(ch) & (unsigned char)*code++)
1884 goto fail;
1885 goto continue_matching;
1886 }
1887 default:
1888 {
1889 FREE_STATE(state);
1890 set_error("Unknown regex opcode: memory corrupted?");
1891 return -2;
1892 /*NOTREACHED*/}
1893 }
1894
1895
1896
1897 #if 0 /* This line is never reached --Guido */
1898 abort();
1899 #endif
1900 /*
1901 *NOTREACHED
1902 */
1903
1904 /* Using "break;" in the above switch statement is equivalent to "goto fail;" */
1905 fail:
1906 POP_FAILURE(state, code, text, goto done_matching, goto error);
1907 goto continue_matching;
1908
1909 done_matching:
1910 /* if(translated != NULL) */
1911 /* free(translated); */
1912 FREE_STATE(state);
1913 return -1;
1914
1915 error:
1916 /* if (translated != NULL) */
1917 /* free(translated); */
1918 FREE_STATE(state);
1919 return -2;
1920 }
1921
1922
1923 #undef PREFETCH
1924 #undef NEXTCHAR
1925
ReSearch(regex_t * bufp,unsigned char * str,int size,int pos,int range,regexp_registers_t regs)1926 int ReSearch(regex_t * bufp, unsigned char *str, int size, int pos,
1927 int range, regexp_registers_t regs)
1928 {
1929 unsigned char *fastmap;
1930 unsigned char *translate;
1931 unsigned char *text;
1932 unsigned char *partstart;
1933 unsigned char *partend;
1934 int dir;
1935 int ret;
1936 unsigned char anchor;
1937 unsigned char *string = str;
1938
1939 if (bufp->cflags & REG_ICASE) { /* we must use string in lowercase */
1940 int len = strlen((const char *)str);
1941 if (!bufp->lcase) {
1942 bufp->lcase = GetPoolMemory(PM_FNAME);
1943 }
1944 bufp->lcase = CheckPoolMemorySize(bufp->lcase, len+1);
1945 unsigned char *dst = (unsigned char *)bufp->lcase;
1946 while (*string) {
1947 *dst++ = tolower(*string++);
1948 }
1949 *dst = '\0';
1950 string = (unsigned char *)bufp->lcase;
1951 }
1952
1953 // assert(size >= 0 && pos >= 0);
1954 // assert(pos + range >= 0 && pos + range <= size); /* Bugfix by ylo */
1955
1956 fastmap = bufp->fastmap;
1957 translate = bufp->translate;
1958 if (fastmap && !bufp->fastmap_accurate) {
1959 ReCompileFastmap(bufp);
1960 if (got_error)
1961 return -2;
1962 }
1963
1964 anchor = bufp->anchor;
1965 if (bufp->can_be_null == 1) /* can_be_null == 2: can match null at eob */
1966 fastmap = NULL;
1967
1968 if (range < 0) {
1969 dir = -1;
1970 range = -range;
1971 } else
1972 dir = 1;
1973
1974 if (anchor == 2) {
1975 if (pos != 0)
1976 return -1;
1977 else
1978 range = 0;
1979 }
1980
1981 for (; range >= 0; range--, pos += dir) {
1982 if (fastmap) {
1983 if (dir == 1) { /* searching forwards */
1984
1985 text = string + pos;
1986 partend = string + size;
1987 partstart = text;
1988 if (translate)
1989 while (text != partend &&
1990 !fastmap[(unsigned char)translate[(unsigned char)*text]])
1991 text++;
1992 else
1993 while (text != partend && !fastmap[(unsigned char)*text])
1994 text++;
1995 pos += text - partstart;
1996 range -= text - partstart;
1997 if (pos == size && bufp->can_be_null == 0)
1998 return -1;
1999 } else { /* searching backwards */
2000 text = string + pos;
2001 partstart = string + pos - range;
2002 partend = text;
2003 if (translate)
2004 while (text != partstart && !fastmap[(unsigned char)
2005 translate[(unsigned char)*text]])
2006 text--;
2007 else
2008 while (text != partstart && !fastmap[(unsigned char)*text])
2009 text--;
2010 pos -= partend - text;
2011 range -= partend - text;
2012 }
2013 }
2014 if (anchor == 1) { /* anchored to begline */
2015 if (pos > 0 && (string[pos - 1] != '\n'))
2016 continue;
2017 }
2018 // assert(pos >= 0 && pos <= size);
2019 ret = ReMatch(bufp, string, size, pos, regs);
2020 if (ret >= 0)
2021 return pos;
2022 if (ret == -2)
2023 return -2;
2024 }
2025 return -1;
2026 }
2027
2028 /*
2029 ** Local Variables:
2030 ** mode: c
2031 ** c-file-style: "python"
2032 ** End:
2033 */
2034