1 /*
2 BAREOS® - Backup Archiving REcovery Open Sourced
3
4 Copyright (C) 2006-2010 Free Software Foundation Europe e.V.
5
6 This program is Free Software; you can redistribute it and/or
7 modify it under the terms of version three of the GNU Affero General Public
8 License as published by the Free Software Foundation and included
9 in the file LICENSE.
10
11 This program is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Affero General Public License for more details.
15
16 You should have received a copy of the GNU Affero General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301, USA.
20 */
21 /* regexpr.c
22 *
23 * Author: Tatu Ylonen <ylo@ngs.fi>
24 *
25 * Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
26 *
27 * Permission to use, copy, modify, distribute, and sell this software
28 * and its documentation for any purpose is hereby granted without
29 * fee, provided that the above copyright notice appear in all copies.
30 * This software is provided "as is" without express or implied
31 * warranty.
32 *
33 * Created: Thu Sep 26 17:14:05 1991 ylo
34 * Last modified: Mon Nov 4 17:06:48 1991 ylo
35 * Ported to Think C: 19 Jan 1992 guido@cwi.nl
36 *
37 * This code draws many ideas from the regular expression packages by
38 * Henry Spencer of the University of Toronto and Richard Stallman of
39 * the Free Software Foundation.
40 *
41 * Emacs-specific code and syntax table code is almost directly borrowed
42 * from GNU regexp.
43 *
44 * Bugs fixed and lots of reorganization by Jeffrey C. Ollie, April
45 * 1997 Thanks for bug reports and ideas from Andrew Kuchling, Tim
46 * Peters, Guido van Rossum, Ka-Ping Yee, Sjoerd Mullender, and
47 * probably one or two others that I'm forgetting.
48 *
49 * This file modified to work with BAREOS and C++ by
50 * Kern Sibbald, April 2006
51 *
52 * This file modified to work with REG_ICASE and BAREOS by
53 * Eric Bollengier April 2007
54 */
55
56 #include "include/bareos.h"
57 #include "bregex.h"
58
59 #define set_error(x) bufp->errmsg = ((char*)(x))
60 #define got_error bufp->errmsg != NULL
61
62 /* The original code blithely assumed that sizeof(short) == 2. Not
63 * always true. Original instances of "(short)x" were replaced by
64 * SHORT(x), where SHORT is #defined below. */
65
66 #define SHORT(x) ((x)&0x8000 ? (x)-0x10000 : (x))
67
68 /* The stack implementation is taken from an idea by Andrew Kuchling.
69 * It's a doubly linked list of arrays. The advantages of this over a
70 * simple linked list are that the number of mallocs required are
71 * reduced. It also makes it possible to statically allocate enough
72 * space so that small patterns don't ever need to call malloc.
73 *
74 * The advantages over a single array is that is periodically
75 * realloced when more space is needed is that we avoid ever copying
76 * the stack. */
77
78 /* item_t is the basic stack element. Defined as a union of
79 * structures so that both registers, failure points, and counters can
80 * be pushed/popped from the stack. There's nothing built into the
81 * item to keep track of whether a certain stack item is a register, a
82 * failure point, or a counter. */
83
84 typedef union item_t {
85 struct {
86 int num;
87 int level;
88 unsigned char* start;
89 unsigned char* end;
90 } reg;
91 struct {
92 int count;
93 int level;
94 int phantom;
95 unsigned char* code;
96 unsigned char* text;
97 } fail;
98 struct {
99 int num;
100 int level;
101 int count;
102 } cntr;
103 } item_t;
104
105 #define B_STACK_PAGE_SIZE 256
106 #define NUM_REGISTERS 256
107
108 /* A 'page' of stack items. */
109
110 typedef struct item_page_t {
111 item_t items[B_STACK_PAGE_SIZE];
112 struct item_page_t* prev;
113 struct item_page_t* next;
114 } item_page_t;
115
116
117 typedef struct match_state {
118 /* The number of registers that have been pushed onto the stack
119 * since the last failure point. */
120
121 int count;
122
123 /* Used to control when registers need to be pushed onto the
124 * stack. */
125
126 int level;
127
128 /* The number of failure points on the stack. */
129
130 int point;
131
132 /* Storage for the registers. Each register consists of two
133 * pointers to characters. So register N is represented as
134 * start[N] and end[N]. The pointers must be converted to
135 * offsets from the beginning of the string before returning the
136 * registers to the calling program. */
137
138 unsigned char* start[NUM_REGISTERS];
139 unsigned char* end[NUM_REGISTERS];
140
141 /* Keeps track of whether a register has changed recently. */
142
143 int changed[NUM_REGISTERS];
144
145 /* Structure to encapsulate the stack. */
146 struct {
147 /* index into the current page. If index == 0 and you need
148 * to pop an item, move to the previous page and set index
149 * = B_STACK_PAGE_SIZE - 1. Otherwise decrement index to
150 * push a page. If index == B_STACK_PAGE_SIZE and you need
151 * to push a page move to the next page and set index =
152 * 0. If there is no new next page, allocate a new page
153 * and link it in. Otherwise, increment index to push a
154 * page. */
155
156 int index;
157 item_page_t* current; /* Pointer to the current page. */
158 item_page_t first; /* First page is statically allocated. */
159 } stack;
160 } match_state;
161
162 /* Initialize a state object */
163
164 /* #define NEW_STATE(state) \ */
165 /* memset(&state, 0, (void *)(&state.stack) - (void *)(&state)); \ */
166 /* state.stack.current = &state.stack.first; \ */
167 /* state.stack.first.prev = NULL; \ */
168 /* state.stack.first.next = NULL; \ */
169 /* state.stack.index = 0; \ */
170 /* state.level = 1 */
171
172 #define NEW_STATE(state, nregs) \
173 { \
174 int i; \
175 for (i = 0; i < nregs; i++) { \
176 state.start[i] = NULL; \
177 state.end[i] = NULL; \
178 state.changed[i] = 0; \
179 } \
180 state.stack.current = &state.stack.first; \
181 state.stack.first.prev = NULL; \
182 state.stack.first.next = NULL; \
183 state.stack.index = 0; \
184 state.level = 1; \
185 state.count = 0; \
186 state.level = 0; \
187 state.point = 0; \
188 }
189
190 /* Free any memory that might have been malloc'd */
191
192 #define FREE_STATE(state) \
193 while (state.stack.first.next != NULL) { \
194 state.stack.current = state.stack.first.next; \
195 state.stack.first.next = state.stack.current->next; \
196 free(state.stack.current); \
197 }
198
199 /* Discard the top 'count' stack items. */
200
201 #define STACK_DISCARD(stack, count, on_error) \
202 stack.index -= count; \
203 while (stack.index < 0) { \
204 if (stack.current->prev == NULL) on_error; \
205 stack.current = stack.current->prev; \
206 stack.index += B_STACK_PAGE_SIZE; \
207 }
208
209 /* Store a pointer to the previous item on the stack. Used to pop an
210 * item off of the stack. */
211
212 #define STACK_PREV(stack, top, on_error) \
213 if (stack.index == 0) { \
214 if (stack.current->prev == NULL) on_error; \
215 stack.current = stack.current->prev; \
216 stack.index = B_STACK_PAGE_SIZE - 1; \
217 } else { \
218 stack.index--; \
219 } \
220 top = &(stack.current->items[stack.index])
221
222 /* Store a pointer to the next item on the stack. Used to push an item
223 * on to the stack. */
224
225 #define STACK_NEXT(stack, top, on_error) \
226 if (stack.index == B_STACK_PAGE_SIZE) { \
227 if (stack.current->next == NULL) { \
228 stack.current->next = (item_page_t*)malloc(sizeof(item_page_t)); \
229 if (stack.current->next == NULL) on_error; \
230 stack.current->next->prev = stack.current; \
231 stack.current->next->next = NULL; \
232 } \
233 stack.current = stack.current->next; \
234 stack.index = 0; \
235 } \
236 top = &(stack.current->items[stack.index++])
237
238 /* Store a pointer to the item that is 'count' items back in the
239 * stack. STACK_BACK(stack, top, 1, on_error) is equivalent to
240 * STACK_TOP(stack, top, on_error). */
241
242 #define STACK_BACK(stack, top, count, on_error) \
243 { \
244 int index; \
245 item_page_t* current; \
246 current = stack.current; \
247 index = stack.index - (count); \
248 while (index < 0) { \
249 if (current->prev == NULL) on_error; \
250 current = current->prev; \
251 index += B_STACK_PAGE_SIZE; \
252 } \
253 top = &(current->items[index]); \
254 }
255
256 /* Store a pointer to the top item on the stack. Execute the
257 * 'on_error' code if there are no items on the stack. */
258
259 #define STACK_TOP(stack, top, on_error) \
260 if (stack.index == 0) { \
261 if (stack.current->prev == NULL) on_error; \
262 top = &(stack.current->prev->items[B_STACK_PAGE_SIZE - 1]); \
263 } else { \
264 top = &(stack.current->items[stack.index - 1]); \
265 }
266
267 /* Test to see if the stack is empty */
268
269 #define STACK_EMPTY(stack) ((stack.index == 0) && (stack.current->prev == NULL))
270
271 /* Return the start of register 'reg' */
272
273 #define GET_REG_START(state, reg) (state.start[reg])
274
275 /* Return the end of register 'reg' */
276
277 #define GET_REG_END(state, reg) (state.end[reg])
278
279 /* Set the start of register 'reg'. If the state of the register needs
280 * saving, push it on the stack. */
281
282 #define SET_REG_START(state, reg, text, on_error) \
283 if (state.changed[reg] < state.level) { \
284 item_t* item; \
285 STACK_NEXT(state.stack, item, on_error); \
286 item->reg.num = reg; \
287 item->reg.start = state.start[reg]; \
288 item->reg.end = state.end[reg]; \
289 item->reg.level = state.changed[reg]; \
290 state.changed[reg] = state.level; \
291 state.count++; \
292 } \
293 state.start[reg] = text
294
295 /* Set the end of register 'reg'. If the state of the register needs
296 * saving, push it on the stack. */
297
298 #define SET_REG_END(state, reg, text, on_error) \
299 if (state.changed[reg] < state.level) { \
300 item_t* item; \
301 STACK_NEXT(state.stack, item, on_error); \
302 item->reg.num = reg; \
303 item->reg.start = state.start[reg]; \
304 item->reg.end = state.end[reg]; \
305 item->reg.level = state.changed[reg]; \
306 state.changed[reg] = state.level; \
307 state.count++; \
308 } \
309 state.end[reg] = text
310
311 #define PUSH_FAILURE(state, xcode, xtext, on_error) \
312 { \
313 item_t* item; \
314 STACK_NEXT(state.stack, item, on_error); \
315 item->fail.code = xcode; \
316 item->fail.text = xtext; \
317 item->fail.count = state.count; \
318 item->fail.level = state.level; \
319 item->fail.phantom = 0; \
320 state.count = 0; \
321 state.level++; \
322 state.point++; \
323 }
324
325 /* Update the last failure point with a new position in the text. */
326
327 #define UPDATE_FAILURE(state, xtext, on_error) \
328 { \
329 item_t* item; \
330 STACK_BACK(state.stack, item, state.count + 1, on_error); \
331 if (!item->fail.phantom) { \
332 item_t* item2; \
333 STACK_NEXT(state.stack, item2, on_error); \
334 item2->fail.code = item->fail.code; \
335 item2->fail.text = xtext; \
336 item2->fail.count = state.count; \
337 item2->fail.level = state.level; \
338 item2->fail.phantom = 1; \
339 state.count = 0; \
340 state.level++; \
341 state.point++; \
342 } else { \
343 STACK_DISCARD(state.stack, state.count, on_error); \
344 STACK_TOP(state.stack, item, on_error); \
345 item->fail.text = xtext; \
346 state.count = 0; \
347 state.level++; \
348 } \
349 }
350
351 #define POP_FAILURE(state, xcode, xtext, on_empty, on_error) \
352 { \
353 item_t* item; \
354 do { \
355 while (state.count > 0) { \
356 STACK_PREV(state.stack, item, on_error); \
357 state.start[item->reg.num] = item->reg.start; \
358 state.end[item->reg.num] = item->reg.end; \
359 state.changed[item->reg.num] = item->reg.level; \
360 state.count--; \
361 } \
362 STACK_PREV(state.stack, item, on_empty); \
363 xcode = item->fail.code; \
364 xtext = item->fail.text; \
365 state.count = item->fail.count; \
366 state.level = item->fail.level; \
367 state.point--; \
368 } while (item->fail.text == NULL); \
369 }
370
371 enum regexp_compiled_ops
372 { /* opcodes for compiled regexp */
373 Cend, /* end of pattern reached */
374 Cbol, /* beginning of line */
375 Ceol, /* end of line */
376 Cset, /* character set. Followed by 32 bytes of set. */
377 Cexact, /* followed by a byte to match */
378 Canychar, /* matches any character except newline */
379 Cstart_memory, /* set register start addr (followed by reg number) */
380 Cend_memory, /* set register end addr (followed by reg number) */
381 Cmatch_memory, /* match a duplicate of reg contents (regnum follows) */
382 Cjump, /* followed by two bytes (lsb,msb) of displacement. */
383 Cstar_jump, /* will change to jump/update_failure_jump at runtime */
384 Cfailure_jump, /* jump to addr on failure */
385 Cupdate_failure_jump, /* update topmost failure point and jump */
386 Cdummy_failure_jump, /* push a dummy failure point and jump */
387 Cbegbuf, /* match at beginning of buffer */
388 Cendbuf, /* match at end of buffer */
389 Cwordbeg, /* match at beginning of word */
390 Cwordend, /* match at end of word */
391 Cwordbound, /* match if at word boundary */
392 Cnotwordbound, /* match if not at word boundary */
393 Csyntaxspec, /* matches syntax code (1 byte follows) */
394 Cnotsyntaxspec, /* matches if syntax code does not match (1 byte follows) */
395 Crepeat1
396 };
397
398 enum regexp_syntax_op
399 { /* syntax codes for plain and quoted characters */
400 Rend, /* special code for end of regexp */
401 Rnormal, /* normal character */
402 Ranychar, /* any character except newline */
403 Rquote, /* the quote character */
404 Rbol, /* match beginning of line */
405 Reol, /* match end of line */
406 Roptional, /* match preceding expression optionally */
407 Rstar, /* match preceding expr zero or more times */
408 Rplus, /* match preceding expr one or more times */
409 Ror, /* match either of alternatives */
410 Ropenpar, /* opening parenthesis */
411 Rclosepar, /* closing parenthesis */
412 Rmemory, /* match memory register */
413 Rextended_memory, /* \vnn to match registers 10-99 */
414 Ropenset, /* open set. Internal syntax hard-coded below. */
415 /* the following are gnu extensions to "normal" regexp syntax */
416 Rbegbuf, /* beginning of buffer */
417 Rendbuf, /* end of buffer */
418 Rwordchar, /* word character */
419 Rnotwordchar, /* not word character */
420 Rwordbeg, /* beginning of word */
421 Rwordend, /* end of word */
422 Rwordbound, /* word bound */
423 Rnotwordbound, /* not word bound */
424 Rnum_ops
425 };
426
427 static int re_compile_initialized = 0;
428 static int regexp_syntax = RE_SYNTAX_EGREP;
429 int re_syntax = RE_SYNTAX_EGREP; /* Exported copy of regexp_syntax */
430 static unsigned char plain_ops[256];
431 static unsigned char quoted_ops[256];
432 static unsigned char precedences[Rnum_ops];
433 static int regexp_context_indep_ops;
434 static int regexp_ansi_sequences;
435
436 #define NUM_LEVELS 5 /* number of precedence levels in use */
437 #define MAX_NESTING 100 /* max nesting level of operators */
438
439 #define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
440
441 unsigned char re_syntax_table[256];
442
ReCompileInitialize(void)443 void ReCompileInitialize(void)
444 {
445 int a;
446
447 static int syntax_table_inited = 0;
448
449 if (!syntax_table_inited) {
450 syntax_table_inited = 1;
451 memset(re_syntax_table, 0, 256);
452 for (a = 'a'; a <= 'z'; a++) re_syntax_table[a] = Sword;
453 for (a = 'A'; a <= 'Z'; a++) re_syntax_table[a] = Sword;
454 for (a = '0'; a <= '9'; a++)
455 re_syntax_table[a] = Sword | Sdigit | Shexdigit;
456 for (a = '0'; a <= '7'; a++) re_syntax_table[a] |= Soctaldigit;
457 for (a = 'A'; a <= 'F'; a++) re_syntax_table[a] |= Shexdigit;
458 for (a = 'a'; a <= 'f'; a++) re_syntax_table[a] |= Shexdigit;
459 re_syntax_table[(int)'_'] = Sword;
460 for (a = 9; a <= 13; a++) re_syntax_table[a] = Swhitespace;
461 re_syntax_table[(int)' '] = Swhitespace;
462 }
463 re_compile_initialized = 1;
464 for (a = 0; a < 256; a++) {
465 plain_ops[a] = Rnormal;
466 quoted_ops[a] = Rnormal;
467 }
468 for (a = '0'; a <= '9'; a++) quoted_ops[a] = Rmemory;
469 plain_ops[(int)'\134'] = Rquote;
470 if (regexp_syntax & RE_NO_BK_PARENS) {
471 plain_ops[(int)'('] = Ropenpar;
472 plain_ops[(int)')'] = Rclosepar;
473 } else {
474 quoted_ops[(int)'('] = Ropenpar;
475 quoted_ops[(int)')'] = Rclosepar;
476 }
477 if (regexp_syntax & RE_NO_BK_VBAR) {
478 plain_ops[(int)'\174'] = Ror;
479 } else {
480 quoted_ops[(int)'\174'] = Ror;
481 }
482 plain_ops[(int)'*'] = Rstar;
483 if (regexp_syntax & RE_BK_PLUS_QM) {
484 quoted_ops[(int)'+'] = Rplus;
485 quoted_ops[(int)'?'] = Roptional;
486 } else {
487 plain_ops[(int)'+'] = Rplus;
488 plain_ops[(int)'?'] = Roptional;
489 }
490 if (regexp_syntax & RE_NEWLINE_OR) { plain_ops[(int)'\n'] = Ror; }
491 plain_ops[(int)'\133'] = Ropenset;
492 plain_ops[(int)'\136'] = Rbol;
493 plain_ops[(int)'$'] = Reol;
494 plain_ops[(int)'.'] = Ranychar;
495 if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS)) {
496 quoted_ops[(int)'w'] = Rwordchar;
497 quoted_ops[(int)'W'] = Rnotwordchar;
498 quoted_ops[(int)'<'] = Rwordbeg;
499 quoted_ops[(int)'>'] = Rwordend;
500 quoted_ops[(int)'b'] = Rwordbound;
501 quoted_ops[(int)'B'] = Rnotwordbound;
502 quoted_ops[(int)'`'] = Rbegbuf;
503 quoted_ops[(int)'\''] = Rendbuf;
504 }
505 if (regexp_syntax & RE_ANSI_HEX) { quoted_ops[(int)'v'] = Rextended_memory; }
506 for (a = 0; a < Rnum_ops; a++) { precedences[a] = 4; }
507 if (regexp_syntax & RE_TIGHT_VBAR) {
508 precedences[Ror] = 3;
509 precedences[Rbol] = 2;
510 precedences[Reol] = 2;
511 } else {
512 precedences[Ror] = 2;
513 precedences[Rbol] = 3;
514 precedences[Reol] = 3;
515 }
516 precedences[Rclosepar] = 1;
517 precedences[Rend] = 0;
518 regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
519 regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
520 }
521
ReSetSyntax(int syntax)522 int ReSetSyntax(int syntax)
523 {
524 int ret;
525
526 ret = regexp_syntax;
527 regexp_syntax = syntax;
528 re_syntax = syntax; /* Exported copy */
529 ReCompileInitialize();
530 return ret;
531 }
532
HexCharToDecimal(int ch)533 static int HexCharToDecimal(int ch)
534 {
535 if (ch >= '0' && ch <= '9') return ch - '0';
536 if (ch >= 'a' && ch <= 'f') return ch - 'a' + 10;
537 if (ch >= 'A' && ch <= 'F') return ch - 'A' + 10;
538 return 16;
539 }
540
re_compile_fastmap_aux(regex_t * bufp,unsigned char * code,int pos,unsigned char * visited,unsigned char * can_be_null,unsigned char * fastmap)541 static void re_compile_fastmap_aux(regex_t* bufp,
542 unsigned char* code,
543 int pos,
544 unsigned char* visited,
545 unsigned char* can_be_null,
546 unsigned char* fastmap)
547 {
548 int a;
549 int b;
550 int syntaxcode;
551
552 if (visited[pos]) return; /* we have already been here */
553 visited[pos] = 1;
554 for (;;) {
555 switch (code[pos++]) {
556 case Cend:
557 *can_be_null = 1;
558 return;
559 case Cbol:
560 case Cbegbuf:
561 case Cendbuf:
562 case Cwordbeg:
563 case Cwordend:
564 case Cwordbound:
565 case Cnotwordbound:
566 for (a = 0; a < 256; a++) fastmap[a] = 1;
567 break;
568 case Csyntaxspec:
569 syntaxcode = code[pos++];
570 for (a = 0; a < 256; a++)
571 if (SYNTAX(a) & syntaxcode) fastmap[a] = 1;
572 return;
573 case Cnotsyntaxspec:
574 syntaxcode = code[pos++];
575 for (a = 0; a < 256; a++)
576 if (!(SYNTAX(a) & syntaxcode)) fastmap[a] = 1;
577 return;
578 case Ceol:
579 fastmap[(int)'\n'] = 1;
580 if (*can_be_null == 0)
581 *can_be_null = 2; /* can match null, but only at end of buffer */
582 return;
583 case Cset:
584 for (a = 0; a < 256 / 8; a++)
585 if (code[pos + a] != 0)
586 for (b = 0; b < 8; b++)
587 if (code[pos + a] & (1 << b)) fastmap[(a << 3) + b] = 1;
588 pos += 256 / 8;
589 return;
590 case Cexact:
591 fastmap[(unsigned char)code[pos]] = 1;
592 return;
593 case Canychar:
594 for (a = 0; a < 256; a++)
595 if (a != '\n') fastmap[a] = 1;
596 return;
597 case Cstart_memory:
598 case Cend_memory:
599 pos++;
600 break;
601 case Cmatch_memory:
602 for (a = 0; a < 256; a++) fastmap[a] = 1;
603 *can_be_null = 1;
604 return;
605 case Cjump:
606 case Cdummy_failure_jump:
607 case Cupdate_failure_jump:
608 case Cstar_jump:
609 a = (unsigned char)code[pos++];
610 a |= (unsigned char)code[pos++] << 8;
611 pos += (int)SHORT(a);
612 if (visited[pos]) {
613 /* argh... the regexp contains empty loops. This is not
614 good, as this may cause a failure stack overflow when
615 matching. Oh well. */
616 /* this path leads nowhere; pursue other paths. */
617 return;
618 }
619 visited[pos] = 1;
620 break;
621 case Cfailure_jump:
622 a = (unsigned char)code[pos++];
623 a |= (unsigned char)code[pos++] << 8;
624 a = pos + (int)SHORT(a);
625 re_compile_fastmap_aux(bufp, code, a, visited, can_be_null, fastmap);
626 break;
627 case Crepeat1:
628 pos += 2;
629 break;
630 default:
631 set_error("Unknown regex opcode: memory corrupted?");
632 return;
633 }
634 }
635 }
636
re_do_compile_fastmap(regex_t * bufp,unsigned char * buffer,int used,int pos,unsigned char * can_be_null,unsigned char * fastmap)637 static int re_do_compile_fastmap(regex_t* bufp,
638 unsigned char* buffer,
639 int used,
640 int pos,
641 unsigned char* can_be_null,
642 unsigned char* fastmap)
643 {
644 unsigned char small_visited[512], *visited;
645
646 if (used <= (int)sizeof(small_visited))
647 visited = small_visited;
648 else {
649 visited = (unsigned char*)malloc(used);
650 if (!visited) return 0;
651 }
652 *can_be_null = 0;
653 memset(fastmap, 0, 256);
654 memset(visited, 0, used);
655 re_compile_fastmap_aux(bufp, buffer, pos, visited, can_be_null, fastmap);
656 if (visited != small_visited) free(visited);
657 return 1;
658 }
659
ReCompileFastmap(regex_t * bufp)660 void ReCompileFastmap(regex_t* bufp)
661 {
662 if (!bufp->fastmap || bufp->fastmap_accurate) return;
663 // assert(bufp->used > 0);
664 if (!re_do_compile_fastmap(bufp, bufp->buffer, bufp->used, 0,
665 &bufp->can_be_null, bufp->fastmap))
666 return;
667 if (got_error) return;
668 if (bufp->buffer[0] == Cbol)
669 bufp->anchor = 1; /* begline */
670 else if (bufp->buffer[0] == Cbegbuf)
671 bufp->anchor = 2; /* begbuf */
672 else
673 bufp->anchor = 0; /* none */
674 bufp->fastmap_accurate = 1;
675 }
676
677 /*
678 * star is coded as:
679 * 1: failure_jump 2
680 * ... code for operand of star
681 * star_jump 1
682 * 2: ... code after star
683 *
684 * We change the star_jump to update_failure_jump if we can determine
685 * that it is safe to do so; otherwise we change it to an ordinary
686 * jump.
687 *
688 * plus is coded as
689 *
690 * jump 2
691 * 1: failure_jump 3
692 * 2: ... code for operand of plus
693 * star_jump 1
694 * 3: ... code after plus
695 *
696 * For star_jump considerations this is processed identically to star.
697 *
698 */
699
ReOptimizeStarJump(regex_t * bufp,unsigned char * code)700 static int ReOptimizeStarJump(regex_t* bufp, unsigned char* code)
701 {
702 unsigned char map[256];
703 unsigned char can_be_null;
704 unsigned char* p1;
705 unsigned char* p2;
706 unsigned char ch;
707 int a;
708 int b;
709 int num_instructions = 0;
710
711 a = (unsigned char)*code++;
712 a |= (unsigned char)*code++ << 8;
713 a = (int)SHORT(a);
714
715 p1 = code + a + 3; /* skip the failure_jump */
716 /* Check that the jump is within the pattern */
717 if (p1 < bufp->buffer || bufp->buffer + bufp->used < p1) {
718 set_error("Regex VM jump out of bounds (failure_jump opt)");
719 return 0;
720 }
721 // assert(p1[-3] == Cfailure_jump);
722 p2 = code;
723 /* p1 points inside loop, p2 points to after loop */
724 if (!re_do_compile_fastmap(bufp, bufp->buffer, bufp->used,
725 (int)(p2 - bufp->buffer), &can_be_null, map))
726 goto make_normal_jump;
727
728 /* If we might introduce a new update point inside the
729 * loop, we can't optimize because then update_jump would
730 * update a wrong failure point. Thus we have to be
731 * quite careful here.
732 */
733
734 /* loop until we find something that consumes a character */
735 for (;;) {
736 num_instructions++;
737 switch (*p1++) {
738 case Cbol:
739 case Ceol:
740 case Cbegbuf:
741 case Cendbuf:
742 case Cwordbeg:
743 case Cwordend:
744 case Cwordbound:
745 case Cnotwordbound:
746 continue;
747 case Cstart_memory:
748 case Cend_memory:
749 p1++;
750 continue;
751 case Cexact:
752 ch = (unsigned char)*p1++;
753 if (map[(int)ch]) goto make_normal_jump;
754 break;
755 case Canychar:
756 for (b = 0; b < 256; b++)
757 if (b != '\n' && map[b]) goto make_normal_jump;
758 break;
759 case Cset:
760 for (b = 0; b < 256; b++)
761 if ((p1[b >> 3] & (1 << (b & 7))) && map[b]) goto make_normal_jump;
762 p1 += 256 / 8;
763 break;
764 default:
765 goto make_normal_jump;
766 }
767 break;
768 }
769 /* now we know that we can't backtrack. */
770 while (p1 != p2 - 3) {
771 num_instructions++;
772 switch (*p1++) {
773 case Cend:
774 return 0;
775 case Cbol:
776 case Ceol:
777 case Canychar:
778 case Cbegbuf:
779 case Cendbuf:
780 case Cwordbeg:
781 case Cwordend:
782 case Cwordbound:
783 case Cnotwordbound:
784 break;
785 case Cset:
786 p1 += 256 / 8;
787 break;
788 case Cexact:
789 case Cstart_memory:
790 case Cend_memory:
791 case Cmatch_memory:
792 case Csyntaxspec:
793 case Cnotsyntaxspec:
794 p1++;
795 break;
796 case Cjump:
797 case Cstar_jump:
798 case Cfailure_jump:
799 case Cupdate_failure_jump:
800 case Cdummy_failure_jump:
801 goto make_normal_jump;
802 default:
803 return 0;
804 }
805 }
806
807 /* make_update_jump: */
808 code -= 3;
809 a += 3; /* jump to after the Cfailure_jump */
810 code[0] = Cupdate_failure_jump;
811 code[1] = a & 0xff;
812 code[2] = a >> 8;
813 if (num_instructions > 1) return 1;
814 // assert(num_instructions == 1);
815 /* if the only instruction matches a single character, we can do
816 * better */
817 p1 = code + 3 + a; /* start of sole instruction */
818 if (*p1 == Cset || *p1 == Cexact || *p1 == Canychar || *p1 == Csyntaxspec ||
819 *p1 == Cnotsyntaxspec)
820 code[0] = Crepeat1;
821 return 1;
822
823 make_normal_jump:
824 code -= 3;
825 *code = Cjump;
826 return 1;
827 }
828
ReOptimize(regex_t * bufp)829 static int ReOptimize(regex_t* bufp)
830 {
831 unsigned char* code;
832
833 code = bufp->buffer;
834
835 while (1) {
836 switch (*code++) {
837 case Cend:
838 return 1;
839 case Canychar:
840 case Cbol:
841 case Ceol:
842 case Cbegbuf:
843 case Cendbuf:
844 case Cwordbeg:
845 case Cwordend:
846 case Cwordbound:
847 case Cnotwordbound:
848 break;
849 case Cset:
850 code += 256 / 8;
851 break;
852 case Cexact:
853 case Cstart_memory:
854 case Cend_memory:
855 case Cmatch_memory:
856 case Csyntaxspec:
857 case Cnotsyntaxspec:
858 code++;
859 break;
860 case Cstar_jump:
861 if (!ReOptimizeStarJump(bufp, code)) { return 0; }
862 /* fall through */
863 case Cupdate_failure_jump:
864 case Cjump:
865 case Cdummy_failure_jump:
866 case Cfailure_jump:
867 case Crepeat1:
868 code += 2;
869 break;
870 default:
871 return 0;
872 }
873 }
874 }
875
876 #define NEXTCHAR(var) \
877 { \
878 if (pos >= size) goto ends_prematurely; \
879 (var) = regex[pos]; \
880 pos++; \
881 }
882
883 #define ALLOC(amount) \
884 { \
885 if (pattern_offset + (amount) > alloc) { \
886 alloc += 256 + (amount); \
887 pattern = (unsigned char*)realloc(pattern, alloc); \
888 if (!pattern) goto out_of_memory; \
889 } \
890 }
891
892 #define STORE(ch) pattern[pattern_offset++] = (ch)
893
894 #define CURRENT_LEVEL_START (starts[starts_base + current_level])
895
896 #define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
897
898 #define PUSH_LEVEL_STARTS \
899 if (starts_base < (MAX_NESTING - 1) * NUM_LEVELS) \
900 starts_base += NUM_LEVELS; \
901 else \
902 goto too_complex
903
904 #define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
905
906 #define PUT_ADDR(offset, addr) \
907 { \
908 int disp = (addr) - (offset)-2; \
909 pattern[(offset)] = disp & 0xff; \
910 pattern[(offset) + 1] = (disp >> 8) & 0xff; \
911 }
912
913 #define INSERT_JUMP(pos, type, addr) \
914 { \
915 int a, p = (pos), t = (type), ad = (addr); \
916 for (a = pattern_offset - 1; a >= p; a--) pattern[a + 3] = pattern[a]; \
917 pattern[p] = t; \
918 PUT_ADDR(p + 1, ad); \
919 pattern_offset += 3; \
920 }
921
922 #define SETBIT(buf, offset, bit) (buf)[(offset) + (bit) / 8] |= (1 << ((bit)&7))
923
924 #define SET_FIELDS \
925 { \
926 bufp->allocated = alloc; \
927 bufp->buffer = pattern; \
928 bufp->used = pattern_offset; \
929 }
930
931 #define GETHEX(var) \
932 { \
933 unsigned char gethex_ch, gethex_value; \
934 NEXTCHAR(gethex_ch); \
935 gethex_value = HexCharToDecimal(gethex_ch); \
936 if (gethex_value == 16) goto hex_error; \
937 NEXTCHAR(gethex_ch); \
938 gethex_ch = HexCharToDecimal(gethex_ch); \
939 if (gethex_ch == 16) goto hex_error; \
940 (var) = gethex_value * 16 + gethex_ch; \
941 }
942
943 #define ANSI_TRANSLATE(ch) \
944 { \
945 switch (ch) { \
946 case 'a': \
947 case 'A': { \
948 ch = 7; /* audible bell */ \
949 break; \
950 } \
951 case 'b': \
952 case 'B': { \
953 ch = 8; /* backspace */ \
954 break; \
955 } \
956 case 'f': \
957 case 'F': { \
958 ch = 12; /* form feed */ \
959 break; \
960 } \
961 case 'n': \
962 case 'N': { \
963 ch = 10; /* line feed */ \
964 break; \
965 } \
966 case 'r': \
967 case 'R': { \
968 ch = 13; /* carriage return */ \
969 break; \
970 } \
971 case 't': \
972 case 'T': { \
973 ch = 9; /* tab */ \
974 break; \
975 } \
976 case 'v': \
977 case 'V': { \
978 ch = 11; /* vertical tab */ \
979 break; \
980 } \
981 case 'x': /* hex code */ \
982 case 'X': { \
983 GETHEX(ch); \
984 break; \
985 } \
986 default: { \
987 /* other characters passed through */ \
988 if (translate) ch = translate[(unsigned char)ch]; \
989 break; \
990 } \
991 } \
992 }
993
re_compile_pattern(regex_t * bufp,unsigned char * regex)994 const char* re_compile_pattern(regex_t* bufp, unsigned char* regex)
995 {
996 int a;
997 int pos;
998 int op;
999 int current_level;
1000 int level;
1001 int opcode;
1002 int pattern_offset = 0, alloc;
1003 int starts[NUM_LEVELS * MAX_NESTING];
1004 int starts_base;
1005 int future_jumps[MAX_NESTING];
1006 int num_jumps;
1007 unsigned char ch = '\0';
1008 unsigned char* pattern;
1009 unsigned char* translate;
1010 int next_register;
1011 int paren_depth;
1012 int num_open_registers;
1013 int open_registers[RE_NREGS];
1014 int beginning_context;
1015 int size = strlen((char*)regex);
1016
1017 if (!re_compile_initialized) ReCompileInitialize();
1018 bufp->used = 0;
1019 bufp->fastmap_accurate = 0;
1020 bufp->uses_registers = 1;
1021 bufp->num_registers = 1;
1022 translate = bufp->translate;
1023 pattern = bufp->buffer;
1024 alloc = bufp->allocated;
1025 if (alloc == 0 || pattern == NULL) {
1026 alloc = 256;
1027 bufp->buffer = pattern = (unsigned char*)malloc(alloc);
1028 if (!pattern) goto out_of_memory;
1029 }
1030 pattern_offset = 0;
1031 starts_base = 0;
1032 num_jumps = 0;
1033 current_level = 0;
1034 SET_LEVEL_START;
1035 num_open_registers = 0;
1036 next_register = 1;
1037 paren_depth = 0;
1038 beginning_context = 1;
1039 op = -1;
1040 /* we use Rend dummy to ensure that pending jumps are updated
1041 (due to low priority of Rend) before exiting the loop. */
1042 pos = 0;
1043 while (op != Rend) {
1044 if (pos >= size)
1045 op = Rend;
1046 else {
1047 NEXTCHAR(ch);
1048 if (translate) ch = translate[(unsigned char)ch];
1049 op = plain_ops[(unsigned char)ch];
1050 if (op == Rquote) {
1051 NEXTCHAR(ch);
1052 op = quoted_ops[(unsigned char)ch];
1053 if (op == Rnormal && regexp_ansi_sequences) ANSI_TRANSLATE(ch);
1054 }
1055 }
1056 level = precedences[op];
1057 /* printf("ch='%c' op=%d level=%d current_level=%d
1058 curlevstart=%d\n", ch, op, level, current_level,
1059 CURRENT_LEVEL_START); */
1060 if (level > current_level) {
1061 for (current_level++; current_level < level; current_level++)
1062 SET_LEVEL_START;
1063 SET_LEVEL_START;
1064 } else if (level < current_level) {
1065 current_level = level;
1066 for (;
1067 num_jumps > 0 && future_jumps[num_jumps - 1] >= CURRENT_LEVEL_START;
1068 num_jumps--)
1069 PUT_ADDR(future_jumps[num_jumps - 1], pattern_offset);
1070 }
1071 switch (op) {
1072 case Rend:
1073 break;
1074 case Rnormal:
1075 normal_char:
1076 opcode = Cexact;
1077 store_opcode_and_arg: /* opcode & ch must be set */
1078 SET_LEVEL_START;
1079 ALLOC(2);
1080 STORE(opcode);
1081 STORE(ch);
1082 break;
1083 case Ranychar:
1084 opcode = Canychar;
1085 store_opcode:
1086 SET_LEVEL_START;
1087 ALLOC(1);
1088 STORE(opcode);
1089 break;
1090 case Rquote:
1091 set_error("Rquote");
1092 /* NOTREACHED */
1093 break;
1094 case Rbol:
1095 if (!beginning_context) {
1096 if (regexp_context_indep_ops)
1097 goto op_error;
1098 else
1099 goto normal_char;
1100 }
1101 opcode = Cbol;
1102 goto store_opcode;
1103 case Reol:
1104 if (!((pos >= size) ||
1105 ((regexp_syntax & RE_NO_BK_VBAR)
1106 ? (regex[pos] == '\174')
1107 : (pos + 1 < size && regex[pos] == '\134' &&
1108 regex[pos + 1] == '\174')) ||
1109 ((regexp_syntax & RE_NO_BK_PARENS)
1110 ? (regex[pos] == ')')
1111 : (pos + 1 < size && regex[pos] == '\134' &&
1112 regex[pos + 1] == ')')))) {
1113 if (regexp_context_indep_ops)
1114 goto op_error;
1115 else
1116 goto normal_char;
1117 }
1118 opcode = Ceol;
1119 goto store_opcode;
1120 /* NOTREACHED */
1121 break;
1122 case Roptional:
1123 if (beginning_context) {
1124 if (regexp_context_indep_ops)
1125 goto op_error;
1126 else
1127 goto normal_char;
1128 }
1129 if (CURRENT_LEVEL_START == pattern_offset)
1130 break; /* ignore empty patterns for ? */
1131 ALLOC(3);
1132 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 3);
1133 break;
1134 case Rstar:
1135 case Rplus:
1136 if (beginning_context) {
1137 if (regexp_context_indep_ops)
1138 goto op_error;
1139 else
1140 goto normal_char;
1141 }
1142 if (CURRENT_LEVEL_START == pattern_offset)
1143 break; /* ignore empty patterns for + and * */
1144 ALLOC(9);
1145 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1146 INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
1147 if (op == Rplus) /* jump over initial failure_jump */
1148 INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
1149 CURRENT_LEVEL_START + 6);
1150 break;
1151 case Ror:
1152 ALLOC(6);
1153 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1154 if (num_jumps >= MAX_NESTING) goto too_complex;
1155 STORE(Cjump);
1156 future_jumps[num_jumps++] = pattern_offset;
1157 STORE(0);
1158 STORE(0);
1159 SET_LEVEL_START;
1160 break;
1161 case Ropenpar: {
1162 SET_LEVEL_START;
1163 if (next_register < RE_NREGS) {
1164 bufp->uses_registers = 1;
1165 ALLOC(2);
1166 STORE(Cstart_memory);
1167 STORE(next_register);
1168 open_registers[num_open_registers++] = next_register;
1169 bufp->num_registers++;
1170 next_register++;
1171 }
1172 paren_depth++;
1173 PUSH_LEVEL_STARTS;
1174 current_level = 0;
1175 SET_LEVEL_START;
1176 break;
1177 }
1178 case Rclosepar:
1179 if (paren_depth <= 0) goto parenthesIsError;
1180 POP_LEVEL_STARTS;
1181 current_level = precedences[Ropenpar];
1182 paren_depth--;
1183 if (paren_depth < num_open_registers) {
1184 bufp->uses_registers = 1;
1185 ALLOC(2);
1186 STORE(Cend_memory);
1187 num_open_registers--;
1188 STORE(open_registers[num_open_registers]);
1189 }
1190 break;
1191 case Rmemory:
1192 if (ch == '0') goto bad_match_register;
1193 if (!(ch >= '0' && ch <= '9')) { goto bad_match_register; }
1194 bufp->uses_registers = 1;
1195 opcode = Cmatch_memory;
1196 ch -= '0';
1197 goto store_opcode_and_arg;
1198 case Rextended_memory:
1199 NEXTCHAR(ch);
1200 if (ch < '0' || ch > '9') goto bad_match_register;
1201 NEXTCHAR(a);
1202 if (a < '0' || a > '9') goto bad_match_register;
1203 ch = 10 * (a - '0') + ch - '0';
1204 if (ch == 0 || ch >= RE_NREGS) goto bad_match_register;
1205 bufp->uses_registers = 1;
1206 opcode = Cmatch_memory;
1207 goto store_opcode_and_arg;
1208 case Ropenset: {
1209 int complement;
1210 int prev;
1211 int offset;
1212 int range;
1213 int firstchar;
1214
1215 SET_LEVEL_START;
1216 ALLOC(1 + 256 / 8);
1217 STORE(Cset);
1218 offset = pattern_offset;
1219 for (a = 0; a < 256 / 8; a++) STORE(0);
1220 NEXTCHAR(ch);
1221 if (translate) ch = translate[(unsigned char)ch];
1222 if (ch == '\136') {
1223 complement = 1;
1224 NEXTCHAR(ch);
1225 if (translate) ch = translate[(unsigned char)ch];
1226 } else
1227 complement = 0;
1228 prev = -1;
1229 range = 0;
1230 firstchar = 1;
1231 while (ch != '\135' || firstchar) {
1232 firstchar = 0;
1233 if (regexp_ansi_sequences && ch == '\134') {
1234 NEXTCHAR(ch);
1235 ANSI_TRANSLATE(ch);
1236 }
1237 if (range) {
1238 for (a = prev; a <= (int)ch; a++) SETBIT(pattern, offset, a);
1239 prev = -1;
1240 range = 0;
1241 } else if (prev != -1 && ch == '-')
1242 range = 1;
1243 else {
1244 SETBIT(pattern, offset, ch);
1245 prev = ch;
1246 }
1247 NEXTCHAR(ch);
1248 if (translate) ch = translate[(unsigned char)ch];
1249 }
1250 if (range) SETBIT(pattern, offset, '-');
1251 if (complement) {
1252 for (a = 0; a < 256 / 8; a++) pattern[offset + a] ^= 0xff;
1253 }
1254 break;
1255 }
1256 case Rbegbuf: {
1257 opcode = Cbegbuf;
1258 goto store_opcode;
1259 }
1260 case Rendbuf: {
1261 opcode = Cendbuf;
1262 goto store_opcode;
1263 }
1264 case Rwordchar: {
1265 opcode = Csyntaxspec;
1266 ch = Sword;
1267 goto store_opcode_and_arg;
1268 }
1269 case Rnotwordchar: {
1270 opcode = Cnotsyntaxspec;
1271 ch = Sword;
1272 goto store_opcode_and_arg;
1273 }
1274 case Rwordbeg: {
1275 opcode = Cwordbeg;
1276 goto store_opcode;
1277 }
1278 case Rwordend: {
1279 opcode = Cwordend;
1280 goto store_opcode;
1281 }
1282 case Rwordbound: {
1283 opcode = Cwordbound;
1284 goto store_opcode;
1285 }
1286 case Rnotwordbound: {
1287 opcode = Cnotwordbound;
1288 goto store_opcode;
1289 }
1290 default: {
1291 abort();
1292 }
1293 }
1294 beginning_context = (op == Ropenpar || op == Ror);
1295 }
1296 if (starts_base != 0) goto parenthesIsError;
1297 // assert(num_jumps == 0);
1298 ALLOC(1);
1299 STORE(Cend);
1300 SET_FIELDS;
1301 if (!ReOptimize(bufp)) return "Optimization error";
1302 return NULL;
1303
1304 op_error:
1305 SET_FIELDS;
1306 return "Badly placed special character";
1307
1308 bad_match_register:
1309 SET_FIELDS;
1310 return "Bad match register number";
1311
1312 hex_error:
1313 SET_FIELDS;
1314 return "Bad hexadecimal number";
1315
1316 parenthesIsError:
1317 SET_FIELDS;
1318 return "Badly placed parenthesis";
1319
1320 out_of_memory:
1321 SET_FIELDS;
1322 return "Out of memory";
1323
1324 ends_prematurely:
1325 SET_FIELDS;
1326 return "Regular expression ends prematurely";
1327
1328 too_complex:
1329 SET_FIELDS;
1330 return "Regular expression too complex";
1331 }
1332
1333 #undef CHARAT
1334 #undef NEXTCHAR
1335 #undef GETHEX
1336 #undef ALLOC
1337 #undef STORE
1338 #undef CURRENT_LEVEL_START
1339 #undef SET_LEVEL_START
1340 #undef PUSH_LEVEL_STARTS
1341 #undef POP_LEVEL_STARTS
1342 #undef PUT_ADDR
1343 #undef INSERT_JUMP
1344 #undef SETBIT
1345 #undef SET_FIELDS
1346
1347 #define PREFETCH \
1348 if (text == textend) goto fail
1349
1350 #define NEXTCHAR(var) \
1351 PREFETCH; \
1352 var = (unsigned char)*text++; \
1353 if (translate) var = translate[var]
1354
regcomp(regex_t * bufp,const char * regex,int cflags)1355 int regcomp(regex_t* bufp, const char* regex, int cflags)
1356 {
1357 memset(bufp, 0, sizeof(regex_t));
1358 bufp->cflags = cflags;
1359 if (bufp->cflags & REG_ICASE) {
1360 char *p, *lcase = strdup(regex);
1361 for (p = lcase; *p; p++) { *p = tolower(*p); }
1362 re_compile_pattern(bufp, (unsigned char*)lcase);
1363 free(lcase);
1364 } else {
1365 re_compile_pattern(bufp, (unsigned char*)regex);
1366 }
1367 if (got_error) { return -1; }
1368 return 0;
1369 }
1370
re_registers_to_regmatch(regexp_registers_t old_regs,regmatch_t pmatch[],size_t nmatch)1371 void re_registers_to_regmatch(regexp_registers_t old_regs,
1372 regmatch_t pmatch[],
1373 size_t nmatch)
1374 {
1375 if (!(nmatch == 0 && pmatch == NULL)) {
1376 size_t i = 0;
1377
1378 /*
1379 * We have to set the last entry to -1
1380 */
1381 nmatch = nmatch - 1;
1382 for (i = 0; (i < nmatch) && (old_regs->start[i] > -1); i++) {
1383 pmatch[i].rm_so = old_regs->start[i];
1384 pmatch[i].rm_eo = old_regs->end[i];
1385 }
1386
1387 pmatch[i].rm_eo = pmatch[i].rm_so = -1;
1388 }
1389 }
1390
regexec(regex_t * preg,const char * string,size_t nmatch,regmatch_t pmatch[],int eflags)1391 int regexec(regex_t* preg,
1392 const char* string,
1393 size_t nmatch,
1394 regmatch_t pmatch[],
1395 int eflags)
1396 {
1397 int status;
1398 int len = strlen(string);
1399 re_registers regs{};
1400
1401 status = ReSearch(preg, (unsigned char*)string, len, 0, len, ®s);
1402 if (status >= 0) { re_registers_to_regmatch(®s, pmatch, nmatch); }
1403
1404 /*
1405 * status is the start position in the string base 0 where
1406 * the pattern was found or negative if not found.
1407 */
1408 return status < 0 ? -1 : 0;
1409 }
1410
regerror(int errcode,regex_t * preg,char * errbuf,size_t errbuf_size)1411 size_t regerror(int errcode, regex_t* preg, char* errbuf, size_t errbuf_size)
1412 {
1413 bstrncpy(errbuf, preg->errmsg, errbuf_size);
1414 return 0;
1415 }
1416
regfree(regex_t * preg)1417 void regfree(regex_t* preg)
1418 {
1419 if (preg->lcase) {
1420 FreePoolMemory(preg->lcase);
1421 preg->lcase = NULL;
1422 }
1423 if (preg->buffer) {
1424 free(preg->buffer);
1425 preg->buffer = NULL;
1426 }
1427 }
1428
ReMatch(regex_t * bufp,unsigned char * string,int size,int pos,regexp_registers_t old_regs)1429 int ReMatch(regex_t* bufp,
1430 unsigned char* string,
1431 int size,
1432 int pos,
1433 regexp_registers_t old_regs)
1434 {
1435 unsigned char* code;
1436 unsigned char* translate;
1437 unsigned char* text;
1438 unsigned char* textstart;
1439 unsigned char* textend;
1440 int a;
1441 int b;
1442 int ch;
1443 int reg;
1444 int match_end;
1445 unsigned char* regstart;
1446 unsigned char* regend;
1447 int regsize;
1448 match_state state;
1449
1450 // assert(pos >= 0 && size >= 0);
1451 // assert(pos <= size);
1452
1453 text = string + pos;
1454 textstart = string;
1455 textend = string + size;
1456
1457 code = bufp->buffer;
1458
1459 translate = bufp->translate;
1460
1461 NEW_STATE(state, bufp->num_registers);
1462
1463 continue_matching:
1464 switch (*code++) {
1465 case Cend: {
1466 match_end = text - textstart;
1467 if (old_regs) {
1468 old_regs->start[0] = pos;
1469 old_regs->end[0] = match_end;
1470 if (!bufp->uses_registers) {
1471 for (a = 1; a < RE_NREGS; a++) {
1472 old_regs->start[a] = -1;
1473 old_regs->end[a] = -1;
1474 }
1475 } else {
1476 for (a = 1; a < bufp->num_registers; a++) {
1477 if ((GET_REG_START(state, a) == NULL) ||
1478 (GET_REG_END(state, a) == NULL)) {
1479 old_regs->start[a] = -1;
1480 old_regs->end[a] = -1;
1481 continue;
1482 }
1483 old_regs->start[a] = GET_REG_START(state, a) - textstart;
1484 old_regs->end[a] = GET_REG_END(state, a) - textstart;
1485 }
1486 for (; a < RE_NREGS; a++) {
1487 old_regs->start[a] = -1;
1488 old_regs->end[a] = -1;
1489 }
1490 }
1491 }
1492 FREE_STATE(state);
1493 return match_end - pos;
1494 }
1495 case Cbol: {
1496 if (text == textstart || text[-1] == '\n') goto continue_matching;
1497 goto fail;
1498 }
1499 case Ceol: {
1500 if (text == textend || *text == '\n') goto continue_matching;
1501 goto fail;
1502 }
1503 case Cset: {
1504 NEXTCHAR(ch);
1505 if (code[ch / 8] & (1 << (ch & 7))) {
1506 code += 256 / 8;
1507 goto continue_matching;
1508 }
1509 goto fail;
1510 }
1511 case Cexact: {
1512 NEXTCHAR(ch);
1513 if (ch != (unsigned char)*code++) goto fail;
1514 goto continue_matching;
1515 }
1516 case Canychar: {
1517 NEXTCHAR(ch);
1518 if (ch == '\n') goto fail;
1519 goto continue_matching;
1520 }
1521 case Cstart_memory: {
1522 reg = *code++;
1523 SET_REG_START(state, reg, text, goto error);
1524 goto continue_matching;
1525 }
1526 case Cend_memory: {
1527 reg = *code++;
1528 SET_REG_END(state, reg, text, goto error);
1529 goto continue_matching;
1530 }
1531 case Cmatch_memory: {
1532 reg = *code++;
1533 regstart = GET_REG_START(state, reg);
1534 regend = GET_REG_END(state, reg);
1535 if ((regstart == NULL) || (regend == NULL))
1536 goto fail; /* or should we just match nothing? */
1537 regsize = regend - regstart;
1538
1539 if (regsize > (textend - text)) goto fail;
1540 if (translate) {
1541 for (; regstart < regend; regstart++, text++)
1542 if (translate[*regstart] != translate[*text]) goto fail;
1543 } else
1544 for (; regstart < regend; regstart++, text++)
1545 if (*regstart != *text) goto fail;
1546 goto continue_matching;
1547 }
1548 case Cupdate_failure_jump: {
1549 UPDATE_FAILURE(state, text, goto error);
1550 /* fall to next case */
1551 }
1552 /* treat Cstar_jump just like Cjump if it hasn't been optimized */
1553 case Cstar_jump:
1554 case Cjump: {
1555 a = (unsigned char)*code++;
1556 a |= (unsigned char)*code++ << 8;
1557 code += (int)SHORT(a);
1558 if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1559 set_error("Regex VM jump out of bounds (Cjump)");
1560 FREE_STATE(state);
1561 return -2;
1562 }
1563 goto continue_matching;
1564 }
1565 case Cdummy_failure_jump: {
1566 unsigned char* failuredest;
1567
1568 a = (unsigned char)*code++;
1569 a |= (unsigned char)*code++ << 8;
1570 a = (int)SHORT(a);
1571 // assert(*code == Cfailure_jump);
1572 b = (unsigned char)code[1];
1573 b |= (unsigned char)code[2] << 8;
1574 failuredest = code + (int)SHORT(b) + 3;
1575 if (failuredest < bufp->buffer ||
1576 bufp->buffer + bufp->used < failuredest) {
1577 set_error(
1578 "Regex VM jump out of bounds (Cdummy_failure_jump failuredest)");
1579 FREE_STATE(state);
1580 return -2;
1581 }
1582 PUSH_FAILURE(state, failuredest, NULL, goto error);
1583 code += a;
1584 if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1585 set_error("Regex VM jump out of bounds (Cdummy_failure_jump code)");
1586 FREE_STATE(state);
1587 return -2;
1588 }
1589 goto continue_matching;
1590 }
1591 case Cfailure_jump: {
1592 a = (unsigned char)*code++;
1593 a |= (unsigned char)*code++ << 8;
1594 a = (int)SHORT(a);
1595 if (code + a < bufp->buffer || bufp->buffer + bufp->used < code + a) {
1596 set_error("Regex VM jump out of bounds (Cfailure_jump)");
1597 FREE_STATE(state);
1598 return -2;
1599 }
1600 PUSH_FAILURE(state, code + a, text, goto error);
1601 goto continue_matching;
1602 }
1603 case Crepeat1: {
1604 unsigned char* pinst;
1605 a = (unsigned char)*code++;
1606 a |= (unsigned char)*code++ << 8;
1607 a = (int)SHORT(a);
1608 pinst = code + a;
1609 if (pinst < bufp->buffer || bufp->buffer + bufp->used < pinst) {
1610 set_error("Regex VM jump out of bounds (Crepeat1)");
1611 FREE_STATE(state);
1612 return -2;
1613 }
1614 /* pinst is sole instruction in loop, and it matches a
1615 * single character. Since Crepeat1 was originally a
1616 * Cupdate_failure_jump, we also know that backtracking
1617 * is useless: so long as the single-character
1618 * expression matches, it must be used. Also, in the
1619 * case of +, we've already matched one character, so +
1620 * can't fail: nothing here can cause a failure. */
1621 switch (*pinst++) {
1622 case Cset: {
1623 if (translate) {
1624 while (text < textend) {
1625 ch = translate[(unsigned char)*text];
1626 if (pinst[ch / 8] & (1 << (ch & 7)))
1627 text++;
1628 else
1629 break;
1630 }
1631 } else {
1632 while (text < textend) {
1633 ch = (unsigned char)*text;
1634 if (pinst[ch / 8] & (1 << (ch & 7)))
1635 text++;
1636 else
1637 break;
1638 }
1639 }
1640 break;
1641 }
1642 case Cexact: {
1643 ch = (unsigned char)*pinst;
1644 if (translate) {
1645 while (text < textend && translate[(unsigned char)*text] == ch)
1646 text++;
1647 } else {
1648 while (text < textend && (unsigned char)*text == ch) text++;
1649 }
1650 break;
1651 }
1652 case Canychar: {
1653 while (text < textend && (unsigned char)*text != '\n') text++;
1654 break;
1655 }
1656 case Csyntaxspec: {
1657 a = (unsigned char)*pinst;
1658 if (translate) {
1659 while (text < textend && (SYNTAX(translate[*text]) & a)) text++;
1660 } else {
1661 while (text < textend && (SYNTAX(*text) & a)) text++;
1662 }
1663 break;
1664 }
1665 case Cnotsyntaxspec: {
1666 a = (unsigned char)*pinst;
1667 if (translate) {
1668 while (text < textend && !(SYNTAX(translate[*text]) & a)) text++;
1669 } else {
1670 while (text < textend && !(SYNTAX(*text) & a)) text++;
1671 }
1672 break;
1673 }
1674 default: {
1675 FREE_STATE(state);
1676 set_error("Unknown regex opcode: memory corrupted?");
1677 return -2;
1678 /*NOTREACHED*/}
1679 }
1680 /* due to the funky way + and * are compiled, the top
1681 * failure- stack entry at this point is actually a
1682 * success entry -- update it & pop it */
1683 UPDATE_FAILURE(state, text, goto error);
1684 goto fail; /* i.e., succeed <wink/sigh> */
1685 }
1686 case Cbegbuf: {
1687 if (text == textstart) goto continue_matching;
1688 goto fail;
1689 }
1690 case Cendbuf: {
1691 if (text == textend) goto continue_matching;
1692 goto fail;
1693 }
1694 case Cwordbeg: {
1695 if (text == textend) goto fail;
1696 if (!(SYNTAX(*text) & Sword)) goto fail;
1697 if (text == textstart) goto continue_matching;
1698 if (!(SYNTAX(text[-1]) & Sword)) goto continue_matching;
1699 goto fail;
1700 }
1701 case Cwordend: {
1702 if (text == textstart) goto fail;
1703 if (!(SYNTAX(text[-1]) & Sword)) goto fail;
1704 if (text == textend) goto continue_matching;
1705 if (!(SYNTAX(*text) & Sword)) goto continue_matching;
1706 goto fail;
1707 }
1708 case Cwordbound: {
1709 /* Note: as in gnu regexp, this also matches at the
1710 * beginning and end of buffer. */
1711
1712 if (text == textstart || text == textend) goto continue_matching;
1713 if ((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword))
1714 goto continue_matching;
1715 goto fail;
1716 }
1717 case Cnotwordbound: {
1718 /* Note: as in gnu regexp, this never matches at the
1719 * beginning and end of buffer. */
1720 if (text == textstart || text == textend) goto fail;
1721 if (!((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword)))
1722 goto continue_matching;
1723 goto fail;
1724 }
1725 case Csyntaxspec: {
1726 NEXTCHAR(ch);
1727 if (!(SYNTAX(ch) & (unsigned char)*code++)) goto fail;
1728 goto continue_matching;
1729 }
1730 case Cnotsyntaxspec: {
1731 NEXTCHAR(ch);
1732 if (SYNTAX(ch) & (unsigned char)*code++) goto fail;
1733 goto continue_matching;
1734 }
1735 default: {
1736 FREE_STATE(state);
1737 set_error("Unknown regex opcode: memory corrupted?");
1738 return -2;
1739 /*NOTREACHED*/}
1740 }
1741
1742 /*
1743 *NOTREACHED
1744 */
1745
1746 /* Using "break;" in the above switch statement is equivalent to "goto fail;"
1747 */
1748 fail:
1749 POP_FAILURE(state, code, text, goto done_matching, goto error);
1750 goto continue_matching;
1751
1752 done_matching:
1753 /* if(translated != NULL) */
1754 /* free(translated); */
1755 FREE_STATE(state);
1756 return -1;
1757
1758 error:
1759 /* if (translated != NULL) */
1760 /* free(translated); */
1761 FREE_STATE(state);
1762 return -2;
1763 }
1764
1765
1766 #undef PREFETCH
1767 #undef NEXTCHAR
1768
ReSearch(regex_t * bufp,unsigned char * str,int size,int pos,int range,regexp_registers_t regs)1769 int ReSearch(regex_t* bufp,
1770 unsigned char* str,
1771 int size,
1772 int pos,
1773 int range,
1774 regexp_registers_t regs)
1775 {
1776 unsigned char* fastmap;
1777 unsigned char* translate;
1778 unsigned char* text;
1779 unsigned char* partstart;
1780 unsigned char* partend;
1781 int dir;
1782 int ret;
1783 unsigned char anchor;
1784 unsigned char* string = str;
1785
1786 if (bufp->cflags & REG_ICASE) { /* we must use string in lowercase */
1787 int len = strlen((const char*)str);
1788 if (!bufp->lcase) { bufp->lcase = GetPoolMemory(PM_FNAME); }
1789 bufp->lcase = CheckPoolMemorySize(bufp->lcase, len + 1);
1790 unsigned char* dst = (unsigned char*)bufp->lcase;
1791 while (*string) { *dst++ = tolower(*string++); }
1792 *dst = '\0';
1793 string = (unsigned char*)bufp->lcase;
1794 }
1795
1796 // assert(size >= 0 && pos >= 0);
1797 // assert(pos + range >= 0 && pos + range <= size); /* Bugfix by ylo */
1798
1799 fastmap = bufp->fastmap;
1800 translate = bufp->translate;
1801 if (fastmap && !bufp->fastmap_accurate) {
1802 ReCompileFastmap(bufp);
1803 if (got_error) return -2;
1804 }
1805
1806 anchor = bufp->anchor;
1807 if (bufp->can_be_null == 1) /* can_be_null == 2: can match null at eob */
1808 fastmap = NULL;
1809
1810 if (range < 0) {
1811 dir = -1;
1812 range = -range;
1813 } else
1814 dir = 1;
1815
1816 if (anchor == 2) {
1817 if (pos != 0)
1818 return -1;
1819 else
1820 range = 0;
1821 }
1822
1823 for (; range >= 0; range--, pos += dir) {
1824 if (fastmap) {
1825 if (dir == 1) { /* searching forwards */
1826
1827 text = string + pos;
1828 partend = string + size;
1829 partstart = text;
1830 if (translate)
1831 while (text != partend &&
1832 !fastmap[(unsigned char)translate[(unsigned char)*text]])
1833 text++;
1834 else
1835 while (text != partend && !fastmap[(unsigned char)*text]) text++;
1836 pos += text - partstart;
1837 range -= text - partstart;
1838 if (pos == size && bufp->can_be_null == 0) return -1;
1839 } else { /* searching backwards */
1840 text = string + pos;
1841 partstart = string + pos - range;
1842 partend = text;
1843 if (translate)
1844 while (text != partstart &&
1845 !fastmap[(unsigned char)translate[(unsigned char)*text]])
1846 text--;
1847 else
1848 while (text != partstart && !fastmap[(unsigned char)*text]) text--;
1849 pos -= partend - text;
1850 range -= partend - text;
1851 }
1852 }
1853 if (anchor == 1) { /* anchored to begline */
1854 if (pos > 0 && (string[pos - 1] != '\n')) continue;
1855 }
1856 // assert(pos >= 0 && pos <= size);
1857 ret = ReMatch(bufp, string, size, pos, regs);
1858 if (ret >= 0) return pos;
1859 if (ret == -2) return -2;
1860 }
1861 return -1;
1862 }
1863
1864 /*
1865 ** Local Variables:
1866 ** mode: c
1867 ** c-file-style: "python"
1868 ** End:
1869 */
1870