1 /*
2 BAREOS® - Backup Archiving REcovery Open Sourced
3
4 Copyright (C) 2006-2010 Free Software Foundation Europe e.V.
5 Copyright (C) 2013-2020 Bareos GmbH & Co. KG
6
7 This program is Free Software; you can redistribute it and/or
8 modify it under the terms of version three of the GNU Affero General Public
9 License as published by the Free Software Foundation and included
10 in the file LICENSE.
11
12 This program is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Affero General Public License for more details.
16
17 You should have received a copy of the GNU Affero General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
20 02110-1301, USA.
21 */
22 /* regexpr.c
23 *
24 * Author: Tatu Ylonen <ylo@ngs.fi>
25 *
26 * Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
27 *
28 * Permission to use, copy, modify, distribute, and sell this software
29 * and its documentation for any purpose is hereby granted without
30 * fee, provided that the above copyright notice appear in all copies.
31 * This software is provided "as is" without express or implied
32 * warranty.
33 *
34 * Created: Thu Sep 26 17:14:05 1991 ylo
35 * Last modified: Mon Nov 4 17:06:48 1991 ylo
36 * Ported to Think C: 19 Jan 1992 guido@cwi.nl
37 *
38 * This code draws many ideas from the regular expression packages by
39 * Henry Spencer of the University of Toronto and Richard Stallman of
40 * the Free Software Foundation.
41 *
42 * Emacs-specific code and syntax table code is almost directly borrowed
43 * from GNU regexp.
44 *
45 * Bugs fixed and lots of reorganization by Jeffrey C. Ollie, April
46 * 1997 Thanks for bug reports and ideas from Andrew Kuchling, Tim
47 * Peters, Guido van Rossum, Ka-Ping Yee, Sjoerd Mullender, and
48 * probably one or two others that I'm forgetting.
49 *
50 * This file modified to work with BAREOS and C++ by
51 * Kern Sibbald, April 2006
52 *
53 * This file modified to work with REG_ICASE and BAREOS by
54 * Eric Bollengier April 2007
55 */
56
57 #include "include/bareos.h"
58 #include "bregex.h"
59
60 #define set_error(x) bufp->errmsg = ((char*)(x))
61 #define got_error bufp->errmsg != NULL
62
63 /* The original code blithely assumed that sizeof(short) == 2. Not
64 * always true. Original instances of "(short)x" were replaced by
65 * SHORT(x), where SHORT is #defined below. */
66
67 #define SHORT(x) ((x)&0x8000 ? (x)-0x10000 : (x))
68
69 /* The stack implementation is taken from an idea by Andrew Kuchling.
70 * It's a doubly linked list of arrays. The advantages of this over a
71 * simple linked list are that the number of mallocs required are
72 * reduced. It also makes it possible to statically allocate enough
73 * space so that small patterns don't ever need to call malloc.
74 *
75 * The advantages over a single array is that is periodically
76 * realloced when more space is needed is that we avoid ever copying
77 * the stack. */
78
79 /* item_t is the basic stack element. Defined as a union of
80 * structures so that both registers, failure points, and counters can
81 * be pushed/popped from the stack. There's nothing built into the
82 * item to keep track of whether a certain stack item is a register, a
83 * failure point, or a counter. */
84
85 typedef union item_t {
86 struct {
87 int num;
88 int level;
89 unsigned char* start;
90 unsigned char* end;
91 } reg;
92 struct {
93 int count;
94 int level;
95 int phantom;
96 unsigned char* code;
97 unsigned char* text;
98 } fail;
99 struct {
100 int num;
101 int level;
102 int count;
103 } cntr;
104 } item_t;
105
106 #define B_STACK_PAGE_SIZE 256
107 #define NUM_REGISTERS 256
108
109 /* A 'page' of stack items. */
110
111 typedef struct item_page_t {
112 item_t items[B_STACK_PAGE_SIZE];
113 struct item_page_t* prev;
114 struct item_page_t* next;
115 } item_page_t;
116
117
118 typedef struct match_state {
119 /* The number of registers that have been pushed onto the stack
120 * since the last failure point. */
121
122 int count;
123
124 /* Used to control when registers need to be pushed onto the
125 * stack. */
126
127 int level;
128
129 /* The number of failure points on the stack. */
130
131 int point;
132
133 /* Storage for the registers. Each register consists of two
134 * pointers to characters. So register N is represented as
135 * start[N] and end[N]. The pointers must be converted to
136 * offsets from the beginning of the string before returning the
137 * registers to the calling program. */
138
139 unsigned char* start[NUM_REGISTERS];
140 unsigned char* end[NUM_REGISTERS];
141
142 /* Keeps track of whether a register has changed recently. */
143
144 int changed[NUM_REGISTERS];
145
146 /* Structure to encapsulate the stack. */
147 struct {
148 /* index into the current page. If index == 0 and you need
149 * to pop an item, move to the previous page and set index
150 * = B_STACK_PAGE_SIZE - 1. Otherwise decrement index to
151 * push a page. If index == B_STACK_PAGE_SIZE and you need
152 * to push a page move to the next page and set index =
153 * 0. If there is no new next page, allocate a new page
154 * and link it in. Otherwise, increment index to push a
155 * page. */
156
157 int index;
158 item_page_t* current; /* Pointer to the current page. */
159 item_page_t first; /* First page is statically allocated. */
160 } stack;
161 } match_state;
162
163 /* Initialize a state object */
164
165 /* #define NEW_STATE(state) \ */
166 /* memset(&state, 0, (void *)(&state.stack) - (void *)(&state)); \ */
167 /* state.stack.current = &state.stack.first; \ */
168 /* state.stack.first.prev = NULL; \ */
169 /* state.stack.first.next = NULL; \ */
170 /* state.stack.index = 0; \ */
171 /* state.level = 1 */
172
173 #define NEW_STATE(state, nregs) \
174 { \
175 int i; \
176 for (i = 0; i < nregs; i++) { \
177 state.start[i] = NULL; \
178 state.end[i] = NULL; \
179 state.changed[i] = 0; \
180 } \
181 state.stack.current = &state.stack.first; \
182 state.stack.first.prev = NULL; \
183 state.stack.first.next = NULL; \
184 state.stack.index = 0; \
185 state.level = 1; \
186 state.count = 0; \
187 state.level = 0; \
188 state.point = 0; \
189 }
190
191 /* Free any memory that might have been malloc'd */
192
193 #define FREE_STATE(state) \
194 while (state.stack.first.next != NULL) { \
195 state.stack.current = state.stack.first.next; \
196 state.stack.first.next = state.stack.current->next; \
197 free(state.stack.current); \
198 }
199
200 /* Discard the top 'count' stack items. */
201
202 #define STACK_DISCARD(stack, count, on_error) \
203 stack.index -= count; \
204 while (stack.index < 0) { \
205 if (stack.current->prev == NULL) on_error; \
206 stack.current = stack.current->prev; \
207 stack.index += B_STACK_PAGE_SIZE; \
208 }
209
210 /* Store a pointer to the previous item on the stack. Used to pop an
211 * item off of the stack. */
212
213 #define STACK_PREV(stack, top, on_error) \
214 if (stack.index == 0) { \
215 if (stack.current->prev == NULL) on_error; \
216 stack.current = stack.current->prev; \
217 stack.index = B_STACK_PAGE_SIZE - 1; \
218 } else { \
219 stack.index--; \
220 } \
221 top = &(stack.current->items[stack.index])
222
223 /* Store a pointer to the next item on the stack. Used to push an item
224 * on to the stack. */
225
226 #define STACK_NEXT(stack, top, on_error) \
227 if (stack.index == B_STACK_PAGE_SIZE) { \
228 if (stack.current->next == NULL) { \
229 stack.current->next = (item_page_t*)malloc(sizeof(item_page_t)); \
230 if (stack.current->next == NULL) on_error; \
231 stack.current->next->prev = stack.current; \
232 stack.current->next->next = NULL; \
233 } \
234 stack.current = stack.current->next; \
235 stack.index = 0; \
236 } \
237 top = &(stack.current->items[stack.index++])
238
239 /* Store a pointer to the item that is 'count' items back in the
240 * stack. STACK_BACK(stack, top, 1, on_error) is equivalent to
241 * STACK_TOP(stack, top, on_error). */
242
243 #define STACK_BACK(stack, top, count, on_error) \
244 { \
245 int index; \
246 item_page_t* current; \
247 current = stack.current; \
248 index = stack.index - (count); \
249 while (index < 0) { \
250 if (current->prev == NULL) on_error; \
251 current = current->prev; \
252 index += B_STACK_PAGE_SIZE; \
253 } \
254 top = &(current->items[index]); \
255 }
256
257 /* Store a pointer to the top item on the stack. Execute the
258 * 'on_error' code if there are no items on the stack. */
259
260 #define STACK_TOP(stack, top, on_error) \
261 if (stack.index == 0) { \
262 if (stack.current->prev == NULL) on_error; \
263 top = &(stack.current->prev->items[B_STACK_PAGE_SIZE - 1]); \
264 } else { \
265 top = &(stack.current->items[stack.index - 1]); \
266 }
267
268 /* Test to see if the stack is empty */
269
270 #define STACK_EMPTY(stack) ((stack.index == 0) && (stack.current->prev == NULL))
271
272 /* Return the start of register 'reg' */
273
274 #define GET_REG_START(state, reg) (state.start[reg])
275
276 /* Return the end of register 'reg' */
277
278 #define GET_REG_END(state, reg) (state.end[reg])
279
280 /* Set the start of register 'reg'. If the state of the register needs
281 * saving, push it on the stack. */
282
283 #define SET_REG_START(state, reg, text, on_error) \
284 if (state.changed[reg] < state.level) { \
285 item_t* item; \
286 STACK_NEXT(state.stack, item, on_error); \
287 item->reg.num = reg; \
288 item->reg.start = state.start[reg]; \
289 item->reg.end = state.end[reg]; \
290 item->reg.level = state.changed[reg]; \
291 state.changed[reg] = state.level; \
292 state.count++; \
293 } \
294 state.start[reg] = text
295
296 /* Set the end of register 'reg'. If the state of the register needs
297 * saving, push it on the stack. */
298
299 #define SET_REG_END(state, reg, text, on_error) \
300 if (state.changed[reg] < state.level) { \
301 item_t* item; \
302 STACK_NEXT(state.stack, item, on_error); \
303 item->reg.num = reg; \
304 item->reg.start = state.start[reg]; \
305 item->reg.end = state.end[reg]; \
306 item->reg.level = state.changed[reg]; \
307 state.changed[reg] = state.level; \
308 state.count++; \
309 } \
310 state.end[reg] = text
311
312 #define PUSH_FAILURE(state, xcode, xtext, on_error) \
313 { \
314 item_t* item; \
315 STACK_NEXT(state.stack, item, on_error); \
316 item->fail.code = xcode; \
317 item->fail.text = xtext; \
318 item->fail.count = state.count; \
319 item->fail.level = state.level; \
320 item->fail.phantom = 0; \
321 state.count = 0; \
322 state.level++; \
323 state.point++; \
324 }
325
326 /* Update the last failure point with a new position in the text. */
327
328 #define UPDATE_FAILURE(state, xtext, on_error) \
329 { \
330 item_t* item; \
331 STACK_BACK(state.stack, item, state.count + 1, on_error); \
332 if (!item->fail.phantom) { \
333 item_t* item2; \
334 STACK_NEXT(state.stack, item2, on_error); \
335 item2->fail.code = item->fail.code; \
336 item2->fail.text = xtext; \
337 item2->fail.count = state.count; \
338 item2->fail.level = state.level; \
339 item2->fail.phantom = 1; \
340 state.count = 0; \
341 state.level++; \
342 state.point++; \
343 } else { \
344 STACK_DISCARD(state.stack, state.count, on_error); \
345 STACK_TOP(state.stack, item, on_error); \
346 item->fail.text = xtext; \
347 state.count = 0; \
348 state.level++; \
349 } \
350 }
351
352 #define POP_FAILURE(state, xcode, xtext, on_empty, on_error) \
353 { \
354 item_t* item; \
355 do { \
356 while (state.count > 0) { \
357 STACK_PREV(state.stack, item, on_error); \
358 state.start[item->reg.num] = item->reg.start; \
359 state.end[item->reg.num] = item->reg.end; \
360 state.changed[item->reg.num] = item->reg.level; \
361 state.count--; \
362 } \
363 STACK_PREV(state.stack, item, on_empty); \
364 xcode = item->fail.code; \
365 xtext = item->fail.text; \
366 state.count = item->fail.count; \
367 state.level = item->fail.level; \
368 state.point--; \
369 } while (item->fail.text == NULL); \
370 }
371
372 enum regexp_compiled_ops
373 { /* opcodes for compiled regexp */
374 Cend, /* end of pattern reached */
375 Cbol, /* beginning of line */
376 Ceol, /* end of line */
377 Cset, /* character set. Followed by 32 bytes of set. */
378 Cexact, /* followed by a byte to match */
379 Canychar, /* matches any character except newline */
380 Cstart_memory, /* set register start addr (followed by reg number) */
381 Cend_memory, /* set register end addr (followed by reg number) */
382 Cmatch_memory, /* match a duplicate of reg contents (regnum follows) */
383 Cjump, /* followed by two bytes (lsb,msb) of displacement. */
384 Cstar_jump, /* will change to jump/update_failure_jump at runtime */
385 Cfailure_jump, /* jump to addr on failure */
386 Cupdate_failure_jump, /* update topmost failure point and jump */
387 Cdummy_failure_jump, /* push a dummy failure point and jump */
388 Cbegbuf, /* match at beginning of buffer */
389 Cendbuf, /* match at end of buffer */
390 Cwordbeg, /* match at beginning of word */
391 Cwordend, /* match at end of word */
392 Cwordbound, /* match if at word boundary */
393 Cnotwordbound, /* match if not at word boundary */
394 Csyntaxspec, /* matches syntax code (1 byte follows) */
395 Cnotsyntaxspec, /* matches if syntax code does not match (1 byte follows) */
396 Crepeat1
397 };
398
399 enum regexp_syntax_op
400 { /* syntax codes for plain and quoted characters */
401 Rend, /* special code for end of regexp */
402 Rnormal, /* normal character */
403 Ranychar, /* any character except newline */
404 Rquote, /* the quote character */
405 Rbol, /* match beginning of line */
406 Reol, /* match end of line */
407 Roptional, /* match preceding expression optionally */
408 Rstar, /* match preceding expr zero or more times */
409 Rplus, /* match preceding expr one or more times */
410 Ror, /* match either of alternatives */
411 Ropenpar, /* opening parenthesis */
412 Rclosepar, /* closing parenthesis */
413 Rmemory, /* match memory register */
414 Rextended_memory, /* \vnn to match registers 10-99 */
415 Ropenset, /* open set. Internal syntax hard-coded below. */
416 /* the following are gnu extensions to "normal" regexp syntax */
417 Rbegbuf, /* beginning of buffer */
418 Rendbuf, /* end of buffer */
419 Rwordchar, /* word character */
420 Rnotwordchar, /* not word character */
421 Rwordbeg, /* beginning of word */
422 Rwordend, /* end of word */
423 Rwordbound, /* word bound */
424 Rnotwordbound, /* not word bound */
425 Rnum_ops
426 };
427
428 static int re_compile_initialized = 0;
429 static int regexp_syntax = RE_SYNTAX_EGREP;
430 int re_syntax = RE_SYNTAX_EGREP; /* Exported copy of regexp_syntax */
431 static unsigned char plain_ops[256];
432 static unsigned char quoted_ops[256];
433 static unsigned char precedences[Rnum_ops];
434 static int regexp_context_indep_ops;
435 static int regexp_ansi_sequences;
436
437 #define NUM_LEVELS 5 /* number of precedence levels in use */
438 #define MAX_NESTING 100 /* max nesting level of operators */
439
440 #define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
441
442 unsigned char re_syntax_table[256];
443
ReCompileInitialize(void)444 void ReCompileInitialize(void)
445 {
446 int a;
447
448 static int syntax_table_inited = 0;
449
450 if (!syntax_table_inited) {
451 syntax_table_inited = 1;
452 memset(re_syntax_table, 0, 256);
453 for (a = 'a'; a <= 'z'; a++) re_syntax_table[a] = Sword;
454 for (a = 'A'; a <= 'Z'; a++) re_syntax_table[a] = Sword;
455 for (a = '0'; a <= '9'; a++)
456 re_syntax_table[a] = Sword | Sdigit | Shexdigit;
457 for (a = '0'; a <= '7'; a++) re_syntax_table[a] |= Soctaldigit;
458 for (a = 'A'; a <= 'F'; a++) re_syntax_table[a] |= Shexdigit;
459 for (a = 'a'; a <= 'f'; a++) re_syntax_table[a] |= Shexdigit;
460 re_syntax_table[(int)'_'] = Sword;
461 for (a = 9; a <= 13; a++) re_syntax_table[a] = Swhitespace;
462 re_syntax_table[(int)' '] = Swhitespace;
463 }
464 re_compile_initialized = 1;
465 for (a = 0; a < 256; a++) {
466 plain_ops[a] = Rnormal;
467 quoted_ops[a] = Rnormal;
468 }
469 for (a = '0'; a <= '9'; a++) quoted_ops[a] = Rmemory;
470 plain_ops[(int)'\134'] = Rquote;
471 if (regexp_syntax & RE_NO_BK_PARENS) {
472 plain_ops[(int)'('] = Ropenpar;
473 plain_ops[(int)')'] = Rclosepar;
474 } else {
475 quoted_ops[(int)'('] = Ropenpar;
476 quoted_ops[(int)')'] = Rclosepar;
477 }
478 if (regexp_syntax & RE_NO_BK_VBAR) {
479 plain_ops[(int)'\174'] = Ror;
480 } else {
481 quoted_ops[(int)'\174'] = Ror;
482 }
483 plain_ops[(int)'*'] = Rstar;
484 if (regexp_syntax & RE_BK_PLUS_QM) {
485 quoted_ops[(int)'+'] = Rplus;
486 quoted_ops[(int)'?'] = Roptional;
487 } else {
488 plain_ops[(int)'+'] = Rplus;
489 plain_ops[(int)'?'] = Roptional;
490 }
491 if (regexp_syntax & RE_NEWLINE_OR) { plain_ops[(int)'\n'] = Ror; }
492 plain_ops[(int)'\133'] = Ropenset;
493 plain_ops[(int)'\136'] = Rbol;
494 plain_ops[(int)'$'] = Reol;
495 plain_ops[(int)'.'] = Ranychar;
496 if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS)) {
497 quoted_ops[(int)'w'] = Rwordchar;
498 quoted_ops[(int)'W'] = Rnotwordchar;
499 quoted_ops[(int)'<'] = Rwordbeg;
500 quoted_ops[(int)'>'] = Rwordend;
501 quoted_ops[(int)'b'] = Rwordbound;
502 quoted_ops[(int)'B'] = Rnotwordbound;
503 quoted_ops[(int)'`'] = Rbegbuf;
504 quoted_ops[(int)'\''] = Rendbuf;
505 }
506 if (regexp_syntax & RE_ANSI_HEX) { quoted_ops[(int)'v'] = Rextended_memory; }
507 for (a = 0; a < Rnum_ops; a++) { precedences[a] = 4; }
508 if (regexp_syntax & RE_TIGHT_VBAR) {
509 precedences[Ror] = 3;
510 precedences[Rbol] = 2;
511 precedences[Reol] = 2;
512 } else {
513 precedences[Ror] = 2;
514 precedences[Rbol] = 3;
515 precedences[Reol] = 3;
516 }
517 precedences[Rclosepar] = 1;
518 precedences[Rend] = 0;
519 regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
520 regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
521 }
522
ReSetSyntax(int syntax)523 int ReSetSyntax(int syntax)
524 {
525 int ret;
526
527 ret = regexp_syntax;
528 regexp_syntax = syntax;
529 re_syntax = syntax; /* Exported copy */
530 ReCompileInitialize();
531 return ret;
532 }
533
HexCharToDecimal(int ch)534 static int HexCharToDecimal(int ch)
535 {
536 if (ch >= '0' && ch <= '9') return ch - '0';
537 if (ch >= 'a' && ch <= 'f') return ch - 'a' + 10;
538 if (ch >= 'A' && ch <= 'F') return ch - 'A' + 10;
539 return 16;
540 }
541
re_compile_fastmap_aux(regex_t * bufp,unsigned char * code,int pos,unsigned char * visited,unsigned char * can_be_null,unsigned char * fastmap)542 static void re_compile_fastmap_aux(regex_t* bufp,
543 unsigned char* code,
544 int pos,
545 unsigned char* visited,
546 unsigned char* can_be_null,
547 unsigned char* fastmap)
548 {
549 int a;
550 int b;
551 int syntaxcode;
552
553 if (visited[pos]) return; /* we have already been here */
554 visited[pos] = 1;
555 for (;;) {
556 switch (code[pos++]) {
557 case Cend:
558 *can_be_null = 1;
559 return;
560 case Cbol:
561 case Cbegbuf:
562 case Cendbuf:
563 case Cwordbeg:
564 case Cwordend:
565 case Cwordbound:
566 case Cnotwordbound:
567 for (a = 0; a < 256; a++) fastmap[a] = 1;
568 break;
569 case Csyntaxspec:
570 syntaxcode = code[pos++];
571 for (a = 0; a < 256; a++)
572 if (SYNTAX(a) & syntaxcode) fastmap[a] = 1;
573 return;
574 case Cnotsyntaxspec:
575 syntaxcode = code[pos++];
576 for (a = 0; a < 256; a++)
577 if (!(SYNTAX(a) & syntaxcode)) fastmap[a] = 1;
578 return;
579 case Ceol:
580 fastmap[(int)'\n'] = 1;
581 if (*can_be_null == 0)
582 *can_be_null = 2; /* can match null, but only at end of buffer */
583 return;
584 case Cset:
585 for (a = 0; a < 256 / 8; a++)
586 if (code[pos + a] != 0)
587 for (b = 0; b < 8; b++)
588 if (code[pos + a] & (1 << b)) fastmap[(a << 3) + b] = 1;
589 pos += 256 / 8;
590 return;
591 case Cexact:
592 fastmap[(unsigned char)code[pos]] = 1;
593 return;
594 case Canychar:
595 for (a = 0; a < 256; a++)
596 if (a != '\n') fastmap[a] = 1;
597 return;
598 case Cstart_memory:
599 case Cend_memory:
600 pos++;
601 break;
602 case Cmatch_memory:
603 for (a = 0; a < 256; a++) fastmap[a] = 1;
604 *can_be_null = 1;
605 return;
606 case Cjump:
607 case Cdummy_failure_jump:
608 case Cupdate_failure_jump:
609 case Cstar_jump:
610 a = (unsigned char)code[pos++];
611 a |= (unsigned char)code[pos++] << 8;
612 pos += (int)SHORT(a);
613 if (visited[pos]) {
614 /* argh... the regexp contains empty loops. This is not
615 good, as this may cause a failure stack overflow when
616 matching. Oh well. */
617 /* this path leads nowhere; pursue other paths. */
618 return;
619 }
620 visited[pos] = 1;
621 break;
622 case Cfailure_jump:
623 a = (unsigned char)code[pos++];
624 a |= (unsigned char)code[pos++] << 8;
625 a = pos + (int)SHORT(a);
626 re_compile_fastmap_aux(bufp, code, a, visited, can_be_null, fastmap);
627 break;
628 case Crepeat1:
629 pos += 2;
630 break;
631 default:
632 set_error("Unknown regex opcode: memory corrupted?");
633 return;
634 }
635 }
636 }
637
re_do_compile_fastmap(regex_t * bufp,unsigned char * buffer,int used,int pos,unsigned char * can_be_null,unsigned char * fastmap)638 static int re_do_compile_fastmap(regex_t* bufp,
639 unsigned char* buffer,
640 int used,
641 int pos,
642 unsigned char* can_be_null,
643 unsigned char* fastmap)
644 {
645 unsigned char small_visited[512], *visited;
646
647 if (used <= (int)sizeof(small_visited))
648 visited = small_visited;
649 else {
650 visited = (unsigned char*)malloc(used);
651 if (!visited) return 0;
652 }
653 *can_be_null = 0;
654 memset(fastmap, 0, 256);
655 memset(visited, 0, used);
656 re_compile_fastmap_aux(bufp, buffer, pos, visited, can_be_null, fastmap);
657 if (visited != small_visited) free(visited);
658 return 1;
659 }
660
ReCompileFastmap(regex_t * bufp)661 void ReCompileFastmap(regex_t* bufp)
662 {
663 if (!bufp->fastmap || bufp->fastmap_accurate) return;
664 // assert(bufp->used > 0);
665 if (!re_do_compile_fastmap(bufp, bufp->buffer, bufp->used, 0,
666 &bufp->can_be_null, bufp->fastmap))
667 return;
668 if (got_error) return;
669 if (bufp->buffer[0] == Cbol)
670 bufp->anchor = 1; /* begline */
671 else if (bufp->buffer[0] == Cbegbuf)
672 bufp->anchor = 2; /* begbuf */
673 else
674 bufp->anchor = 0; /* none */
675 bufp->fastmap_accurate = 1;
676 }
677
678 /*
679 * star is coded as:
680 * 1: failure_jump 2
681 * ... code for operand of star
682 * star_jump 1
683 * 2: ... code after star
684 *
685 * We change the star_jump to update_failure_jump if we can determine
686 * that it is safe to do so; otherwise we change it to an ordinary
687 * jump.
688 *
689 * plus is coded as
690 *
691 * jump 2
692 * 1: failure_jump 3
693 * 2: ... code for operand of plus
694 * star_jump 1
695 * 3: ... code after plus
696 *
697 * For star_jump considerations this is processed identically to star.
698 *
699 */
700
ReOptimizeStarJump(regex_t * bufp,unsigned char * code)701 static int ReOptimizeStarJump(regex_t* bufp, unsigned char* code)
702 {
703 unsigned char map[256];
704 unsigned char can_be_null;
705 unsigned char* p1;
706 unsigned char* p2;
707 unsigned char ch;
708 int a;
709 int b;
710 int num_instructions = 0;
711
712 a = (unsigned char)*code++;
713 a |= (unsigned char)*code++ << 8;
714 a = (int)SHORT(a);
715
716 p1 = code + a + 3; /* skip the failure_jump */
717 /* Check that the jump is within the pattern */
718 if (p1 < bufp->buffer || bufp->buffer + bufp->used < p1) {
719 set_error("Regex VM jump out of bounds (failure_jump opt)");
720 return 0;
721 }
722 // assert(p1[-3] == Cfailure_jump);
723 p2 = code;
724 /* p1 points inside loop, p2 points to after loop */
725 if (!re_do_compile_fastmap(bufp, bufp->buffer, bufp->used,
726 (int)(p2 - bufp->buffer), &can_be_null, map))
727 goto make_normal_jump;
728
729 /* If we might introduce a new update point inside the
730 * loop, we can't optimize because then update_jump would
731 * update a wrong failure point. Thus we have to be
732 * quite careful here.
733 */
734
735 /* loop until we find something that consumes a character */
736 for (;;) {
737 num_instructions++;
738 switch (*p1++) {
739 case Cbol:
740 case Ceol:
741 case Cbegbuf:
742 case Cendbuf:
743 case Cwordbeg:
744 case Cwordend:
745 case Cwordbound:
746 case Cnotwordbound:
747 continue;
748 case Cstart_memory:
749 case Cend_memory:
750 p1++;
751 continue;
752 case Cexact:
753 ch = (unsigned char)*p1++;
754 if (map[(int)ch]) goto make_normal_jump;
755 break;
756 case Canychar:
757 for (b = 0; b < 256; b++)
758 if (b != '\n' && map[b]) goto make_normal_jump;
759 break;
760 case Cset:
761 for (b = 0; b < 256; b++)
762 if ((p1[b >> 3] & (1 << (b & 7))) && map[b]) goto make_normal_jump;
763 p1 += 256 / 8;
764 break;
765 default:
766 goto make_normal_jump;
767 }
768 break;
769 }
770 /* now we know that we can't backtrack. */
771 while (p1 != p2 - 3) {
772 num_instructions++;
773 switch (*p1++) {
774 case Cend:
775 return 0;
776 case Cbol:
777 case Ceol:
778 case Canychar:
779 case Cbegbuf:
780 case Cendbuf:
781 case Cwordbeg:
782 case Cwordend:
783 case Cwordbound:
784 case Cnotwordbound:
785 break;
786 case Cset:
787 p1 += 256 / 8;
788 break;
789 case Cexact:
790 case Cstart_memory:
791 case Cend_memory:
792 case Cmatch_memory:
793 case Csyntaxspec:
794 case Cnotsyntaxspec:
795 p1++;
796 break;
797 case Cjump:
798 case Cstar_jump:
799 case Cfailure_jump:
800 case Cupdate_failure_jump:
801 case Cdummy_failure_jump:
802 goto make_normal_jump;
803 default:
804 return 0;
805 }
806 }
807
808 /* make_update_jump: */
809 code -= 3;
810 a += 3; /* jump to after the Cfailure_jump */
811 code[0] = Cupdate_failure_jump;
812 code[1] = a & 0xff;
813 code[2] = a >> 8;
814 if (num_instructions > 1) return 1;
815 // assert(num_instructions == 1);
816 /* if the only instruction matches a single character, we can do
817 * better */
818 p1 = code + 3 + a; /* start of sole instruction */
819 if (*p1 == Cset || *p1 == Cexact || *p1 == Canychar || *p1 == Csyntaxspec
820 || *p1 == Cnotsyntaxspec)
821 code[0] = Crepeat1;
822 return 1;
823
824 make_normal_jump:
825 code -= 3;
826 *code = Cjump;
827 return 1;
828 }
829
ReOptimize(regex_t * bufp)830 static int ReOptimize(regex_t* bufp)
831 {
832 unsigned char* code;
833
834 code = bufp->buffer;
835
836 while (1) {
837 switch (*code++) {
838 case Cend:
839 return 1;
840 case Canychar:
841 case Cbol:
842 case Ceol:
843 case Cbegbuf:
844 case Cendbuf:
845 case Cwordbeg:
846 case Cwordend:
847 case Cwordbound:
848 case Cnotwordbound:
849 break;
850 case Cset:
851 code += 256 / 8;
852 break;
853 case Cexact:
854 case Cstart_memory:
855 case Cend_memory:
856 case Cmatch_memory:
857 case Csyntaxspec:
858 case Cnotsyntaxspec:
859 code++;
860 break;
861 case Cstar_jump:
862 if (!ReOptimizeStarJump(bufp, code)) { return 0; }
863 /* fall through */
864 case Cupdate_failure_jump:
865 case Cjump:
866 case Cdummy_failure_jump:
867 case Cfailure_jump:
868 case Crepeat1:
869 code += 2;
870 break;
871 default:
872 return 0;
873 }
874 }
875 }
876
877 #define NEXTCHAR(var) \
878 { \
879 if (pos >= size) goto ends_prematurely; \
880 (var) = regex[pos]; \
881 pos++; \
882 }
883
884 #define ALLOC(amount) \
885 { \
886 if (pattern_offset + (amount) > alloc) { \
887 alloc += 256 + (amount); \
888 pattern = (unsigned char*)realloc(pattern, alloc); \
889 if (!pattern) goto out_of_memory; \
890 } \
891 }
892
893 #define STORE(ch) pattern[pattern_offset++] = (ch)
894
895 #define CURRENT_LEVEL_START (starts[starts_base + current_level])
896
897 #define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
898
899 #define PUSH_LEVEL_STARTS \
900 if (starts_base < (MAX_NESTING - 1) * NUM_LEVELS) \
901 starts_base += NUM_LEVELS; \
902 else \
903 goto too_complex
904
905 #define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
906
907 #define PUT_ADDR(offset, addr) \
908 { \
909 int disp = (addr) - (offset)-2; \
910 pattern[(offset)] = disp & 0xff; \
911 pattern[(offset) + 1] = (disp >> 8) & 0xff; \
912 }
913
914 #define INSERT_JUMP(pos, type, addr) \
915 { \
916 int a, p = (pos), t = (type), ad = (addr); \
917 for (a = pattern_offset - 1; a >= p; a--) pattern[a + 3] = pattern[a]; \
918 pattern[p] = t; \
919 PUT_ADDR(p + 1, ad); \
920 pattern_offset += 3; \
921 }
922
923 #define SETBIT(buf, offset, bit) (buf)[(offset) + (bit) / 8] |= (1 << ((bit)&7))
924
925 #define SET_FIELDS \
926 { \
927 bufp->allocated = alloc; \
928 bufp->buffer = pattern; \
929 bufp->used = pattern_offset; \
930 }
931
932 #define GETHEX(var) \
933 { \
934 unsigned char gethex_ch, gethex_value; \
935 NEXTCHAR(gethex_ch); \
936 gethex_value = HexCharToDecimal(gethex_ch); \
937 if (gethex_value == 16) goto hex_error; \
938 NEXTCHAR(gethex_ch); \
939 gethex_ch = HexCharToDecimal(gethex_ch); \
940 if (gethex_ch == 16) goto hex_error; \
941 (var) = gethex_value * 16 + gethex_ch; \
942 }
943
944 #define ANSI_TRANSLATE(ch) \
945 { \
946 switch (ch) { \
947 case 'a': \
948 case 'A': { \
949 ch = 7; /* audible bell */ \
950 break; \
951 } \
952 case 'b': \
953 case 'B': { \
954 ch = 8; /* backspace */ \
955 break; \
956 } \
957 case 'f': \
958 case 'F': { \
959 ch = 12; /* form feed */ \
960 break; \
961 } \
962 case 'n': \
963 case 'N': { \
964 ch = 10; /* line feed */ \
965 break; \
966 } \
967 case 'r': \
968 case 'R': { \
969 ch = 13; /* carriage return */ \
970 break; \
971 } \
972 case 't': \
973 case 'T': { \
974 ch = 9; /* tab */ \
975 break; \
976 } \
977 case 'v': \
978 case 'V': { \
979 ch = 11; /* vertical tab */ \
980 break; \
981 } \
982 case 'x': /* hex code */ \
983 case 'X': { \
984 GETHEX(ch); \
985 break; \
986 } \
987 default: { \
988 /* other characters passed through */ \
989 if (translate) ch = translate[(unsigned char)ch]; \
990 break; \
991 } \
992 } \
993 }
994
re_compile_pattern(regex_t * bufp,unsigned char * regex)995 const char* re_compile_pattern(regex_t* bufp, unsigned char* regex)
996 {
997 int a;
998 int pos;
999 int op;
1000 int current_level;
1001 int level;
1002 int opcode;
1003 int pattern_offset = 0, alloc;
1004 int starts[NUM_LEVELS * MAX_NESTING];
1005 int starts_base;
1006 int future_jumps[MAX_NESTING];
1007 int num_jumps;
1008 unsigned char ch = '\0';
1009 unsigned char* pattern;
1010 unsigned char* translate;
1011 int next_register;
1012 int paren_depth;
1013 int num_open_registers;
1014 int open_registers[RE_NREGS];
1015 int beginning_context;
1016 int size = strlen((char*)regex);
1017
1018 if (!re_compile_initialized) ReCompileInitialize();
1019 bufp->used = 0;
1020 bufp->fastmap_accurate = 0;
1021 bufp->uses_registers = 1;
1022 bufp->num_registers = 1;
1023 translate = bufp->translate;
1024 pattern = bufp->buffer;
1025 alloc = bufp->allocated;
1026 if (alloc == 0 || pattern == NULL) {
1027 alloc = 256;
1028 bufp->buffer = pattern = (unsigned char*)malloc(alloc);
1029 if (!pattern) goto out_of_memory;
1030 }
1031 pattern_offset = 0;
1032 starts_base = 0;
1033 num_jumps = 0;
1034 current_level = 0;
1035 SET_LEVEL_START;
1036 num_open_registers = 0;
1037 next_register = 1;
1038 paren_depth = 0;
1039 beginning_context = 1;
1040 op = -1;
1041 /* we use Rend dummy to ensure that pending jumps are updated
1042 (due to low priority of Rend) before exiting the loop. */
1043 pos = 0;
1044 while (op != Rend) {
1045 if (pos >= size)
1046 op = Rend;
1047 else {
1048 NEXTCHAR(ch);
1049 if (translate) ch = translate[(unsigned char)ch];
1050 op = plain_ops[(unsigned char)ch];
1051 if (op == Rquote) {
1052 NEXTCHAR(ch);
1053 op = quoted_ops[(unsigned char)ch];
1054 if (op == Rnormal && regexp_ansi_sequences) ANSI_TRANSLATE(ch);
1055 }
1056 }
1057 level = precedences[op];
1058 /* printf("ch='%c' op=%d level=%d current_level=%d
1059 curlevstart=%d\n", ch, op, level, current_level,
1060 CURRENT_LEVEL_START); */
1061 if (level > current_level) {
1062 for (current_level++; current_level < level; current_level++)
1063 SET_LEVEL_START;
1064 SET_LEVEL_START;
1065 } else if (level < current_level) {
1066 current_level = level;
1067 for (;
1068 num_jumps > 0 && future_jumps[num_jumps - 1] >= CURRENT_LEVEL_START;
1069 num_jumps--)
1070 PUT_ADDR(future_jumps[num_jumps - 1], pattern_offset);
1071 }
1072 switch (op) {
1073 case Rend:
1074 break;
1075 case Rnormal:
1076 normal_char:
1077 opcode = Cexact;
1078 store_opcode_and_arg: /* opcode & ch must be set */
1079 SET_LEVEL_START;
1080 ALLOC(2);
1081 STORE(opcode);
1082 STORE(ch);
1083 break;
1084 case Ranychar:
1085 opcode = Canychar;
1086 store_opcode:
1087 SET_LEVEL_START;
1088 ALLOC(1);
1089 STORE(opcode);
1090 break;
1091 case Rquote:
1092 set_error("Rquote");
1093 /* NOTREACHED */
1094 break;
1095 case Rbol:
1096 if (!beginning_context) {
1097 if (regexp_context_indep_ops)
1098 goto op_error;
1099 else
1100 goto normal_char;
1101 }
1102 opcode = Cbol;
1103 goto store_opcode;
1104 case Reol:
1105 if (!((pos >= size)
1106 || ((regexp_syntax & RE_NO_BK_VBAR)
1107 ? (regex[pos] == '\174')
1108 : (pos + 1 < size && regex[pos] == '\134'
1109 && regex[pos + 1] == '\174'))
1110 || ((regexp_syntax & RE_NO_BK_PARENS)
1111 ? (regex[pos] == ')')
1112 : (pos + 1 < size && regex[pos] == '\134'
1113 && regex[pos + 1] == ')')))) {
1114 if (regexp_context_indep_ops)
1115 goto op_error;
1116 else
1117 goto normal_char;
1118 }
1119 opcode = Ceol;
1120 goto store_opcode;
1121 /* NOTREACHED */
1122 break;
1123 case Roptional:
1124 if (beginning_context) {
1125 if (regexp_context_indep_ops)
1126 goto op_error;
1127 else
1128 goto normal_char;
1129 }
1130 if (CURRENT_LEVEL_START == pattern_offset)
1131 break; /* ignore empty patterns for ? */
1132 ALLOC(3);
1133 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 3);
1134 break;
1135 case Rstar:
1136 case Rplus:
1137 if (beginning_context) {
1138 if (regexp_context_indep_ops)
1139 goto op_error;
1140 else
1141 goto normal_char;
1142 }
1143 if (CURRENT_LEVEL_START == pattern_offset)
1144 break; /* ignore empty patterns for + and * */
1145 ALLOC(9);
1146 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1147 INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
1148 if (op == Rplus) /* jump over initial failure_jump */
1149 INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
1150 CURRENT_LEVEL_START + 6);
1151 break;
1152 case Ror:
1153 ALLOC(6);
1154 INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1155 if (num_jumps >= MAX_NESTING) goto too_complex;
1156 STORE(Cjump);
1157 future_jumps[num_jumps++] = pattern_offset;
1158 STORE(0);
1159 STORE(0);
1160 SET_LEVEL_START;
1161 break;
1162 case Ropenpar: {
1163 SET_LEVEL_START;
1164 if (next_register < RE_NREGS) {
1165 bufp->uses_registers = 1;
1166 ALLOC(2);
1167 STORE(Cstart_memory);
1168 STORE(next_register);
1169 open_registers[num_open_registers++] = next_register;
1170 bufp->num_registers++;
1171 next_register++;
1172 }
1173 paren_depth++;
1174 PUSH_LEVEL_STARTS;
1175 current_level = 0;
1176 SET_LEVEL_START;
1177 break;
1178 }
1179 case Rclosepar:
1180 if (paren_depth <= 0) goto parenthesIsError;
1181 POP_LEVEL_STARTS;
1182 current_level = precedences[Ropenpar];
1183 paren_depth--;
1184 if (paren_depth < num_open_registers) {
1185 bufp->uses_registers = 1;
1186 ALLOC(2);
1187 STORE(Cend_memory);
1188 num_open_registers--;
1189 STORE(open_registers[num_open_registers]);
1190 }
1191 break;
1192 case Rmemory:
1193 if (ch == '0') goto bad_match_register;
1194 if (!(ch >= '0' && ch <= '9')) { goto bad_match_register; }
1195 bufp->uses_registers = 1;
1196 opcode = Cmatch_memory;
1197 ch -= '0';
1198 goto store_opcode_and_arg;
1199 case Rextended_memory:
1200 NEXTCHAR(ch);
1201 if (ch < '0' || ch > '9') goto bad_match_register;
1202 NEXTCHAR(a);
1203 if (a < '0' || a > '9') goto bad_match_register;
1204 ch = 10 * (a - '0') + ch - '0';
1205 if (ch == 0 || ch >= RE_NREGS) goto bad_match_register;
1206 bufp->uses_registers = 1;
1207 opcode = Cmatch_memory;
1208 goto store_opcode_and_arg;
1209 case Ropenset: {
1210 int complement;
1211 int prev;
1212 int offset;
1213 int range;
1214 int firstchar;
1215
1216 SET_LEVEL_START;
1217 ALLOC(1 + 256 / 8);
1218 STORE(Cset);
1219 offset = pattern_offset;
1220 for (a = 0; a < 256 / 8; a++) STORE(0);
1221 NEXTCHAR(ch);
1222 if (translate) ch = translate[(unsigned char)ch];
1223 if (ch == '\136') {
1224 complement = 1;
1225 NEXTCHAR(ch);
1226 if (translate) ch = translate[(unsigned char)ch];
1227 } else
1228 complement = 0;
1229 prev = -1;
1230 range = 0;
1231 firstchar = 1;
1232 while (ch != '\135' || firstchar) {
1233 firstchar = 0;
1234 if (regexp_ansi_sequences && ch == '\134') {
1235 NEXTCHAR(ch);
1236 ANSI_TRANSLATE(ch);
1237 }
1238 if (range) {
1239 for (a = prev; a <= (int)ch; a++) SETBIT(pattern, offset, a);
1240 prev = -1;
1241 range = 0;
1242 } else if (prev != -1 && ch == '-')
1243 range = 1;
1244 else {
1245 SETBIT(pattern, offset, ch);
1246 prev = ch;
1247 }
1248 NEXTCHAR(ch);
1249 if (translate) ch = translate[(unsigned char)ch];
1250 }
1251 if (range) SETBIT(pattern, offset, '-');
1252 if (complement) {
1253 for (a = 0; a < 256 / 8; a++) pattern[offset + a] ^= 0xff;
1254 }
1255 break;
1256 }
1257 case Rbegbuf: {
1258 opcode = Cbegbuf;
1259 goto store_opcode;
1260 }
1261 case Rendbuf: {
1262 opcode = Cendbuf;
1263 goto store_opcode;
1264 }
1265 case Rwordchar: {
1266 opcode = Csyntaxspec;
1267 ch = Sword;
1268 goto store_opcode_and_arg;
1269 }
1270 case Rnotwordchar: {
1271 opcode = Cnotsyntaxspec;
1272 ch = Sword;
1273 goto store_opcode_and_arg;
1274 }
1275 case Rwordbeg: {
1276 opcode = Cwordbeg;
1277 goto store_opcode;
1278 }
1279 case Rwordend: {
1280 opcode = Cwordend;
1281 goto store_opcode;
1282 }
1283 case Rwordbound: {
1284 opcode = Cwordbound;
1285 goto store_opcode;
1286 }
1287 case Rnotwordbound: {
1288 opcode = Cnotwordbound;
1289 goto store_opcode;
1290 }
1291 default: {
1292 abort();
1293 }
1294 }
1295 beginning_context = (op == Ropenpar || op == Ror);
1296 }
1297 if (starts_base != 0) goto parenthesIsError;
1298 // assert(num_jumps == 0);
1299 ALLOC(1);
1300 STORE(Cend);
1301 SET_FIELDS;
1302 if (!ReOptimize(bufp)) return "Optimization error";
1303 return NULL;
1304
1305 op_error:
1306 SET_FIELDS;
1307 return "Badly placed special character";
1308
1309 bad_match_register:
1310 SET_FIELDS;
1311 return "Bad match register number";
1312
1313 hex_error:
1314 SET_FIELDS;
1315 return "Bad hexadecimal number";
1316
1317 parenthesIsError:
1318 SET_FIELDS;
1319 return "Badly placed parenthesis";
1320
1321 out_of_memory:
1322 SET_FIELDS;
1323 return "Out of memory";
1324
1325 ends_prematurely:
1326 SET_FIELDS;
1327 return "Regular expression ends prematurely";
1328
1329 too_complex:
1330 SET_FIELDS;
1331 return "Regular expression too complex";
1332 }
1333
1334 #undef CHARAT
1335 #undef NEXTCHAR
1336 #undef GETHEX
1337 #undef ALLOC
1338 #undef STORE
1339 #undef CURRENT_LEVEL_START
1340 #undef SET_LEVEL_START
1341 #undef PUSH_LEVEL_STARTS
1342 #undef POP_LEVEL_STARTS
1343 #undef PUT_ADDR
1344 #undef INSERT_JUMP
1345 #undef SETBIT
1346 #undef SET_FIELDS
1347
1348 #define PREFETCH \
1349 if (text == textend) goto fail
1350
1351 #define NEXTCHAR(var) \
1352 PREFETCH; \
1353 var = (unsigned char)*text++; \
1354 if (translate) var = translate[var]
1355
regcomp(regex_t * bufp,const char * regex,int cflags)1356 int regcomp(regex_t* bufp, const char* regex, int cflags)
1357 {
1358 memset(bufp, 0, sizeof(regex_t));
1359 bufp->cflags = cflags;
1360 if (bufp->cflags & REG_ICASE) {
1361 char *p, *lcase = strdup(regex);
1362 for (p = lcase; *p; p++) { *p = tolower(*p); }
1363 re_compile_pattern(bufp, (unsigned char*)lcase);
1364 free(lcase);
1365 } else {
1366 re_compile_pattern(bufp, (unsigned char*)regex);
1367 }
1368 if (got_error) { return -1; }
1369 return 0;
1370 }
1371
re_registers_to_regmatch(regexp_registers_t old_regs,regmatch_t pmatch[],size_t nmatch)1372 void re_registers_to_regmatch(regexp_registers_t old_regs,
1373 regmatch_t pmatch[],
1374 size_t nmatch)
1375 {
1376 if (!(nmatch == 0 && pmatch == NULL)) {
1377 size_t i = 0;
1378
1379 /*
1380 * We have to set the last entry to -1
1381 */
1382 nmatch = nmatch - 1;
1383 for (i = 0; (i < nmatch) && (old_regs->start[i] > -1); i++) {
1384 pmatch[i].rm_so = old_regs->start[i];
1385 pmatch[i].rm_eo = old_regs->end[i];
1386 }
1387
1388 pmatch[i].rm_eo = pmatch[i].rm_so = -1;
1389 }
1390 }
1391
regexec(regex_t * preg,const char * string,size_t nmatch,regmatch_t pmatch[],int eflags)1392 int regexec(regex_t* preg,
1393 const char* string,
1394 size_t nmatch,
1395 regmatch_t pmatch[],
1396 int eflags)
1397 {
1398 int status;
1399 int len = strlen(string);
1400 re_registers regs{};
1401
1402 status = ReSearch(preg, (unsigned char*)string, len, 0, len, ®s);
1403 if (status >= 0) { re_registers_to_regmatch(®s, pmatch, nmatch); }
1404
1405 /*
1406 * status is the start position in the string base 0 where
1407 * the pattern was found or negative if not found.
1408 */
1409 return status < 0 ? -1 : 0;
1410 }
1411
regerror(int errcode,regex_t * preg,char * errbuf,size_t errbuf_size)1412 size_t regerror(int errcode, regex_t* preg, char* errbuf, size_t errbuf_size)
1413 {
1414 bstrncpy(errbuf, preg->errmsg, errbuf_size);
1415 return 0;
1416 }
1417
regfree(regex_t * preg)1418 void regfree(regex_t* preg)
1419 {
1420 if (preg->lcase) {
1421 FreePoolMemory(preg->lcase);
1422 preg->lcase = NULL;
1423 }
1424 if (preg->buffer) {
1425 free(preg->buffer);
1426 preg->buffer = NULL;
1427 }
1428 }
1429
ReMatch(regex_t * bufp,unsigned char * string,int size,int pos,regexp_registers_t old_regs)1430 int ReMatch(regex_t* bufp,
1431 unsigned char* string,
1432 int size,
1433 int pos,
1434 regexp_registers_t old_regs)
1435 {
1436 unsigned char* code;
1437 unsigned char* translate;
1438 unsigned char* text;
1439 unsigned char* textstart;
1440 unsigned char* textend;
1441 int a;
1442 int b;
1443 int ch;
1444 int reg;
1445 int match_end;
1446 unsigned char* regstart;
1447 unsigned char* regend;
1448 int regsize;
1449 match_state state;
1450
1451 // assert(pos >= 0 && size >= 0);
1452 // assert(pos <= size);
1453
1454 text = string + pos;
1455 textstart = string;
1456 textend = string + size;
1457
1458 code = bufp->buffer;
1459
1460 translate = bufp->translate;
1461
1462 NEW_STATE(state, bufp->num_registers);
1463
1464 continue_matching:
1465 switch (*code++) {
1466 case Cend: {
1467 match_end = text - textstart;
1468 if (old_regs) {
1469 old_regs->start[0] = pos;
1470 old_regs->end[0] = match_end;
1471 if (!bufp->uses_registers) {
1472 for (a = 1; a < RE_NREGS; a++) {
1473 old_regs->start[a] = -1;
1474 old_regs->end[a] = -1;
1475 }
1476 } else {
1477 for (a = 1; a < bufp->num_registers; a++) {
1478 if ((GET_REG_START(state, a) == NULL)
1479 || (GET_REG_END(state, a) == NULL)) {
1480 old_regs->start[a] = -1;
1481 old_regs->end[a] = -1;
1482 continue;
1483 }
1484 old_regs->start[a] = GET_REG_START(state, a) - textstart;
1485 old_regs->end[a] = GET_REG_END(state, a) - textstart;
1486 }
1487 for (; a < RE_NREGS; a++) {
1488 old_regs->start[a] = -1;
1489 old_regs->end[a] = -1;
1490 }
1491 }
1492 }
1493 FREE_STATE(state);
1494 return match_end - pos;
1495 }
1496 case Cbol: {
1497 if (text == textstart || text[-1] == '\n') goto continue_matching;
1498 goto fail;
1499 }
1500 case Ceol: {
1501 if (text == textend || *text == '\n') goto continue_matching;
1502 goto fail;
1503 }
1504 case Cset: {
1505 NEXTCHAR(ch);
1506 if (code[ch / 8] & (1 << (ch & 7))) {
1507 code += 256 / 8;
1508 goto continue_matching;
1509 }
1510 goto fail;
1511 }
1512 case Cexact: {
1513 NEXTCHAR(ch);
1514 if (ch != (unsigned char)*code++) goto fail;
1515 goto continue_matching;
1516 }
1517 case Canychar: {
1518 NEXTCHAR(ch);
1519 if (ch == '\n') goto fail;
1520 goto continue_matching;
1521 }
1522 case Cstart_memory: {
1523 reg = *code++;
1524 SET_REG_START(state, reg, text, goto error);
1525 goto continue_matching;
1526 }
1527 case Cend_memory: {
1528 reg = *code++;
1529 SET_REG_END(state, reg, text, goto error);
1530 goto continue_matching;
1531 }
1532 case Cmatch_memory: {
1533 reg = *code++;
1534 regstart = GET_REG_START(state, reg);
1535 regend = GET_REG_END(state, reg);
1536 if ((regstart == NULL) || (regend == NULL))
1537 goto fail; /* or should we just match nothing? */
1538 regsize = regend - regstart;
1539
1540 if (regsize > (textend - text)) goto fail;
1541 if (translate) {
1542 for (; regstart < regend; regstart++, text++)
1543 if (translate[*regstart] != translate[*text]) goto fail;
1544 } else
1545 for (; regstart < regend; regstart++, text++)
1546 if (*regstart != *text) goto fail;
1547 goto continue_matching;
1548 }
1549 case Cupdate_failure_jump: {
1550 UPDATE_FAILURE(state, text, goto error);
1551 /* fall to next case */
1552 }
1553 /* treat Cstar_jump just like Cjump if it hasn't been optimized */
1554 case Cstar_jump:
1555 case Cjump: {
1556 a = (unsigned char)*code++;
1557 a |= (unsigned char)*code++ << 8;
1558 code += (int)SHORT(a);
1559 if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1560 set_error("Regex VM jump out of bounds (Cjump)");
1561 FREE_STATE(state);
1562 return -2;
1563 }
1564 goto continue_matching;
1565 }
1566 case Cdummy_failure_jump: {
1567 unsigned char* failuredest;
1568
1569 a = (unsigned char)*code++;
1570 a |= (unsigned char)*code++ << 8;
1571 a = (int)SHORT(a);
1572 // assert(*code == Cfailure_jump);
1573 b = (unsigned char)code[1];
1574 b |= (unsigned char)code[2] << 8;
1575 failuredest = code + (int)SHORT(b) + 3;
1576 if (failuredest < bufp->buffer
1577 || bufp->buffer + bufp->used < failuredest) {
1578 set_error(
1579 "Regex VM jump out of bounds (Cdummy_failure_jump failuredest)");
1580 FREE_STATE(state);
1581 return -2;
1582 }
1583 PUSH_FAILURE(state, failuredest, NULL, goto error);
1584 code += a;
1585 if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1586 set_error("Regex VM jump out of bounds (Cdummy_failure_jump code)");
1587 FREE_STATE(state);
1588 return -2;
1589 }
1590 goto continue_matching;
1591 }
1592 case Cfailure_jump: {
1593 a = (unsigned char)*code++;
1594 a |= (unsigned char)*code++ << 8;
1595 a = (int)SHORT(a);
1596 if (code + a < bufp->buffer || bufp->buffer + bufp->used < code + a) {
1597 set_error("Regex VM jump out of bounds (Cfailure_jump)");
1598 FREE_STATE(state);
1599 return -2;
1600 }
1601 PUSH_FAILURE(state, code + a, text, goto error);
1602 goto continue_matching;
1603 }
1604 case Crepeat1: {
1605 unsigned char* pinst;
1606 a = (unsigned char)*code++;
1607 a |= (unsigned char)*code++ << 8;
1608 a = (int)SHORT(a);
1609 pinst = code + a;
1610 if (pinst < bufp->buffer || bufp->buffer + bufp->used < pinst) {
1611 set_error("Regex VM jump out of bounds (Crepeat1)");
1612 FREE_STATE(state);
1613 return -2;
1614 }
1615 /* pinst is sole instruction in loop, and it matches a
1616 * single character. Since Crepeat1 was originally a
1617 * Cupdate_failure_jump, we also know that backtracking
1618 * is useless: so long as the single-character
1619 * expression matches, it must be used. Also, in the
1620 * case of +, we've already matched one character, so +
1621 * can't fail: nothing here can cause a failure. */
1622 switch (*pinst++) {
1623 case Cset: {
1624 if (translate) {
1625 while (text < textend) {
1626 ch = translate[(unsigned char)*text];
1627 if (pinst[ch / 8] & (1 << (ch & 7)))
1628 text++;
1629 else
1630 break;
1631 }
1632 } else {
1633 while (text < textend) {
1634 ch = (unsigned char)*text;
1635 if (pinst[ch / 8] & (1 << (ch & 7)))
1636 text++;
1637 else
1638 break;
1639 }
1640 }
1641 break;
1642 }
1643 case Cexact: {
1644 ch = (unsigned char)*pinst;
1645 if (translate) {
1646 while (text < textend && translate[(unsigned char)*text] == ch)
1647 text++;
1648 } else {
1649 while (text < textend && (unsigned char)*text == ch) text++;
1650 }
1651 break;
1652 }
1653 case Canychar: {
1654 while (text < textend && (unsigned char)*text != '\n') text++;
1655 break;
1656 }
1657 case Csyntaxspec: {
1658 a = (unsigned char)*pinst;
1659 if (translate) {
1660 while (text < textend && (SYNTAX(translate[*text]) & a)) text++;
1661 } else {
1662 while (text < textend && (SYNTAX(*text) & a)) text++;
1663 }
1664 break;
1665 }
1666 case Cnotsyntaxspec: {
1667 a = (unsigned char)*pinst;
1668 if (translate) {
1669 while (text < textend && !(SYNTAX(translate[*text]) & a)) text++;
1670 } else {
1671 while (text < textend && !(SYNTAX(*text) & a)) text++;
1672 }
1673 break;
1674 }
1675 default: {
1676 FREE_STATE(state);
1677 set_error("Unknown regex opcode: memory corrupted?");
1678 return -2;
1679 /*NOTREACHED*/}
1680 }
1681 /* due to the funky way + and * are compiled, the top
1682 * failure- stack entry at this point is actually a
1683 * success entry -- update it & pop it */
1684 UPDATE_FAILURE(state, text, goto error);
1685 goto fail; /* i.e., succeed <wink/sigh> */
1686 }
1687 case Cbegbuf: {
1688 if (text == textstart) goto continue_matching;
1689 goto fail;
1690 }
1691 case Cendbuf: {
1692 if (text == textend) goto continue_matching;
1693 goto fail;
1694 }
1695 case Cwordbeg: {
1696 if (text == textend) goto fail;
1697 if (!(SYNTAX(*text) & Sword)) goto fail;
1698 if (text == textstart) goto continue_matching;
1699 if (!(SYNTAX(text[-1]) & Sword)) goto continue_matching;
1700 goto fail;
1701 }
1702 case Cwordend: {
1703 if (text == textstart) goto fail;
1704 if (!(SYNTAX(text[-1]) & Sword)) goto fail;
1705 if (text == textend) goto continue_matching;
1706 if (!(SYNTAX(*text) & Sword)) goto continue_matching;
1707 goto fail;
1708 }
1709 case Cwordbound: {
1710 /* Note: as in gnu regexp, this also matches at the
1711 * beginning and end of buffer. */
1712
1713 if (text == textstart || text == textend) goto continue_matching;
1714 if ((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword))
1715 goto continue_matching;
1716 goto fail;
1717 }
1718 case Cnotwordbound: {
1719 /* Note: as in gnu regexp, this never matches at the
1720 * beginning and end of buffer. */
1721 if (text == textstart || text == textend) goto fail;
1722 if (!((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword)))
1723 goto continue_matching;
1724 goto fail;
1725 }
1726 case Csyntaxspec: {
1727 NEXTCHAR(ch);
1728 if (!(SYNTAX(ch) & (unsigned char)*code++)) goto fail;
1729 goto continue_matching;
1730 }
1731 case Cnotsyntaxspec: {
1732 NEXTCHAR(ch);
1733 if (SYNTAX(ch) & (unsigned char)*code++) goto fail;
1734 goto continue_matching;
1735 }
1736 default: {
1737 FREE_STATE(state);
1738 set_error("Unknown regex opcode: memory corrupted?");
1739 return -2;
1740 /*NOTREACHED*/}
1741 }
1742
1743 /*
1744 *NOTREACHED
1745 */
1746
1747 /* Using "break;" in the above switch statement is equivalent to "goto fail;"
1748 */
1749 fail:
1750 POP_FAILURE(state, code, text, goto done_matching, goto error);
1751 goto continue_matching;
1752
1753 done_matching:
1754 /* if(translated != NULL) */
1755 /* free(translated); */
1756 FREE_STATE(state);
1757 return -1;
1758
1759 error:
1760 /* if (translated != NULL) */
1761 /* free(translated); */
1762 FREE_STATE(state);
1763 return -2;
1764 }
1765
1766
1767 #undef PREFETCH
1768 #undef NEXTCHAR
1769
ReSearch(regex_t * bufp,unsigned char * str,int size,int pos,int range,regexp_registers_t regs)1770 int ReSearch(regex_t* bufp,
1771 unsigned char* str,
1772 int size,
1773 int pos,
1774 int range,
1775 regexp_registers_t regs)
1776 {
1777 unsigned char* fastmap;
1778 unsigned char* translate;
1779 unsigned char* text;
1780 unsigned char* partstart;
1781 unsigned char* partend;
1782 int dir;
1783 int ret;
1784 unsigned char anchor;
1785 unsigned char* string = str;
1786
1787 if (bufp->cflags & REG_ICASE) { /* we must use string in lowercase */
1788 int len = strlen((const char*)str);
1789 if (!bufp->lcase) { bufp->lcase = GetPoolMemory(PM_FNAME); }
1790 bufp->lcase = CheckPoolMemorySize(bufp->lcase, len + 1);
1791 unsigned char* dst = (unsigned char*)bufp->lcase;
1792 while (*string) { *dst++ = tolower(*string++); }
1793 *dst = '\0';
1794 string = (unsigned char*)bufp->lcase;
1795 }
1796
1797 // assert(size >= 0 && pos >= 0);
1798 // assert(pos + range >= 0 && pos + range <= size); /* Bugfix by ylo */
1799
1800 fastmap = bufp->fastmap;
1801 translate = bufp->translate;
1802 if (fastmap && !bufp->fastmap_accurate) {
1803 ReCompileFastmap(bufp);
1804 if (got_error) return -2;
1805 }
1806
1807 anchor = bufp->anchor;
1808 if (bufp->can_be_null == 1) /* can_be_null == 2: can match null at eob */
1809 fastmap = NULL;
1810
1811 if (range < 0) {
1812 dir = -1;
1813 range = -range;
1814 } else
1815 dir = 1;
1816
1817 if (anchor == 2) {
1818 if (pos != 0)
1819 return -1;
1820 else
1821 range = 0;
1822 }
1823
1824 for (; range >= 0; range--, pos += dir) {
1825 if (fastmap) {
1826 if (dir == 1) { /* searching forwards */
1827
1828 text = string + pos;
1829 partend = string + size;
1830 partstart = text;
1831 if (translate)
1832 while (text != partend
1833 && !fastmap[(unsigned char)translate[(unsigned char)*text]])
1834 text++;
1835 else
1836 while (text != partend && !fastmap[(unsigned char)*text]) text++;
1837 pos += text - partstart;
1838 range -= text - partstart;
1839 if (pos == size && bufp->can_be_null == 0) return -1;
1840 } else { /* searching backwards */
1841 text = string + pos;
1842 partstart = string + pos - range;
1843 partend = text;
1844 if (translate)
1845 while (text != partstart
1846 && !fastmap[(unsigned char)translate[(unsigned char)*text]])
1847 text--;
1848 else
1849 while (text != partstart && !fastmap[(unsigned char)*text]) text--;
1850 pos -= partend - text;
1851 range -= partend - text;
1852 }
1853 }
1854 if (anchor == 1) { /* anchored to begline */
1855 if (pos > 0 && (string[pos - 1] != '\n')) continue;
1856 }
1857 // assert(pos >= 0 && pos <= size);
1858 ret = ReMatch(bufp, string, size, pos, regs);
1859 if (ret >= 0) return pos;
1860 if (ret == -2) return -2;
1861 }
1862 return -1;
1863 }
1864
1865 /*
1866 ** Local Variables:
1867 ** mode: c
1868 ** c-file-style: "python"
1869 ** End:
1870 */
1871