1 /*
2    BAREOS® - Backup Archiving REcovery Open Sourced
3 
4    Copyright (C) 2006-2010 Free Software Foundation Europe e.V.
5 
6    This program is Free Software; you can redistribute it and/or
7    modify it under the terms of version three of the GNU Affero General Public
8    License as published by the Free Software Foundation and included
9    in the file LICENSE.
10 
11    This program is distributed in the hope that it will be useful, but
12    WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14    Affero General Public License for more details.
15 
16    You should have received a copy of the GNU Affero General Public License
17    along with this program; if not, write to the Free Software
18    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19    02110-1301, USA.
20 */
21 /* regexpr.c
22  *
23  * Author: Tatu Ylonen <ylo@ngs.fi>
24  *
25  * Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
26  *
27  * Permission to use, copy, modify, distribute, and sell this software
28  * and its documentation for any purpose is hereby granted without
29  * fee, provided that the above copyright notice appear in all copies.
30  * This software is provided "as is" without express or implied
31  * warranty.
32  *
33  * Created: Thu Sep 26 17:14:05 1991 ylo
34  * Last modified: Mon Nov  4 17:06:48 1991 ylo
35  * Ported to Think C: 19 Jan 1992 guido@cwi.nl
36  *
37  * This code draws many ideas from the regular expression packages by
38  * Henry Spencer of the University of Toronto and Richard Stallman of
39  * the Free Software Foundation.
40  *
41  * Emacs-specific code and syntax table code is almost directly borrowed
42  * from GNU regexp.
43  *
44  * Bugs fixed and lots of reorganization by Jeffrey C. Ollie, April
45  * 1997 Thanks for bug reports and ideas from Andrew Kuchling, Tim
46  * Peters, Guido van Rossum, Ka-Ping Yee, Sjoerd Mullender, and
47  * probably one or two others that I'm forgetting.
48  *
49  * This file modified to work with BAREOS and C++ by
50  *    Kern Sibbald, April 2006
51  *
52  * This file modified to work with REG_ICASE and BAREOS by
53  *    Eric Bollengier April 2007
54  */
55 
56 #include "include/bareos.h"
57 #include "bregex.h"
58 
59 #define set_error(x) bufp->errmsg = ((char*)(x))
60 #define got_error bufp->errmsg != NULL
61 
62 /* The original code blithely assumed that sizeof(short) == 2.  Not
63  * always true.  Original instances of "(short)x" were replaced by
64  * SHORT(x), where SHORT is #defined below.  */
65 
66 #define SHORT(x) ((x)&0x8000 ? (x)-0x10000 : (x))
67 
68 /* The stack implementation is taken from an idea by Andrew Kuchling.
69  * It's a doubly linked list of arrays. The advantages of this over a
70  * simple linked list are that the number of mallocs required are
71  * reduced. It also makes it possible to statically allocate enough
72  * space so that small patterns don't ever need to call malloc.
73  *
74  * The advantages over a single array is that is periodically
75  * realloced when more space is needed is that we avoid ever copying
76  * the stack. */
77 
78 /* item_t is the basic stack element.  Defined as a union of
79  * structures so that both registers, failure points, and counters can
80  * be pushed/popped from the stack.  There's nothing built into the
81  * item to keep track of whether a certain stack item is a register, a
82  * failure point, or a counter. */
83 
84 typedef union item_t {
85   struct {
86     int num;
87     int level;
88     unsigned char* start;
89     unsigned char* end;
90   } reg;
91   struct {
92     int count;
93     int level;
94     int phantom;
95     unsigned char* code;
96     unsigned char* text;
97   } fail;
98   struct {
99     int num;
100     int level;
101     int count;
102   } cntr;
103 } item_t;
104 
105 #define B_STACK_PAGE_SIZE 256
106 #define NUM_REGISTERS 256
107 
108 /* A 'page' of stack items. */
109 
110 typedef struct item_page_t {
111   item_t items[B_STACK_PAGE_SIZE];
112   struct item_page_t* prev;
113   struct item_page_t* next;
114 } item_page_t;
115 
116 
117 typedef struct match_state {
118   /* The number of registers that have been pushed onto the stack
119    * since the last failure point. */
120 
121   int count;
122 
123   /* Used to control when registers need to be pushed onto the
124    * stack. */
125 
126   int level;
127 
128   /* The number of failure points on the stack. */
129 
130   int point;
131 
132   /* Storage for the registers.  Each register consists of two
133    * pointers to characters.  So register N is represented as
134    * start[N] and end[N].  The pointers must be converted to
135    * offsets from the beginning of the string before returning the
136    * registers to the calling program. */
137 
138   unsigned char* start[NUM_REGISTERS];
139   unsigned char* end[NUM_REGISTERS];
140 
141   /* Keeps track of whether a register has changed recently. */
142 
143   int changed[NUM_REGISTERS];
144 
145   /* Structure to encapsulate the stack. */
146   struct {
147     /* index into the current page.  If index == 0 and you need
148      * to pop an item, move to the previous page and set index
149      * = B_STACK_PAGE_SIZE - 1.  Otherwise decrement index to
150      * push a page. If index == B_STACK_PAGE_SIZE and you need
151      * to push a page move to the next page and set index =
152      * 0. If there is no new next page, allocate a new page
153      * and link it in. Otherwise, increment index to push a
154      * page. */
155 
156     int index;
157     item_page_t* current; /* Pointer to the current page. */
158     item_page_t first;    /* First page is statically allocated. */
159   } stack;
160 } match_state;
161 
162 /* Initialize a state object */
163 
164 /* #define NEW_STATE(state) \ */
165 /* memset(&state, 0, (void *)(&state.stack) - (void *)(&state)); \ */
166 /* state.stack.current = &state.stack.first; \ */
167 /* state.stack.first.prev = NULL; \ */
168 /* state.stack.first.next = NULL; \ */
169 /* state.stack.index = 0; \ */
170 /* state.level = 1 */
171 
172 #define NEW_STATE(state, nregs)               \
173   {                                           \
174     int i;                                    \
175     for (i = 0; i < nregs; i++) {             \
176       state.start[i] = NULL;                  \
177       state.end[i] = NULL;                    \
178       state.changed[i] = 0;                   \
179     }                                         \
180     state.stack.current = &state.stack.first; \
181     state.stack.first.prev = NULL;            \
182     state.stack.first.next = NULL;            \
183     state.stack.index = 0;                    \
184     state.level = 1;                          \
185     state.count = 0;                          \
186     state.level = 0;                          \
187     state.point = 0;                          \
188   }
189 
190 /* Free any memory that might have been malloc'd */
191 
192 #define FREE_STATE(state)                               \
193   while (state.stack.first.next != NULL) {              \
194     state.stack.current = state.stack.first.next;       \
195     state.stack.first.next = state.stack.current->next; \
196     free(state.stack.current);                          \
197   }
198 
199 /* Discard the top 'count' stack items. */
200 
201 #define STACK_DISCARD(stack, count, on_error)  \
202   stack.index -= count;                        \
203   while (stack.index < 0) {                    \
204     if (stack.current->prev == NULL) on_error; \
205     stack.current = stack.current->prev;       \
206     stack.index += B_STACK_PAGE_SIZE;          \
207   }
208 
209 /* Store a pointer to the previous item on the stack. Used to pop an
210  * item off of the stack. */
211 
212 #define STACK_PREV(stack, top, on_error)       \
213   if (stack.index == 0) {                      \
214     if (stack.current->prev == NULL) on_error; \
215     stack.current = stack.current->prev;       \
216     stack.index = B_STACK_PAGE_SIZE - 1;       \
217   } else {                                     \
218     stack.index--;                             \
219   }                                            \
220   top = &(stack.current->items[stack.index])
221 
222 /* Store a pointer to the next item on the stack. Used to push an item
223  * on to the stack. */
224 
225 #define STACK_NEXT(stack, top, on_error)                               \
226   if (stack.index == B_STACK_PAGE_SIZE) {                              \
227     if (stack.current->next == NULL) {                                 \
228       stack.current->next = (item_page_t*)malloc(sizeof(item_page_t)); \
229       if (stack.current->next == NULL) on_error;                       \
230       stack.current->next->prev = stack.current;                       \
231       stack.current->next->next = NULL;                                \
232     }                                                                  \
233     stack.current = stack.current->next;                               \
234     stack.index = 0;                                                   \
235   }                                                                    \
236   top = &(stack.current->items[stack.index++])
237 
238 /* Store a pointer to the item that is 'count' items back in the
239  * stack. STACK_BACK(stack, top, 1, on_error) is equivalent to
240  * STACK_TOP(stack, top, on_error).  */
241 
242 #define STACK_BACK(stack, top, count, on_error) \
243   {                                             \
244     int index;                                  \
245     item_page_t* current;                       \
246     current = stack.current;                    \
247     index = stack.index - (count);              \
248     while (index < 0) {                         \
249       if (current->prev == NULL) on_error;      \
250       current = current->prev;                  \
251       index += B_STACK_PAGE_SIZE;               \
252     }                                           \
253     top = &(current->items[index]);             \
254   }
255 
256 /* Store a pointer to the top item on the stack. Execute the
257  * 'on_error' code if there are no items on the stack. */
258 
259 #define STACK_TOP(stack, top, on_error)                         \
260   if (stack.index == 0) {                                       \
261     if (stack.current->prev == NULL) on_error;                  \
262     top = &(stack.current->prev->items[B_STACK_PAGE_SIZE - 1]); \
263   } else {                                                      \
264     top = &(stack.current->items[stack.index - 1]);             \
265   }
266 
267 /* Test to see if the stack is empty */
268 
269 #define STACK_EMPTY(stack) ((stack.index == 0) && (stack.current->prev == NULL))
270 
271 /* Return the start of register 'reg' */
272 
273 #define GET_REG_START(state, reg) (state.start[reg])
274 
275 /* Return the end of register 'reg' */
276 
277 #define GET_REG_END(state, reg) (state.end[reg])
278 
279 /* Set the start of register 'reg'. If the state of the register needs
280  * saving, push it on the stack. */
281 
282 #define SET_REG_START(state, reg, text, on_error) \
283   if (state.changed[reg] < state.level) {         \
284     item_t* item;                                 \
285     STACK_NEXT(state.stack, item, on_error);      \
286     item->reg.num = reg;                          \
287     item->reg.start = state.start[reg];           \
288     item->reg.end = state.end[reg];               \
289     item->reg.level = state.changed[reg];         \
290     state.changed[reg] = state.level;             \
291     state.count++;                                \
292   }                                               \
293   state.start[reg] = text
294 
295 /* Set the end of register 'reg'. If the state of the register needs
296  * saving, push it on the stack. */
297 
298 #define SET_REG_END(state, reg, text, on_error) \
299   if (state.changed[reg] < state.level) {       \
300     item_t* item;                               \
301     STACK_NEXT(state.stack, item, on_error);    \
302     item->reg.num = reg;                        \
303     item->reg.start = state.start[reg];         \
304     item->reg.end = state.end[reg];             \
305     item->reg.level = state.changed[reg];       \
306     state.changed[reg] = state.level;           \
307     state.count++;                              \
308   }                                             \
309   state.end[reg] = text
310 
311 #define PUSH_FAILURE(state, xcode, xtext, on_error) \
312   {                                                 \
313     item_t* item;                                   \
314     STACK_NEXT(state.stack, item, on_error);        \
315     item->fail.code = xcode;                        \
316     item->fail.text = xtext;                        \
317     item->fail.count = state.count;                 \
318     item->fail.level = state.level;                 \
319     item->fail.phantom = 0;                         \
320     state.count = 0;                                \
321     state.level++;                                  \
322     state.point++;                                  \
323   }
324 
325 /* Update the last failure point with a new position in the text. */
326 
327 #define UPDATE_FAILURE(state, xtext, on_error)                \
328   {                                                           \
329     item_t* item;                                             \
330     STACK_BACK(state.stack, item, state.count + 1, on_error); \
331     if (!item->fail.phantom) {                                \
332       item_t* item2;                                          \
333       STACK_NEXT(state.stack, item2, on_error);               \
334       item2->fail.code = item->fail.code;                     \
335       item2->fail.text = xtext;                               \
336       item2->fail.count = state.count;                        \
337       item2->fail.level = state.level;                        \
338       item2->fail.phantom = 1;                                \
339       state.count = 0;                                        \
340       state.level++;                                          \
341       state.point++;                                          \
342     } else {                                                  \
343       STACK_DISCARD(state.stack, state.count, on_error);      \
344       STACK_TOP(state.stack, item, on_error);                 \
345       item->fail.text = xtext;                                \
346       state.count = 0;                                        \
347       state.level++;                                          \
348     }                                                         \
349   }
350 
351 #define POP_FAILURE(state, xcode, xtext, on_empty, on_error) \
352   {                                                          \
353     item_t* item;                                            \
354     do {                                                     \
355       while (state.count > 0) {                              \
356         STACK_PREV(state.stack, item, on_error);             \
357         state.start[item->reg.num] = item->reg.start;        \
358         state.end[item->reg.num] = item->reg.end;            \
359         state.changed[item->reg.num] = item->reg.level;      \
360         state.count--;                                       \
361       }                                                      \
362       STACK_PREV(state.stack, item, on_empty);               \
363       xcode = item->fail.code;                               \
364       xtext = item->fail.text;                               \
365       state.count = item->fail.count;                        \
366       state.level = item->fail.level;                        \
367       state.point--;                                         \
368     } while (item->fail.text == NULL);                       \
369   }
370 
371 enum regexp_compiled_ops
372 {                       /* opcodes for compiled regexp */
373   Cend,                 /* end of pattern reached */
374   Cbol,                 /* beginning of line */
375   Ceol,                 /* end of line */
376   Cset,                 /* character set.  Followed by 32 bytes of set. */
377   Cexact,               /* followed by a byte to match */
378   Canychar,             /* matches any character except newline */
379   Cstart_memory,        /* set register start addr (followed by reg number) */
380   Cend_memory,          /* set register end addr (followed by reg number) */
381   Cmatch_memory,        /* match a duplicate of reg contents (regnum follows) */
382   Cjump,                /* followed by two bytes (lsb,msb) of displacement. */
383   Cstar_jump,           /* will change to jump/update_failure_jump at runtime */
384   Cfailure_jump,        /* jump to addr on failure */
385   Cupdate_failure_jump, /* update topmost failure point and jump */
386   Cdummy_failure_jump,  /* push a dummy failure point and jump */
387   Cbegbuf,              /* match at beginning of buffer */
388   Cendbuf,              /* match at end of buffer */
389   Cwordbeg,             /* match at beginning of word */
390   Cwordend,             /* match at end of word */
391   Cwordbound,           /* match if at word boundary */
392   Cnotwordbound,        /* match if not at word boundary */
393   Csyntaxspec,          /* matches syntax code (1 byte follows) */
394   Cnotsyntaxspec, /* matches if syntax code does not match (1 byte follows) */
395   Crepeat1
396 };
397 
398 enum regexp_syntax_op
399 {                   /* syntax codes for plain and quoted characters */
400   Rend,             /* special code for end of regexp */
401   Rnormal,          /* normal character */
402   Ranychar,         /* any character except newline */
403   Rquote,           /* the quote character */
404   Rbol,             /* match beginning of line */
405   Reol,             /* match end of line */
406   Roptional,        /* match preceding expression optionally */
407   Rstar,            /* match preceding expr zero or more times */
408   Rplus,            /* match preceding expr one or more times */
409   Ror,              /* match either of alternatives */
410   Ropenpar,         /* opening parenthesis */
411   Rclosepar,        /* closing parenthesis */
412   Rmemory,          /* match memory register */
413   Rextended_memory, /* \vnn to match registers 10-99 */
414   Ropenset,         /* open set.  Internal syntax hard-coded below. */
415   /* the following are gnu extensions to "normal" regexp syntax */
416   Rbegbuf,       /* beginning of buffer */
417   Rendbuf,       /* end of buffer */
418   Rwordchar,     /* word character */
419   Rnotwordchar,  /* not word character */
420   Rwordbeg,      /* beginning of word */
421   Rwordend,      /* end of word */
422   Rwordbound,    /* word bound */
423   Rnotwordbound, /* not word bound */
424   Rnum_ops
425 };
426 
427 static int re_compile_initialized = 0;
428 static int regexp_syntax = RE_SYNTAX_EGREP;
429 int re_syntax = RE_SYNTAX_EGREP; /* Exported copy of regexp_syntax */
430 static unsigned char plain_ops[256];
431 static unsigned char quoted_ops[256];
432 static unsigned char precedences[Rnum_ops];
433 static int regexp_context_indep_ops;
434 static int regexp_ansi_sequences;
435 
436 #define NUM_LEVELS 5    /* number of precedence levels in use */
437 #define MAX_NESTING 100 /* max nesting level of operators */
438 
439 #define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
440 
441 unsigned char re_syntax_table[256];
442 
ReCompileInitialize(void)443 void ReCompileInitialize(void)
444 {
445   int a;
446 
447   static int syntax_table_inited = 0;
448 
449   if (!syntax_table_inited) {
450     syntax_table_inited = 1;
451     memset(re_syntax_table, 0, 256);
452     for (a = 'a'; a <= 'z'; a++) re_syntax_table[a] = Sword;
453     for (a = 'A'; a <= 'Z'; a++) re_syntax_table[a] = Sword;
454     for (a = '0'; a <= '9'; a++)
455       re_syntax_table[a] = Sword | Sdigit | Shexdigit;
456     for (a = '0'; a <= '7'; a++) re_syntax_table[a] |= Soctaldigit;
457     for (a = 'A'; a <= 'F'; a++) re_syntax_table[a] |= Shexdigit;
458     for (a = 'a'; a <= 'f'; a++) re_syntax_table[a] |= Shexdigit;
459     re_syntax_table[(int)'_'] = Sword;
460     for (a = 9; a <= 13; a++) re_syntax_table[a] = Swhitespace;
461     re_syntax_table[(int)' '] = Swhitespace;
462   }
463   re_compile_initialized = 1;
464   for (a = 0; a < 256; a++) {
465     plain_ops[a] = Rnormal;
466     quoted_ops[a] = Rnormal;
467   }
468   for (a = '0'; a <= '9'; a++) quoted_ops[a] = Rmemory;
469   plain_ops[(int)'\134'] = Rquote;
470   if (regexp_syntax & RE_NO_BK_PARENS) {
471     plain_ops[(int)'('] = Ropenpar;
472     plain_ops[(int)')'] = Rclosepar;
473   } else {
474     quoted_ops[(int)'('] = Ropenpar;
475     quoted_ops[(int)')'] = Rclosepar;
476   }
477   if (regexp_syntax & RE_NO_BK_VBAR) {
478     plain_ops[(int)'\174'] = Ror;
479   } else {
480     quoted_ops[(int)'\174'] = Ror;
481   }
482   plain_ops[(int)'*'] = Rstar;
483   if (regexp_syntax & RE_BK_PLUS_QM) {
484     quoted_ops[(int)'+'] = Rplus;
485     quoted_ops[(int)'?'] = Roptional;
486   } else {
487     plain_ops[(int)'+'] = Rplus;
488     plain_ops[(int)'?'] = Roptional;
489   }
490   if (regexp_syntax & RE_NEWLINE_OR) { plain_ops[(int)'\n'] = Ror; }
491   plain_ops[(int)'\133'] = Ropenset;
492   plain_ops[(int)'\136'] = Rbol;
493   plain_ops[(int)'$'] = Reol;
494   plain_ops[(int)'.'] = Ranychar;
495   if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS)) {
496     quoted_ops[(int)'w'] = Rwordchar;
497     quoted_ops[(int)'W'] = Rnotwordchar;
498     quoted_ops[(int)'<'] = Rwordbeg;
499     quoted_ops[(int)'>'] = Rwordend;
500     quoted_ops[(int)'b'] = Rwordbound;
501     quoted_ops[(int)'B'] = Rnotwordbound;
502     quoted_ops[(int)'`'] = Rbegbuf;
503     quoted_ops[(int)'\''] = Rendbuf;
504   }
505   if (regexp_syntax & RE_ANSI_HEX) { quoted_ops[(int)'v'] = Rextended_memory; }
506   for (a = 0; a < Rnum_ops; a++) { precedences[a] = 4; }
507   if (regexp_syntax & RE_TIGHT_VBAR) {
508     precedences[Ror] = 3;
509     precedences[Rbol] = 2;
510     precedences[Reol] = 2;
511   } else {
512     precedences[Ror] = 2;
513     precedences[Rbol] = 3;
514     precedences[Reol] = 3;
515   }
516   precedences[Rclosepar] = 1;
517   precedences[Rend] = 0;
518   regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
519   regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
520 }
521 
ReSetSyntax(int syntax)522 int ReSetSyntax(int syntax)
523 {
524   int ret;
525 
526   ret = regexp_syntax;
527   regexp_syntax = syntax;
528   re_syntax = syntax; /* Exported copy */
529   ReCompileInitialize();
530   return ret;
531 }
532 
HexCharToDecimal(int ch)533 static int HexCharToDecimal(int ch)
534 {
535   if (ch >= '0' && ch <= '9') return ch - '0';
536   if (ch >= 'a' && ch <= 'f') return ch - 'a' + 10;
537   if (ch >= 'A' && ch <= 'F') return ch - 'A' + 10;
538   return 16;
539 }
540 
re_compile_fastmap_aux(regex_t * bufp,unsigned char * code,int pos,unsigned char * visited,unsigned char * can_be_null,unsigned char * fastmap)541 static void re_compile_fastmap_aux(regex_t* bufp,
542                                    unsigned char* code,
543                                    int pos,
544                                    unsigned char* visited,
545                                    unsigned char* can_be_null,
546                                    unsigned char* fastmap)
547 {
548   int a;
549   int b;
550   int syntaxcode;
551 
552   if (visited[pos]) return; /* we have already been here */
553   visited[pos] = 1;
554   for (;;) {
555     switch (code[pos++]) {
556       case Cend:
557         *can_be_null = 1;
558         return;
559       case Cbol:
560       case Cbegbuf:
561       case Cendbuf:
562       case Cwordbeg:
563       case Cwordend:
564       case Cwordbound:
565       case Cnotwordbound:
566         for (a = 0; a < 256; a++) fastmap[a] = 1;
567         break;
568       case Csyntaxspec:
569         syntaxcode = code[pos++];
570         for (a = 0; a < 256; a++)
571           if (SYNTAX(a) & syntaxcode) fastmap[a] = 1;
572         return;
573       case Cnotsyntaxspec:
574         syntaxcode = code[pos++];
575         for (a = 0; a < 256; a++)
576           if (!(SYNTAX(a) & syntaxcode)) fastmap[a] = 1;
577         return;
578       case Ceol:
579         fastmap[(int)'\n'] = 1;
580         if (*can_be_null == 0)
581           *can_be_null = 2; /* can match null, but only at end of buffer */
582         return;
583       case Cset:
584         for (a = 0; a < 256 / 8; a++)
585           if (code[pos + a] != 0)
586             for (b = 0; b < 8; b++)
587               if (code[pos + a] & (1 << b)) fastmap[(a << 3) + b] = 1;
588         pos += 256 / 8;
589         return;
590       case Cexact:
591         fastmap[(unsigned char)code[pos]] = 1;
592         return;
593       case Canychar:
594         for (a = 0; a < 256; a++)
595           if (a != '\n') fastmap[a] = 1;
596         return;
597       case Cstart_memory:
598       case Cend_memory:
599         pos++;
600         break;
601       case Cmatch_memory:
602         for (a = 0; a < 256; a++) fastmap[a] = 1;
603         *can_be_null = 1;
604         return;
605       case Cjump:
606       case Cdummy_failure_jump:
607       case Cupdate_failure_jump:
608       case Cstar_jump:
609         a = (unsigned char)code[pos++];
610         a |= (unsigned char)code[pos++] << 8;
611         pos += (int)SHORT(a);
612         if (visited[pos]) {
613           /* argh... the regexp contains empty loops.  This is not
614              good, as this may cause a failure stack overflow when
615              matching.  Oh well. */
616           /* this path leads nowhere; pursue other paths. */
617           return;
618         }
619         visited[pos] = 1;
620         break;
621       case Cfailure_jump:
622         a = (unsigned char)code[pos++];
623         a |= (unsigned char)code[pos++] << 8;
624         a = pos + (int)SHORT(a);
625         re_compile_fastmap_aux(bufp, code, a, visited, can_be_null, fastmap);
626         break;
627       case Crepeat1:
628         pos += 2;
629         break;
630       default:
631         set_error("Unknown regex opcode: memory corrupted?");
632         return;
633     }
634   }
635 }
636 
re_do_compile_fastmap(regex_t * bufp,unsigned char * buffer,int used,int pos,unsigned char * can_be_null,unsigned char * fastmap)637 static int re_do_compile_fastmap(regex_t* bufp,
638                                  unsigned char* buffer,
639                                  int used,
640                                  int pos,
641                                  unsigned char* can_be_null,
642                                  unsigned char* fastmap)
643 {
644   unsigned char small_visited[512], *visited;
645 
646   if (used <= (int)sizeof(small_visited))
647     visited = small_visited;
648   else {
649     visited = (unsigned char*)malloc(used);
650     if (!visited) return 0;
651   }
652   *can_be_null = 0;
653   memset(fastmap, 0, 256);
654   memset(visited, 0, used);
655   re_compile_fastmap_aux(bufp, buffer, pos, visited, can_be_null, fastmap);
656   if (visited != small_visited) free(visited);
657   return 1;
658 }
659 
ReCompileFastmap(regex_t * bufp)660 void ReCompileFastmap(regex_t* bufp)
661 {
662   if (!bufp->fastmap || bufp->fastmap_accurate) return;
663   // assert(bufp->used > 0);
664   if (!re_do_compile_fastmap(bufp, bufp->buffer, bufp->used, 0,
665                              &bufp->can_be_null, bufp->fastmap))
666     return;
667   if (got_error) return;
668   if (bufp->buffer[0] == Cbol)
669     bufp->anchor = 1; /* begline */
670   else if (bufp->buffer[0] == Cbegbuf)
671     bufp->anchor = 2; /* begbuf */
672   else
673     bufp->anchor = 0; /* none */
674   bufp->fastmap_accurate = 1;
675 }
676 
677 /*
678  * star is coded as:
679  * 1: failure_jump 2
680  *    ... code for operand of star
681  *    star_jump 1
682  * 2: ... code after star
683  *
684  * We change the star_jump to update_failure_jump if we can determine
685  * that it is safe to do so; otherwise we change it to an ordinary
686  * jump.
687  *
688  * plus is coded as
689  *
690  *    jump 2
691  * 1: failure_jump 3
692  * 2: ... code for operand of plus
693  *    star_jump 1
694  * 3: ... code after plus
695  *
696  * For star_jump considerations this is processed identically to star.
697  *
698  */
699 
ReOptimizeStarJump(regex_t * bufp,unsigned char * code)700 static int ReOptimizeStarJump(regex_t* bufp, unsigned char* code)
701 {
702   unsigned char map[256];
703   unsigned char can_be_null;
704   unsigned char* p1;
705   unsigned char* p2;
706   unsigned char ch;
707   int a;
708   int b;
709   int num_instructions = 0;
710 
711   a = (unsigned char)*code++;
712   a |= (unsigned char)*code++ << 8;
713   a = (int)SHORT(a);
714 
715   p1 = code + a + 3; /* skip the failure_jump */
716   /* Check that the jump is within the pattern */
717   if (p1 < bufp->buffer || bufp->buffer + bufp->used < p1) {
718     set_error("Regex VM jump out of bounds (failure_jump opt)");
719     return 0;
720   }
721   // assert(p1[-3] == Cfailure_jump);
722   p2 = code;
723   /* p1 points inside loop, p2 points to after loop */
724   if (!re_do_compile_fastmap(bufp, bufp->buffer, bufp->used,
725                              (int)(p2 - bufp->buffer), &can_be_null, map))
726     goto make_normal_jump;
727 
728   /* If we might introduce a new update point inside the
729    * loop, we can't optimize because then update_jump would
730    * update a wrong failure point.  Thus we have to be
731    * quite careful here.
732    */
733 
734   /* loop until we find something that consumes a character */
735   for (;;) {
736     num_instructions++;
737     switch (*p1++) {
738       case Cbol:
739       case Ceol:
740       case Cbegbuf:
741       case Cendbuf:
742       case Cwordbeg:
743       case Cwordend:
744       case Cwordbound:
745       case Cnotwordbound:
746         continue;
747       case Cstart_memory:
748       case Cend_memory:
749         p1++;
750         continue;
751       case Cexact:
752         ch = (unsigned char)*p1++;
753         if (map[(int)ch]) goto make_normal_jump;
754         break;
755       case Canychar:
756         for (b = 0; b < 256; b++)
757           if (b != '\n' && map[b]) goto make_normal_jump;
758         break;
759       case Cset:
760         for (b = 0; b < 256; b++)
761           if ((p1[b >> 3] & (1 << (b & 7))) && map[b]) goto make_normal_jump;
762         p1 += 256 / 8;
763         break;
764       default:
765         goto make_normal_jump;
766     }
767     break;
768   }
769   /* now we know that we can't backtrack. */
770   while (p1 != p2 - 3) {
771     num_instructions++;
772     switch (*p1++) {
773       case Cend:
774         return 0;
775       case Cbol:
776       case Ceol:
777       case Canychar:
778       case Cbegbuf:
779       case Cendbuf:
780       case Cwordbeg:
781       case Cwordend:
782       case Cwordbound:
783       case Cnotwordbound:
784         break;
785       case Cset:
786         p1 += 256 / 8;
787         break;
788       case Cexact:
789       case Cstart_memory:
790       case Cend_memory:
791       case Cmatch_memory:
792       case Csyntaxspec:
793       case Cnotsyntaxspec:
794         p1++;
795         break;
796       case Cjump:
797       case Cstar_jump:
798       case Cfailure_jump:
799       case Cupdate_failure_jump:
800       case Cdummy_failure_jump:
801         goto make_normal_jump;
802       default:
803         return 0;
804     }
805   }
806 
807   /* make_update_jump: */
808   code -= 3;
809   a += 3; /* jump to after the Cfailure_jump */
810   code[0] = Cupdate_failure_jump;
811   code[1] = a & 0xff;
812   code[2] = a >> 8;
813   if (num_instructions > 1) return 1;
814   // assert(num_instructions == 1);
815   /* if the only instruction matches a single character, we can do
816    * better */
817   p1 = code + 3 + a; /* start of sole instruction */
818   if (*p1 == Cset || *p1 == Cexact || *p1 == Canychar || *p1 == Csyntaxspec ||
819       *p1 == Cnotsyntaxspec)
820     code[0] = Crepeat1;
821   return 1;
822 
823 make_normal_jump:
824   code -= 3;
825   *code = Cjump;
826   return 1;
827 }
828 
ReOptimize(regex_t * bufp)829 static int ReOptimize(regex_t* bufp)
830 {
831   unsigned char* code;
832 
833   code = bufp->buffer;
834 
835   while (1) {
836     switch (*code++) {
837       case Cend:
838         return 1;
839       case Canychar:
840       case Cbol:
841       case Ceol:
842       case Cbegbuf:
843       case Cendbuf:
844       case Cwordbeg:
845       case Cwordend:
846       case Cwordbound:
847       case Cnotwordbound:
848         break;
849       case Cset:
850         code += 256 / 8;
851         break;
852       case Cexact:
853       case Cstart_memory:
854       case Cend_memory:
855       case Cmatch_memory:
856       case Csyntaxspec:
857       case Cnotsyntaxspec:
858         code++;
859         break;
860       case Cstar_jump:
861         if (!ReOptimizeStarJump(bufp, code)) { return 0; }
862         /* fall through */
863       case Cupdate_failure_jump:
864       case Cjump:
865       case Cdummy_failure_jump:
866       case Cfailure_jump:
867       case Crepeat1:
868         code += 2;
869         break;
870       default:
871         return 0;
872     }
873   }
874 }
875 
876 #define NEXTCHAR(var)                       \
877   {                                         \
878     if (pos >= size) goto ends_prematurely; \
879     (var) = regex[pos];                     \
880     pos++;                                  \
881   }
882 
883 #define ALLOC(amount)                                    \
884   {                                                      \
885     if (pattern_offset + (amount) > alloc) {             \
886       alloc += 256 + (amount);                           \
887       pattern = (unsigned char*)realloc(pattern, alloc); \
888       if (!pattern) goto out_of_memory;                  \
889     }                                                    \
890   }
891 
892 #define STORE(ch) pattern[pattern_offset++] = (ch)
893 
894 #define CURRENT_LEVEL_START (starts[starts_base + current_level])
895 
896 #define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
897 
898 #define PUSH_LEVEL_STARTS                           \
899   if (starts_base < (MAX_NESTING - 1) * NUM_LEVELS) \
900     starts_base += NUM_LEVELS;                      \
901   else                                              \
902     goto too_complex
903 
904 #define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
905 
906 #define PUT_ADDR(offset, addr)                  \
907   {                                             \
908     int disp = (addr) - (offset)-2;             \
909     pattern[(offset)] = disp & 0xff;            \
910     pattern[(offset) + 1] = (disp >> 8) & 0xff; \
911   }
912 
913 #define INSERT_JUMP(pos, type, addr)                                       \
914   {                                                                        \
915     int a, p = (pos), t = (type), ad = (addr);                             \
916     for (a = pattern_offset - 1; a >= p; a--) pattern[a + 3] = pattern[a]; \
917     pattern[p] = t;                                                        \
918     PUT_ADDR(p + 1, ad);                                                   \
919     pattern_offset += 3;                                                   \
920   }
921 
922 #define SETBIT(buf, offset, bit) (buf)[(offset) + (bit) / 8] |= (1 << ((bit)&7))
923 
924 #define SET_FIELDS               \
925   {                              \
926     bufp->allocated = alloc;     \
927     bufp->buffer = pattern;      \
928     bufp->used = pattern_offset; \
929   }
930 
931 #define GETHEX(var)                             \
932   {                                             \
933     unsigned char gethex_ch, gethex_value;      \
934     NEXTCHAR(gethex_ch);                        \
935     gethex_value = HexCharToDecimal(gethex_ch); \
936     if (gethex_value == 16) goto hex_error;     \
937     NEXTCHAR(gethex_ch);                        \
938     gethex_ch = HexCharToDecimal(gethex_ch);    \
939     if (gethex_ch == 16) goto hex_error;        \
940     (var) = gethex_value * 16 + gethex_ch;      \
941   }
942 
943 #define ANSI_TRANSLATE(ch)                                \
944   {                                                       \
945     switch (ch) {                                         \
946       case 'a':                                           \
947       case 'A': {                                         \
948         ch = 7; /* audible bell */                        \
949         break;                                            \
950       }                                                   \
951       case 'b':                                           \
952       case 'B': {                                         \
953         ch = 8; /* backspace */                           \
954         break;                                            \
955       }                                                   \
956       case 'f':                                           \
957       case 'F': {                                         \
958         ch = 12; /* form feed */                          \
959         break;                                            \
960       }                                                   \
961       case 'n':                                           \
962       case 'N': {                                         \
963         ch = 10; /* line feed */                          \
964         break;                                            \
965       }                                                   \
966       case 'r':                                           \
967       case 'R': {                                         \
968         ch = 13; /* carriage return */                    \
969         break;                                            \
970       }                                                   \
971       case 't':                                           \
972       case 'T': {                                         \
973         ch = 9; /* tab */                                 \
974         break;                                            \
975       }                                                   \
976       case 'v':                                           \
977       case 'V': {                                         \
978         ch = 11; /* vertical tab */                       \
979         break;                                            \
980       }                                                   \
981       case 'x': /* hex code */                            \
982       case 'X': {                                         \
983         GETHEX(ch);                                       \
984         break;                                            \
985       }                                                   \
986       default: {                                          \
987         /* other characters passed through */             \
988         if (translate) ch = translate[(unsigned char)ch]; \
989         break;                                            \
990       }                                                   \
991     }                                                     \
992   }
993 
re_compile_pattern(regex_t * bufp,unsigned char * regex)994 const char* re_compile_pattern(regex_t* bufp, unsigned char* regex)
995 {
996   int a;
997   int pos;
998   int op;
999   int current_level;
1000   int level;
1001   int opcode;
1002   int pattern_offset = 0, alloc;
1003   int starts[NUM_LEVELS * MAX_NESTING];
1004   int starts_base;
1005   int future_jumps[MAX_NESTING];
1006   int num_jumps;
1007   unsigned char ch = '\0';
1008   unsigned char* pattern;
1009   unsigned char* translate;
1010   int next_register;
1011   int paren_depth;
1012   int num_open_registers;
1013   int open_registers[RE_NREGS];
1014   int beginning_context;
1015   int size = strlen((char*)regex);
1016 
1017   if (!re_compile_initialized) ReCompileInitialize();
1018   bufp->used = 0;
1019   bufp->fastmap_accurate = 0;
1020   bufp->uses_registers = 1;
1021   bufp->num_registers = 1;
1022   translate = bufp->translate;
1023   pattern = bufp->buffer;
1024   alloc = bufp->allocated;
1025   if (alloc == 0 || pattern == NULL) {
1026     alloc = 256;
1027     bufp->buffer = pattern = (unsigned char*)malloc(alloc);
1028     if (!pattern) goto out_of_memory;
1029   }
1030   pattern_offset = 0;
1031   starts_base = 0;
1032   num_jumps = 0;
1033   current_level = 0;
1034   SET_LEVEL_START;
1035   num_open_registers = 0;
1036   next_register = 1;
1037   paren_depth = 0;
1038   beginning_context = 1;
1039   op = -1;
1040   /* we use Rend dummy to ensure that pending jumps are updated
1041      (due to low priority of Rend) before exiting the loop. */
1042   pos = 0;
1043   while (op != Rend) {
1044     if (pos >= size)
1045       op = Rend;
1046     else {
1047       NEXTCHAR(ch);
1048       if (translate) ch = translate[(unsigned char)ch];
1049       op = plain_ops[(unsigned char)ch];
1050       if (op == Rquote) {
1051         NEXTCHAR(ch);
1052         op = quoted_ops[(unsigned char)ch];
1053         if (op == Rnormal && regexp_ansi_sequences) ANSI_TRANSLATE(ch);
1054       }
1055     }
1056     level = precedences[op];
1057     /* printf("ch='%c' op=%d level=%d current_level=%d
1058        curlevstart=%d\n", ch, op, level, current_level,
1059        CURRENT_LEVEL_START); */
1060     if (level > current_level) {
1061       for (current_level++; current_level < level; current_level++)
1062         SET_LEVEL_START;
1063       SET_LEVEL_START;
1064     } else if (level < current_level) {
1065       current_level = level;
1066       for (;
1067            num_jumps > 0 && future_jumps[num_jumps - 1] >= CURRENT_LEVEL_START;
1068            num_jumps--)
1069         PUT_ADDR(future_jumps[num_jumps - 1], pattern_offset);
1070     }
1071     switch (op) {
1072       case Rend:
1073         break;
1074       case Rnormal:
1075       normal_char:
1076         opcode = Cexact;
1077       store_opcode_and_arg: /* opcode & ch must be set */
1078         SET_LEVEL_START;
1079         ALLOC(2);
1080         STORE(opcode);
1081         STORE(ch);
1082         break;
1083       case Ranychar:
1084         opcode = Canychar;
1085       store_opcode:
1086         SET_LEVEL_START;
1087         ALLOC(1);
1088         STORE(opcode);
1089         break;
1090       case Rquote:
1091         set_error("Rquote");
1092         /* NOTREACHED */
1093         break;
1094       case Rbol:
1095         if (!beginning_context) {
1096           if (regexp_context_indep_ops)
1097             goto op_error;
1098           else
1099             goto normal_char;
1100         }
1101         opcode = Cbol;
1102         goto store_opcode;
1103       case Reol:
1104         if (!((pos >= size) ||
1105               ((regexp_syntax & RE_NO_BK_VBAR)
1106                    ? (regex[pos] == '\174')
1107                    : (pos + 1 < size && regex[pos] == '\134' &&
1108                       regex[pos + 1] == '\174')) ||
1109               ((regexp_syntax & RE_NO_BK_PARENS)
1110                    ? (regex[pos] == ')')
1111                    : (pos + 1 < size && regex[pos] == '\134' &&
1112                       regex[pos + 1] == ')')))) {
1113           if (regexp_context_indep_ops)
1114             goto op_error;
1115           else
1116             goto normal_char;
1117         }
1118         opcode = Ceol;
1119         goto store_opcode;
1120         /* NOTREACHED */
1121         break;
1122       case Roptional:
1123         if (beginning_context) {
1124           if (regexp_context_indep_ops)
1125             goto op_error;
1126           else
1127             goto normal_char;
1128         }
1129         if (CURRENT_LEVEL_START == pattern_offset)
1130           break; /* ignore empty patterns for ? */
1131         ALLOC(3);
1132         INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 3);
1133         break;
1134       case Rstar:
1135       case Rplus:
1136         if (beginning_context) {
1137           if (regexp_context_indep_ops)
1138             goto op_error;
1139           else
1140             goto normal_char;
1141         }
1142         if (CURRENT_LEVEL_START == pattern_offset)
1143           break; /* ignore empty patterns for + and * */
1144         ALLOC(9);
1145         INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1146         INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
1147         if (op == Rplus) /* jump over initial failure_jump */
1148           INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
1149                       CURRENT_LEVEL_START + 6);
1150         break;
1151       case Ror:
1152         ALLOC(6);
1153         INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1154         if (num_jumps >= MAX_NESTING) goto too_complex;
1155         STORE(Cjump);
1156         future_jumps[num_jumps++] = pattern_offset;
1157         STORE(0);
1158         STORE(0);
1159         SET_LEVEL_START;
1160         break;
1161       case Ropenpar: {
1162         SET_LEVEL_START;
1163         if (next_register < RE_NREGS) {
1164           bufp->uses_registers = 1;
1165           ALLOC(2);
1166           STORE(Cstart_memory);
1167           STORE(next_register);
1168           open_registers[num_open_registers++] = next_register;
1169           bufp->num_registers++;
1170           next_register++;
1171         }
1172         paren_depth++;
1173         PUSH_LEVEL_STARTS;
1174         current_level = 0;
1175         SET_LEVEL_START;
1176         break;
1177       }
1178       case Rclosepar:
1179         if (paren_depth <= 0) goto parenthesIsError;
1180         POP_LEVEL_STARTS;
1181         current_level = precedences[Ropenpar];
1182         paren_depth--;
1183         if (paren_depth < num_open_registers) {
1184           bufp->uses_registers = 1;
1185           ALLOC(2);
1186           STORE(Cend_memory);
1187           num_open_registers--;
1188           STORE(open_registers[num_open_registers]);
1189         }
1190         break;
1191       case Rmemory:
1192         if (ch == '0') goto bad_match_register;
1193         if (!(ch >= '0' && ch <= '9')) { goto bad_match_register; }
1194         bufp->uses_registers = 1;
1195         opcode = Cmatch_memory;
1196         ch -= '0';
1197         goto store_opcode_and_arg;
1198       case Rextended_memory:
1199         NEXTCHAR(ch);
1200         if (ch < '0' || ch > '9') goto bad_match_register;
1201         NEXTCHAR(a);
1202         if (a < '0' || a > '9') goto bad_match_register;
1203         ch = 10 * (a - '0') + ch - '0';
1204         if (ch == 0 || ch >= RE_NREGS) goto bad_match_register;
1205         bufp->uses_registers = 1;
1206         opcode = Cmatch_memory;
1207         goto store_opcode_and_arg;
1208       case Ropenset: {
1209         int complement;
1210         int prev;
1211         int offset;
1212         int range;
1213         int firstchar;
1214 
1215         SET_LEVEL_START;
1216         ALLOC(1 + 256 / 8);
1217         STORE(Cset);
1218         offset = pattern_offset;
1219         for (a = 0; a < 256 / 8; a++) STORE(0);
1220         NEXTCHAR(ch);
1221         if (translate) ch = translate[(unsigned char)ch];
1222         if (ch == '\136') {
1223           complement = 1;
1224           NEXTCHAR(ch);
1225           if (translate) ch = translate[(unsigned char)ch];
1226         } else
1227           complement = 0;
1228         prev = -1;
1229         range = 0;
1230         firstchar = 1;
1231         while (ch != '\135' || firstchar) {
1232           firstchar = 0;
1233           if (regexp_ansi_sequences && ch == '\134') {
1234             NEXTCHAR(ch);
1235             ANSI_TRANSLATE(ch);
1236           }
1237           if (range) {
1238             for (a = prev; a <= (int)ch; a++) SETBIT(pattern, offset, a);
1239             prev = -1;
1240             range = 0;
1241           } else if (prev != -1 && ch == '-')
1242             range = 1;
1243           else {
1244             SETBIT(pattern, offset, ch);
1245             prev = ch;
1246           }
1247           NEXTCHAR(ch);
1248           if (translate) ch = translate[(unsigned char)ch];
1249         }
1250         if (range) SETBIT(pattern, offset, '-');
1251         if (complement) {
1252           for (a = 0; a < 256 / 8; a++) pattern[offset + a] ^= 0xff;
1253         }
1254         break;
1255       }
1256       case Rbegbuf: {
1257         opcode = Cbegbuf;
1258         goto store_opcode;
1259       }
1260       case Rendbuf: {
1261         opcode = Cendbuf;
1262         goto store_opcode;
1263       }
1264       case Rwordchar: {
1265         opcode = Csyntaxspec;
1266         ch = Sword;
1267         goto store_opcode_and_arg;
1268       }
1269       case Rnotwordchar: {
1270         opcode = Cnotsyntaxspec;
1271         ch = Sword;
1272         goto store_opcode_and_arg;
1273       }
1274       case Rwordbeg: {
1275         opcode = Cwordbeg;
1276         goto store_opcode;
1277       }
1278       case Rwordend: {
1279         opcode = Cwordend;
1280         goto store_opcode;
1281       }
1282       case Rwordbound: {
1283         opcode = Cwordbound;
1284         goto store_opcode;
1285       }
1286       case Rnotwordbound: {
1287         opcode = Cnotwordbound;
1288         goto store_opcode;
1289       }
1290       default: {
1291         abort();
1292       }
1293     }
1294     beginning_context = (op == Ropenpar || op == Ror);
1295   }
1296   if (starts_base != 0) goto parenthesIsError;
1297   // assert(num_jumps == 0);
1298   ALLOC(1);
1299   STORE(Cend);
1300   SET_FIELDS;
1301   if (!ReOptimize(bufp)) return "Optimization error";
1302   return NULL;
1303 
1304 op_error:
1305   SET_FIELDS;
1306   return "Badly placed special character";
1307 
1308 bad_match_register:
1309   SET_FIELDS;
1310   return "Bad match register number";
1311 
1312 hex_error:
1313   SET_FIELDS;
1314   return "Bad hexadecimal number";
1315 
1316 parenthesIsError:
1317   SET_FIELDS;
1318   return "Badly placed parenthesis";
1319 
1320 out_of_memory:
1321   SET_FIELDS;
1322   return "Out of memory";
1323 
1324 ends_prematurely:
1325   SET_FIELDS;
1326   return "Regular expression ends prematurely";
1327 
1328 too_complex:
1329   SET_FIELDS;
1330   return "Regular expression too complex";
1331 }
1332 
1333 #undef CHARAT
1334 #undef NEXTCHAR
1335 #undef GETHEX
1336 #undef ALLOC
1337 #undef STORE
1338 #undef CURRENT_LEVEL_START
1339 #undef SET_LEVEL_START
1340 #undef PUSH_LEVEL_STARTS
1341 #undef POP_LEVEL_STARTS
1342 #undef PUT_ADDR
1343 #undef INSERT_JUMP
1344 #undef SETBIT
1345 #undef SET_FIELDS
1346 
1347 #define PREFETCH \
1348   if (text == textend) goto fail
1349 
1350 #define NEXTCHAR(var)           \
1351   PREFETCH;                     \
1352   var = (unsigned char)*text++; \
1353   if (translate) var = translate[var]
1354 
regcomp(regex_t * bufp,const char * regex,int cflags)1355 int regcomp(regex_t* bufp, const char* regex, int cflags)
1356 {
1357   memset(bufp, 0, sizeof(regex_t));
1358   bufp->cflags = cflags;
1359   if (bufp->cflags & REG_ICASE) {
1360     char *p, *lcase = strdup(regex);
1361     for (p = lcase; *p; p++) { *p = tolower(*p); }
1362     re_compile_pattern(bufp, (unsigned char*)lcase);
1363     free(lcase);
1364   } else {
1365     re_compile_pattern(bufp, (unsigned char*)regex);
1366   }
1367   if (got_error) { return -1; }
1368   return 0;
1369 }
1370 
re_registers_to_regmatch(regexp_registers_t old_regs,regmatch_t pmatch[],size_t nmatch)1371 void re_registers_to_regmatch(regexp_registers_t old_regs,
1372                               regmatch_t pmatch[],
1373                               size_t nmatch)
1374 {
1375   if (!(nmatch == 0 && pmatch == NULL)) {
1376     size_t i = 0;
1377 
1378     /*
1379      * We have to set the last entry to -1
1380      */
1381     nmatch = nmatch - 1;
1382     for (i = 0; (i < nmatch) && (old_regs->start[i] > -1); i++) {
1383       pmatch[i].rm_so = old_regs->start[i];
1384       pmatch[i].rm_eo = old_regs->end[i];
1385     }
1386 
1387     pmatch[i].rm_eo = pmatch[i].rm_so = -1;
1388   }
1389 }
1390 
regexec(regex_t * preg,const char * string,size_t nmatch,regmatch_t pmatch[],int eflags)1391 int regexec(regex_t* preg,
1392             const char* string,
1393             size_t nmatch,
1394             regmatch_t pmatch[],
1395             int eflags)
1396 {
1397   int status;
1398   int len = strlen(string);
1399   re_registers regs{};
1400 
1401   status = ReSearch(preg, (unsigned char*)string, len, 0, len, &regs);
1402   if (status >= 0) { re_registers_to_regmatch(&regs, pmatch, nmatch); }
1403 
1404   /*
1405    * status is the start position in the string base 0 where
1406    * the pattern was found or negative if not found.
1407    */
1408   return status < 0 ? -1 : 0;
1409 }
1410 
regerror(int errcode,regex_t * preg,char * errbuf,size_t errbuf_size)1411 size_t regerror(int errcode, regex_t* preg, char* errbuf, size_t errbuf_size)
1412 {
1413   bstrncpy(errbuf, preg->errmsg, errbuf_size);
1414   return 0;
1415 }
1416 
regfree(regex_t * preg)1417 void regfree(regex_t* preg)
1418 {
1419   if (preg->lcase) {
1420     FreePoolMemory(preg->lcase);
1421     preg->lcase = NULL;
1422   }
1423   if (preg->buffer) {
1424     free(preg->buffer);
1425     preg->buffer = NULL;
1426   }
1427 }
1428 
ReMatch(regex_t * bufp,unsigned char * string,int size,int pos,regexp_registers_t old_regs)1429 int ReMatch(regex_t* bufp,
1430             unsigned char* string,
1431             int size,
1432             int pos,
1433             regexp_registers_t old_regs)
1434 {
1435   unsigned char* code;
1436   unsigned char* translate;
1437   unsigned char* text;
1438   unsigned char* textstart;
1439   unsigned char* textend;
1440   int a;
1441   int b;
1442   int ch;
1443   int reg;
1444   int match_end;
1445   unsigned char* regstart;
1446   unsigned char* regend;
1447   int regsize;
1448   match_state state;
1449 
1450   // assert(pos >= 0 && size >= 0);
1451   // assert(pos <= size);
1452 
1453   text = string + pos;
1454   textstart = string;
1455   textend = string + size;
1456 
1457   code = bufp->buffer;
1458 
1459   translate = bufp->translate;
1460 
1461   NEW_STATE(state, bufp->num_registers);
1462 
1463 continue_matching:
1464   switch (*code++) {
1465     case Cend: {
1466       match_end = text - textstart;
1467       if (old_regs) {
1468         old_regs->start[0] = pos;
1469         old_regs->end[0] = match_end;
1470         if (!bufp->uses_registers) {
1471           for (a = 1; a < RE_NREGS; a++) {
1472             old_regs->start[a] = -1;
1473             old_regs->end[a] = -1;
1474           }
1475         } else {
1476           for (a = 1; a < bufp->num_registers; a++) {
1477             if ((GET_REG_START(state, a) == NULL) ||
1478                 (GET_REG_END(state, a) == NULL)) {
1479               old_regs->start[a] = -1;
1480               old_regs->end[a] = -1;
1481               continue;
1482             }
1483             old_regs->start[a] = GET_REG_START(state, a) - textstart;
1484             old_regs->end[a] = GET_REG_END(state, a) - textstart;
1485           }
1486           for (; a < RE_NREGS; a++) {
1487             old_regs->start[a] = -1;
1488             old_regs->end[a] = -1;
1489           }
1490         }
1491       }
1492       FREE_STATE(state);
1493       return match_end - pos;
1494     }
1495     case Cbol: {
1496       if (text == textstart || text[-1] == '\n') goto continue_matching;
1497       goto fail;
1498     }
1499     case Ceol: {
1500       if (text == textend || *text == '\n') goto continue_matching;
1501       goto fail;
1502     }
1503     case Cset: {
1504       NEXTCHAR(ch);
1505       if (code[ch / 8] & (1 << (ch & 7))) {
1506         code += 256 / 8;
1507         goto continue_matching;
1508       }
1509       goto fail;
1510     }
1511     case Cexact: {
1512       NEXTCHAR(ch);
1513       if (ch != (unsigned char)*code++) goto fail;
1514       goto continue_matching;
1515     }
1516     case Canychar: {
1517       NEXTCHAR(ch);
1518       if (ch == '\n') goto fail;
1519       goto continue_matching;
1520     }
1521     case Cstart_memory: {
1522       reg = *code++;
1523       SET_REG_START(state, reg, text, goto error);
1524       goto continue_matching;
1525     }
1526     case Cend_memory: {
1527       reg = *code++;
1528       SET_REG_END(state, reg, text, goto error);
1529       goto continue_matching;
1530     }
1531     case Cmatch_memory: {
1532       reg = *code++;
1533       regstart = GET_REG_START(state, reg);
1534       regend = GET_REG_END(state, reg);
1535       if ((regstart == NULL) || (regend == NULL))
1536         goto fail; /* or should we just match nothing? */
1537       regsize = regend - regstart;
1538 
1539       if (regsize > (textend - text)) goto fail;
1540       if (translate) {
1541         for (; regstart < regend; regstart++, text++)
1542           if (translate[*regstart] != translate[*text]) goto fail;
1543       } else
1544         for (; regstart < regend; regstart++, text++)
1545           if (*regstart != *text) goto fail;
1546       goto continue_matching;
1547     }
1548     case Cupdate_failure_jump: {
1549       UPDATE_FAILURE(state, text, goto error);
1550       /* fall to next case */
1551     }
1552       /* treat Cstar_jump just like Cjump if it hasn't been optimized */
1553     case Cstar_jump:
1554     case Cjump: {
1555       a = (unsigned char)*code++;
1556       a |= (unsigned char)*code++ << 8;
1557       code += (int)SHORT(a);
1558       if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1559         set_error("Regex VM jump out of bounds (Cjump)");
1560         FREE_STATE(state);
1561         return -2;
1562       }
1563       goto continue_matching;
1564     }
1565     case Cdummy_failure_jump: {
1566       unsigned char* failuredest;
1567 
1568       a = (unsigned char)*code++;
1569       a |= (unsigned char)*code++ << 8;
1570       a = (int)SHORT(a);
1571       //       assert(*code == Cfailure_jump);
1572       b = (unsigned char)code[1];
1573       b |= (unsigned char)code[2] << 8;
1574       failuredest = code + (int)SHORT(b) + 3;
1575       if (failuredest < bufp->buffer ||
1576           bufp->buffer + bufp->used < failuredest) {
1577         set_error(
1578             "Regex VM jump out of bounds (Cdummy_failure_jump failuredest)");
1579         FREE_STATE(state);
1580         return -2;
1581       }
1582       PUSH_FAILURE(state, failuredest, NULL, goto error);
1583       code += a;
1584       if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1585         set_error("Regex VM jump out of bounds (Cdummy_failure_jump code)");
1586         FREE_STATE(state);
1587         return -2;
1588       }
1589       goto continue_matching;
1590     }
1591     case Cfailure_jump: {
1592       a = (unsigned char)*code++;
1593       a |= (unsigned char)*code++ << 8;
1594       a = (int)SHORT(a);
1595       if (code + a < bufp->buffer || bufp->buffer + bufp->used < code + a) {
1596         set_error("Regex VM jump out of bounds (Cfailure_jump)");
1597         FREE_STATE(state);
1598         return -2;
1599       }
1600       PUSH_FAILURE(state, code + a, text, goto error);
1601       goto continue_matching;
1602     }
1603     case Crepeat1: {
1604       unsigned char* pinst;
1605       a = (unsigned char)*code++;
1606       a |= (unsigned char)*code++ << 8;
1607       a = (int)SHORT(a);
1608       pinst = code + a;
1609       if (pinst < bufp->buffer || bufp->buffer + bufp->used < pinst) {
1610         set_error("Regex VM jump out of bounds (Crepeat1)");
1611         FREE_STATE(state);
1612         return -2;
1613       }
1614       /* pinst is sole instruction in loop, and it matches a
1615        * single character.  Since Crepeat1 was originally a
1616        * Cupdate_failure_jump, we also know that backtracking
1617        * is useless: so long as the single-character
1618        * expression matches, it must be used.  Also, in the
1619        * case of +, we've already matched one character, so +
1620        * can't fail: nothing here can cause a failure.  */
1621       switch (*pinst++) {
1622         case Cset: {
1623           if (translate) {
1624             while (text < textend) {
1625               ch = translate[(unsigned char)*text];
1626               if (pinst[ch / 8] & (1 << (ch & 7)))
1627                 text++;
1628               else
1629                 break;
1630             }
1631           } else {
1632             while (text < textend) {
1633               ch = (unsigned char)*text;
1634               if (pinst[ch / 8] & (1 << (ch & 7)))
1635                 text++;
1636               else
1637                 break;
1638             }
1639           }
1640           break;
1641         }
1642         case Cexact: {
1643           ch = (unsigned char)*pinst;
1644           if (translate) {
1645             while (text < textend && translate[(unsigned char)*text] == ch)
1646               text++;
1647           } else {
1648             while (text < textend && (unsigned char)*text == ch) text++;
1649           }
1650           break;
1651         }
1652         case Canychar: {
1653           while (text < textend && (unsigned char)*text != '\n') text++;
1654           break;
1655         }
1656         case Csyntaxspec: {
1657           a = (unsigned char)*pinst;
1658           if (translate) {
1659             while (text < textend && (SYNTAX(translate[*text]) & a)) text++;
1660           } else {
1661             while (text < textend && (SYNTAX(*text) & a)) text++;
1662           }
1663           break;
1664         }
1665         case Cnotsyntaxspec: {
1666           a = (unsigned char)*pinst;
1667           if (translate) {
1668             while (text < textend && !(SYNTAX(translate[*text]) & a)) text++;
1669           } else {
1670             while (text < textend && !(SYNTAX(*text) & a)) text++;
1671           }
1672           break;
1673         }
1674         default: {
1675           FREE_STATE(state);
1676           set_error("Unknown regex opcode: memory corrupted?");
1677           return -2;
1678              /*NOTREACHED*/}
1679       }
1680       /* due to the funky way + and * are compiled, the top
1681        * failure- stack entry at this point is actually a
1682        * success entry -- update it & pop it */
1683       UPDATE_FAILURE(state, text, goto error);
1684       goto fail; /* i.e., succeed <wink/sigh> */
1685     }
1686     case Cbegbuf: {
1687       if (text == textstart) goto continue_matching;
1688       goto fail;
1689     }
1690     case Cendbuf: {
1691       if (text == textend) goto continue_matching;
1692       goto fail;
1693     }
1694     case Cwordbeg: {
1695       if (text == textend) goto fail;
1696       if (!(SYNTAX(*text) & Sword)) goto fail;
1697       if (text == textstart) goto continue_matching;
1698       if (!(SYNTAX(text[-1]) & Sword)) goto continue_matching;
1699       goto fail;
1700     }
1701     case Cwordend: {
1702       if (text == textstart) goto fail;
1703       if (!(SYNTAX(text[-1]) & Sword)) goto fail;
1704       if (text == textend) goto continue_matching;
1705       if (!(SYNTAX(*text) & Sword)) goto continue_matching;
1706       goto fail;
1707     }
1708     case Cwordbound: {
1709       /* Note: as in gnu regexp, this also matches at the
1710        * beginning and end of buffer.  */
1711 
1712       if (text == textstart || text == textend) goto continue_matching;
1713       if ((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword))
1714         goto continue_matching;
1715       goto fail;
1716     }
1717     case Cnotwordbound: {
1718       /* Note: as in gnu regexp, this never matches at the
1719        * beginning and end of buffer.  */
1720       if (text == textstart || text == textend) goto fail;
1721       if (!((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword)))
1722         goto continue_matching;
1723       goto fail;
1724     }
1725     case Csyntaxspec: {
1726       NEXTCHAR(ch);
1727       if (!(SYNTAX(ch) & (unsigned char)*code++)) goto fail;
1728       goto continue_matching;
1729     }
1730     case Cnotsyntaxspec: {
1731       NEXTCHAR(ch);
1732       if (SYNTAX(ch) & (unsigned char)*code++) goto fail;
1733       goto continue_matching;
1734     }
1735     default: {
1736       FREE_STATE(state);
1737       set_error("Unknown regex opcode: memory corrupted?");
1738       return -2;
1739        /*NOTREACHED*/}
1740   }
1741 
1742   /*
1743    *NOTREACHED
1744    */
1745 
1746   /* Using "break;" in the above switch statement is equivalent to "goto fail;"
1747    */
1748 fail:
1749   POP_FAILURE(state, code, text, goto done_matching, goto error);
1750   goto continue_matching;
1751 
1752 done_matching:
1753   /*   if(translated != NULL) */
1754   /*      free(translated); */
1755   FREE_STATE(state);
1756   return -1;
1757 
1758 error:
1759   /*   if (translated != NULL) */
1760   /*      free(translated); */
1761   FREE_STATE(state);
1762   return -2;
1763 }
1764 
1765 
1766 #undef PREFETCH
1767 #undef NEXTCHAR
1768 
ReSearch(regex_t * bufp,unsigned char * str,int size,int pos,int range,regexp_registers_t regs)1769 int ReSearch(regex_t* bufp,
1770              unsigned char* str,
1771              int size,
1772              int pos,
1773              int range,
1774              regexp_registers_t regs)
1775 {
1776   unsigned char* fastmap;
1777   unsigned char* translate;
1778   unsigned char* text;
1779   unsigned char* partstart;
1780   unsigned char* partend;
1781   int dir;
1782   int ret;
1783   unsigned char anchor;
1784   unsigned char* string = str;
1785 
1786   if (bufp->cflags & REG_ICASE) { /* we must use string in lowercase */
1787     int len = strlen((const char*)str);
1788     if (!bufp->lcase) { bufp->lcase = GetPoolMemory(PM_FNAME); }
1789     bufp->lcase = CheckPoolMemorySize(bufp->lcase, len + 1);
1790     unsigned char* dst = (unsigned char*)bufp->lcase;
1791     while (*string) { *dst++ = tolower(*string++); }
1792     *dst = '\0';
1793     string = (unsigned char*)bufp->lcase;
1794   }
1795 
1796   // assert(size >= 0 && pos >= 0);
1797   // assert(pos + range >= 0 && pos + range <= size);     /* Bugfix by ylo */
1798 
1799   fastmap = bufp->fastmap;
1800   translate = bufp->translate;
1801   if (fastmap && !bufp->fastmap_accurate) {
1802     ReCompileFastmap(bufp);
1803     if (got_error) return -2;
1804   }
1805 
1806   anchor = bufp->anchor;
1807   if (bufp->can_be_null == 1) /* can_be_null == 2: can match null at eob */
1808     fastmap = NULL;
1809 
1810   if (range < 0) {
1811     dir = -1;
1812     range = -range;
1813   } else
1814     dir = 1;
1815 
1816   if (anchor == 2) {
1817     if (pos != 0)
1818       return -1;
1819     else
1820       range = 0;
1821   }
1822 
1823   for (; range >= 0; range--, pos += dir) {
1824     if (fastmap) {
1825       if (dir == 1) { /* searching forwards */
1826 
1827         text = string + pos;
1828         partend = string + size;
1829         partstart = text;
1830         if (translate)
1831           while (text != partend &&
1832                  !fastmap[(unsigned char)translate[(unsigned char)*text]])
1833             text++;
1834         else
1835           while (text != partend && !fastmap[(unsigned char)*text]) text++;
1836         pos += text - partstart;
1837         range -= text - partstart;
1838         if (pos == size && bufp->can_be_null == 0) return -1;
1839       } else { /* searching backwards */
1840         text = string + pos;
1841         partstart = string + pos - range;
1842         partend = text;
1843         if (translate)
1844           while (text != partstart &&
1845                  !fastmap[(unsigned char)translate[(unsigned char)*text]])
1846             text--;
1847         else
1848           while (text != partstart && !fastmap[(unsigned char)*text]) text--;
1849         pos -= partend - text;
1850         range -= partend - text;
1851       }
1852     }
1853     if (anchor == 1) { /* anchored to begline */
1854       if (pos > 0 && (string[pos - 1] != '\n')) continue;
1855     }
1856     //    assert(pos >= 0 && pos <= size);
1857     ret = ReMatch(bufp, string, size, pos, regs);
1858     if (ret >= 0) return pos;
1859     if (ret == -2) return -2;
1860   }
1861   return -1;
1862 }
1863 
1864 /*
1865 ** Local Variables:
1866 ** mode: c
1867 ** c-file-style: "python"
1868 ** End:
1869 */
1870