1 /*
2    BAREOS® - Backup Archiving REcovery Open Sourced
3 
4    Copyright (C) 2006-2010 Free Software Foundation Europe e.V.
5    Copyright (C) 2013-2020 Bareos GmbH & Co. KG
6 
7    This program is Free Software; you can redistribute it and/or
8    modify it under the terms of version three of the GNU Affero General Public
9    License as published by the Free Software Foundation and included
10    in the file LICENSE.
11 
12    This program is distributed in the hope that it will be useful, but
13    WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15    Affero General Public License for more details.
16 
17    You should have received a copy of the GNU Affero General Public License
18    along with this program; if not, write to the Free Software
19    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
20    02110-1301, USA.
21 */
22 /* regexpr.c
23  *
24  * Author: Tatu Ylonen <ylo@ngs.fi>
25  *
26  * Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
27  *
28  * Permission to use, copy, modify, distribute, and sell this software
29  * and its documentation for any purpose is hereby granted without
30  * fee, provided that the above copyright notice appear in all copies.
31  * This software is provided "as is" without express or implied
32  * warranty.
33  *
34  * Created: Thu Sep 26 17:14:05 1991 ylo
35  * Last modified: Mon Nov  4 17:06:48 1991 ylo
36  * Ported to Think C: 19 Jan 1992 guido@cwi.nl
37  *
38  * This code draws many ideas from the regular expression packages by
39  * Henry Spencer of the University of Toronto and Richard Stallman of
40  * the Free Software Foundation.
41  *
42  * Emacs-specific code and syntax table code is almost directly borrowed
43  * from GNU regexp.
44  *
45  * Bugs fixed and lots of reorganization by Jeffrey C. Ollie, April
46  * 1997 Thanks for bug reports and ideas from Andrew Kuchling, Tim
47  * Peters, Guido van Rossum, Ka-Ping Yee, Sjoerd Mullender, and
48  * probably one or two others that I'm forgetting.
49  *
50  * This file modified to work with BAREOS and C++ by
51  *    Kern Sibbald, April 2006
52  *
53  * This file modified to work with REG_ICASE and BAREOS by
54  *    Eric Bollengier April 2007
55  */
56 
57 #include "include/bareos.h"
58 #include "bregex.h"
59 
60 #define set_error(x) bufp->errmsg = ((char*)(x))
61 #define got_error bufp->errmsg != NULL
62 
63 /* The original code blithely assumed that sizeof(short) == 2.  Not
64  * always true.  Original instances of "(short)x" were replaced by
65  * SHORT(x), where SHORT is #defined below.  */
66 
67 #define SHORT(x) ((x)&0x8000 ? (x)-0x10000 : (x))
68 
69 /* The stack implementation is taken from an idea by Andrew Kuchling.
70  * It's a doubly linked list of arrays. The advantages of this over a
71  * simple linked list are that the number of mallocs required are
72  * reduced. It also makes it possible to statically allocate enough
73  * space so that small patterns don't ever need to call malloc.
74  *
75  * The advantages over a single array is that is periodically
76  * realloced when more space is needed is that we avoid ever copying
77  * the stack. */
78 
79 /* item_t is the basic stack element.  Defined as a union of
80  * structures so that both registers, failure points, and counters can
81  * be pushed/popped from the stack.  There's nothing built into the
82  * item to keep track of whether a certain stack item is a register, a
83  * failure point, or a counter. */
84 
85 typedef union item_t {
86   struct {
87     int num;
88     int level;
89     unsigned char* start;
90     unsigned char* end;
91   } reg;
92   struct {
93     int count;
94     int level;
95     int phantom;
96     unsigned char* code;
97     unsigned char* text;
98   } fail;
99   struct {
100     int num;
101     int level;
102     int count;
103   } cntr;
104 } item_t;
105 
106 #define B_STACK_PAGE_SIZE 256
107 #define NUM_REGISTERS 256
108 
109 /* A 'page' of stack items. */
110 
111 typedef struct item_page_t {
112   item_t items[B_STACK_PAGE_SIZE];
113   struct item_page_t* prev;
114   struct item_page_t* next;
115 } item_page_t;
116 
117 
118 typedef struct match_state {
119   /* The number of registers that have been pushed onto the stack
120    * since the last failure point. */
121 
122   int count;
123 
124   /* Used to control when registers need to be pushed onto the
125    * stack. */
126 
127   int level;
128 
129   /* The number of failure points on the stack. */
130 
131   int point;
132 
133   /* Storage for the registers.  Each register consists of two
134    * pointers to characters.  So register N is represented as
135    * start[N] and end[N].  The pointers must be converted to
136    * offsets from the beginning of the string before returning the
137    * registers to the calling program. */
138 
139   unsigned char* start[NUM_REGISTERS];
140   unsigned char* end[NUM_REGISTERS];
141 
142   /* Keeps track of whether a register has changed recently. */
143 
144   int changed[NUM_REGISTERS];
145 
146   /* Structure to encapsulate the stack. */
147   struct {
148     /* index into the current page.  If index == 0 and you need
149      * to pop an item, move to the previous page and set index
150      * = B_STACK_PAGE_SIZE - 1.  Otherwise decrement index to
151      * push a page. If index == B_STACK_PAGE_SIZE and you need
152      * to push a page move to the next page and set index =
153      * 0. If there is no new next page, allocate a new page
154      * and link it in. Otherwise, increment index to push a
155      * page. */
156 
157     int index;
158     item_page_t* current; /* Pointer to the current page. */
159     item_page_t first;    /* First page is statically allocated. */
160   } stack;
161 } match_state;
162 
163 /* Initialize a state object */
164 
165 /* #define NEW_STATE(state) \ */
166 /* memset(&state, 0, (void *)(&state.stack) - (void *)(&state)); \ */
167 /* state.stack.current = &state.stack.first; \ */
168 /* state.stack.first.prev = NULL; \ */
169 /* state.stack.first.next = NULL; \ */
170 /* state.stack.index = 0; \ */
171 /* state.level = 1 */
172 
173 #define NEW_STATE(state, nregs)               \
174   {                                           \
175     int i;                                    \
176     for (i = 0; i < nregs; i++) {             \
177       state.start[i] = NULL;                  \
178       state.end[i] = NULL;                    \
179       state.changed[i] = 0;                   \
180     }                                         \
181     state.stack.current = &state.stack.first; \
182     state.stack.first.prev = NULL;            \
183     state.stack.first.next = NULL;            \
184     state.stack.index = 0;                    \
185     state.level = 1;                          \
186     state.count = 0;                          \
187     state.level = 0;                          \
188     state.point = 0;                          \
189   }
190 
191 /* Free any memory that might have been malloc'd */
192 
193 #define FREE_STATE(state)                               \
194   while (state.stack.first.next != NULL) {              \
195     state.stack.current = state.stack.first.next;       \
196     state.stack.first.next = state.stack.current->next; \
197     free(state.stack.current);                          \
198   }
199 
200 /* Discard the top 'count' stack items. */
201 
202 #define STACK_DISCARD(stack, count, on_error)  \
203   stack.index -= count;                        \
204   while (stack.index < 0) {                    \
205     if (stack.current->prev == NULL) on_error; \
206     stack.current = stack.current->prev;       \
207     stack.index += B_STACK_PAGE_SIZE;          \
208   }
209 
210 /* Store a pointer to the previous item on the stack. Used to pop an
211  * item off of the stack. */
212 
213 #define STACK_PREV(stack, top, on_error)       \
214   if (stack.index == 0) {                      \
215     if (stack.current->prev == NULL) on_error; \
216     stack.current = stack.current->prev;       \
217     stack.index = B_STACK_PAGE_SIZE - 1;       \
218   } else {                                     \
219     stack.index--;                             \
220   }                                            \
221   top = &(stack.current->items[stack.index])
222 
223 /* Store a pointer to the next item on the stack. Used to push an item
224  * on to the stack. */
225 
226 #define STACK_NEXT(stack, top, on_error)                               \
227   if (stack.index == B_STACK_PAGE_SIZE) {                              \
228     if (stack.current->next == NULL) {                                 \
229       stack.current->next = (item_page_t*)malloc(sizeof(item_page_t)); \
230       if (stack.current->next == NULL) on_error;                       \
231       stack.current->next->prev = stack.current;                       \
232       stack.current->next->next = NULL;                                \
233     }                                                                  \
234     stack.current = stack.current->next;                               \
235     stack.index = 0;                                                   \
236   }                                                                    \
237   top = &(stack.current->items[stack.index++])
238 
239 /* Store a pointer to the item that is 'count' items back in the
240  * stack. STACK_BACK(stack, top, 1, on_error) is equivalent to
241  * STACK_TOP(stack, top, on_error).  */
242 
243 #define STACK_BACK(stack, top, count, on_error) \
244   {                                             \
245     int index;                                  \
246     item_page_t* current;                       \
247     current = stack.current;                    \
248     index = stack.index - (count);              \
249     while (index < 0) {                         \
250       if (current->prev == NULL) on_error;      \
251       current = current->prev;                  \
252       index += B_STACK_PAGE_SIZE;               \
253     }                                           \
254     top = &(current->items[index]);             \
255   }
256 
257 /* Store a pointer to the top item on the stack. Execute the
258  * 'on_error' code if there are no items on the stack. */
259 
260 #define STACK_TOP(stack, top, on_error)                         \
261   if (stack.index == 0) {                                       \
262     if (stack.current->prev == NULL) on_error;                  \
263     top = &(stack.current->prev->items[B_STACK_PAGE_SIZE - 1]); \
264   } else {                                                      \
265     top = &(stack.current->items[stack.index - 1]);             \
266   }
267 
268 /* Test to see if the stack is empty */
269 
270 #define STACK_EMPTY(stack) ((stack.index == 0) && (stack.current->prev == NULL))
271 
272 /* Return the start of register 'reg' */
273 
274 #define GET_REG_START(state, reg) (state.start[reg])
275 
276 /* Return the end of register 'reg' */
277 
278 #define GET_REG_END(state, reg) (state.end[reg])
279 
280 /* Set the start of register 'reg'. If the state of the register needs
281  * saving, push it on the stack. */
282 
283 #define SET_REG_START(state, reg, text, on_error) \
284   if (state.changed[reg] < state.level) {         \
285     item_t* item;                                 \
286     STACK_NEXT(state.stack, item, on_error);      \
287     item->reg.num = reg;                          \
288     item->reg.start = state.start[reg];           \
289     item->reg.end = state.end[reg];               \
290     item->reg.level = state.changed[reg];         \
291     state.changed[reg] = state.level;             \
292     state.count++;                                \
293   }                                               \
294   state.start[reg] = text
295 
296 /* Set the end of register 'reg'. If the state of the register needs
297  * saving, push it on the stack. */
298 
299 #define SET_REG_END(state, reg, text, on_error) \
300   if (state.changed[reg] < state.level) {       \
301     item_t* item;                               \
302     STACK_NEXT(state.stack, item, on_error);    \
303     item->reg.num = reg;                        \
304     item->reg.start = state.start[reg];         \
305     item->reg.end = state.end[reg];             \
306     item->reg.level = state.changed[reg];       \
307     state.changed[reg] = state.level;           \
308     state.count++;                              \
309   }                                             \
310   state.end[reg] = text
311 
312 #define PUSH_FAILURE(state, xcode, xtext, on_error) \
313   {                                                 \
314     item_t* item;                                   \
315     STACK_NEXT(state.stack, item, on_error);        \
316     item->fail.code = xcode;                        \
317     item->fail.text = xtext;                        \
318     item->fail.count = state.count;                 \
319     item->fail.level = state.level;                 \
320     item->fail.phantom = 0;                         \
321     state.count = 0;                                \
322     state.level++;                                  \
323     state.point++;                                  \
324   }
325 
326 /* Update the last failure point with a new position in the text. */
327 
328 #define UPDATE_FAILURE(state, xtext, on_error)                \
329   {                                                           \
330     item_t* item;                                             \
331     STACK_BACK(state.stack, item, state.count + 1, on_error); \
332     if (!item->fail.phantom) {                                \
333       item_t* item2;                                          \
334       STACK_NEXT(state.stack, item2, on_error);               \
335       item2->fail.code = item->fail.code;                     \
336       item2->fail.text = xtext;                               \
337       item2->fail.count = state.count;                        \
338       item2->fail.level = state.level;                        \
339       item2->fail.phantom = 1;                                \
340       state.count = 0;                                        \
341       state.level++;                                          \
342       state.point++;                                          \
343     } else {                                                  \
344       STACK_DISCARD(state.stack, state.count, on_error);      \
345       STACK_TOP(state.stack, item, on_error);                 \
346       item->fail.text = xtext;                                \
347       state.count = 0;                                        \
348       state.level++;                                          \
349     }                                                         \
350   }
351 
352 #define POP_FAILURE(state, xcode, xtext, on_empty, on_error) \
353   {                                                          \
354     item_t* item;                                            \
355     do {                                                     \
356       while (state.count > 0) {                              \
357         STACK_PREV(state.stack, item, on_error);             \
358         state.start[item->reg.num] = item->reg.start;        \
359         state.end[item->reg.num] = item->reg.end;            \
360         state.changed[item->reg.num] = item->reg.level;      \
361         state.count--;                                       \
362       }                                                      \
363       STACK_PREV(state.stack, item, on_empty);               \
364       xcode = item->fail.code;                               \
365       xtext = item->fail.text;                               \
366       state.count = item->fail.count;                        \
367       state.level = item->fail.level;                        \
368       state.point--;                                         \
369     } while (item->fail.text == NULL);                       \
370   }
371 
372 enum regexp_compiled_ops
373 {                       /* opcodes for compiled regexp */
374   Cend,                 /* end of pattern reached */
375   Cbol,                 /* beginning of line */
376   Ceol,                 /* end of line */
377   Cset,                 /* character set.  Followed by 32 bytes of set. */
378   Cexact,               /* followed by a byte to match */
379   Canychar,             /* matches any character except newline */
380   Cstart_memory,        /* set register start addr (followed by reg number) */
381   Cend_memory,          /* set register end addr (followed by reg number) */
382   Cmatch_memory,        /* match a duplicate of reg contents (regnum follows) */
383   Cjump,                /* followed by two bytes (lsb,msb) of displacement. */
384   Cstar_jump,           /* will change to jump/update_failure_jump at runtime */
385   Cfailure_jump,        /* jump to addr on failure */
386   Cupdate_failure_jump, /* update topmost failure point and jump */
387   Cdummy_failure_jump,  /* push a dummy failure point and jump */
388   Cbegbuf,              /* match at beginning of buffer */
389   Cendbuf,              /* match at end of buffer */
390   Cwordbeg,             /* match at beginning of word */
391   Cwordend,             /* match at end of word */
392   Cwordbound,           /* match if at word boundary */
393   Cnotwordbound,        /* match if not at word boundary */
394   Csyntaxspec,          /* matches syntax code (1 byte follows) */
395   Cnotsyntaxspec, /* matches if syntax code does not match (1 byte follows) */
396   Crepeat1
397 };
398 
399 enum regexp_syntax_op
400 {                   /* syntax codes for plain and quoted characters */
401   Rend,             /* special code for end of regexp */
402   Rnormal,          /* normal character */
403   Ranychar,         /* any character except newline */
404   Rquote,           /* the quote character */
405   Rbol,             /* match beginning of line */
406   Reol,             /* match end of line */
407   Roptional,        /* match preceding expression optionally */
408   Rstar,            /* match preceding expr zero or more times */
409   Rplus,            /* match preceding expr one or more times */
410   Ror,              /* match either of alternatives */
411   Ropenpar,         /* opening parenthesis */
412   Rclosepar,        /* closing parenthesis */
413   Rmemory,          /* match memory register */
414   Rextended_memory, /* \vnn to match registers 10-99 */
415   Ropenset,         /* open set.  Internal syntax hard-coded below. */
416   /* the following are gnu extensions to "normal" regexp syntax */
417   Rbegbuf,       /* beginning of buffer */
418   Rendbuf,       /* end of buffer */
419   Rwordchar,     /* word character */
420   Rnotwordchar,  /* not word character */
421   Rwordbeg,      /* beginning of word */
422   Rwordend,      /* end of word */
423   Rwordbound,    /* word bound */
424   Rnotwordbound, /* not word bound */
425   Rnum_ops
426 };
427 
428 static int re_compile_initialized = 0;
429 static int regexp_syntax = RE_SYNTAX_EGREP;
430 int re_syntax = RE_SYNTAX_EGREP; /* Exported copy of regexp_syntax */
431 static unsigned char plain_ops[256];
432 static unsigned char quoted_ops[256];
433 static unsigned char precedences[Rnum_ops];
434 static int regexp_context_indep_ops;
435 static int regexp_ansi_sequences;
436 
437 #define NUM_LEVELS 5    /* number of precedence levels in use */
438 #define MAX_NESTING 100 /* max nesting level of operators */
439 
440 #define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
441 
442 unsigned char re_syntax_table[256];
443 
ReCompileInitialize(void)444 void ReCompileInitialize(void)
445 {
446   int a;
447 
448   static int syntax_table_inited = 0;
449 
450   if (!syntax_table_inited) {
451     syntax_table_inited = 1;
452     memset(re_syntax_table, 0, 256);
453     for (a = 'a'; a <= 'z'; a++) re_syntax_table[a] = Sword;
454     for (a = 'A'; a <= 'Z'; a++) re_syntax_table[a] = Sword;
455     for (a = '0'; a <= '9'; a++)
456       re_syntax_table[a] = Sword | Sdigit | Shexdigit;
457     for (a = '0'; a <= '7'; a++) re_syntax_table[a] |= Soctaldigit;
458     for (a = 'A'; a <= 'F'; a++) re_syntax_table[a] |= Shexdigit;
459     for (a = 'a'; a <= 'f'; a++) re_syntax_table[a] |= Shexdigit;
460     re_syntax_table[(int)'_'] = Sword;
461     for (a = 9; a <= 13; a++) re_syntax_table[a] = Swhitespace;
462     re_syntax_table[(int)' '] = Swhitespace;
463   }
464   re_compile_initialized = 1;
465   for (a = 0; a < 256; a++) {
466     plain_ops[a] = Rnormal;
467     quoted_ops[a] = Rnormal;
468   }
469   for (a = '0'; a <= '9'; a++) quoted_ops[a] = Rmemory;
470   plain_ops[(int)'\134'] = Rquote;
471   if (regexp_syntax & RE_NO_BK_PARENS) {
472     plain_ops[(int)'('] = Ropenpar;
473     plain_ops[(int)')'] = Rclosepar;
474   } else {
475     quoted_ops[(int)'('] = Ropenpar;
476     quoted_ops[(int)')'] = Rclosepar;
477   }
478   if (regexp_syntax & RE_NO_BK_VBAR) {
479     plain_ops[(int)'\174'] = Ror;
480   } else {
481     quoted_ops[(int)'\174'] = Ror;
482   }
483   plain_ops[(int)'*'] = Rstar;
484   if (regexp_syntax & RE_BK_PLUS_QM) {
485     quoted_ops[(int)'+'] = Rplus;
486     quoted_ops[(int)'?'] = Roptional;
487   } else {
488     plain_ops[(int)'+'] = Rplus;
489     plain_ops[(int)'?'] = Roptional;
490   }
491   if (regexp_syntax & RE_NEWLINE_OR) { plain_ops[(int)'\n'] = Ror; }
492   plain_ops[(int)'\133'] = Ropenset;
493   plain_ops[(int)'\136'] = Rbol;
494   plain_ops[(int)'$'] = Reol;
495   plain_ops[(int)'.'] = Ranychar;
496   if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS)) {
497     quoted_ops[(int)'w'] = Rwordchar;
498     quoted_ops[(int)'W'] = Rnotwordchar;
499     quoted_ops[(int)'<'] = Rwordbeg;
500     quoted_ops[(int)'>'] = Rwordend;
501     quoted_ops[(int)'b'] = Rwordbound;
502     quoted_ops[(int)'B'] = Rnotwordbound;
503     quoted_ops[(int)'`'] = Rbegbuf;
504     quoted_ops[(int)'\''] = Rendbuf;
505   }
506   if (regexp_syntax & RE_ANSI_HEX) { quoted_ops[(int)'v'] = Rextended_memory; }
507   for (a = 0; a < Rnum_ops; a++) { precedences[a] = 4; }
508   if (regexp_syntax & RE_TIGHT_VBAR) {
509     precedences[Ror] = 3;
510     precedences[Rbol] = 2;
511     precedences[Reol] = 2;
512   } else {
513     precedences[Ror] = 2;
514     precedences[Rbol] = 3;
515     precedences[Reol] = 3;
516   }
517   precedences[Rclosepar] = 1;
518   precedences[Rend] = 0;
519   regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
520   regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
521 }
522 
ReSetSyntax(int syntax)523 int ReSetSyntax(int syntax)
524 {
525   int ret;
526 
527   ret = regexp_syntax;
528   regexp_syntax = syntax;
529   re_syntax = syntax; /* Exported copy */
530   ReCompileInitialize();
531   return ret;
532 }
533 
HexCharToDecimal(int ch)534 static int HexCharToDecimal(int ch)
535 {
536   if (ch >= '0' && ch <= '9') return ch - '0';
537   if (ch >= 'a' && ch <= 'f') return ch - 'a' + 10;
538   if (ch >= 'A' && ch <= 'F') return ch - 'A' + 10;
539   return 16;
540 }
541 
re_compile_fastmap_aux(regex_t * bufp,unsigned char * code,int pos,unsigned char * visited,unsigned char * can_be_null,unsigned char * fastmap)542 static void re_compile_fastmap_aux(regex_t* bufp,
543                                    unsigned char* code,
544                                    int pos,
545                                    unsigned char* visited,
546                                    unsigned char* can_be_null,
547                                    unsigned char* fastmap)
548 {
549   int a;
550   int b;
551   int syntaxcode;
552 
553   if (visited[pos]) return; /* we have already been here */
554   visited[pos] = 1;
555   for (;;) {
556     switch (code[pos++]) {
557       case Cend:
558         *can_be_null = 1;
559         return;
560       case Cbol:
561       case Cbegbuf:
562       case Cendbuf:
563       case Cwordbeg:
564       case Cwordend:
565       case Cwordbound:
566       case Cnotwordbound:
567         for (a = 0; a < 256; a++) fastmap[a] = 1;
568         break;
569       case Csyntaxspec:
570         syntaxcode = code[pos++];
571         for (a = 0; a < 256; a++)
572           if (SYNTAX(a) & syntaxcode) fastmap[a] = 1;
573         return;
574       case Cnotsyntaxspec:
575         syntaxcode = code[pos++];
576         for (a = 0; a < 256; a++)
577           if (!(SYNTAX(a) & syntaxcode)) fastmap[a] = 1;
578         return;
579       case Ceol:
580         fastmap[(int)'\n'] = 1;
581         if (*can_be_null == 0)
582           *can_be_null = 2; /* can match null, but only at end of buffer */
583         return;
584       case Cset:
585         for (a = 0; a < 256 / 8; a++)
586           if (code[pos + a] != 0)
587             for (b = 0; b < 8; b++)
588               if (code[pos + a] & (1 << b)) fastmap[(a << 3) + b] = 1;
589         pos += 256 / 8;
590         return;
591       case Cexact:
592         fastmap[(unsigned char)code[pos]] = 1;
593         return;
594       case Canychar:
595         for (a = 0; a < 256; a++)
596           if (a != '\n') fastmap[a] = 1;
597         return;
598       case Cstart_memory:
599       case Cend_memory:
600         pos++;
601         break;
602       case Cmatch_memory:
603         for (a = 0; a < 256; a++) fastmap[a] = 1;
604         *can_be_null = 1;
605         return;
606       case Cjump:
607       case Cdummy_failure_jump:
608       case Cupdate_failure_jump:
609       case Cstar_jump:
610         a = (unsigned char)code[pos++];
611         a |= (unsigned char)code[pos++] << 8;
612         pos += (int)SHORT(a);
613         if (visited[pos]) {
614           /* argh... the regexp contains empty loops.  This is not
615              good, as this may cause a failure stack overflow when
616              matching.  Oh well. */
617           /* this path leads nowhere; pursue other paths. */
618           return;
619         }
620         visited[pos] = 1;
621         break;
622       case Cfailure_jump:
623         a = (unsigned char)code[pos++];
624         a |= (unsigned char)code[pos++] << 8;
625         a = pos + (int)SHORT(a);
626         re_compile_fastmap_aux(bufp, code, a, visited, can_be_null, fastmap);
627         break;
628       case Crepeat1:
629         pos += 2;
630         break;
631       default:
632         set_error("Unknown regex opcode: memory corrupted?");
633         return;
634     }
635   }
636 }
637 
re_do_compile_fastmap(regex_t * bufp,unsigned char * buffer,int used,int pos,unsigned char * can_be_null,unsigned char * fastmap)638 static int re_do_compile_fastmap(regex_t* bufp,
639                                  unsigned char* buffer,
640                                  int used,
641                                  int pos,
642                                  unsigned char* can_be_null,
643                                  unsigned char* fastmap)
644 {
645   unsigned char small_visited[512], *visited;
646 
647   if (used <= (int)sizeof(small_visited))
648     visited = small_visited;
649   else {
650     visited = (unsigned char*)malloc(used);
651     if (!visited) return 0;
652   }
653   *can_be_null = 0;
654   memset(fastmap, 0, 256);
655   memset(visited, 0, used);
656   re_compile_fastmap_aux(bufp, buffer, pos, visited, can_be_null, fastmap);
657   if (visited != small_visited) free(visited);
658   return 1;
659 }
660 
ReCompileFastmap(regex_t * bufp)661 void ReCompileFastmap(regex_t* bufp)
662 {
663   if (!bufp->fastmap || bufp->fastmap_accurate) return;
664   // assert(bufp->used > 0);
665   if (!re_do_compile_fastmap(bufp, bufp->buffer, bufp->used, 0,
666                              &bufp->can_be_null, bufp->fastmap))
667     return;
668   if (got_error) return;
669   if (bufp->buffer[0] == Cbol)
670     bufp->anchor = 1; /* begline */
671   else if (bufp->buffer[0] == Cbegbuf)
672     bufp->anchor = 2; /* begbuf */
673   else
674     bufp->anchor = 0; /* none */
675   bufp->fastmap_accurate = 1;
676 }
677 
678 /*
679  * star is coded as:
680  * 1: failure_jump 2
681  *    ... code for operand of star
682  *    star_jump 1
683  * 2: ... code after star
684  *
685  * We change the star_jump to update_failure_jump if we can determine
686  * that it is safe to do so; otherwise we change it to an ordinary
687  * jump.
688  *
689  * plus is coded as
690  *
691  *    jump 2
692  * 1: failure_jump 3
693  * 2: ... code for operand of plus
694  *    star_jump 1
695  * 3: ... code after plus
696  *
697  * For star_jump considerations this is processed identically to star.
698  *
699  */
700 
ReOptimizeStarJump(regex_t * bufp,unsigned char * code)701 static int ReOptimizeStarJump(regex_t* bufp, unsigned char* code)
702 {
703   unsigned char map[256];
704   unsigned char can_be_null;
705   unsigned char* p1;
706   unsigned char* p2;
707   unsigned char ch;
708   int a;
709   int b;
710   int num_instructions = 0;
711 
712   a = (unsigned char)*code++;
713   a |= (unsigned char)*code++ << 8;
714   a = (int)SHORT(a);
715 
716   p1 = code + a + 3; /* skip the failure_jump */
717   /* Check that the jump is within the pattern */
718   if (p1 < bufp->buffer || bufp->buffer + bufp->used < p1) {
719     set_error("Regex VM jump out of bounds (failure_jump opt)");
720     return 0;
721   }
722   // assert(p1[-3] == Cfailure_jump);
723   p2 = code;
724   /* p1 points inside loop, p2 points to after loop */
725   if (!re_do_compile_fastmap(bufp, bufp->buffer, bufp->used,
726                              (int)(p2 - bufp->buffer), &can_be_null, map))
727     goto make_normal_jump;
728 
729   /* If we might introduce a new update point inside the
730    * loop, we can't optimize because then update_jump would
731    * update a wrong failure point.  Thus we have to be
732    * quite careful here.
733    */
734 
735   /* loop until we find something that consumes a character */
736   for (;;) {
737     num_instructions++;
738     switch (*p1++) {
739       case Cbol:
740       case Ceol:
741       case Cbegbuf:
742       case Cendbuf:
743       case Cwordbeg:
744       case Cwordend:
745       case Cwordbound:
746       case Cnotwordbound:
747         continue;
748       case Cstart_memory:
749       case Cend_memory:
750         p1++;
751         continue;
752       case Cexact:
753         ch = (unsigned char)*p1++;
754         if (map[(int)ch]) goto make_normal_jump;
755         break;
756       case Canychar:
757         for (b = 0; b < 256; b++)
758           if (b != '\n' && map[b]) goto make_normal_jump;
759         break;
760       case Cset:
761         for (b = 0; b < 256; b++)
762           if ((p1[b >> 3] & (1 << (b & 7))) && map[b]) goto make_normal_jump;
763         p1 += 256 / 8;
764         break;
765       default:
766         goto make_normal_jump;
767     }
768     break;
769   }
770   /* now we know that we can't backtrack. */
771   while (p1 != p2 - 3) {
772     num_instructions++;
773     switch (*p1++) {
774       case Cend:
775         return 0;
776       case Cbol:
777       case Ceol:
778       case Canychar:
779       case Cbegbuf:
780       case Cendbuf:
781       case Cwordbeg:
782       case Cwordend:
783       case Cwordbound:
784       case Cnotwordbound:
785         break;
786       case Cset:
787         p1 += 256 / 8;
788         break;
789       case Cexact:
790       case Cstart_memory:
791       case Cend_memory:
792       case Cmatch_memory:
793       case Csyntaxspec:
794       case Cnotsyntaxspec:
795         p1++;
796         break;
797       case Cjump:
798       case Cstar_jump:
799       case Cfailure_jump:
800       case Cupdate_failure_jump:
801       case Cdummy_failure_jump:
802         goto make_normal_jump;
803       default:
804         return 0;
805     }
806   }
807 
808   /* make_update_jump: */
809   code -= 3;
810   a += 3; /* jump to after the Cfailure_jump */
811   code[0] = Cupdate_failure_jump;
812   code[1] = a & 0xff;
813   code[2] = a >> 8;
814   if (num_instructions > 1) return 1;
815   // assert(num_instructions == 1);
816   /* if the only instruction matches a single character, we can do
817    * better */
818   p1 = code + 3 + a; /* start of sole instruction */
819   if (*p1 == Cset || *p1 == Cexact || *p1 == Canychar || *p1 == Csyntaxspec
820       || *p1 == Cnotsyntaxspec)
821     code[0] = Crepeat1;
822   return 1;
823 
824 make_normal_jump:
825   code -= 3;
826   *code = Cjump;
827   return 1;
828 }
829 
ReOptimize(regex_t * bufp)830 static int ReOptimize(regex_t* bufp)
831 {
832   unsigned char* code;
833 
834   code = bufp->buffer;
835 
836   while (1) {
837     switch (*code++) {
838       case Cend:
839         return 1;
840       case Canychar:
841       case Cbol:
842       case Ceol:
843       case Cbegbuf:
844       case Cendbuf:
845       case Cwordbeg:
846       case Cwordend:
847       case Cwordbound:
848       case Cnotwordbound:
849         break;
850       case Cset:
851         code += 256 / 8;
852         break;
853       case Cexact:
854       case Cstart_memory:
855       case Cend_memory:
856       case Cmatch_memory:
857       case Csyntaxspec:
858       case Cnotsyntaxspec:
859         code++;
860         break;
861       case Cstar_jump:
862         if (!ReOptimizeStarJump(bufp, code)) { return 0; }
863         /* fall through */
864       case Cupdate_failure_jump:
865       case Cjump:
866       case Cdummy_failure_jump:
867       case Cfailure_jump:
868       case Crepeat1:
869         code += 2;
870         break;
871       default:
872         return 0;
873     }
874   }
875 }
876 
877 #define NEXTCHAR(var)                       \
878   {                                         \
879     if (pos >= size) goto ends_prematurely; \
880     (var) = regex[pos];                     \
881     pos++;                                  \
882   }
883 
884 #define ALLOC(amount)                                    \
885   {                                                      \
886     if (pattern_offset + (amount) > alloc) {             \
887       alloc += 256 + (amount);                           \
888       pattern = (unsigned char*)realloc(pattern, alloc); \
889       if (!pattern) goto out_of_memory;                  \
890     }                                                    \
891   }
892 
893 #define STORE(ch) pattern[pattern_offset++] = (ch)
894 
895 #define CURRENT_LEVEL_START (starts[starts_base + current_level])
896 
897 #define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
898 
899 #define PUSH_LEVEL_STARTS                           \
900   if (starts_base < (MAX_NESTING - 1) * NUM_LEVELS) \
901     starts_base += NUM_LEVELS;                      \
902   else                                              \
903     goto too_complex
904 
905 #define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
906 
907 #define PUT_ADDR(offset, addr)                  \
908   {                                             \
909     int disp = (addr) - (offset)-2;             \
910     pattern[(offset)] = disp & 0xff;            \
911     pattern[(offset) + 1] = (disp >> 8) & 0xff; \
912   }
913 
914 #define INSERT_JUMP(pos, type, addr)                                       \
915   {                                                                        \
916     int a, p = (pos), t = (type), ad = (addr);                             \
917     for (a = pattern_offset - 1; a >= p; a--) pattern[a + 3] = pattern[a]; \
918     pattern[p] = t;                                                        \
919     PUT_ADDR(p + 1, ad);                                                   \
920     pattern_offset += 3;                                                   \
921   }
922 
923 #define SETBIT(buf, offset, bit) (buf)[(offset) + (bit) / 8] |= (1 << ((bit)&7))
924 
925 #define SET_FIELDS               \
926   {                              \
927     bufp->allocated = alloc;     \
928     bufp->buffer = pattern;      \
929     bufp->used = pattern_offset; \
930   }
931 
932 #define GETHEX(var)                             \
933   {                                             \
934     unsigned char gethex_ch, gethex_value;      \
935     NEXTCHAR(gethex_ch);                        \
936     gethex_value = HexCharToDecimal(gethex_ch); \
937     if (gethex_value == 16) goto hex_error;     \
938     NEXTCHAR(gethex_ch);                        \
939     gethex_ch = HexCharToDecimal(gethex_ch);    \
940     if (gethex_ch == 16) goto hex_error;        \
941     (var) = gethex_value * 16 + gethex_ch;      \
942   }
943 
944 #define ANSI_TRANSLATE(ch)                                \
945   {                                                       \
946     switch (ch) {                                         \
947       case 'a':                                           \
948       case 'A': {                                         \
949         ch = 7; /* audible bell */                        \
950         break;                                            \
951       }                                                   \
952       case 'b':                                           \
953       case 'B': {                                         \
954         ch = 8; /* backspace */                           \
955         break;                                            \
956       }                                                   \
957       case 'f':                                           \
958       case 'F': {                                         \
959         ch = 12; /* form feed */                          \
960         break;                                            \
961       }                                                   \
962       case 'n':                                           \
963       case 'N': {                                         \
964         ch = 10; /* line feed */                          \
965         break;                                            \
966       }                                                   \
967       case 'r':                                           \
968       case 'R': {                                         \
969         ch = 13; /* carriage return */                    \
970         break;                                            \
971       }                                                   \
972       case 't':                                           \
973       case 'T': {                                         \
974         ch = 9; /* tab */                                 \
975         break;                                            \
976       }                                                   \
977       case 'v':                                           \
978       case 'V': {                                         \
979         ch = 11; /* vertical tab */                       \
980         break;                                            \
981       }                                                   \
982       case 'x': /* hex code */                            \
983       case 'X': {                                         \
984         GETHEX(ch);                                       \
985         break;                                            \
986       }                                                   \
987       default: {                                          \
988         /* other characters passed through */             \
989         if (translate) ch = translate[(unsigned char)ch]; \
990         break;                                            \
991       }                                                   \
992     }                                                     \
993   }
994 
re_compile_pattern(regex_t * bufp,unsigned char * regex)995 const char* re_compile_pattern(regex_t* bufp, unsigned char* regex)
996 {
997   int a;
998   int pos;
999   int op;
1000   int current_level;
1001   int level;
1002   int opcode;
1003   int pattern_offset = 0, alloc;
1004   int starts[NUM_LEVELS * MAX_NESTING];
1005   int starts_base;
1006   int future_jumps[MAX_NESTING];
1007   int num_jumps;
1008   unsigned char ch = '\0';
1009   unsigned char* pattern;
1010   unsigned char* translate;
1011   int next_register;
1012   int paren_depth;
1013   int num_open_registers;
1014   int open_registers[RE_NREGS];
1015   int beginning_context;
1016   int size = strlen((char*)regex);
1017 
1018   if (!re_compile_initialized) ReCompileInitialize();
1019   bufp->used = 0;
1020   bufp->fastmap_accurate = 0;
1021   bufp->uses_registers = 1;
1022   bufp->num_registers = 1;
1023   translate = bufp->translate;
1024   pattern = bufp->buffer;
1025   alloc = bufp->allocated;
1026   if (alloc == 0 || pattern == NULL) {
1027     alloc = 256;
1028     bufp->buffer = pattern = (unsigned char*)malloc(alloc);
1029     if (!pattern) goto out_of_memory;
1030   }
1031   pattern_offset = 0;
1032   starts_base = 0;
1033   num_jumps = 0;
1034   current_level = 0;
1035   SET_LEVEL_START;
1036   num_open_registers = 0;
1037   next_register = 1;
1038   paren_depth = 0;
1039   beginning_context = 1;
1040   op = -1;
1041   /* we use Rend dummy to ensure that pending jumps are updated
1042      (due to low priority of Rend) before exiting the loop. */
1043   pos = 0;
1044   while (op != Rend) {
1045     if (pos >= size)
1046       op = Rend;
1047     else {
1048       NEXTCHAR(ch);
1049       if (translate) ch = translate[(unsigned char)ch];
1050       op = plain_ops[(unsigned char)ch];
1051       if (op == Rquote) {
1052         NEXTCHAR(ch);
1053         op = quoted_ops[(unsigned char)ch];
1054         if (op == Rnormal && regexp_ansi_sequences) ANSI_TRANSLATE(ch);
1055       }
1056     }
1057     level = precedences[op];
1058     /* printf("ch='%c' op=%d level=%d current_level=%d
1059        curlevstart=%d\n", ch, op, level, current_level,
1060        CURRENT_LEVEL_START); */
1061     if (level > current_level) {
1062       for (current_level++; current_level < level; current_level++)
1063         SET_LEVEL_START;
1064       SET_LEVEL_START;
1065     } else if (level < current_level) {
1066       current_level = level;
1067       for (;
1068            num_jumps > 0 && future_jumps[num_jumps - 1] >= CURRENT_LEVEL_START;
1069            num_jumps--)
1070         PUT_ADDR(future_jumps[num_jumps - 1], pattern_offset);
1071     }
1072     switch (op) {
1073       case Rend:
1074         break;
1075       case Rnormal:
1076       normal_char:
1077         opcode = Cexact;
1078       store_opcode_and_arg: /* opcode & ch must be set */
1079         SET_LEVEL_START;
1080         ALLOC(2);
1081         STORE(opcode);
1082         STORE(ch);
1083         break;
1084       case Ranychar:
1085         opcode = Canychar;
1086       store_opcode:
1087         SET_LEVEL_START;
1088         ALLOC(1);
1089         STORE(opcode);
1090         break;
1091       case Rquote:
1092         set_error("Rquote");
1093         /* NOTREACHED */
1094         break;
1095       case Rbol:
1096         if (!beginning_context) {
1097           if (regexp_context_indep_ops)
1098             goto op_error;
1099           else
1100             goto normal_char;
1101         }
1102         opcode = Cbol;
1103         goto store_opcode;
1104       case Reol:
1105         if (!((pos >= size)
1106               || ((regexp_syntax & RE_NO_BK_VBAR)
1107                       ? (regex[pos] == '\174')
1108                       : (pos + 1 < size && regex[pos] == '\134'
1109                          && regex[pos + 1] == '\174'))
1110               || ((regexp_syntax & RE_NO_BK_PARENS)
1111                       ? (regex[pos] == ')')
1112                       : (pos + 1 < size && regex[pos] == '\134'
1113                          && regex[pos + 1] == ')')))) {
1114           if (regexp_context_indep_ops)
1115             goto op_error;
1116           else
1117             goto normal_char;
1118         }
1119         opcode = Ceol;
1120         goto store_opcode;
1121         /* NOTREACHED */
1122         break;
1123       case Roptional:
1124         if (beginning_context) {
1125           if (regexp_context_indep_ops)
1126             goto op_error;
1127           else
1128             goto normal_char;
1129         }
1130         if (CURRENT_LEVEL_START == pattern_offset)
1131           break; /* ignore empty patterns for ? */
1132         ALLOC(3);
1133         INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 3);
1134         break;
1135       case Rstar:
1136       case Rplus:
1137         if (beginning_context) {
1138           if (regexp_context_indep_ops)
1139             goto op_error;
1140           else
1141             goto normal_char;
1142         }
1143         if (CURRENT_LEVEL_START == pattern_offset)
1144           break; /* ignore empty patterns for + and * */
1145         ALLOC(9);
1146         INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1147         INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
1148         if (op == Rplus) /* jump over initial failure_jump */
1149           INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
1150                       CURRENT_LEVEL_START + 6);
1151         break;
1152       case Ror:
1153         ALLOC(6);
1154         INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1155         if (num_jumps >= MAX_NESTING) goto too_complex;
1156         STORE(Cjump);
1157         future_jumps[num_jumps++] = pattern_offset;
1158         STORE(0);
1159         STORE(0);
1160         SET_LEVEL_START;
1161         break;
1162       case Ropenpar: {
1163         SET_LEVEL_START;
1164         if (next_register < RE_NREGS) {
1165           bufp->uses_registers = 1;
1166           ALLOC(2);
1167           STORE(Cstart_memory);
1168           STORE(next_register);
1169           open_registers[num_open_registers++] = next_register;
1170           bufp->num_registers++;
1171           next_register++;
1172         }
1173         paren_depth++;
1174         PUSH_LEVEL_STARTS;
1175         current_level = 0;
1176         SET_LEVEL_START;
1177         break;
1178       }
1179       case Rclosepar:
1180         if (paren_depth <= 0) goto parenthesIsError;
1181         POP_LEVEL_STARTS;
1182         current_level = precedences[Ropenpar];
1183         paren_depth--;
1184         if (paren_depth < num_open_registers) {
1185           bufp->uses_registers = 1;
1186           ALLOC(2);
1187           STORE(Cend_memory);
1188           num_open_registers--;
1189           STORE(open_registers[num_open_registers]);
1190         }
1191         break;
1192       case Rmemory:
1193         if (ch == '0') goto bad_match_register;
1194         if (!(ch >= '0' && ch <= '9')) { goto bad_match_register; }
1195         bufp->uses_registers = 1;
1196         opcode = Cmatch_memory;
1197         ch -= '0';
1198         goto store_opcode_and_arg;
1199       case Rextended_memory:
1200         NEXTCHAR(ch);
1201         if (ch < '0' || ch > '9') goto bad_match_register;
1202         NEXTCHAR(a);
1203         if (a < '0' || a > '9') goto bad_match_register;
1204         ch = 10 * (a - '0') + ch - '0';
1205         if (ch == 0 || ch >= RE_NREGS) goto bad_match_register;
1206         bufp->uses_registers = 1;
1207         opcode = Cmatch_memory;
1208         goto store_opcode_and_arg;
1209       case Ropenset: {
1210         int complement;
1211         int prev;
1212         int offset;
1213         int range;
1214         int firstchar;
1215 
1216         SET_LEVEL_START;
1217         ALLOC(1 + 256 / 8);
1218         STORE(Cset);
1219         offset = pattern_offset;
1220         for (a = 0; a < 256 / 8; a++) STORE(0);
1221         NEXTCHAR(ch);
1222         if (translate) ch = translate[(unsigned char)ch];
1223         if (ch == '\136') {
1224           complement = 1;
1225           NEXTCHAR(ch);
1226           if (translate) ch = translate[(unsigned char)ch];
1227         } else
1228           complement = 0;
1229         prev = -1;
1230         range = 0;
1231         firstchar = 1;
1232         while (ch != '\135' || firstchar) {
1233           firstchar = 0;
1234           if (regexp_ansi_sequences && ch == '\134') {
1235             NEXTCHAR(ch);
1236             ANSI_TRANSLATE(ch);
1237           }
1238           if (range) {
1239             for (a = prev; a <= (int)ch; a++) SETBIT(pattern, offset, a);
1240             prev = -1;
1241             range = 0;
1242           } else if (prev != -1 && ch == '-')
1243             range = 1;
1244           else {
1245             SETBIT(pattern, offset, ch);
1246             prev = ch;
1247           }
1248           NEXTCHAR(ch);
1249           if (translate) ch = translate[(unsigned char)ch];
1250         }
1251         if (range) SETBIT(pattern, offset, '-');
1252         if (complement) {
1253           for (a = 0; a < 256 / 8; a++) pattern[offset + a] ^= 0xff;
1254         }
1255         break;
1256       }
1257       case Rbegbuf: {
1258         opcode = Cbegbuf;
1259         goto store_opcode;
1260       }
1261       case Rendbuf: {
1262         opcode = Cendbuf;
1263         goto store_opcode;
1264       }
1265       case Rwordchar: {
1266         opcode = Csyntaxspec;
1267         ch = Sword;
1268         goto store_opcode_and_arg;
1269       }
1270       case Rnotwordchar: {
1271         opcode = Cnotsyntaxspec;
1272         ch = Sword;
1273         goto store_opcode_and_arg;
1274       }
1275       case Rwordbeg: {
1276         opcode = Cwordbeg;
1277         goto store_opcode;
1278       }
1279       case Rwordend: {
1280         opcode = Cwordend;
1281         goto store_opcode;
1282       }
1283       case Rwordbound: {
1284         opcode = Cwordbound;
1285         goto store_opcode;
1286       }
1287       case Rnotwordbound: {
1288         opcode = Cnotwordbound;
1289         goto store_opcode;
1290       }
1291       default: {
1292         abort();
1293       }
1294     }
1295     beginning_context = (op == Ropenpar || op == Ror);
1296   }
1297   if (starts_base != 0) goto parenthesIsError;
1298   // assert(num_jumps == 0);
1299   ALLOC(1);
1300   STORE(Cend);
1301   SET_FIELDS;
1302   if (!ReOptimize(bufp)) return "Optimization error";
1303   return NULL;
1304 
1305 op_error:
1306   SET_FIELDS;
1307   return "Badly placed special character";
1308 
1309 bad_match_register:
1310   SET_FIELDS;
1311   return "Bad match register number";
1312 
1313 hex_error:
1314   SET_FIELDS;
1315   return "Bad hexadecimal number";
1316 
1317 parenthesIsError:
1318   SET_FIELDS;
1319   return "Badly placed parenthesis";
1320 
1321 out_of_memory:
1322   SET_FIELDS;
1323   return "Out of memory";
1324 
1325 ends_prematurely:
1326   SET_FIELDS;
1327   return "Regular expression ends prematurely";
1328 
1329 too_complex:
1330   SET_FIELDS;
1331   return "Regular expression too complex";
1332 }
1333 
1334 #undef CHARAT
1335 #undef NEXTCHAR
1336 #undef GETHEX
1337 #undef ALLOC
1338 #undef STORE
1339 #undef CURRENT_LEVEL_START
1340 #undef SET_LEVEL_START
1341 #undef PUSH_LEVEL_STARTS
1342 #undef POP_LEVEL_STARTS
1343 #undef PUT_ADDR
1344 #undef INSERT_JUMP
1345 #undef SETBIT
1346 #undef SET_FIELDS
1347 
1348 #define PREFETCH \
1349   if (text == textend) goto fail
1350 
1351 #define NEXTCHAR(var)           \
1352   PREFETCH;                     \
1353   var = (unsigned char)*text++; \
1354   if (translate) var = translate[var]
1355 
regcomp(regex_t * bufp,const char * regex,int cflags)1356 int regcomp(regex_t* bufp, const char* regex, int cflags)
1357 {
1358   memset(bufp, 0, sizeof(regex_t));
1359   bufp->cflags = cflags;
1360   if (bufp->cflags & REG_ICASE) {
1361     char *p, *lcase = strdup(regex);
1362     for (p = lcase; *p; p++) { *p = tolower(*p); }
1363     re_compile_pattern(bufp, (unsigned char*)lcase);
1364     free(lcase);
1365   } else {
1366     re_compile_pattern(bufp, (unsigned char*)regex);
1367   }
1368   if (got_error) { return -1; }
1369   return 0;
1370 }
1371 
re_registers_to_regmatch(regexp_registers_t old_regs,regmatch_t pmatch[],size_t nmatch)1372 void re_registers_to_regmatch(regexp_registers_t old_regs,
1373                               regmatch_t pmatch[],
1374                               size_t nmatch)
1375 {
1376   if (!(nmatch == 0 && pmatch == NULL)) {
1377     size_t i = 0;
1378 
1379     /*
1380      * We have to set the last entry to -1
1381      */
1382     nmatch = nmatch - 1;
1383     for (i = 0; (i < nmatch) && (old_regs->start[i] > -1); i++) {
1384       pmatch[i].rm_so = old_regs->start[i];
1385       pmatch[i].rm_eo = old_regs->end[i];
1386     }
1387 
1388     pmatch[i].rm_eo = pmatch[i].rm_so = -1;
1389   }
1390 }
1391 
regexec(regex_t * preg,const char * string,size_t nmatch,regmatch_t pmatch[],int eflags)1392 int regexec(regex_t* preg,
1393             const char* string,
1394             size_t nmatch,
1395             regmatch_t pmatch[],
1396             int eflags)
1397 {
1398   int status;
1399   int len = strlen(string);
1400   re_registers regs{};
1401 
1402   status = ReSearch(preg, (unsigned char*)string, len, 0, len, &regs);
1403   if (status >= 0) { re_registers_to_regmatch(&regs, pmatch, nmatch); }
1404 
1405   /*
1406    * status is the start position in the string base 0 where
1407    * the pattern was found or negative if not found.
1408    */
1409   return status < 0 ? -1 : 0;
1410 }
1411 
regerror(int errcode,regex_t * preg,char * errbuf,size_t errbuf_size)1412 size_t regerror(int errcode, regex_t* preg, char* errbuf, size_t errbuf_size)
1413 {
1414   bstrncpy(errbuf, preg->errmsg, errbuf_size);
1415   return 0;
1416 }
1417 
regfree(regex_t * preg)1418 void regfree(regex_t* preg)
1419 {
1420   if (preg->lcase) {
1421     FreePoolMemory(preg->lcase);
1422     preg->lcase = NULL;
1423   }
1424   if (preg->buffer) {
1425     free(preg->buffer);
1426     preg->buffer = NULL;
1427   }
1428 }
1429 
ReMatch(regex_t * bufp,unsigned char * string,int size,int pos,regexp_registers_t old_regs)1430 int ReMatch(regex_t* bufp,
1431             unsigned char* string,
1432             int size,
1433             int pos,
1434             regexp_registers_t old_regs)
1435 {
1436   unsigned char* code;
1437   unsigned char* translate;
1438   unsigned char* text;
1439   unsigned char* textstart;
1440   unsigned char* textend;
1441   int a;
1442   int b;
1443   int ch;
1444   int reg;
1445   int match_end;
1446   unsigned char* regstart;
1447   unsigned char* regend;
1448   int regsize;
1449   match_state state;
1450 
1451   // assert(pos >= 0 && size >= 0);
1452   // assert(pos <= size);
1453 
1454   text = string + pos;
1455   textstart = string;
1456   textend = string + size;
1457 
1458   code = bufp->buffer;
1459 
1460   translate = bufp->translate;
1461 
1462   NEW_STATE(state, bufp->num_registers);
1463 
1464 continue_matching:
1465   switch (*code++) {
1466     case Cend: {
1467       match_end = text - textstart;
1468       if (old_regs) {
1469         old_regs->start[0] = pos;
1470         old_regs->end[0] = match_end;
1471         if (!bufp->uses_registers) {
1472           for (a = 1; a < RE_NREGS; a++) {
1473             old_regs->start[a] = -1;
1474             old_regs->end[a] = -1;
1475           }
1476         } else {
1477           for (a = 1; a < bufp->num_registers; a++) {
1478             if ((GET_REG_START(state, a) == NULL)
1479                 || (GET_REG_END(state, a) == NULL)) {
1480               old_regs->start[a] = -1;
1481               old_regs->end[a] = -1;
1482               continue;
1483             }
1484             old_regs->start[a] = GET_REG_START(state, a) - textstart;
1485             old_regs->end[a] = GET_REG_END(state, a) - textstart;
1486           }
1487           for (; a < RE_NREGS; a++) {
1488             old_regs->start[a] = -1;
1489             old_regs->end[a] = -1;
1490           }
1491         }
1492       }
1493       FREE_STATE(state);
1494       return match_end - pos;
1495     }
1496     case Cbol: {
1497       if (text == textstart || text[-1] == '\n') goto continue_matching;
1498       goto fail;
1499     }
1500     case Ceol: {
1501       if (text == textend || *text == '\n') goto continue_matching;
1502       goto fail;
1503     }
1504     case Cset: {
1505       NEXTCHAR(ch);
1506       if (code[ch / 8] & (1 << (ch & 7))) {
1507         code += 256 / 8;
1508         goto continue_matching;
1509       }
1510       goto fail;
1511     }
1512     case Cexact: {
1513       NEXTCHAR(ch);
1514       if (ch != (unsigned char)*code++) goto fail;
1515       goto continue_matching;
1516     }
1517     case Canychar: {
1518       NEXTCHAR(ch);
1519       if (ch == '\n') goto fail;
1520       goto continue_matching;
1521     }
1522     case Cstart_memory: {
1523       reg = *code++;
1524       SET_REG_START(state, reg, text, goto error);
1525       goto continue_matching;
1526     }
1527     case Cend_memory: {
1528       reg = *code++;
1529       SET_REG_END(state, reg, text, goto error);
1530       goto continue_matching;
1531     }
1532     case Cmatch_memory: {
1533       reg = *code++;
1534       regstart = GET_REG_START(state, reg);
1535       regend = GET_REG_END(state, reg);
1536       if ((regstart == NULL) || (regend == NULL))
1537         goto fail; /* or should we just match nothing? */
1538       regsize = regend - regstart;
1539 
1540       if (regsize > (textend - text)) goto fail;
1541       if (translate) {
1542         for (; regstart < regend; regstart++, text++)
1543           if (translate[*regstart] != translate[*text]) goto fail;
1544       } else
1545         for (; regstart < regend; regstart++, text++)
1546           if (*regstart != *text) goto fail;
1547       goto continue_matching;
1548     }
1549     case Cupdate_failure_jump: {
1550       UPDATE_FAILURE(state, text, goto error);
1551       /* fall to next case */
1552     }
1553       /* treat Cstar_jump just like Cjump if it hasn't been optimized */
1554     case Cstar_jump:
1555     case Cjump: {
1556       a = (unsigned char)*code++;
1557       a |= (unsigned char)*code++ << 8;
1558       code += (int)SHORT(a);
1559       if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1560         set_error("Regex VM jump out of bounds (Cjump)");
1561         FREE_STATE(state);
1562         return -2;
1563       }
1564       goto continue_matching;
1565     }
1566     case Cdummy_failure_jump: {
1567       unsigned char* failuredest;
1568 
1569       a = (unsigned char)*code++;
1570       a |= (unsigned char)*code++ << 8;
1571       a = (int)SHORT(a);
1572       //       assert(*code == Cfailure_jump);
1573       b = (unsigned char)code[1];
1574       b |= (unsigned char)code[2] << 8;
1575       failuredest = code + (int)SHORT(b) + 3;
1576       if (failuredest < bufp->buffer
1577           || bufp->buffer + bufp->used < failuredest) {
1578         set_error(
1579             "Regex VM jump out of bounds (Cdummy_failure_jump failuredest)");
1580         FREE_STATE(state);
1581         return -2;
1582       }
1583       PUSH_FAILURE(state, failuredest, NULL, goto error);
1584       code += a;
1585       if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1586         set_error("Regex VM jump out of bounds (Cdummy_failure_jump code)");
1587         FREE_STATE(state);
1588         return -2;
1589       }
1590       goto continue_matching;
1591     }
1592     case Cfailure_jump: {
1593       a = (unsigned char)*code++;
1594       a |= (unsigned char)*code++ << 8;
1595       a = (int)SHORT(a);
1596       if (code + a < bufp->buffer || bufp->buffer + bufp->used < code + a) {
1597         set_error("Regex VM jump out of bounds (Cfailure_jump)");
1598         FREE_STATE(state);
1599         return -2;
1600       }
1601       PUSH_FAILURE(state, code + a, text, goto error);
1602       goto continue_matching;
1603     }
1604     case Crepeat1: {
1605       unsigned char* pinst;
1606       a = (unsigned char)*code++;
1607       a |= (unsigned char)*code++ << 8;
1608       a = (int)SHORT(a);
1609       pinst = code + a;
1610       if (pinst < bufp->buffer || bufp->buffer + bufp->used < pinst) {
1611         set_error("Regex VM jump out of bounds (Crepeat1)");
1612         FREE_STATE(state);
1613         return -2;
1614       }
1615       /* pinst is sole instruction in loop, and it matches a
1616        * single character.  Since Crepeat1 was originally a
1617        * Cupdate_failure_jump, we also know that backtracking
1618        * is useless: so long as the single-character
1619        * expression matches, it must be used.  Also, in the
1620        * case of +, we've already matched one character, so +
1621        * can't fail: nothing here can cause a failure.  */
1622       switch (*pinst++) {
1623         case Cset: {
1624           if (translate) {
1625             while (text < textend) {
1626               ch = translate[(unsigned char)*text];
1627               if (pinst[ch / 8] & (1 << (ch & 7)))
1628                 text++;
1629               else
1630                 break;
1631             }
1632           } else {
1633             while (text < textend) {
1634               ch = (unsigned char)*text;
1635               if (pinst[ch / 8] & (1 << (ch & 7)))
1636                 text++;
1637               else
1638                 break;
1639             }
1640           }
1641           break;
1642         }
1643         case Cexact: {
1644           ch = (unsigned char)*pinst;
1645           if (translate) {
1646             while (text < textend && translate[(unsigned char)*text] == ch)
1647               text++;
1648           } else {
1649             while (text < textend && (unsigned char)*text == ch) text++;
1650           }
1651           break;
1652         }
1653         case Canychar: {
1654           while (text < textend && (unsigned char)*text != '\n') text++;
1655           break;
1656         }
1657         case Csyntaxspec: {
1658           a = (unsigned char)*pinst;
1659           if (translate) {
1660             while (text < textend && (SYNTAX(translate[*text]) & a)) text++;
1661           } else {
1662             while (text < textend && (SYNTAX(*text) & a)) text++;
1663           }
1664           break;
1665         }
1666         case Cnotsyntaxspec: {
1667           a = (unsigned char)*pinst;
1668           if (translate) {
1669             while (text < textend && !(SYNTAX(translate[*text]) & a)) text++;
1670           } else {
1671             while (text < textend && !(SYNTAX(*text) & a)) text++;
1672           }
1673           break;
1674         }
1675         default: {
1676           FREE_STATE(state);
1677           set_error("Unknown regex opcode: memory corrupted?");
1678           return -2;
1679              /*NOTREACHED*/}
1680       }
1681       /* due to the funky way + and * are compiled, the top
1682        * failure- stack entry at this point is actually a
1683        * success entry -- update it & pop it */
1684       UPDATE_FAILURE(state, text, goto error);
1685       goto fail; /* i.e., succeed <wink/sigh> */
1686     }
1687     case Cbegbuf: {
1688       if (text == textstart) goto continue_matching;
1689       goto fail;
1690     }
1691     case Cendbuf: {
1692       if (text == textend) goto continue_matching;
1693       goto fail;
1694     }
1695     case Cwordbeg: {
1696       if (text == textend) goto fail;
1697       if (!(SYNTAX(*text) & Sword)) goto fail;
1698       if (text == textstart) goto continue_matching;
1699       if (!(SYNTAX(text[-1]) & Sword)) goto continue_matching;
1700       goto fail;
1701     }
1702     case Cwordend: {
1703       if (text == textstart) goto fail;
1704       if (!(SYNTAX(text[-1]) & Sword)) goto fail;
1705       if (text == textend) goto continue_matching;
1706       if (!(SYNTAX(*text) & Sword)) goto continue_matching;
1707       goto fail;
1708     }
1709     case Cwordbound: {
1710       /* Note: as in gnu regexp, this also matches at the
1711        * beginning and end of buffer.  */
1712 
1713       if (text == textstart || text == textend) goto continue_matching;
1714       if ((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword))
1715         goto continue_matching;
1716       goto fail;
1717     }
1718     case Cnotwordbound: {
1719       /* Note: as in gnu regexp, this never matches at the
1720        * beginning and end of buffer.  */
1721       if (text == textstart || text == textend) goto fail;
1722       if (!((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword)))
1723         goto continue_matching;
1724       goto fail;
1725     }
1726     case Csyntaxspec: {
1727       NEXTCHAR(ch);
1728       if (!(SYNTAX(ch) & (unsigned char)*code++)) goto fail;
1729       goto continue_matching;
1730     }
1731     case Cnotsyntaxspec: {
1732       NEXTCHAR(ch);
1733       if (SYNTAX(ch) & (unsigned char)*code++) goto fail;
1734       goto continue_matching;
1735     }
1736     default: {
1737       FREE_STATE(state);
1738       set_error("Unknown regex opcode: memory corrupted?");
1739       return -2;
1740        /*NOTREACHED*/}
1741   }
1742 
1743   /*
1744    *NOTREACHED
1745    */
1746 
1747   /* Using "break;" in the above switch statement is equivalent to "goto fail;"
1748    */
1749 fail:
1750   POP_FAILURE(state, code, text, goto done_matching, goto error);
1751   goto continue_matching;
1752 
1753 done_matching:
1754   /*   if(translated != NULL) */
1755   /*      free(translated); */
1756   FREE_STATE(state);
1757   return -1;
1758 
1759 error:
1760   /*   if (translated != NULL) */
1761   /*      free(translated); */
1762   FREE_STATE(state);
1763   return -2;
1764 }
1765 
1766 
1767 #undef PREFETCH
1768 #undef NEXTCHAR
1769 
ReSearch(regex_t * bufp,unsigned char * str,int size,int pos,int range,regexp_registers_t regs)1770 int ReSearch(regex_t* bufp,
1771              unsigned char* str,
1772              int size,
1773              int pos,
1774              int range,
1775              regexp_registers_t regs)
1776 {
1777   unsigned char* fastmap;
1778   unsigned char* translate;
1779   unsigned char* text;
1780   unsigned char* partstart;
1781   unsigned char* partend;
1782   int dir;
1783   int ret;
1784   unsigned char anchor;
1785   unsigned char* string = str;
1786 
1787   if (bufp->cflags & REG_ICASE) { /* we must use string in lowercase */
1788     int len = strlen((const char*)str);
1789     if (!bufp->lcase) { bufp->lcase = GetPoolMemory(PM_FNAME); }
1790     bufp->lcase = CheckPoolMemorySize(bufp->lcase, len + 1);
1791     unsigned char* dst = (unsigned char*)bufp->lcase;
1792     while (*string) { *dst++ = tolower(*string++); }
1793     *dst = '\0';
1794     string = (unsigned char*)bufp->lcase;
1795   }
1796 
1797   // assert(size >= 0 && pos >= 0);
1798   // assert(pos + range >= 0 && pos + range <= size);     /* Bugfix by ylo */
1799 
1800   fastmap = bufp->fastmap;
1801   translate = bufp->translate;
1802   if (fastmap && !bufp->fastmap_accurate) {
1803     ReCompileFastmap(bufp);
1804     if (got_error) return -2;
1805   }
1806 
1807   anchor = bufp->anchor;
1808   if (bufp->can_be_null == 1) /* can_be_null == 2: can match null at eob */
1809     fastmap = NULL;
1810 
1811   if (range < 0) {
1812     dir = -1;
1813     range = -range;
1814   } else
1815     dir = 1;
1816 
1817   if (anchor == 2) {
1818     if (pos != 0)
1819       return -1;
1820     else
1821       range = 0;
1822   }
1823 
1824   for (; range >= 0; range--, pos += dir) {
1825     if (fastmap) {
1826       if (dir == 1) { /* searching forwards */
1827 
1828         text = string + pos;
1829         partend = string + size;
1830         partstart = text;
1831         if (translate)
1832           while (text != partend
1833                  && !fastmap[(unsigned char)translate[(unsigned char)*text]])
1834             text++;
1835         else
1836           while (text != partend && !fastmap[(unsigned char)*text]) text++;
1837         pos += text - partstart;
1838         range -= text - partstart;
1839         if (pos == size && bufp->can_be_null == 0) return -1;
1840       } else { /* searching backwards */
1841         text = string + pos;
1842         partstart = string + pos - range;
1843         partend = text;
1844         if (translate)
1845           while (text != partstart
1846                  && !fastmap[(unsigned char)translate[(unsigned char)*text]])
1847             text--;
1848         else
1849           while (text != partstart && !fastmap[(unsigned char)*text]) text--;
1850         pos -= partend - text;
1851         range -= partend - text;
1852       }
1853     }
1854     if (anchor == 1) { /* anchored to begline */
1855       if (pos > 0 && (string[pos - 1] != '\n')) continue;
1856     }
1857     //    assert(pos >= 0 && pos <= size);
1858     ret = ReMatch(bufp, string, size, pos, regs);
1859     if (ret >= 0) return pos;
1860     if (ret == -2) return -2;
1861   }
1862   return -1;
1863 }
1864 
1865 /*
1866 ** Local Variables:
1867 ** mode: c
1868 ** c-file-style: "python"
1869 ** End:
1870 */
1871