1 /*
2    Bacula(R) - The Network Backup Solution
3 
4    Copyright (C) 2000-2020 Kern Sibbald
5 
6    The original author of Bacula is Kern Sibbald, with contributions
7    from many others, a complete list can be found in the file AUTHORS.
8 
9    You may use this file and others of this release according to the
10    license defined in the LICENSE file, which includes the Affero General
11    Public License, v3.0 ("AGPLv3") and some additional permissions and
12    terms pursuant to its AGPLv3 Section 7.
13 
14    This notice must be preserved when any source code is
15    conveyed and/or propagated.
16 
17    Bacula(R) is a registered trademark of Kern Sibbald.
18 */
19 
20 /* regexpr.c
21  *
22  * Author: Tatu Ylonen <ylo@ngs.fi>
23  *
24  * Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
25  *
26  * Permission to use, copy, modify, distribute, and sell this software
27  * and its documentation for any purpose is hereby granted without
28  * fee, provided that the above copyright notice appear in all copies.
29  * This software is provided "as is" without express or implied
30  * warranty.
31  *
32  * Created: Thu Sep 26 17:14:05 1991 ylo
33  * Last modified: Mon Nov  4 17:06:48 1991 ylo
34  * Ported to Think C: 19 Jan 1992 guido@cwi.nl
35  *
36  * This code draws many ideas from the regular expression packages by
37  * Henry Spencer of the University of Toronto and Richard Stallman of
38  * the Free Software Foundation.
39  *
40  * Emacs-specific code and syntax table code is almost directly borrowed
41  * from GNU regexp.
42  *
43  * Bugs fixed and lots of reorganization by Jeffrey C. Ollie, April
44  * 1997 Thanks for bug reports and ideas from Andrew Kuchling, Tim
45  * Peters, Guido van Rossum, Ka-Ping Yee, Sjoerd Mullender, and
46  * probably one or two others that I'm forgetting.
47  *
48  * This file modified to work with Bacula and C++ by
49  *    Kern Sibbald, April 2006
50  *
51  * This file modified to work with REG_ICASE and Bacula by
52  *    Eric Bollengier April 2007
53  */
54 
55 #include "bacula.h"
56 #include "bregex.h"
57 
58 #define set_error(x) bufp->errmsg=((char *)(x))
59 #define got_error bufp->errmsg!=NULL
60 
61 /* The original code blithely assumed that sizeof(short) == 2.  Not
62  * always true.  Original instances of "(short)x" were replaced by
63  * SHORT(x), where SHORT is #defined below.  */
64 
65 #define SHORT(x) ((x) & 0x8000 ? (x) - 0x10000 : (x))
66 
67 /* The stack implementation is taken from an idea by Andrew Kuchling.
68  * It's a doubly linked list of arrays. The advantages of this over a
69  * simple linked list are that the number of mallocs required are
70  * reduced. It also makes it possible to statically allocate enough
71  * space so that small patterns don't ever need to call malloc.
72  *
73  * The advantages over a single array is that is periodically
74  * realloced when more space is needed is that we avoid ever copying
75  * the stack. */
76 
77 /* item_t is the basic stack element.  Defined as a union of
78  * structures so that both registers, failure points, and counters can
79  * be pushed/popped from the stack.  There's nothing built into the
80  * item to keep track of whether a certain stack item is a register, a
81  * failure point, or a counter. */
82 
83 typedef union item_t {
84    struct {
85       int num;
86       int level;
87       unsigned char *start;
88       unsigned char *end;
89    } reg;
90    struct {
91       int count;
92       int level;
93       int phantom;
94       unsigned char *code;
95       unsigned char *text;
96    } fail;
97    struct {
98       int num;
99       int level;
100       int count;
101    } cntr;
102 } item_t;
103 
104 #define B_STACK_PAGE_SIZE 256
105 #define NUM_REGISTERS 256
106 
107 /* A 'page' of stack items. */
108 
109 typedef struct item_page_t {
110    item_t items[B_STACK_PAGE_SIZE];
111    struct item_page_t *prev;
112    struct item_page_t *next;
113 } item_page_t;
114 
115 
116 typedef struct match_state {
117    /* The number of registers that have been pushed onto the stack
118     * since the last failure point. */
119 
120    int count;
121 
122    /* Used to control when registers need to be pushed onto the
123     * stack. */
124 
125    int level;
126 
127    /* The number of failure points on the stack. */
128 
129    int point;
130 
131    /* Storage for the registers.  Each register consists of two
132     * pointers to characters.  So register N is represented as
133     * start[N] and end[N].  The pointers must be converted to
134     * offsets from the beginning of the string before returning the
135     * registers to the calling program. */
136 
137    unsigned char *start[NUM_REGISTERS];
138    unsigned char *end[NUM_REGISTERS];
139 
140    /* Keeps track of whether a register has changed recently. */
141 
142    int changed[NUM_REGISTERS];
143 
144    /* Structure to encapsulate the stack. */
145    struct {
146       /* index into the current page.  If index == 0 and you need
147        * to pop an item, move to the previous page and set index
148        * = B_STACK_PAGE_SIZE - 1.  Otherwise decrement index to
149        * push a page. If index == B_STACK_PAGE_SIZE and you need
150        * to push a page move to the next page and set index =
151        * 0. If there is no new next page, allocate a new page
152        * and link it in. Otherwise, increment index to push a
153        * page. */
154 
155       int index;
156       item_page_t *current;        /* Pointer to the current page. */
157       item_page_t first;           /* First page is statically allocated. */
158    } stack;
159 } match_state;
160 
161 /* Initialize a state object */
162 
163 /* #define NEW_STATE(state) \ */
164 /* memset(&state, 0, (void *)(&state.stack) - (void *)(&state)); \ */
165 /* state.stack.current = &state.stack.first; \ */
166 /* state.stack.first.prev = NULL; \ */
167 /* state.stack.first.next = NULL; \ */
168 /* state.stack.index = 0; \ */
169 /* state.level = 1 */
170 
171 #define NEW_STATE(state, nregs) \
172 { \
173    int i; \
174    for (i = 0; i < nregs; i++) \
175    { \
176       state.start[i] = NULL; \
177       state.end[i] = NULL; \
178       state.changed[i] = 0; \
179    } \
180    state.stack.current = &state.stack.first; \
181    state.stack.first.prev = NULL; \
182    state.stack.first.next = NULL; \
183    state.stack.index = 0; \
184    state.level = 1; \
185    state.count = 0; \
186    state.level = 0; \
187    state.point = 0; \
188 }
189 
190 /* Free any memory that might have been malloc'd */
191 
192 #define FREE_STATE(state) \
193 while(state.stack.first.next != NULL) \
194 { \
195    state.stack.current = state.stack.first.next; \
196    state.stack.first.next = state.stack.current->next; \
197    free(state.stack.current); \
198 }
199 
200 /* Discard the top 'count' stack items. */
201 
202 #define STACK_DISCARD(stack, count, on_error) \
203 stack.index -= count; \
204 while (stack.index < 0) \
205 { \
206    if (stack.current->prev == NULL) \
207            on_error; \
208    stack.current = stack.current->prev; \
209    stack.index += B_STACK_PAGE_SIZE; \
210 }
211 
212 /* Store a pointer to the previous item on the stack. Used to pop an
213  * item off of the stack. */
214 
215 #define STACK_PREV(stack, top, on_error) \
216 if (stack.index == 0) \
217 { \
218         if (stack.current->prev == NULL) \
219                 on_error; \
220         stack.current = stack.current->prev; \
221         stack.index = B_STACK_PAGE_SIZE - 1; \
222 } \
223 else \
224 { \
225         stack.index--; \
226 } \
227 top = &(stack.current->items[stack.index])
228 
229 /* Store a pointer to the next item on the stack. Used to push an item
230  * on to the stack. */
231 
232 #define STACK_NEXT(stack, top, on_error) \
233 if (stack.index == B_STACK_PAGE_SIZE) \
234 { \
235         if (stack.current->next == NULL) \
236         { \
237                 stack.current->next = (item_page_t *)malloc(sizeof(item_page_t)); \
238                 if (stack.current->next == NULL) \
239                         on_error; \
240                 stack.current->next->prev = stack.current; \
241                 stack.current->next->next = NULL; \
242         } \
243         stack.current = stack.current->next; \
244         stack.index = 0; \
245 } \
246 top = &(stack.current->items[stack.index++])
247 
248 /* Store a pointer to the item that is 'count' items back in the
249  * stack. STACK_BACK(stack, top, 1, on_error) is equivalent to
250  * STACK_TOP(stack, top, on_error).  */
251 
252 #define STACK_BACK(stack, top, count, on_error) \
253 { \
254         int index; \
255         item_page_t *current; \
256         current = stack.current; \
257         index = stack.index - (count); \
258         while (index < 0) \
259         { \
260                 if (current->prev == NULL) \
261                         on_error; \
262                 current = current->prev; \
263                 index += B_STACK_PAGE_SIZE; \
264         } \
265         top = &(current->items[index]); \
266 }
267 
268 /* Store a pointer to the top item on the stack. Execute the
269  * 'on_error' code if there are no items on the stack. */
270 
271 #define STACK_TOP(stack, top, on_error) \
272 if (stack.index == 0) \
273 { \
274         if (stack.current->prev == NULL) \
275                 on_error; \
276         top = &(stack.current->prev->items[B_STACK_PAGE_SIZE - 1]); \
277 } \
278 else \
279 { \
280         top = &(stack.current->items[stack.index - 1]); \
281 }
282 
283 /* Test to see if the stack is empty */
284 
285 #define STACK_EMPTY(stack) ((stack.index == 0) && \
286                             (stack.current->prev == NULL))
287 
288 /* Return the start of register 'reg' */
289 
290 #define GET_REG_START(state, reg) (state.start[reg])
291 
292 /* Return the end of register 'reg' */
293 
294 #define GET_REG_END(state, reg) (state.end[reg])
295 
296 /* Set the start of register 'reg'. If the state of the register needs
297  * saving, push it on the stack. */
298 
299 #define SET_REG_START(state, reg, text, on_error) \
300 if(state.changed[reg] < state.level) \
301 { \
302         item_t *item; \
303         STACK_NEXT(state.stack, item, on_error); \
304         item->reg.num = reg; \
305         item->reg.start = state.start[reg]; \
306         item->reg.end = state.end[reg]; \
307         item->reg.level = state.changed[reg]; \
308         state.changed[reg] = state.level; \
309         state.count++; \
310 } \
311 state.start[reg] = text
312 
313 /* Set the end of register 'reg'. If the state of the register needs
314  * saving, push it on the stack. */
315 
316 #define SET_REG_END(state, reg, text, on_error) \
317 if(state.changed[reg] < state.level) \
318 { \
319         item_t *item; \
320         STACK_NEXT(state.stack, item, on_error); \
321         item->reg.num = reg; \
322         item->reg.start = state.start[reg]; \
323         item->reg.end = state.end[reg]; \
324         item->reg.level = state.changed[reg]; \
325         state.changed[reg] = state.level; \
326         state.count++; \
327 } \
328 state.end[reg] = text
329 
330 #define PUSH_FAILURE(state, xcode, xtext, on_error) \
331 { \
332         item_t *item; \
333         STACK_NEXT(state.stack, item, on_error); \
334         item->fail.code = xcode; \
335         item->fail.text = xtext; \
336         item->fail.count = state.count; \
337         item->fail.level = state.level; \
338         item->fail.phantom = 0; \
339         state.count = 0; \
340         state.level++; \
341         state.point++; \
342 }
343 
344 /* Update the last failure point with a new position in the text. */
345 
346 #define UPDATE_FAILURE(state, xtext, on_error) \
347 { \
348    item_t *item; \
349    STACK_BACK(state.stack, item, state.count + 1, on_error); \
350    if (!item->fail.phantom) \
351    { \
352            item_t *item2; \
353            STACK_NEXT(state.stack, item2, on_error); \
354            item2->fail.code = item->fail.code; \
355            item2->fail.text = xtext; \
356            item2->fail.count = state.count; \
357            item2->fail.level = state.level; \
358            item2->fail.phantom = 1; \
359            state.count = 0; \
360            state.level++; \
361            state.point++; \
362    } \
363    else \
364    { \
365            STACK_DISCARD(state.stack, state.count, on_error); \
366            STACK_TOP(state.stack, item, on_error); \
367            item->fail.text = xtext; \
368            state.count = 0; \
369            state.level++; \
370    } \
371 }
372 
373 #define POP_FAILURE(state, xcode, xtext, on_empty, on_error) \
374 { \
375    item_t *item; \
376    do \
377    { \
378            while(state.count > 0) \
379            { \
380                    STACK_PREV(state.stack, item, on_error); \
381                    state.start[item->reg.num] = item->reg.start; \
382                    state.end[item->reg.num] = item->reg.end; \
383                    state.changed[item->reg.num] = item->reg.level; \
384                    state.count--; \
385            } \
386            STACK_PREV(state.stack, item, on_empty); \
387            xcode = item->fail.code; \
388            xtext = item->fail.text; \
389            state.count = item->fail.count; \
390            state.level = item->fail.level; \
391            state.point--; \
392    } \
393    while (item->fail.text == NULL); \
394 }
395 
396 enum regexp_compiled_ops {         /* opcodes for compiled regexp */
397    Cend,                           /* end of pattern reached */
398    Cbol,                           /* beginning of line */
399    Ceol,                           /* end of line */
400    Cset,                           /* character set.  Followed by 32 bytes of set. */
401    Cexact,                         /* followed by a byte to match */
402    Canychar,                       /* matches any character except newline */
403    Cstart_memory,                  /* set register start addr (followed by reg number) */
404    Cend_memory,                    /* set register end addr (followed by reg number) */
405    Cmatch_memory,                  /* match a duplicate of reg contents (regnum follows) */
406    Cjump,                          /* followed by two bytes (lsb,msb) of displacement. */
407    Cstar_jump,                     /* will change to jump/update_failure_jump at runtime */
408    Cfailure_jump,                  /* jump to addr on failure */
409    Cupdate_failure_jump,           /* update topmost failure point and jump */
410    Cdummy_failure_jump,            /* push a dummy failure point and jump */
411    Cbegbuf,                        /* match at beginning of buffer */
412    Cendbuf,                        /* match at end of buffer */
413    Cwordbeg,                       /* match at beginning of word */
414    Cwordend,                       /* match at end of word */
415    Cwordbound,                     /* match if at word boundary */
416    Cnotwordbound,                  /* match if not at word boundary */
417    Csyntaxspec,                    /* matches syntax code (1 byte follows) */
418    Cnotsyntaxspec,                 /* matches if syntax code does not match (1 byte follows) */
419    Crepeat1
420 };
421 
422 enum regexp_syntax_op {            /* syntax codes for plain and quoted characters */
423    Rend,                           /* special code for end of regexp */
424    Rnormal,                        /* normal character */
425    Ranychar,                       /* any character except newline */
426    Rquote,                         /* the quote character */
427    Rbol,                           /* match beginning of line */
428    Reol,                           /* match end of line */
429    Roptional,                      /* match preceding expression optionally */
430    Rstar,                          /* match preceding expr zero or more times */
431    Rplus,                          /* match preceding expr one or more times */
432    Ror,                            /* match either of alternatives */
433    Ropenpar,                       /* opening parenthesis */
434    Rclosepar,                      /* closing parenthesis */
435    Rmemory,                        /* match memory register */
436    Rextended_memory,               /* \vnn to match registers 10-99 */
437    Ropenset,                       /* open set.  Internal syntax hard-coded below. */
438    /* the following are gnu extensions to "normal" regexp syntax */
439    Rbegbuf,                        /* beginning of buffer */
440    Rendbuf,                        /* end of buffer */
441    Rwordchar,                      /* word character */
442    Rnotwordchar,                   /* not word character */
443    Rwordbeg,                       /* beginning of word */
444    Rwordend,                       /* end of word */
445    Rwordbound,                     /* word bound */
446    Rnotwordbound,                  /* not word bound */
447    Rnum_ops
448 };
449 
450 static int re_compile_initialized = 0;
451 static int regexp_syntax = RE_SYNTAX_EGREP;
452 int re_syntax = RE_SYNTAX_EGREP;   /* Exported copy of regexp_syntax */
453 static unsigned char plain_ops[256];
454 static unsigned char quoted_ops[256];
455 static unsigned char precedences[Rnum_ops];
456 static int regexp_context_indep_ops;
457 static int regexp_ansi_sequences;
458 
459 #define NUM_LEVELS  5              /* number of precedence levels in use */
460 #define MAX_NESTING 100            /* max nesting level of operators */
461 
462 #define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
463 
464 unsigned char re_syntax_table[256];
465 
re_compile_initialize(void)466 void re_compile_initialize(void)
467 {
468    int a;
469 
470    static int syntax_table_inited = 0;
471 
472    if (!syntax_table_inited) {
473       syntax_table_inited = 1;
474       memset(re_syntax_table, 0, 256);
475       for (a = 'a'; a <= 'z'; a++)
476          re_syntax_table[a] = Sword;
477       for (a = 'A'; a <= 'Z'; a++)
478          re_syntax_table[a] = Sword;
479       for (a = '0'; a <= '9'; a++)
480          re_syntax_table[a] = Sword | Sdigit | Shexdigit;
481       for (a = '0'; a <= '7'; a++)
482          re_syntax_table[a] |= Soctaldigit;
483       for (a = 'A'; a <= 'F'; a++)
484          re_syntax_table[a] |= Shexdigit;
485       for (a = 'a'; a <= 'f'; a++)
486          re_syntax_table[a] |= Shexdigit;
487       re_syntax_table[(int)'_'] = Sword;
488       for (a = 9; a <= 13; a++)
489          re_syntax_table[a] = Swhitespace;
490       re_syntax_table[(int)' '] = Swhitespace;
491    }
492    re_compile_initialized = 1;
493    for (a = 0; a < 256; a++) {
494       plain_ops[a] = Rnormal;
495       quoted_ops[a] = Rnormal;
496    }
497    for (a = '0'; a <= '9'; a++)
498       quoted_ops[a] = Rmemory;
499    plain_ops[(int)'\134'] = Rquote;
500    if (regexp_syntax & RE_NO_BK_PARENS) {
501       plain_ops[(int)'('] = Ropenpar;
502       plain_ops[(int)')'] = Rclosepar;
503    } else {
504       quoted_ops[(int)'('] = Ropenpar;
505       quoted_ops[(int)')'] = Rclosepar;
506    }
507    if (regexp_syntax & RE_NO_BK_VBAR) {
508       plain_ops[(int)'\174'] = Ror;
509    } else {
510       quoted_ops[(int)'\174'] = Ror;
511    }
512    plain_ops[(int)'*'] = Rstar;
513    if (regexp_syntax & RE_BK_PLUS_QM) {
514       quoted_ops[(int)'+'] = Rplus;
515       quoted_ops[(int)'?'] = Roptional;
516    } else {
517       plain_ops[(int)'+'] = Rplus;
518       plain_ops[(int)'?'] = Roptional;
519    }
520    if (regexp_syntax & RE_NEWLINE_OR) {
521       plain_ops[(int)'\n'] = Ror;
522    }
523    plain_ops[(int)'\133'] = Ropenset;
524    plain_ops[(int)'\136'] = Rbol;
525    plain_ops[(int)'$'] = Reol;
526    plain_ops[(int)'.'] = Ranychar;
527    if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS)) {
528       quoted_ops[(int)'w'] = Rwordchar;
529       quoted_ops[(int)'W'] = Rnotwordchar;
530       quoted_ops[(int)'<'] = Rwordbeg;
531       quoted_ops[(int)'>'] = Rwordend;
532       quoted_ops[(int)'b'] = Rwordbound;
533       quoted_ops[(int)'B'] = Rnotwordbound;
534       quoted_ops[(int)'`'] = Rbegbuf;
535       quoted_ops[(int)'\''] = Rendbuf;
536    }
537    if (regexp_syntax & RE_ANSI_HEX) {
538       quoted_ops[(int)'v'] = Rextended_memory;
539    }
540    for (a = 0; a < Rnum_ops; a++) {
541       precedences[a] = 4;
542    }
543    if (regexp_syntax & RE_TIGHT_VBAR) {
544       precedences[Ror] = 3;
545       precedences[Rbol] = 2;
546       precedences[Reol] = 2;
547    } else {
548       precedences[Ror] = 2;
549       precedences[Rbol] = 3;
550       precedences[Reol] = 3;
551    }
552    precedences[Rclosepar] = 1;
553    precedences[Rend] = 0;
554    regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
555    regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
556 }
557 
re_set_syntax(int syntax)558 int re_set_syntax(int syntax) {
559    int ret;
560 
561    ret = regexp_syntax;
562    regexp_syntax = syntax;
563    re_syntax = syntax;          /* Exported copy */
564    re_compile_initialize();
565    return ret;
566 }
567 
hex_char_to_decimal(int ch)568 static int hex_char_to_decimal(int ch) {
569    if (ch >= '0' && ch <= '9')
570       return ch - '0';
571    if (ch >= 'a' && ch <= 'f')
572       return ch - 'a' + 10;
573    if (ch >= 'A' && ch <= 'F')
574       return ch - 'A' + 10;
575    return 16;
576 }
577 
re_compile_fastmap_aux(regex_t * bufp,unsigned char * code,int pos,unsigned char * visited,unsigned char * can_be_null,unsigned char * fastmap)578 static void re_compile_fastmap_aux(regex_t * bufp, unsigned char *code, int pos,
579                                    unsigned char *visited,
580                                    unsigned char *can_be_null,
581                                    unsigned char *fastmap)
582 {
583    int a;
584    int b;
585    int syntaxcode;
586 
587    if (visited[pos])
588       return;                   /* we have already been here */
589    visited[pos] = 1;
590    for (;;) {
591       switch (code[pos++]) {
592       case Cend:
593          *can_be_null = 1;
594          return;
595       case Cbol:
596       case Cbegbuf:
597       case Cendbuf:
598       case Cwordbeg:
599       case Cwordend:
600       case Cwordbound:
601       case Cnotwordbound:
602          for (a = 0; a < 256; a++)
603             fastmap[a] = 1;
604          break;
605       case Csyntaxspec:
606          syntaxcode = code[pos++];
607          for (a = 0; a < 256; a++)
608             if (SYNTAX(a) & syntaxcode)
609                fastmap[a] = 1;
610          return;
611       case Cnotsyntaxspec:
612          syntaxcode = code[pos++];
613          for (a = 0; a < 256; a++)
614             if (!(SYNTAX(a) & syntaxcode))
615                fastmap[a] = 1;
616          return;
617       case Ceol:
618          fastmap[(int)'\n'] = 1;
619          if (*can_be_null == 0)
620             *can_be_null = 2;      /* can match null, but only at end of buffer */
621          return;
622       case Cset:
623          for (a = 0; a < 256 / 8; a++)
624             if (code[pos + a] != 0)
625                for (b = 0; b < 8; b++)
626                   if (code[pos + a] & (1 << b))
627                      fastmap[(a << 3) + b] = 1;
628          pos += 256 / 8;
629          return;
630       case Cexact:
631          fastmap[(unsigned char)code[pos]] = 1;
632          return;
633       case Canychar:
634          for (a = 0; a < 256; a++)
635             if (a != '\n')
636                fastmap[a] = 1;
637          return;
638       case Cstart_memory:
639       case Cend_memory:
640          pos++;
641          break;
642       case Cmatch_memory:
643          for (a = 0; a < 256; a++)
644             fastmap[a] = 1;
645          *can_be_null = 1;
646          return;
647       case Cjump:
648       case Cdummy_failure_jump:
649       case Cupdate_failure_jump:
650       case Cstar_jump:
651          a = (unsigned char)code[pos++];
652          a |= (unsigned char)code[pos++] << 8;
653          pos += (int)SHORT(a);
654          if (visited[pos]) {
655             /* argh... the regexp contains empty loops.  This is not
656                good, as this may cause a failure stack overflow when
657                matching.  Oh well. */
658             /* this path leads nowhere; pursue other paths. */
659             return;
660          }
661          visited[pos] = 1;
662          break;
663       case Cfailure_jump:
664          a = (unsigned char)code[pos++];
665          a |= (unsigned char)code[pos++] << 8;
666          a = pos + (int)SHORT(a);
667          re_compile_fastmap_aux(bufp, code, a, visited, can_be_null, fastmap);
668          break;
669       case Crepeat1:
670          pos += 2;
671          break;
672       default:
673          set_error("Unknown regex opcode: memory corrupted?");
674          return;
675       }
676    }
677 }
678 
re_do_compile_fastmap(regex_t * bufp,unsigned char * buffer,int used,int pos,unsigned char * can_be_null,unsigned char * fastmap)679 static int re_do_compile_fastmap(regex_t * bufp, unsigned char *buffer, int used,
680                                  int pos, unsigned char *can_be_null,
681                                  unsigned char *fastmap)
682 {
683    unsigned char small_visited[512], *visited;
684 
685    if (used <= (int)sizeof(small_visited))
686       visited = small_visited;
687    else {
688       visited = (unsigned char *)malloc(used);
689       if (!visited)
690          return 0;
691    }
692    *can_be_null = 0;
693    memset(fastmap, 0, 256);
694    memset(visited, 0, used);
695    re_compile_fastmap_aux(bufp, buffer, pos, visited, can_be_null, fastmap);
696    if (visited != small_visited)
697       free(visited);
698    return 1;
699 }
700 
re_compile_fastmap(regex_t * bufp)701 void re_compile_fastmap(regex_t * bufp)
702 {
703    if (!bufp->fastmap || bufp->fastmap_accurate)
704       return;
705 // assert(bufp->used > 0);
706    if (!re_do_compile_fastmap(bufp, bufp->buffer,
707                               bufp->used, 0, &bufp->can_be_null, bufp->fastmap))
708       return;
709    if (got_error)
710       return;
711    if (bufp->buffer[0] == Cbol)
712       bufp->anchor = 1;            /* begline */
713    else if (bufp->buffer[0] == Cbegbuf)
714       bufp->anchor = 2;            /* begbuf */
715    else
716       bufp->anchor = 0;            /* none */
717    bufp->fastmap_accurate = 1;
718 }
719 
720 /*
721  * star is coded as:
722  * 1: failure_jump 2
723  *    ... code for operand of star
724  *    star_jump 1
725  * 2: ... code after star
726  *
727  * We change the star_jump to update_failure_jump if we can determine
728  * that it is safe to do so; otherwise we change it to an ordinary
729  * jump.
730  *
731  * plus is coded as
732  *
733  *    jump 2
734  * 1: failure_jump 3
735  * 2: ... code for operand of plus
736  *    star_jump 1
737  * 3: ... code after plus
738  *
739  * For star_jump considerations this is processed identically to star.
740  *
741  */
742 
re_optimize_star_jump(regex_t * bufp,unsigned char * code)743 static int re_optimize_star_jump(regex_t * bufp, unsigned char *code)
744 {
745    unsigned char map[256];
746    unsigned char can_be_null;
747    unsigned char *p1;
748    unsigned char *p2;
749    unsigned char ch;
750    int a;
751    int b;
752    int num_instructions = 0;
753 
754    a = (unsigned char)*code++;
755    a |= (unsigned char)*code++ << 8;
756    a = (int)SHORT(a);
757 
758    p1 = code + a + 3;              /* skip the failure_jump */
759    /* Check that the jump is within the pattern */
760    if (p1 < bufp->buffer || bufp->buffer + bufp->used < p1) {
761       set_error("Regex VM jump out of bounds (failure_jump opt)");
762       return 0;
763    }
764 // assert(p1[-3] == Cfailure_jump);
765    p2 = code;
766    /* p1 points inside loop, p2 points to after loop */
767    if (!re_do_compile_fastmap(bufp, bufp->buffer, bufp->used,
768                               (int)(p2 - bufp->buffer), &can_be_null, map))
769       goto make_normal_jump;
770 
771    /* If we might introduce a new update point inside the
772     * loop, we can't optimize because then update_jump would
773     * update a wrong failure point.  Thus we have to be
774     * quite careful here.
775     */
776 
777    /* loop until we find something that consumes a character */
778    for (;;) {
779       num_instructions++;
780       switch (*p1++) {
781       case Cbol:
782       case Ceol:
783       case Cbegbuf:
784       case Cendbuf:
785       case Cwordbeg:
786       case Cwordend:
787       case Cwordbound:
788       case Cnotwordbound:
789          continue;
790       case Cstart_memory:
791       case Cend_memory:
792          p1++;
793          continue;
794       case Cexact:
795          ch = (unsigned char)*p1++;
796          if (map[(int)ch])
797             goto make_normal_jump;
798          break;
799       case Canychar:
800          for (b = 0; b < 256; b++)
801             if (b != '\n' && map[b])
802                goto make_normal_jump;
803          break;
804       case Cset:
805          for (b = 0; b < 256; b++)
806             if ((p1[b >> 3] & (1 << (b & 7))) && map[b])
807                goto make_normal_jump;
808          p1 += 256 / 8;
809          break;
810       default:
811          goto make_normal_jump;
812       }
813       break;
814    }
815    /* now we know that we can't backtrack. */
816    while (p1 != p2 - 3) {
817       num_instructions++;
818       switch (*p1++) {
819       case Cend:
820          return 0;
821       case Cbol:
822       case Ceol:
823       case Canychar:
824       case Cbegbuf:
825       case Cendbuf:
826       case Cwordbeg:
827       case Cwordend:
828       case Cwordbound:
829       case Cnotwordbound:
830          break;
831       case Cset:
832          p1 += 256 / 8;
833          break;
834       case Cexact:
835       case Cstart_memory:
836       case Cend_memory:
837       case Cmatch_memory:
838       case Csyntaxspec:
839       case Cnotsyntaxspec:
840          p1++;
841          break;
842       case Cjump:
843       case Cstar_jump:
844       case Cfailure_jump:
845       case Cupdate_failure_jump:
846       case Cdummy_failure_jump:
847          goto make_normal_jump;
848       default:
849          return 0;
850       }
851    }
852 
853    /* make_update_jump: */
854    code -= 3;
855    a += 3;                         /* jump to after the Cfailure_jump */
856    code[0] = Cupdate_failure_jump;
857    code[1] = a & 0xff;
858    code[2] = a >> 8;
859    if (num_instructions > 1)
860       return 1;
861 // assert(num_instructions == 1);
862    /* if the only instruction matches a single character, we can do
863     * better */
864    p1 = code + 3 + a;              /* start of sole instruction */
865    if (*p1 == Cset || *p1 == Cexact || *p1 == Canychar ||
866        *p1 == Csyntaxspec || *p1 == Cnotsyntaxspec)
867       code[0] = Crepeat1;
868    return 1;
869 
870  make_normal_jump:
871    code -= 3;
872    *code = Cjump;
873    return 1;
874 }
875 
re_optimize(regex_t * bufp)876 static int re_optimize(regex_t * bufp)
877 {
878    unsigned char *code;
879 
880    code = bufp->buffer;
881 
882    while (1) {
883       switch (*code++) {
884       case Cend:
885          return 1;
886       case Canychar:
887       case Cbol:
888       case Ceol:
889       case Cbegbuf:
890       case Cendbuf:
891       case Cwordbeg:
892       case Cwordend:
893       case Cwordbound:
894       case Cnotwordbound:
895          break;
896       case Cset:
897          code += 256 / 8;
898          break;
899       case Cexact:
900       case Cstart_memory:
901       case Cend_memory:
902       case Cmatch_memory:
903       case Csyntaxspec:
904       case Cnotsyntaxspec:
905          code++;
906          break;
907       case Cstar_jump:
908          if (!re_optimize_star_jump(bufp, code)) {
909             return 0;
910          }
911          /* fall through */
912       case Cupdate_failure_jump:
913       case Cjump:
914       case Cdummy_failure_jump:
915       case Cfailure_jump:
916       case Crepeat1:
917          code += 2;
918          break;
919       default:
920          return 0;
921       }
922    }
923 }
924 
925 #define NEXTCHAR(var) \
926 { \
927         if (pos >= size) \
928                 goto ends_prematurely; \
929         (var) = regex[pos]; \
930         pos++; \
931 }
932 
933 #define ALLOC(amount) \
934 { \
935           if (pattern_offset+(amount) > alloc) \
936           { \
937                   alloc += 256 + (amount); \
938                   pattern = (unsigned char *)realloc(pattern, alloc); \
939                   if (!pattern) \
940                           goto out_of_memory; \
941           } \
942 }
943 
944 #define STORE(ch) pattern[pattern_offset++] = (ch)
945 
946 #define CURRENT_LEVEL_START (starts[starts_base + current_level])
947 
948 #define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
949 
950 #define PUSH_LEVEL_STARTS \
951 if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
952         starts_base += NUM_LEVELS; \
953 else \
954         goto too_complex \
955 
956 #define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
957 
958 #define PUT_ADDR(offset,addr) \
959 { \
960         int disp = (addr) - (offset) - 2; \
961         pattern[(offset)] = disp & 0xff; \
962         pattern[(offset)+1] = (disp>>8) & 0xff; \
963 }
964 
965 #define INSERT_JUMP(pos,type,addr) \
966 { \
967         int a, p = (pos), t = (type), ad = (addr); \
968         for (a = pattern_offset - 1; a >= p; a--) \
969                 pattern[a + 3] = pattern[a]; \
970         pattern[p] = t; \
971         PUT_ADDR(p+1,ad); \
972         pattern_offset += 3; \
973 }
974 
975 #define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
976 
977 #define SET_FIELDS \
978 { \
979         bufp->allocated = alloc; \
980         bufp->buffer = pattern; \
981         bufp->used = pattern_offset; \
982 }
983 
984 #define GETHEX(var) \
985 { \
986         unsigned char gethex_ch, gethex_value; \
987         NEXTCHAR(gethex_ch); \
988         gethex_value = hex_char_to_decimal(gethex_ch); \
989         if (gethex_value == 16) \
990                 goto hex_error; \
991         NEXTCHAR(gethex_ch); \
992         gethex_ch = hex_char_to_decimal(gethex_ch); \
993         if (gethex_ch == 16) \
994                 goto hex_error; \
995         (var) = gethex_value * 16 + gethex_ch; \
996 }
997 
998 #define ANSI_TRANSLATE(ch) \
999 { \
1000    switch (ch) \
1001    { \
1002    case 'a': \
1003    case 'A': \
1004    { \
1005            ch = 7; /* audible bell */ \
1006            break; \
1007    } \
1008    case 'b': \
1009    case 'B': \
1010    { \
1011            ch = 8; /* backspace */ \
1012            break; \
1013    } \
1014    case 'f': \
1015    case 'F': \
1016    { \
1017            ch = 12; /* form feed */ \
1018            break; \
1019    } \
1020    case 'n': \
1021    case 'N': \
1022    { \
1023            ch = 10; /* line feed */ \
1024            break; \
1025    } \
1026    case 'r': \
1027    case 'R': \
1028    { \
1029            ch = 13; /* carriage return */ \
1030            break; \
1031    } \
1032    case 't': \
1033    case 'T': \
1034    { \
1035          ch = 9; /* tab */ \
1036          break; \
1037    } \
1038    case 'v': \
1039    case 'V': \
1040    { \
1041            ch = 11; /* vertical tab */ \
1042            break; \
1043    } \
1044    case 'x': /* hex code */ \
1045    case 'X': \
1046    { \
1047            GETHEX(ch); \
1048            break; \
1049    } \
1050    default: \
1051    { \
1052            /* other characters passed through */ \
1053            if (translate) \
1054                    ch = translate[(unsigned char)ch]; \
1055            break; \
1056    } \
1057    } \
1058 }
1059 
re_compile_pattern(regex_t * bufp,unsigned char * regex)1060 const char *re_compile_pattern(regex_t * bufp, unsigned char *regex)
1061 {
1062    int a;
1063    int pos;
1064    int op;
1065    int current_level;
1066    int level;
1067    int opcode;
1068    int pattern_offset = 0, alloc;
1069    int starts[NUM_LEVELS * MAX_NESTING];
1070    int starts_base;
1071    int future_jumps[MAX_NESTING];
1072    int num_jumps;
1073    unsigned char ch = '\0';
1074    unsigned char *pattern;
1075    unsigned char *translate;
1076    int next_register;
1077    int paren_depth;
1078    int num_open_registers;
1079    int open_registers[RE_NREGS];
1080    int beginning_context;
1081    int size = strlen((char *)regex);
1082 
1083    if (!re_compile_initialized)
1084       re_compile_initialize();
1085    bufp->used = 0;
1086    bufp->fastmap_accurate = 0;
1087    bufp->uses_registers = 1;
1088    bufp->num_registers = 1;
1089    translate = bufp->translate;
1090    pattern = bufp->buffer;
1091    alloc = bufp->allocated;
1092    if (alloc == 0 || pattern == NULL) {
1093       alloc = 256;
1094       bufp->buffer = pattern = (unsigned char *)malloc(alloc);
1095       if (!pattern)
1096          goto out_of_memory;
1097    }
1098    pattern_offset = 0;
1099    starts_base = 0;
1100    num_jumps = 0;
1101    current_level = 0;
1102    SET_LEVEL_START;
1103    num_open_registers = 0;
1104    next_register = 1;
1105    paren_depth = 0;
1106    beginning_context = 1;
1107    op = -1;
1108    /* we use Rend dummy to ensure that pending jumps are updated
1109       (due to low priority of Rend) before exiting the loop. */
1110    pos = 0;
1111    while (op != Rend) {
1112       if (pos >= size)
1113          op = Rend;
1114       else {
1115          NEXTCHAR(ch);
1116          if (translate)
1117             ch = translate[(unsigned char)ch];
1118          op = plain_ops[(unsigned char)ch];
1119          if (op == Rquote) {
1120             NEXTCHAR(ch);
1121             op = quoted_ops[(unsigned char)ch];
1122             if (op == Rnormal && regexp_ansi_sequences)
1123                ANSI_TRANSLATE(ch);
1124          }
1125       }
1126       level = precedences[op];
1127       /* printf("ch='%c' op=%d level=%d current_level=%d
1128          curlevstart=%d\n", ch, op, level, current_level,
1129          CURRENT_LEVEL_START); */
1130       if (level > current_level) {
1131          for (current_level++; current_level < level; current_level++)
1132             SET_LEVEL_START;
1133          SET_LEVEL_START;
1134       } else if (level < current_level) {
1135          current_level = level;
1136          for (; num_jumps > 0 &&
1137               future_jumps[num_jumps - 1] >= CURRENT_LEVEL_START; num_jumps--)
1138             PUT_ADDR(future_jumps[num_jumps - 1], pattern_offset);
1139       }
1140       switch (op) {
1141       case Rend:
1142          break;
1143       case Rnormal:
1144        normal_char:
1145          opcode = Cexact;
1146        store_opcode_and_arg:       /* opcode & ch must be set */
1147          SET_LEVEL_START;
1148          ALLOC(2);
1149          STORE(opcode);
1150          STORE(ch);
1151          break;
1152       case Ranychar:
1153          opcode = Canychar;
1154        store_opcode:
1155          SET_LEVEL_START;
1156          ALLOC(1);
1157          STORE(opcode);
1158          break;
1159       case Rquote:
1160          set_error("Rquote");
1161        /*NOTREACHED*/ case Rbol:
1162          if (!beginning_context) {
1163             if (regexp_context_indep_ops)
1164                goto op_error;
1165             else
1166                goto normal_char;
1167          }
1168          opcode = Cbol;
1169          goto store_opcode;
1170       case Reol:
1171          if (!((pos >= size) ||
1172                ((regexp_syntax & RE_NO_BK_VBAR) ?
1173                 (regex[pos] == '\174') :
1174                 (pos + 1 < size && regex[pos] == '\134' &&
1175                  regex[pos + 1] == '\174')) ||
1176                ((regexp_syntax & RE_NO_BK_PARENS) ?
1177                 (regex[pos] == ')') :
1178                 (pos + 1 < size && regex[pos] == '\134' &&
1179                  regex[pos + 1] == ')')))) {
1180             if (regexp_context_indep_ops)
1181                goto op_error;
1182             else
1183                goto normal_char;
1184          }
1185          opcode = Ceol;
1186          goto store_opcode;
1187          /* NOTREACHED */
1188          break;
1189       case Roptional:
1190          if (beginning_context) {
1191             if (regexp_context_indep_ops)
1192                goto op_error;
1193             else
1194                goto normal_char;
1195          }
1196          if (CURRENT_LEVEL_START == pattern_offset)
1197             break;                 /* ignore empty patterns for ? */
1198          ALLOC(3);
1199          INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 3);
1200          break;
1201       case Rstar:
1202       case Rplus:
1203          if (beginning_context) {
1204             if (regexp_context_indep_ops)
1205                goto op_error;
1206             else
1207                goto normal_char;
1208          }
1209          if (CURRENT_LEVEL_START == pattern_offset)
1210             break;                 /* ignore empty patterns for + and * */
1211          ALLOC(9);
1212          INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1213          INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
1214          if (op == Rplus)          /* jump over initial failure_jump */
1215             INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
1216                         CURRENT_LEVEL_START + 6);
1217          break;
1218       case Ror:
1219          ALLOC(6);
1220          INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1221          if (num_jumps >= MAX_NESTING)
1222             goto too_complex;
1223          STORE(Cjump);
1224          future_jumps[num_jumps++] = pattern_offset;
1225          STORE(0);
1226          STORE(0);
1227          SET_LEVEL_START;
1228          break;
1229       case Ropenpar:
1230          {
1231             SET_LEVEL_START;
1232             if (next_register < RE_NREGS) {
1233                bufp->uses_registers = 1;
1234                ALLOC(2);
1235                STORE(Cstart_memory);
1236                STORE(next_register);
1237                open_registers[num_open_registers++] = next_register;
1238                bufp->num_registers++;
1239                next_register++;
1240             }
1241             paren_depth++;
1242             PUSH_LEVEL_STARTS;
1243             current_level = 0;
1244             SET_LEVEL_START;
1245             break;
1246          }
1247       case Rclosepar:
1248          if (paren_depth <= 0)
1249             goto parenthesis_error;
1250          POP_LEVEL_STARTS;
1251          current_level = precedences[Ropenpar];
1252          paren_depth--;
1253          if (paren_depth < num_open_registers) {
1254             bufp->uses_registers = 1;
1255             ALLOC(2);
1256             STORE(Cend_memory);
1257             num_open_registers--;
1258             STORE(open_registers[num_open_registers]);
1259          }
1260          break;
1261       case Rmemory:
1262          if (ch == '0')
1263             goto bad_match_register;
1264          if (!(ch >= '0' && ch <= '9')) {
1265             goto bad_match_register;
1266          }
1267          bufp->uses_registers = 1;
1268          opcode = Cmatch_memory;
1269          ch -= '0';
1270          goto store_opcode_and_arg;
1271       case Rextended_memory:
1272          NEXTCHAR(ch);
1273          if (ch < '0' || ch > '9')
1274             goto bad_match_register;
1275          NEXTCHAR(a);
1276          if (a < '0' || a > '9')
1277             goto bad_match_register;
1278          ch = 10 * (a - '0') + ch - '0';
1279          if (ch == 0 || ch >= RE_NREGS)
1280             goto bad_match_register;
1281          bufp->uses_registers = 1;
1282          opcode = Cmatch_memory;
1283          goto store_opcode_and_arg;
1284       case Ropenset:
1285          {
1286             int complement;
1287             int prev;
1288             int offset;
1289             int range;
1290             int firstchar;
1291 
1292             SET_LEVEL_START;
1293             ALLOC(1 + 256 / 8);
1294             STORE(Cset);
1295             offset = pattern_offset;
1296             for (a = 0; a < 256 / 8; a++)
1297                STORE(0);
1298             NEXTCHAR(ch);
1299             if (translate)
1300                ch = translate[(unsigned char)ch];
1301             if (ch == '\136') {
1302                complement = 1;
1303                NEXTCHAR(ch);
1304                if (translate)
1305                   ch = translate[(unsigned char)ch];
1306             } else
1307                complement = 0;
1308             prev = -1;
1309             range = 0;
1310             firstchar = 1;
1311             while (ch != '\135' || firstchar) {
1312                firstchar = 0;
1313                if (regexp_ansi_sequences && ch == '\134') {
1314                   NEXTCHAR(ch);
1315                   ANSI_TRANSLATE(ch);
1316                }
1317                if (range) {
1318                   for (a = prev; a <= (int)ch; a++)
1319                      SETBIT(pattern, offset, a);
1320                   prev = -1;
1321                   range = 0;
1322                } else if (prev != -1 && ch == '-')
1323                   range = 1;
1324                else {
1325                   SETBIT(pattern, offset, ch);
1326                   prev = ch;
1327                }
1328                NEXTCHAR(ch);
1329                if (translate)
1330                   ch = translate[(unsigned char)ch];
1331             }
1332             if (range)
1333                SETBIT(pattern, offset, '-');
1334             if (complement) {
1335                for (a = 0; a < 256 / 8; a++)
1336                   pattern[offset + a] ^= 0xff;
1337             }
1338             break;
1339          }
1340       case Rbegbuf:
1341          {
1342             opcode = Cbegbuf;
1343             goto store_opcode;
1344          }
1345       case Rendbuf:
1346          {
1347             opcode = Cendbuf;
1348             goto store_opcode;
1349          }
1350       case Rwordchar:
1351          {
1352             opcode = Csyntaxspec;
1353             ch = Sword;
1354             goto store_opcode_and_arg;
1355          }
1356       case Rnotwordchar:
1357          {
1358             opcode = Cnotsyntaxspec;
1359             ch = Sword;
1360             goto store_opcode_and_arg;
1361          }
1362       case Rwordbeg:
1363          {
1364             opcode = Cwordbeg;
1365             goto store_opcode;
1366          }
1367       case Rwordend:
1368          {
1369             opcode = Cwordend;
1370             goto store_opcode;
1371          }
1372       case Rwordbound:
1373          {
1374             opcode = Cwordbound;
1375             goto store_opcode;
1376          }
1377       case Rnotwordbound:
1378          {
1379             opcode = Cnotwordbound;
1380             goto store_opcode;
1381          }
1382       default:
1383          {
1384             abort();
1385          }
1386       }
1387       beginning_context = (op == Ropenpar || op == Ror);
1388    }
1389    if (starts_base != 0)
1390       goto parenthesis_error;
1391 // assert(num_jumps == 0);
1392    ALLOC(1);
1393    STORE(Cend);
1394    SET_FIELDS;
1395    if (!re_optimize(bufp))
1396       return "Optimization error";
1397    return NULL;
1398 
1399  op_error:
1400    SET_FIELDS;
1401    return "Badly placed special character";
1402 
1403  bad_match_register:
1404    SET_FIELDS;
1405    return "Bad match register number";
1406 
1407  hex_error:
1408    SET_FIELDS;
1409    return "Bad hexadecimal number";
1410 
1411  parenthesis_error:
1412    SET_FIELDS;
1413    return "Badly placed parenthesis";
1414 
1415  out_of_memory:
1416    SET_FIELDS;
1417    return "Out of memory";
1418 
1419  ends_prematurely:
1420    SET_FIELDS;
1421    return "Regular expression ends prematurely";
1422 
1423  too_complex:
1424    SET_FIELDS;
1425    return "Regular expression too complex";
1426 }
1427 
1428 #undef CHARAT
1429 #undef NEXTCHAR
1430 #undef GETHEX
1431 #undef ALLOC
1432 #undef STORE
1433 #undef CURRENT_LEVEL_START
1434 #undef SET_LEVEL_START
1435 #undef PUSH_LEVEL_STARTS
1436 #undef POP_LEVEL_STARTS
1437 #undef PUT_ADDR
1438 #undef INSERT_JUMP
1439 #undef SETBIT
1440 #undef SET_FIELDS
1441 
1442 #define PREFETCH if (text == textend) goto fail
1443 
1444 #define NEXTCHAR(var) \
1445 PREFETCH; \
1446 var = (unsigned char)*text++; \
1447 if (translate) \
1448         var = translate[var]
1449 
1450 
regcomp(regex_t * bufp,const char * regex,int cflags)1451 int regcomp(regex_t * bufp, const char *regex, int cflags)
1452 {
1453    memset(bufp, 0, sizeof(regex_t));
1454    bufp->cflags = cflags;
1455    if (bufp->cflags & REG_ICASE) {
1456       char *p, *lcase = bstrdup(regex);
1457       for( p = lcase; *p ; p++) {
1458          *p = tolower(*p);
1459       }
1460       re_compile_pattern(bufp, (unsigned char *)lcase);
1461       bfree(lcase);
1462    } else {
1463       re_compile_pattern(bufp, (unsigned char *)regex);
1464    }
1465    if (got_error) {
1466       return -1;
1467    }
1468    return 0;
1469 }
1470 
re_registers_to_regmatch(regexp_registers_t old_regs,regmatch_t pmatch[],size_t nmatch)1471 void re_registers_to_regmatch(regexp_registers_t old_regs,
1472                               regmatch_t pmatch[],
1473                               size_t nmatch)
1474 {
1475    size_t i=0;
1476 
1477    /* We have to set the last entry to -1 */
1478    nmatch = nmatch - 1;
1479    for (i=0; (i < nmatch) && (old_regs->start[i] > -1) ; i++) {
1480       pmatch[i].rm_so = old_regs->start[i];
1481       pmatch[i].rm_eo = old_regs->end[i];
1482    }
1483 
1484    pmatch[i].rm_eo = pmatch[i].rm_so = -1;
1485 }
1486 
regexec(regex_t * preg,const char * string,size_t nmatch,regmatch_t pmatch[],int eflags)1487 int regexec(regex_t * preg, const char *string, size_t nmatch,
1488             regmatch_t pmatch[], int eflags)
1489 {
1490    int stat;
1491    int len = strlen(string);
1492    struct re_registers regs;
1493    stat = re_search(preg, (unsigned char *)string, len, 0, len, &regs);
1494    if (stat >= 0 && nmatch > 0) {
1495       re_registers_to_regmatch(&regs, pmatch, nmatch);
1496    }
1497    /* stat is the start position in the string base 0 where
1498     *  the pattern was found or negative if not found.
1499     */
1500    return stat < 0 ? -1 : 0;
1501 }
1502 
regerror(int errcode,regex_t * preg,char * errbuf,size_t errbuf_size)1503 size_t regerror(int errcode, regex_t * preg, char *errbuf, size_t errbuf_size)
1504 {
1505    bstrncpy(errbuf, preg->errmsg, errbuf_size);
1506    return 0;
1507 }
1508 
regfree(regex_t * preg)1509 void regfree(regex_t * preg)
1510 {
1511    if (preg->lcase) {
1512       free_pool_memory(preg->lcase);
1513       preg->lcase = NULL;
1514    }
1515    if (preg->buffer) {
1516       free(preg->buffer);
1517       preg->buffer = NULL;
1518    }
1519 }
1520 
re_match(regex_t * bufp,unsigned char * string,int size,int pos,regexp_registers_t old_regs)1521 int re_match(regex_t * bufp, unsigned char *string, int size, int pos,
1522              regexp_registers_t old_regs)
1523 {
1524    unsigned char *code;
1525    unsigned char *translate;
1526    unsigned char *text;
1527    unsigned char *textstart;
1528    unsigned char *textend;
1529    int a;
1530    int b;
1531    int ch;
1532    int reg;
1533    int match_end;
1534    unsigned char *regstart;
1535    unsigned char *regend;
1536    int regsize;
1537    match_state state;
1538 
1539 // assert(pos >= 0 && size >= 0);
1540 // assert(pos <= size);
1541 
1542    text = string + pos;
1543    textstart = string;
1544    textend = string + size;
1545 
1546    code = bufp->buffer;
1547 
1548    translate = bufp->translate;
1549 
1550    NEW_STATE(state, bufp->num_registers);
1551 
1552  continue_matching:
1553    switch (*code++) {
1554    case Cend:
1555       {
1556          match_end = text - textstart;
1557          if (old_regs) {
1558             old_regs->start[0] = pos;
1559             old_regs->end[0] = match_end;
1560             if (!bufp->uses_registers) {
1561                for (a = 1; a < RE_NREGS; a++) {
1562                   old_regs->start[a] = -1;
1563                   old_regs->end[a] = -1;
1564                }
1565             } else {
1566                for (a = 1; a < bufp->num_registers; a++) {
1567                   if ((GET_REG_START(state, a) == NULL) ||
1568                       (GET_REG_END(state, a) == NULL)) {
1569                      old_regs->start[a] = -1;
1570                      old_regs->end[a] = -1;
1571                      continue;
1572                   }
1573                   old_regs->start[a] = GET_REG_START(state, a) - textstart;
1574                   old_regs->end[a] = GET_REG_END(state, a) - textstart;
1575                }
1576                for (; a < RE_NREGS; a++) {
1577                   old_regs->start[a] = -1;
1578                   old_regs->end[a] = -1;
1579                }
1580             }
1581          }
1582          FREE_STATE(state);
1583          return match_end - pos;
1584       }
1585    case Cbol:
1586       {
1587          if (text == textstart || text[-1] == '\n')
1588             goto continue_matching;
1589          goto fail;
1590       }
1591    case Ceol:
1592       {
1593          if (text == textend || *text == '\n')
1594             goto continue_matching;
1595          goto fail;
1596       }
1597    case Cset:
1598       {
1599          NEXTCHAR(ch);
1600          if (code[ch / 8] & (1 << (ch & 7))) {
1601             code += 256 / 8;
1602             goto continue_matching;
1603          }
1604          goto fail;
1605       }
1606    case Cexact:
1607       {
1608          NEXTCHAR(ch);
1609          if (ch != (unsigned char)*code++)
1610             goto fail;
1611          goto continue_matching;
1612       }
1613    case Canychar:
1614       {
1615          NEXTCHAR(ch);
1616          if (ch == '\n')
1617             goto fail;
1618          goto continue_matching;
1619       }
1620    case Cstart_memory:
1621       {
1622          reg = *code++;
1623          SET_REG_START(state, reg, text, goto error);
1624          goto continue_matching;
1625       }
1626    case Cend_memory:
1627       {
1628          reg = *code++;
1629          SET_REG_END(state, reg, text, goto error);
1630          goto continue_matching;
1631       }
1632    case Cmatch_memory:
1633       {
1634          reg = *code++;
1635          regstart = GET_REG_START(state, reg);
1636          regend = GET_REG_END(state, reg);
1637          if ((regstart == NULL) || (regend == NULL))
1638             goto fail;             /* or should we just match nothing? */
1639          regsize = regend - regstart;
1640 
1641          if (regsize > (textend - text))
1642             goto fail;
1643          if (translate) {
1644             for (; regstart < regend; regstart++, text++)
1645                if (translate[*regstart] != translate[*text])
1646                   goto fail;
1647          } else
1648             for (; regstart < regend; regstart++, text++)
1649                if (*regstart != *text)
1650                   goto fail;
1651          goto continue_matching;
1652       }
1653    case Cupdate_failure_jump:
1654       {
1655          UPDATE_FAILURE(state, text, goto error);
1656          /* fall to next case */
1657       }
1658       /* treat Cstar_jump just like Cjump if it hasn't been optimized */
1659    case Cstar_jump:
1660    case Cjump:
1661       {
1662          a = (unsigned char)*code++;
1663          a |= (unsigned char)*code++ << 8;
1664          code += (int)SHORT(a);
1665          if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1666             set_error("Regex VM jump out of bounds (Cjump)");
1667             FREE_STATE(state);
1668             return -2;
1669          }
1670          goto continue_matching;
1671       }
1672    case Cdummy_failure_jump:
1673       {
1674          unsigned char *failuredest;
1675 
1676          a = (unsigned char)*code++;
1677          a |= (unsigned char)*code++ << 8;
1678          a = (int)SHORT(a);
1679 //       assert(*code == Cfailure_jump);
1680          b = (unsigned char)code[1];
1681          b |= (unsigned char)code[2] << 8;
1682          failuredest = code + (int)SHORT(b) + 3;
1683          if (failuredest < bufp->buffer || bufp->buffer + bufp->used < failuredest) {
1684             set_error
1685                ("Regex VM jump out of bounds (Cdummy_failure_jump failuredest)");
1686             FREE_STATE(state);
1687             return -2;
1688          }
1689          PUSH_FAILURE(state, failuredest, NULL, goto error);
1690          code += a;
1691          if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1692             set_error("Regex VM jump out of bounds (Cdummy_failure_jump code)");
1693             FREE_STATE(state);
1694             return -2;
1695          }
1696          goto continue_matching;
1697       }
1698    case Cfailure_jump:
1699       {
1700          a = (unsigned char)*code++;
1701          a |= (unsigned char)*code++ << 8;
1702          a = (int)SHORT(a);
1703          if (code + a < bufp->buffer || bufp->buffer + bufp->used < code + a) {
1704             set_error("Regex VM jump out of bounds (Cfailure_jump)");
1705             FREE_STATE(state);
1706             return -2;
1707          }
1708          PUSH_FAILURE(state, code + a, text, goto error);
1709          goto continue_matching;
1710       }
1711    case Crepeat1:
1712       {
1713          unsigned char *pinst;
1714          a = (unsigned char)*code++;
1715          a |= (unsigned char)*code++ << 8;
1716          a = (int)SHORT(a);
1717          pinst = code + a;
1718          if (pinst < bufp->buffer || bufp->buffer + bufp->used < pinst) {
1719             set_error("Regex VM jump out of bounds (Crepeat1)");
1720             FREE_STATE(state);
1721             return -2;
1722          }
1723          /* pinst is sole instruction in loop, and it matches a
1724           * single character.  Since Crepeat1 was originally a
1725           * Cupdate_failure_jump, we also know that backtracking
1726           * is useless: so long as the single-character
1727           * expression matches, it must be used.  Also, in the
1728           * case of +, we've already matched one character, so +
1729           * can't fail: nothing here can cause a failure.  */
1730          switch (*pinst++) {
1731          case Cset:
1732             {
1733                if (translate) {
1734                   while (text < textend) {
1735                      ch = translate[(unsigned char)*text];
1736                      if (pinst[ch / 8] & (1 << (ch & 7)))
1737                         text++;
1738                      else
1739                         break;
1740                   }
1741                } else {
1742                   while (text < textend) {
1743                      ch = (unsigned char)*text;
1744                      if (pinst[ch / 8] & (1 << (ch & 7)))
1745                         text++;
1746                      else
1747                         break;
1748                   }
1749                }
1750                break;
1751             }
1752          case Cexact:
1753             {
1754                ch = (unsigned char)*pinst;
1755                if (translate) {
1756                   while (text < textend && translate[(unsigned char)*text] == ch)
1757                      text++;
1758                } else {
1759                   while (text < textend && (unsigned char)*text == ch)
1760                      text++;
1761                }
1762                break;
1763             }
1764          case Canychar:
1765             {
1766                while (text < textend && (unsigned char)*text != '\n')
1767                   text++;
1768                break;
1769             }
1770          case Csyntaxspec:
1771             {
1772                a = (unsigned char)*pinst;
1773                if (translate) {
1774                   while (text < textend && (SYNTAX(translate[*text]) & a))
1775                      text++;
1776                } else {
1777                   while (text < textend && (SYNTAX(*text) & a))
1778                      text++;
1779                }
1780                break;
1781             }
1782          case Cnotsyntaxspec:
1783             {
1784                a = (unsigned char)*pinst;
1785                if (translate) {
1786                   while (text < textend && !(SYNTAX(translate[*text]) & a))
1787                      text++;
1788                } else {
1789                   while (text < textend && !(SYNTAX(*text) & a))
1790                      text++;
1791                }
1792                break;
1793             }
1794          default:
1795             {
1796                FREE_STATE(state);
1797                set_error("Unknown regex opcode: memory corrupted?");
1798                return -2;
1799              /*NOTREACHED*/}
1800          }
1801          /* due to the funky way + and * are compiled, the top
1802           * failure- stack entry at this point is actually a
1803           * success entry -- update it & pop it */
1804          UPDATE_FAILURE(state, text, goto error);
1805          goto fail;                /* i.e., succeed <wink/sigh> */
1806       }
1807    case Cbegbuf:
1808       {
1809          if (text == textstart)
1810             goto continue_matching;
1811          goto fail;
1812       }
1813    case Cendbuf:
1814       {
1815          if (text == textend)
1816             goto continue_matching;
1817          goto fail;
1818       }
1819    case Cwordbeg:
1820       {
1821          if (text == textend)
1822             goto fail;
1823          if (!(SYNTAX(*text) & Sword))
1824             goto fail;
1825          if (text == textstart)
1826             goto continue_matching;
1827          if (!(SYNTAX(text[-1]) & Sword))
1828             goto continue_matching;
1829          goto fail;
1830       }
1831    case Cwordend:
1832       {
1833          if (text == textstart)
1834             goto fail;
1835          if (!(SYNTAX(text[-1]) & Sword))
1836             goto fail;
1837          if (text == textend)
1838             goto continue_matching;
1839          if (!(SYNTAX(*text) & Sword))
1840             goto continue_matching;
1841          goto fail;
1842       }
1843    case Cwordbound:
1844       {
1845          /* Note: as in gnu regexp, this also matches at the
1846           * beginning and end of buffer.  */
1847 
1848          if (text == textstart || text == textend)
1849             goto continue_matching;
1850          if ((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword))
1851             goto continue_matching;
1852          goto fail;
1853       }
1854    case Cnotwordbound:
1855       {
1856          /* Note: as in gnu regexp, this never matches at the
1857           * beginning and end of buffer.  */
1858          if (text == textstart || text == textend)
1859             goto fail;
1860          if (!((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword)))
1861             goto continue_matching;
1862          goto fail;
1863       }
1864    case Csyntaxspec:
1865       {
1866          NEXTCHAR(ch);
1867          if (!(SYNTAX(ch) & (unsigned char)*code++))
1868             goto fail;
1869          goto continue_matching;
1870       }
1871    case Cnotsyntaxspec:
1872       {
1873          NEXTCHAR(ch);
1874          if (SYNTAX(ch) & (unsigned char)*code++)
1875             goto fail;
1876          goto continue_matching;
1877       }
1878    default:
1879       {
1880          FREE_STATE(state);
1881          set_error("Unknown regex opcode: memory corrupted?");
1882          return -2;
1883        /*NOTREACHED*/}
1884    }
1885 
1886 
1887 
1888 #if 0                              /* This line is never reached --Guido */
1889    abort();
1890 #endif
1891    /*
1892     *NOTREACHED
1893     */
1894 
1895    /* Using "break;" in the above switch statement is equivalent to "goto fail;" */
1896  fail:
1897    POP_FAILURE(state, code, text, goto done_matching, goto error);
1898    goto continue_matching;
1899 
1900  done_matching:
1901 /*   if(translated != NULL) */
1902 /*      free(translated); */
1903    FREE_STATE(state);
1904    return -1;
1905 
1906  error:
1907 /*   if (translated != NULL) */
1908 /*      free(translated); */
1909    FREE_STATE(state);
1910    return -2;
1911 }
1912 
1913 
1914 #undef PREFETCH
1915 #undef NEXTCHAR
1916 
re_search(regex_t * bufp,unsigned char * str,int size,int pos,int range,regexp_registers_t regs)1917 int re_search(regex_t * bufp, unsigned char *str, int size, int pos,
1918               int range, regexp_registers_t regs)
1919 {
1920    unsigned char *fastmap;
1921    unsigned char *translate;
1922    unsigned char *text;
1923    unsigned char *partstart;
1924    unsigned char *partend;
1925    int dir;
1926    int ret;
1927    unsigned char anchor;
1928    unsigned char *string = str;
1929 
1930    if (bufp->cflags & REG_ICASE) { /* we must use string in lowercase */
1931       int len = strlen((const char *)str);
1932       if (!bufp->lcase) {
1933          bufp->lcase = get_pool_memory(PM_FNAME);
1934       }
1935       bufp->lcase = check_pool_memory_size(bufp->lcase, len+1);
1936       unsigned char *dst = (unsigned char *)bufp->lcase;
1937       while (*string) {
1938          *dst++ = tolower(*string++);
1939       }
1940       *dst = '\0';
1941       string = (unsigned char *)bufp->lcase;
1942    }
1943 
1944 // assert(size >= 0 && pos >= 0);
1945 // assert(pos + range >= 0 && pos + range <= size);     /* Bugfix by ylo */
1946 
1947    fastmap = bufp->fastmap;
1948    translate = bufp->translate;
1949    if (fastmap && !bufp->fastmap_accurate) {
1950       re_compile_fastmap(bufp);
1951       if (got_error)
1952          return -2;
1953    }
1954 
1955    anchor = bufp->anchor;
1956    if (bufp->can_be_null == 1)     /* can_be_null == 2: can match null at eob */
1957       fastmap = NULL;
1958 
1959    if (range < 0) {
1960       dir = -1;
1961       range = -range;
1962    } else
1963       dir = 1;
1964 
1965    if (anchor == 2) {
1966       if (pos != 0)
1967          return -1;
1968       else
1969          range = 0;
1970    }
1971 
1972    for (; range >= 0; range--, pos += dir) {
1973       if (fastmap) {
1974          if (dir == 1) {           /* searching forwards */
1975 
1976             text = string + pos;
1977             partend = string + size;
1978             partstart = text;
1979             if (translate)
1980                while (text != partend &&
1981                       !fastmap[(unsigned char)translate[(unsigned char)*text]])
1982                   text++;
1983             else
1984                while (text != partend && !fastmap[(unsigned char)*text])
1985                   text++;
1986             pos += text - partstart;
1987             range -= text - partstart;
1988             if (pos == size && bufp->can_be_null == 0)
1989                return -1;
1990          } else {                  /* searching backwards */
1991             text = string + pos;
1992             partstart = string + pos - range;
1993             partend = text;
1994             if (translate)
1995                while (text != partstart && !fastmap[(unsigned char)
1996                                                     translate[(unsigned char)*text]])
1997                   text--;
1998             else
1999                while (text != partstart && !fastmap[(unsigned char)*text])
2000                   text--;
2001             pos -= partend - text;
2002             range -= partend - text;
2003          }
2004       }
2005       if (anchor == 1) {           /* anchored to begline */
2006          if (pos > 0 && (string[pos - 1] != '\n'))
2007             continue;
2008       }
2009 //    assert(pos >= 0 && pos <= size);
2010       ret = re_match(bufp, string, size, pos, regs);
2011       if (ret >= 0)
2012          return pos;
2013       if (ret == -2)
2014          return -2;
2015    }
2016    return -1;
2017 }
2018 
2019 /*
2020 ** Local Variables:
2021 ** mode: c
2022 ** c-file-style: "python"
2023 ** End:
2024 */
2025