1 /*
2    BAREOS® - Backup Archiving REcovery Open Sourced
3 
4    Copyright (C) 2006-2010 Free Software Foundation Europe e.V.
5 
6    This program is Free Software; you can redistribute it and/or
7    modify it under the terms of version three of the GNU Affero General Public
8    License as published by the Free Software Foundation and included
9    in the file LICENSE.
10 
11    This program is distributed in the hope that it will be useful, but
12    WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14    Affero General Public License for more details.
15 
16    You should have received a copy of the GNU Affero General Public License
17    along with this program; if not, write to the Free Software
18    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19    02110-1301, USA.
20 */
21 /* regexpr.c
22  *
23  * Author: Tatu Ylonen <ylo@ngs.fi>
24  *
25  * Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
26  *
27  * Permission to use, copy, modify, distribute, and sell this software
28  * and its documentation for any purpose is hereby granted without
29  * fee, provided that the above copyright notice appear in all copies.
30  * This software is provided "as is" without express or implied
31  * warranty.
32  *
33  * Created: Thu Sep 26 17:14:05 1991 ylo
34  * Last modified: Mon Nov  4 17:06:48 1991 ylo
35  * Ported to Think C: 19 Jan 1992 guido@cwi.nl
36  *
37  * This code draws many ideas from the regular expression packages by
38  * Henry Spencer of the University of Toronto and Richard Stallman of
39  * the Free Software Foundation.
40  *
41  * Emacs-specific code and syntax table code is almost directly borrowed
42  * from GNU regexp.
43  *
44  * Bugs fixed and lots of reorganization by Jeffrey C. Ollie, April
45  * 1997 Thanks for bug reports and ideas from Andrew Kuchling, Tim
46  * Peters, Guido van Rossum, Ka-Ping Yee, Sjoerd Mullender, and
47  * probably one or two others that I'm forgetting.
48  *
49  * This file modified to work with BAREOS and C++ by
50  *    Kern Sibbald, April 2006
51  *
52  * This file modified to work with REG_ICASE and BAREOS by
53  *    Eric Bollengier April 2007
54  */
55 
56 #include "include/bareos.h"
57 #include "bregex.h"
58 
59 #define set_error(x) bufp->errmsg=((char *)(x))
60 #define got_error bufp->errmsg!=NULL
61 
62 /* The original code blithely assumed that sizeof(short) == 2.  Not
63  * always true.  Original instances of "(short)x" were replaced by
64  * SHORT(x), where SHORT is #defined below.  */
65 
66 #define SHORT(x) ((x) & 0x8000 ? (x) - 0x10000 : (x))
67 
68 /* The stack implementation is taken from an idea by Andrew Kuchling.
69  * It's a doubly linked list of arrays. The advantages of this over a
70  * simple linked list are that the number of mallocs required are
71  * reduced. It also makes it possible to statically allocate enough
72  * space so that small patterns don't ever need to call malloc.
73  *
74  * The advantages over a single array is that is periodically
75  * realloced when more space is needed is that we avoid ever copying
76  * the stack. */
77 
78 /* item_t is the basic stack element.  Defined as a union of
79  * structures so that both registers, failure points, and counters can
80  * be pushed/popped from the stack.  There's nothing built into the
81  * item to keep track of whether a certain stack item is a register, a
82  * failure point, or a counter. */
83 
84 typedef union item_t {
85    struct {
86       int num;
87       int level;
88       unsigned char *start;
89       unsigned char *end;
90    } reg;
91    struct {
92       int count;
93       int level;
94       int phantom;
95       unsigned char *code;
96       unsigned char *text;
97    } fail;
98    struct {
99       int num;
100       int level;
101       int count;
102    } cntr;
103 } item_t;
104 
105 #define B_STACK_PAGE_SIZE 256
106 #define NUM_REGISTERS 256
107 
108 /* A 'page' of stack items. */
109 
110 typedef struct item_page_t {
111    item_t items[B_STACK_PAGE_SIZE];
112    struct item_page_t *prev;
113    struct item_page_t *next;
114 } item_page_t;
115 
116 
117 typedef struct match_state {
118    /* The number of registers that have been pushed onto the stack
119     * since the last failure point. */
120 
121    int count;
122 
123    /* Used to control when registers need to be pushed onto the
124     * stack. */
125 
126    int level;
127 
128    /* The number of failure points on the stack. */
129 
130    int point;
131 
132    /* Storage for the registers.  Each register consists of two
133     * pointers to characters.  So register N is represented as
134     * start[N] and end[N].  The pointers must be converted to
135     * offsets from the beginning of the string before returning the
136     * registers to the calling program. */
137 
138    unsigned char *start[NUM_REGISTERS];
139    unsigned char *end[NUM_REGISTERS];
140 
141    /* Keeps track of whether a register has changed recently. */
142 
143    int changed[NUM_REGISTERS];
144 
145    /* Structure to encapsulate the stack. */
146    struct {
147       /* index into the current page.  If index == 0 and you need
148        * to pop an item, move to the previous page and set index
149        * = B_STACK_PAGE_SIZE - 1.  Otherwise decrement index to
150        * push a page. If index == B_STACK_PAGE_SIZE and you need
151        * to push a page move to the next page and set index =
152        * 0. If there is no new next page, allocate a new page
153        * and link it in. Otherwise, increment index to push a
154        * page. */
155 
156       int index;
157       item_page_t *current;        /* Pointer to the current page. */
158       item_page_t first;           /* First page is statically allocated. */
159    } stack;
160 } match_state;
161 
162 /* Initialize a state object */
163 
164 /* #define NEW_STATE(state) \ */
165 /* memset(&state, 0, (void *)(&state.stack) - (void *)(&state)); \ */
166 /* state.stack.current = &state.stack.first; \ */
167 /* state.stack.first.prev = NULL; \ */
168 /* state.stack.first.next = NULL; \ */
169 /* state.stack.index = 0; \ */
170 /* state.level = 1 */
171 
172 #define NEW_STATE(state, nregs) \
173 { \
174    int i; \
175    for (i = 0; i < nregs; i++) \
176    { \
177       state.start[i] = NULL; \
178       state.end[i] = NULL; \
179       state.changed[i] = 0; \
180    } \
181    state.stack.current = &state.stack.first; \
182    state.stack.first.prev = NULL; \
183    state.stack.first.next = NULL; \
184    state.stack.index = 0; \
185    state.level = 1; \
186    state.count = 0; \
187    state.level = 0; \
188    state.point = 0; \
189 }
190 
191 /* Free any memory that might have been malloc'd */
192 
193 #define FREE_STATE(state) \
194 while(state.stack.first.next != NULL) \
195 { \
196    state.stack.current = state.stack.first.next; \
197    state.stack.first.next = state.stack.current->next; \
198    free(state.stack.current); \
199 }
200 
201 /* Discard the top 'count' stack items. */
202 
203 #define STACK_DISCARD(stack, count, on_error) \
204 stack.index -= count; \
205 while (stack.index < 0) \
206 { \
207    if (stack.current->prev == NULL) \
208            on_error; \
209    stack.current = stack.current->prev; \
210    stack.index += B_STACK_PAGE_SIZE; \
211 }
212 
213 /* Store a pointer to the previous item on the stack. Used to pop an
214  * item off of the stack. */
215 
216 #define STACK_PREV(stack, top, on_error) \
217 if (stack.index == 0) \
218 { \
219         if (stack.current->prev == NULL) \
220                 on_error; \
221         stack.current = stack.current->prev; \
222         stack.index = B_STACK_PAGE_SIZE - 1; \
223 } \
224 else \
225 { \
226         stack.index--; \
227 } \
228 top = &(stack.current->items[stack.index])
229 
230 /* Store a pointer to the next item on the stack. Used to push an item
231  * on to the stack. */
232 
233 #define STACK_NEXT(stack, top, on_error) \
234 if (stack.index == B_STACK_PAGE_SIZE) \
235 { \
236         if (stack.current->next == NULL) \
237         { \
238                 stack.current->next = (item_page_t *)malloc(sizeof(item_page_t)); \
239                 if (stack.current->next == NULL) \
240                         on_error; \
241                 stack.current->next->prev = stack.current; \
242                 stack.current->next->next = NULL; \
243         } \
244         stack.current = stack.current->next; \
245         stack.index = 0; \
246 } \
247 top = &(stack.current->items[stack.index++])
248 
249 /* Store a pointer to the item that is 'count' items back in the
250  * stack. STACK_BACK(stack, top, 1, on_error) is equivalent to
251  * STACK_TOP(stack, top, on_error).  */
252 
253 #define STACK_BACK(stack, top, count, on_error) \
254 { \
255         int index; \
256         item_page_t *current; \
257         current = stack.current; \
258         index = stack.index - (count); \
259         while (index < 0) \
260         { \
261                 if (current->prev == NULL) \
262                         on_error; \
263                 current = current->prev; \
264                 index += B_STACK_PAGE_SIZE; \
265         } \
266         top = &(current->items[index]); \
267 }
268 
269 /* Store a pointer to the top item on the stack. Execute the
270  * 'on_error' code if there are no items on the stack. */
271 
272 #define STACK_TOP(stack, top, on_error) \
273 if (stack.index == 0) \
274 { \
275         if (stack.current->prev == NULL) \
276                 on_error; \
277         top = &(stack.current->prev->items[B_STACK_PAGE_SIZE - 1]); \
278 } \
279 else \
280 { \
281         top = &(stack.current->items[stack.index - 1]); \
282 }
283 
284 /* Test to see if the stack is empty */
285 
286 #define STACK_EMPTY(stack) ((stack.index == 0) && \
287                             (stack.current->prev == NULL))
288 
289 /* Return the start of register 'reg' */
290 
291 #define GET_REG_START(state, reg) (state.start[reg])
292 
293 /* Return the end of register 'reg' */
294 
295 #define GET_REG_END(state, reg) (state.end[reg])
296 
297 /* Set the start of register 'reg'. If the state of the register needs
298  * saving, push it on the stack. */
299 
300 #define SET_REG_START(state, reg, text, on_error) \
301 if(state.changed[reg] < state.level) \
302 { \
303         item_t *item; \
304         STACK_NEXT(state.stack, item, on_error); \
305         item->reg.num = reg; \
306         item->reg.start = state.start[reg]; \
307         item->reg.end = state.end[reg]; \
308         item->reg.level = state.changed[reg]; \
309         state.changed[reg] = state.level; \
310         state.count++; \
311 } \
312 state.start[reg] = text
313 
314 /* Set the end of register 'reg'. If the state of the register needs
315  * saving, push it on the stack. */
316 
317 #define SET_REG_END(state, reg, text, on_error) \
318 if(state.changed[reg] < state.level) \
319 { \
320         item_t *item; \
321         STACK_NEXT(state.stack, item, on_error); \
322         item->reg.num = reg; \
323         item->reg.start = state.start[reg]; \
324         item->reg.end = state.end[reg]; \
325         item->reg.level = state.changed[reg]; \
326         state.changed[reg] = state.level; \
327         state.count++; \
328 } \
329 state.end[reg] = text
330 
331 #define PUSH_FAILURE(state, xcode, xtext, on_error) \
332 { \
333         item_t *item; \
334         STACK_NEXT(state.stack, item, on_error); \
335         item->fail.code = xcode; \
336         item->fail.text = xtext; \
337         item->fail.count = state.count; \
338         item->fail.level = state.level; \
339         item->fail.phantom = 0; \
340         state.count = 0; \
341         state.level++; \
342         state.point++; \
343 }
344 
345 /* Update the last failure point with a new position in the text. */
346 
347 #define UPDATE_FAILURE(state, xtext, on_error) \
348 { \
349    item_t *item; \
350    STACK_BACK(state.stack, item, state.count + 1, on_error); \
351    if (!item->fail.phantom) \
352    { \
353            item_t *item2; \
354            STACK_NEXT(state.stack, item2, on_error); \
355            item2->fail.code = item->fail.code; \
356            item2->fail.text = xtext; \
357            item2->fail.count = state.count; \
358            item2->fail.level = state.level; \
359            item2->fail.phantom = 1; \
360            state.count = 0; \
361            state.level++; \
362            state.point++; \
363    } \
364    else \
365    { \
366            STACK_DISCARD(state.stack, state.count, on_error); \
367            STACK_TOP(state.stack, item, on_error); \
368            item->fail.text = xtext; \
369            state.count = 0; \
370            state.level++; \
371    } \
372 }
373 
374 #define POP_FAILURE(state, xcode, xtext, on_empty, on_error) \
375 { \
376    item_t *item; \
377    do \
378    { \
379            while(state.count > 0) \
380            { \
381                    STACK_PREV(state.stack, item, on_error); \
382                    state.start[item->reg.num] = item->reg.start; \
383                    state.end[item->reg.num] = item->reg.end; \
384                    state.changed[item->reg.num] = item->reg.level; \
385                    state.count--; \
386            } \
387            STACK_PREV(state.stack, item, on_empty); \
388            xcode = item->fail.code; \
389            xtext = item->fail.text; \
390            state.count = item->fail.count; \
391            state.level = item->fail.level; \
392            state.point--; \
393    } \
394    while (item->fail.text == NULL); \
395 }
396 
397 enum regexp_compiled_ops {         /* opcodes for compiled regexp */
398    Cend,                           /* end of pattern reached */
399    Cbol,                           /* beginning of line */
400    Ceol,                           /* end of line */
401    Cset,                           /* character set.  Followed by 32 bytes of set. */
402    Cexact,                         /* followed by a byte to match */
403    Canychar,                       /* matches any character except newline */
404    Cstart_memory,                  /* set register start addr (followed by reg number) */
405    Cend_memory,                    /* set register end addr (followed by reg number) */
406    Cmatch_memory,                  /* match a duplicate of reg contents (regnum follows) */
407    Cjump,                          /* followed by two bytes (lsb,msb) of displacement. */
408    Cstar_jump,                     /* will change to jump/update_failure_jump at runtime */
409    Cfailure_jump,                  /* jump to addr on failure */
410    Cupdate_failure_jump,           /* update topmost failure point and jump */
411    Cdummy_failure_jump,            /* push a dummy failure point and jump */
412    Cbegbuf,                        /* match at beginning of buffer */
413    Cendbuf,                        /* match at end of buffer */
414    Cwordbeg,                       /* match at beginning of word */
415    Cwordend,                       /* match at end of word */
416    Cwordbound,                     /* match if at word boundary */
417    Cnotwordbound,                  /* match if not at word boundary */
418    Csyntaxspec,                    /* matches syntax code (1 byte follows) */
419    Cnotsyntaxspec,                 /* matches if syntax code does not match (1 byte follows) */
420    Crepeat1
421 };
422 
423 enum regexp_syntax_op {            /* syntax codes for plain and quoted characters */
424    Rend,                           /* special code for end of regexp */
425    Rnormal,                        /* normal character */
426    Ranychar,                       /* any character except newline */
427    Rquote,                         /* the quote character */
428    Rbol,                           /* match beginning of line */
429    Reol,                           /* match end of line */
430    Roptional,                      /* match preceding expression optionally */
431    Rstar,                          /* match preceding expr zero or more times */
432    Rplus,                          /* match preceding expr one or more times */
433    Ror,                            /* match either of alternatives */
434    Ropenpar,                       /* opening parenthesis */
435    Rclosepar,                      /* closing parenthesis */
436    Rmemory,                        /* match memory register */
437    Rextended_memory,               /* \vnn to match registers 10-99 */
438    Ropenset,                       /* open set.  Internal syntax hard-coded below. */
439    /* the following are gnu extensions to "normal" regexp syntax */
440    Rbegbuf,                        /* beginning of buffer */
441    Rendbuf,                        /* end of buffer */
442    Rwordchar,                      /* word character */
443    Rnotwordchar,                   /* not word character */
444    Rwordbeg,                       /* beginning of word */
445    Rwordend,                       /* end of word */
446    Rwordbound,                     /* word bound */
447    Rnotwordbound,                  /* not word bound */
448    Rnum_ops
449 };
450 
451 static int re_compile_initialized = 0;
452 static int regexp_syntax = RE_SYNTAX_EGREP;
453 int re_syntax = RE_SYNTAX_EGREP;   /* Exported copy of regexp_syntax */
454 static unsigned char plain_ops[256];
455 static unsigned char quoted_ops[256];
456 static unsigned char precedences[Rnum_ops];
457 static int regexp_context_indep_ops;
458 static int regexp_ansi_sequences;
459 
460 #define NUM_LEVELS  5              /* number of precedence levels in use */
461 #define MAX_NESTING 100            /* max nesting level of operators */
462 
463 #define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
464 
465 unsigned char re_syntax_table[256];
466 
ReCompileInitialize(void)467 void ReCompileInitialize(void)
468 {
469    int a;
470 
471    static int syntax_table_inited = 0;
472 
473    if (!syntax_table_inited) {
474       syntax_table_inited = 1;
475       memset(re_syntax_table, 0, 256);
476       for (a = 'a'; a <= 'z'; a++)
477          re_syntax_table[a] = Sword;
478       for (a = 'A'; a <= 'Z'; a++)
479          re_syntax_table[a] = Sword;
480       for (a = '0'; a <= '9'; a++)
481          re_syntax_table[a] = Sword | Sdigit | Shexdigit;
482       for (a = '0'; a <= '7'; a++)
483          re_syntax_table[a] |= Soctaldigit;
484       for (a = 'A'; a <= 'F'; a++)
485          re_syntax_table[a] |= Shexdigit;
486       for (a = 'a'; a <= 'f'; a++)
487          re_syntax_table[a] |= Shexdigit;
488       re_syntax_table[(int)'_'] = Sword;
489       for (a = 9; a <= 13; a++)
490          re_syntax_table[a] = Swhitespace;
491       re_syntax_table[(int)' '] = Swhitespace;
492    }
493    re_compile_initialized = 1;
494    for (a = 0; a < 256; a++) {
495       plain_ops[a] = Rnormal;
496       quoted_ops[a] = Rnormal;
497    }
498    for (a = '0'; a <= '9'; a++)
499       quoted_ops[a] = Rmemory;
500    plain_ops[(int)'\134'] = Rquote;
501    if (regexp_syntax & RE_NO_BK_PARENS) {
502       plain_ops[(int)'('] = Ropenpar;
503       plain_ops[(int)')'] = Rclosepar;
504    } else {
505       quoted_ops[(int)'('] = Ropenpar;
506       quoted_ops[(int)')'] = Rclosepar;
507    }
508    if (regexp_syntax & RE_NO_BK_VBAR) {
509       plain_ops[(int)'\174'] = Ror;
510    } else {
511       quoted_ops[(int)'\174'] = Ror;
512    }
513    plain_ops[(int)'*'] = Rstar;
514    if (regexp_syntax & RE_BK_PLUS_QM) {
515       quoted_ops[(int)'+'] = Rplus;
516       quoted_ops[(int)'?'] = Roptional;
517    } else {
518       plain_ops[(int)'+'] = Rplus;
519       plain_ops[(int)'?'] = Roptional;
520    }
521    if (regexp_syntax & RE_NEWLINE_OR) {
522       plain_ops[(int)'\n'] = Ror;
523    }
524    plain_ops[(int)'\133'] = Ropenset;
525    plain_ops[(int)'\136'] = Rbol;
526    plain_ops[(int)'$'] = Reol;
527    plain_ops[(int)'.'] = Ranychar;
528    if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS)) {
529       quoted_ops[(int)'w'] = Rwordchar;
530       quoted_ops[(int)'W'] = Rnotwordchar;
531       quoted_ops[(int)'<'] = Rwordbeg;
532       quoted_ops[(int)'>'] = Rwordend;
533       quoted_ops[(int)'b'] = Rwordbound;
534       quoted_ops[(int)'B'] = Rnotwordbound;
535       quoted_ops[(int)'`'] = Rbegbuf;
536       quoted_ops[(int)'\''] = Rendbuf;
537    }
538    if (regexp_syntax & RE_ANSI_HEX) {
539       quoted_ops[(int)'v'] = Rextended_memory;
540    }
541    for (a = 0; a < Rnum_ops; a++) {
542       precedences[a] = 4;
543    }
544    if (regexp_syntax & RE_TIGHT_VBAR) {
545       precedences[Ror] = 3;
546       precedences[Rbol] = 2;
547       precedences[Reol] = 2;
548    } else {
549       precedences[Ror] = 2;
550       precedences[Rbol] = 3;
551       precedences[Reol] = 3;
552    }
553    precedences[Rclosepar] = 1;
554    precedences[Rend] = 0;
555    regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
556    regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
557 }
558 
ReSetSyntax(int syntax)559 int ReSetSyntax(int syntax) {
560    int ret;
561 
562    ret = regexp_syntax;
563    regexp_syntax = syntax;
564    re_syntax = syntax;          /* Exported copy */
565    ReCompileInitialize();
566    return ret;
567 }
568 
HexCharToDecimal(int ch)569 static int HexCharToDecimal(int ch) {
570    if (ch >= '0' && ch <= '9')
571       return ch - '0';
572    if (ch >= 'a' && ch <= 'f')
573       return ch - 'a' + 10;
574    if (ch >= 'A' && ch <= 'F')
575       return ch - 'A' + 10;
576    return 16;
577 }
578 
re_compile_fastmap_aux(regex_t * bufp,unsigned char * code,int pos,unsigned char * visited,unsigned char * can_be_null,unsigned char * fastmap)579 static void re_compile_fastmap_aux(regex_t * bufp, unsigned char *code, int pos,
580                                    unsigned char *visited,
581                                    unsigned char *can_be_null,
582                                    unsigned char *fastmap)
583 {
584    int a;
585    int b;
586    int syntaxcode;
587 
588    if (visited[pos])
589       return;                   /* we have already been here */
590    visited[pos] = 1;
591    for (;;) {
592       switch (code[pos++]) {
593       case Cend:
594          *can_be_null = 1;
595          return;
596       case Cbol:
597       case Cbegbuf:
598       case Cendbuf:
599       case Cwordbeg:
600       case Cwordend:
601       case Cwordbound:
602       case Cnotwordbound:
603          for (a = 0; a < 256; a++)
604             fastmap[a] = 1;
605          break;
606       case Csyntaxspec:
607          syntaxcode = code[pos++];
608          for (a = 0; a < 256; a++)
609             if (SYNTAX(a) & syntaxcode)
610                fastmap[a] = 1;
611          return;
612       case Cnotsyntaxspec:
613          syntaxcode = code[pos++];
614          for (a = 0; a < 256; a++)
615             if (!(SYNTAX(a) & syntaxcode))
616                fastmap[a] = 1;
617          return;
618       case Ceol:
619          fastmap[(int)'\n'] = 1;
620          if (*can_be_null == 0)
621             *can_be_null = 2;      /* can match null, but only at end of buffer */
622          return;
623       case Cset:
624          for (a = 0; a < 256 / 8; a++)
625             if (code[pos + a] != 0)
626                for (b = 0; b < 8; b++)
627                   if (code[pos + a] & (1 << b))
628                      fastmap[(a << 3) + b] = 1;
629          pos += 256 / 8;
630          return;
631       case Cexact:
632          fastmap[(unsigned char)code[pos]] = 1;
633          return;
634       case Canychar:
635          for (a = 0; a < 256; a++)
636             if (a != '\n')
637                fastmap[a] = 1;
638          return;
639       case Cstart_memory:
640       case Cend_memory:
641          pos++;
642          break;
643       case Cmatch_memory:
644          for (a = 0; a < 256; a++)
645             fastmap[a] = 1;
646          *can_be_null = 1;
647          return;
648       case Cjump:
649       case Cdummy_failure_jump:
650       case Cupdate_failure_jump:
651       case Cstar_jump:
652          a = (unsigned char)code[pos++];
653          a |= (unsigned char)code[pos++] << 8;
654          pos += (int)SHORT(a);
655          if (visited[pos]) {
656             /* argh... the regexp contains empty loops.  This is not
657                good, as this may cause a failure stack overflow when
658                matching.  Oh well. */
659             /* this path leads nowhere; pursue other paths. */
660             return;
661          }
662          visited[pos] = 1;
663          break;
664       case Cfailure_jump:
665          a = (unsigned char)code[pos++];
666          a |= (unsigned char)code[pos++] << 8;
667          a = pos + (int)SHORT(a);
668          re_compile_fastmap_aux(bufp, code, a, visited, can_be_null, fastmap);
669          break;
670       case Crepeat1:
671          pos += 2;
672          break;
673       default:
674          set_error("Unknown regex opcode: memory corrupted?");
675          return;
676       }
677    }
678 }
679 
re_do_compile_fastmap(regex_t * bufp,unsigned char * buffer,int used,int pos,unsigned char * can_be_null,unsigned char * fastmap)680 static int re_do_compile_fastmap(regex_t * bufp, unsigned char *buffer, int used,
681                                  int pos, unsigned char *can_be_null,
682                                  unsigned char *fastmap)
683 {
684    unsigned char small_visited[512], *visited;
685 
686    if (used <= (int)sizeof(small_visited))
687       visited = small_visited;
688    else {
689       visited = (unsigned char *)malloc(used);
690       if (!visited)
691          return 0;
692    }
693    *can_be_null = 0;
694    memset(fastmap, 0, 256);
695    memset(visited, 0, used);
696    re_compile_fastmap_aux(bufp, buffer, pos, visited, can_be_null, fastmap);
697    if (visited != small_visited)
698       free(visited);
699    return 1;
700 }
701 
ReCompileFastmap(regex_t * bufp)702 void ReCompileFastmap(regex_t * bufp)
703 {
704    if (!bufp->fastmap || bufp->fastmap_accurate)
705       return;
706 // assert(bufp->used > 0);
707    if (!re_do_compile_fastmap(bufp, bufp->buffer,
708                               bufp->used, 0, &bufp->can_be_null, bufp->fastmap))
709       return;
710    if (got_error)
711       return;
712    if (bufp->buffer[0] == Cbol)
713       bufp->anchor = 1;            /* begline */
714    else if (bufp->buffer[0] == Cbegbuf)
715       bufp->anchor = 2;            /* begbuf */
716    else
717       bufp->anchor = 0;            /* none */
718    bufp->fastmap_accurate = 1;
719 }
720 
721 /*
722  * star is coded as:
723  * 1: failure_jump 2
724  *    ... code for operand of star
725  *    star_jump 1
726  * 2: ... code after star
727  *
728  * We change the star_jump to update_failure_jump if we can determine
729  * that it is safe to do so; otherwise we change it to an ordinary
730  * jump.
731  *
732  * plus is coded as
733  *
734  *    jump 2
735  * 1: failure_jump 3
736  * 2: ... code for operand of plus
737  *    star_jump 1
738  * 3: ... code after plus
739  *
740  * For star_jump considerations this is processed identically to star.
741  *
742  */
743 
ReOptimizeStarJump(regex_t * bufp,unsigned char * code)744 static int ReOptimizeStarJump(regex_t * bufp, unsigned char *code)
745 {
746    unsigned char map[256];
747    unsigned char can_be_null;
748    unsigned char *p1;
749    unsigned char *p2;
750    unsigned char ch;
751    int a;
752    int b;
753    int num_instructions = 0;
754 
755    a = (unsigned char)*code++;
756    a |= (unsigned char)*code++ << 8;
757    a = (int)SHORT(a);
758 
759    p1 = code + a + 3;              /* skip the failure_jump */
760    /* Check that the jump is within the pattern */
761    if (p1 < bufp->buffer || bufp->buffer + bufp->used < p1) {
762       set_error("Regex VM jump out of bounds (failure_jump opt)");
763       return 0;
764    }
765 // assert(p1[-3] == Cfailure_jump);
766    p2 = code;
767    /* p1 points inside loop, p2 points to after loop */
768    if (!re_do_compile_fastmap(bufp, bufp->buffer, bufp->used,
769                               (int)(p2 - bufp->buffer), &can_be_null, map))
770       goto make_normal_jump;
771 
772    /* If we might introduce a new update point inside the
773     * loop, we can't optimize because then update_jump would
774     * update a wrong failure point.  Thus we have to be
775     * quite careful here.
776     */
777 
778    /* loop until we find something that consumes a character */
779    for (;;) {
780       num_instructions++;
781       switch (*p1++) {
782       case Cbol:
783       case Ceol:
784       case Cbegbuf:
785       case Cendbuf:
786       case Cwordbeg:
787       case Cwordend:
788       case Cwordbound:
789       case Cnotwordbound:
790          continue;
791       case Cstart_memory:
792       case Cend_memory:
793          p1++;
794          continue;
795       case Cexact:
796          ch = (unsigned char)*p1++;
797          if (map[(int)ch])
798             goto make_normal_jump;
799          break;
800       case Canychar:
801          for (b = 0; b < 256; b++)
802             if (b != '\n' && map[b])
803                goto make_normal_jump;
804          break;
805       case Cset:
806          for (b = 0; b < 256; b++)
807             if ((p1[b >> 3] & (1 << (b & 7))) && map[b])
808                goto make_normal_jump;
809          p1 += 256 / 8;
810          break;
811       default:
812          goto make_normal_jump;
813       }
814       break;
815    }
816    /* now we know that we can't backtrack. */
817    while (p1 != p2 - 3) {
818       num_instructions++;
819       switch (*p1++) {
820       case Cend:
821          return 0;
822       case Cbol:
823       case Ceol:
824       case Canychar:
825       case Cbegbuf:
826       case Cendbuf:
827       case Cwordbeg:
828       case Cwordend:
829       case Cwordbound:
830       case Cnotwordbound:
831          break;
832       case Cset:
833          p1 += 256 / 8;
834          break;
835       case Cexact:
836       case Cstart_memory:
837       case Cend_memory:
838       case Cmatch_memory:
839       case Csyntaxspec:
840       case Cnotsyntaxspec:
841          p1++;
842          break;
843       case Cjump:
844       case Cstar_jump:
845       case Cfailure_jump:
846       case Cupdate_failure_jump:
847       case Cdummy_failure_jump:
848          goto make_normal_jump;
849       default:
850          return 0;
851       }
852    }
853 
854    /* make_update_jump: */
855    code -= 3;
856    a += 3;                         /* jump to after the Cfailure_jump */
857    code[0] = Cupdate_failure_jump;
858    code[1] = a & 0xff;
859    code[2] = a >> 8;
860    if (num_instructions > 1)
861       return 1;
862 // assert(num_instructions == 1);
863    /* if the only instruction matches a single character, we can do
864     * better */
865    p1 = code + 3 + a;              /* start of sole instruction */
866    if (*p1 == Cset || *p1 == Cexact || *p1 == Canychar ||
867        *p1 == Csyntaxspec || *p1 == Cnotsyntaxspec)
868       code[0] = Crepeat1;
869    return 1;
870 
871  make_normal_jump:
872    code -= 3;
873    *code = Cjump;
874    return 1;
875 }
876 
ReOptimize(regex_t * bufp)877 static int ReOptimize(regex_t * bufp)
878 {
879    unsigned char *code;
880 
881    code = bufp->buffer;
882 
883    while (1) {
884       switch (*code++) {
885       case Cend:
886          return 1;
887       case Canychar:
888       case Cbol:
889       case Ceol:
890       case Cbegbuf:
891       case Cendbuf:
892       case Cwordbeg:
893       case Cwordend:
894       case Cwordbound:
895       case Cnotwordbound:
896          break;
897       case Cset:
898          code += 256 / 8;
899          break;
900       case Cexact:
901       case Cstart_memory:
902       case Cend_memory:
903       case Cmatch_memory:
904       case Csyntaxspec:
905       case Cnotsyntaxspec:
906          code++;
907          break;
908       case Cstar_jump:
909          if (!ReOptimizeStarJump(bufp, code)) {
910             return 0;
911          }
912          /* fall through */
913       case Cupdate_failure_jump:
914       case Cjump:
915       case Cdummy_failure_jump:
916       case Cfailure_jump:
917       case Crepeat1:
918          code += 2;
919          break;
920       default:
921          return 0;
922       }
923    }
924 }
925 
926 #define NEXTCHAR(var) \
927 { \
928         if (pos >= size) \
929                 goto ends_prematurely; \
930         (var) = regex[pos]; \
931         pos++; \
932 }
933 
934 #define ALLOC(amount) \
935 { \
936           if (pattern_offset+(amount) > alloc) \
937           { \
938                   alloc += 256 + (amount); \
939                   pattern = (unsigned char *)realloc(pattern, alloc); \
940                   if (!pattern) \
941                           goto out_of_memory; \
942           } \
943 }
944 
945 #define STORE(ch) pattern[pattern_offset++] = (ch)
946 
947 #define CURRENT_LEVEL_START (starts[starts_base + current_level])
948 
949 #define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
950 
951 #define PUSH_LEVEL_STARTS \
952 if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
953         starts_base += NUM_LEVELS; \
954 else \
955         goto too_complex \
956 
957 #define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
958 
959 #define PUT_ADDR(offset,addr) \
960 { \
961         int disp = (addr) - (offset) - 2; \
962         pattern[(offset)] = disp & 0xff; \
963         pattern[(offset)+1] = (disp>>8) & 0xff; \
964 }
965 
966 #define INSERT_JUMP(pos,type,addr) \
967 { \
968         int a, p = (pos), t = (type), ad = (addr); \
969         for (a = pattern_offset - 1; a >= p; a--) \
970                 pattern[a + 3] = pattern[a]; \
971         pattern[p] = t; \
972         PUT_ADDR(p+1,ad); \
973         pattern_offset += 3; \
974 }
975 
976 #define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
977 
978 #define SET_FIELDS \
979 { \
980         bufp->allocated = alloc; \
981         bufp->buffer = pattern; \
982         bufp->used = pattern_offset; \
983 }
984 
985 #define GETHEX(var) \
986 { \
987         unsigned char gethex_ch, gethex_value; \
988         NEXTCHAR(gethex_ch); \
989         gethex_value = HexCharToDecimal(gethex_ch); \
990         if (gethex_value == 16) \
991                 goto hex_error; \
992         NEXTCHAR(gethex_ch); \
993         gethex_ch = HexCharToDecimal(gethex_ch); \
994         if (gethex_ch == 16) \
995                 goto hex_error; \
996         (var) = gethex_value * 16 + gethex_ch; \
997 }
998 
999 #define ANSI_TRANSLATE(ch) \
1000 { \
1001    switch (ch) \
1002    { \
1003    case 'a': \
1004    case 'A': \
1005    { \
1006            ch = 7; /* audible bell */ \
1007            break; \
1008    } \
1009    case 'b': \
1010    case 'B': \
1011    { \
1012            ch = 8; /* backspace */ \
1013            break; \
1014    } \
1015    case 'f': \
1016    case 'F': \
1017    { \
1018            ch = 12; /* form feed */ \
1019            break; \
1020    } \
1021    case 'n': \
1022    case 'N': \
1023    { \
1024            ch = 10; /* line feed */ \
1025            break; \
1026    } \
1027    case 'r': \
1028    case 'R': \
1029    { \
1030            ch = 13; /* carriage return */ \
1031            break; \
1032    } \
1033    case 't': \
1034    case 'T': \
1035    { \
1036          ch = 9; /* tab */ \
1037          break; \
1038    } \
1039    case 'v': \
1040    case 'V': \
1041    { \
1042            ch = 11; /* vertical tab */ \
1043            break; \
1044    } \
1045    case 'x': /* hex code */ \
1046    case 'X': \
1047    { \
1048            GETHEX(ch); \
1049            break; \
1050    } \
1051    default: \
1052    { \
1053            /* other characters passed through */ \
1054            if (translate) \
1055                    ch = translate[(unsigned char)ch]; \
1056            break; \
1057    } \
1058    } \
1059 }
1060 
re_compile_pattern(regex_t * bufp,unsigned char * regex)1061 const char *re_compile_pattern(regex_t * bufp, unsigned char *regex)
1062 {
1063    int a;
1064    int pos;
1065    int op;
1066    int current_level;
1067    int level;
1068    int opcode;
1069    int pattern_offset = 0, alloc;
1070    int starts[NUM_LEVELS * MAX_NESTING];
1071    int starts_base;
1072    int future_jumps[MAX_NESTING];
1073    int num_jumps;
1074    unsigned char ch = '\0';
1075    unsigned char *pattern;
1076    unsigned char *translate;
1077    int next_register;
1078    int paren_depth;
1079    int num_open_registers;
1080    int open_registers[RE_NREGS];
1081    int beginning_context;
1082    int size = strlen((char *)regex);
1083 
1084    if (!re_compile_initialized)
1085       ReCompileInitialize();
1086    bufp->used = 0;
1087    bufp->fastmap_accurate = 0;
1088    bufp->uses_registers = 1;
1089    bufp->num_registers = 1;
1090    translate = bufp->translate;
1091    pattern = bufp->buffer;
1092    alloc = bufp->allocated;
1093    if (alloc == 0 || pattern == NULL) {
1094       alloc = 256;
1095       bufp->buffer = pattern = (unsigned char *)malloc(alloc);
1096       if (!pattern)
1097          goto out_of_memory;
1098    }
1099    pattern_offset = 0;
1100    starts_base = 0;
1101    num_jumps = 0;
1102    current_level = 0;
1103    SET_LEVEL_START;
1104    num_open_registers = 0;
1105    next_register = 1;
1106    paren_depth = 0;
1107    beginning_context = 1;
1108    op = -1;
1109    /* we use Rend dummy to ensure that pending jumps are updated
1110       (due to low priority of Rend) before exiting the loop. */
1111    pos = 0;
1112    while (op != Rend) {
1113       if (pos >= size)
1114          op = Rend;
1115       else {
1116          NEXTCHAR(ch);
1117          if (translate)
1118             ch = translate[(unsigned char)ch];
1119          op = plain_ops[(unsigned char)ch];
1120          if (op == Rquote) {
1121             NEXTCHAR(ch);
1122             op = quoted_ops[(unsigned char)ch];
1123             if (op == Rnormal && regexp_ansi_sequences)
1124                ANSI_TRANSLATE(ch);
1125          }
1126       }
1127       level = precedences[op];
1128       /* printf("ch='%c' op=%d level=%d current_level=%d
1129          curlevstart=%d\n", ch, op, level, current_level,
1130          CURRENT_LEVEL_START); */
1131       if (level > current_level) {
1132          for (current_level++; current_level < level; current_level++)
1133             SET_LEVEL_START;
1134          SET_LEVEL_START;
1135       } else if (level < current_level) {
1136          current_level = level;
1137          for (; num_jumps > 0 &&
1138               future_jumps[num_jumps - 1] >= CURRENT_LEVEL_START; num_jumps--)
1139             PUT_ADDR(future_jumps[num_jumps - 1], pattern_offset);
1140       }
1141       switch (op) {
1142       case Rend:
1143          break;
1144       case Rnormal:
1145        normal_char:
1146          opcode = Cexact;
1147        store_opcode_and_arg:       /* opcode & ch must be set */
1148          SET_LEVEL_START;
1149          ALLOC(2);
1150          STORE(opcode);
1151          STORE(ch);
1152          break;
1153       case Ranychar:
1154          opcode = Canychar;
1155        store_opcode:
1156          SET_LEVEL_START;
1157          ALLOC(1);
1158          STORE(opcode);
1159          break;
1160       case Rquote:
1161          set_error("Rquote");
1162          /* NOTREACHED */
1163          break;
1164       case Rbol:
1165          if (!beginning_context) {
1166             if (regexp_context_indep_ops)
1167                goto op_error;
1168             else
1169                goto normal_char;
1170          }
1171          opcode = Cbol;
1172          goto store_opcode;
1173       case Reol:
1174          if (!((pos >= size) ||
1175                ((regexp_syntax & RE_NO_BK_VBAR) ?
1176                 (regex[pos] == '\174') :
1177                 (pos + 1 < size && regex[pos] == '\134' &&
1178                  regex[pos + 1] == '\174')) ||
1179                ((regexp_syntax & RE_NO_BK_PARENS) ?
1180                 (regex[pos] == ')') :
1181                 (pos + 1 < size && regex[pos] == '\134' &&
1182                  regex[pos + 1] == ')')))) {
1183             if (regexp_context_indep_ops)
1184                goto op_error;
1185             else
1186                goto normal_char;
1187          }
1188          opcode = Ceol;
1189          goto store_opcode;
1190          /* NOTREACHED */
1191          break;
1192       case Roptional:
1193          if (beginning_context) {
1194             if (regexp_context_indep_ops)
1195                goto op_error;
1196             else
1197                goto normal_char;
1198          }
1199          if (CURRENT_LEVEL_START == pattern_offset)
1200             break;                 /* ignore empty patterns for ? */
1201          ALLOC(3);
1202          INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 3);
1203          break;
1204       case Rstar:
1205       case Rplus:
1206          if (beginning_context) {
1207             if (regexp_context_indep_ops)
1208                goto op_error;
1209             else
1210                goto normal_char;
1211          }
1212          if (CURRENT_LEVEL_START == pattern_offset)
1213             break;                 /* ignore empty patterns for + and * */
1214          ALLOC(9);
1215          INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1216          INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
1217          if (op == Rplus)          /* jump over initial failure_jump */
1218             INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
1219                         CURRENT_LEVEL_START + 6);
1220          break;
1221       case Ror:
1222          ALLOC(6);
1223          INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6);
1224          if (num_jumps >= MAX_NESTING)
1225             goto too_complex;
1226          STORE(Cjump);
1227          future_jumps[num_jumps++] = pattern_offset;
1228          STORE(0);
1229          STORE(0);
1230          SET_LEVEL_START;
1231          break;
1232       case Ropenpar:
1233          {
1234             SET_LEVEL_START;
1235             if (next_register < RE_NREGS) {
1236                bufp->uses_registers = 1;
1237                ALLOC(2);
1238                STORE(Cstart_memory);
1239                STORE(next_register);
1240                open_registers[num_open_registers++] = next_register;
1241                bufp->num_registers++;
1242                next_register++;
1243             }
1244             paren_depth++;
1245             PUSH_LEVEL_STARTS;
1246             current_level = 0;
1247             SET_LEVEL_START;
1248             break;
1249          }
1250       case Rclosepar:
1251          if (paren_depth <= 0)
1252             goto parenthesIsError;
1253          POP_LEVEL_STARTS;
1254          current_level = precedences[Ropenpar];
1255          paren_depth--;
1256          if (paren_depth < num_open_registers) {
1257             bufp->uses_registers = 1;
1258             ALLOC(2);
1259             STORE(Cend_memory);
1260             num_open_registers--;
1261             STORE(open_registers[num_open_registers]);
1262          }
1263          break;
1264       case Rmemory:
1265          if (ch == '0')
1266             goto bad_match_register;
1267          if (!(ch >= '0' && ch <= '9')) {
1268             goto bad_match_register;
1269          }
1270          bufp->uses_registers = 1;
1271          opcode = Cmatch_memory;
1272          ch -= '0';
1273          goto store_opcode_and_arg;
1274       case Rextended_memory:
1275          NEXTCHAR(ch);
1276          if (ch < '0' || ch > '9')
1277             goto bad_match_register;
1278          NEXTCHAR(a);
1279          if (a < '0' || a > '9')
1280             goto bad_match_register;
1281          ch = 10 * (a - '0') + ch - '0';
1282          if (ch == 0 || ch >= RE_NREGS)
1283             goto bad_match_register;
1284          bufp->uses_registers = 1;
1285          opcode = Cmatch_memory;
1286          goto store_opcode_and_arg;
1287       case Ropenset:
1288          {
1289             int complement;
1290             int prev;
1291             int offset;
1292             int range;
1293             int firstchar;
1294 
1295             SET_LEVEL_START;
1296             ALLOC(1 + 256 / 8);
1297             STORE(Cset);
1298             offset = pattern_offset;
1299             for (a = 0; a < 256 / 8; a++)
1300                STORE(0);
1301             NEXTCHAR(ch);
1302             if (translate)
1303                ch = translate[(unsigned char)ch];
1304             if (ch == '\136') {
1305                complement = 1;
1306                NEXTCHAR(ch);
1307                if (translate)
1308                   ch = translate[(unsigned char)ch];
1309             } else
1310                complement = 0;
1311             prev = -1;
1312             range = 0;
1313             firstchar = 1;
1314             while (ch != '\135' || firstchar) {
1315                firstchar = 0;
1316                if (regexp_ansi_sequences && ch == '\134') {
1317                   NEXTCHAR(ch);
1318                   ANSI_TRANSLATE(ch);
1319                }
1320                if (range) {
1321                   for (a = prev; a <= (int)ch; a++)
1322                      SETBIT(pattern, offset, a);
1323                   prev = -1;
1324                   range = 0;
1325                } else if (prev != -1 && ch == '-')
1326                   range = 1;
1327                else {
1328                   SETBIT(pattern, offset, ch);
1329                   prev = ch;
1330                }
1331                NEXTCHAR(ch);
1332                if (translate)
1333                   ch = translate[(unsigned char)ch];
1334             }
1335             if (range)
1336                SETBIT(pattern, offset, '-');
1337             if (complement) {
1338                for (a = 0; a < 256 / 8; a++)
1339                   pattern[offset + a] ^= 0xff;
1340             }
1341             break;
1342          }
1343       case Rbegbuf:
1344          {
1345             opcode = Cbegbuf;
1346             goto store_opcode;
1347          }
1348       case Rendbuf:
1349          {
1350             opcode = Cendbuf;
1351             goto store_opcode;
1352          }
1353       case Rwordchar:
1354          {
1355             opcode = Csyntaxspec;
1356             ch = Sword;
1357             goto store_opcode_and_arg;
1358          }
1359       case Rnotwordchar:
1360          {
1361             opcode = Cnotsyntaxspec;
1362             ch = Sword;
1363             goto store_opcode_and_arg;
1364          }
1365       case Rwordbeg:
1366          {
1367             opcode = Cwordbeg;
1368             goto store_opcode;
1369          }
1370       case Rwordend:
1371          {
1372             opcode = Cwordend;
1373             goto store_opcode;
1374          }
1375       case Rwordbound:
1376          {
1377             opcode = Cwordbound;
1378             goto store_opcode;
1379          }
1380       case Rnotwordbound:
1381          {
1382             opcode = Cnotwordbound;
1383             goto store_opcode;
1384          }
1385       default:
1386          {
1387             abort();
1388          }
1389       }
1390       beginning_context = (op == Ropenpar || op == Ror);
1391    }
1392    if (starts_base != 0)
1393       goto parenthesIsError;
1394 // assert(num_jumps == 0);
1395    ALLOC(1);
1396    STORE(Cend);
1397    SET_FIELDS;
1398    if (!ReOptimize(bufp))
1399       return "Optimization error";
1400    return NULL;
1401 
1402  op_error:
1403    SET_FIELDS;
1404    return "Badly placed special character";
1405 
1406  bad_match_register:
1407    SET_FIELDS;
1408    return "Bad match register number";
1409 
1410  hex_error:
1411    SET_FIELDS;
1412    return "Bad hexadecimal number";
1413 
1414  parenthesIsError:
1415    SET_FIELDS;
1416    return "Badly placed parenthesis";
1417 
1418  out_of_memory:
1419    SET_FIELDS;
1420    return "Out of memory";
1421 
1422  ends_prematurely:
1423    SET_FIELDS;
1424    return "Regular expression ends prematurely";
1425 
1426  too_complex:
1427    SET_FIELDS;
1428    return "Regular expression too complex";
1429 }
1430 
1431 #undef CHARAT
1432 #undef NEXTCHAR
1433 #undef GETHEX
1434 #undef ALLOC
1435 #undef STORE
1436 #undef CURRENT_LEVEL_START
1437 #undef SET_LEVEL_START
1438 #undef PUSH_LEVEL_STARTS
1439 #undef POP_LEVEL_STARTS
1440 #undef PUT_ADDR
1441 #undef INSERT_JUMP
1442 #undef SETBIT
1443 #undef SET_FIELDS
1444 
1445 #define PREFETCH if (text == textend) goto fail
1446 
1447 #define NEXTCHAR(var) \
1448 PREFETCH; \
1449 var = (unsigned char)*text++; \
1450 if (translate) \
1451         var = translate[var]
1452 
regcomp(regex_t * bufp,const char * regex,int cflags)1453 int regcomp(regex_t * bufp, const char *regex, int cflags)
1454 {
1455    memset(bufp, 0, sizeof(regex_t));
1456    bufp->cflags = cflags;
1457    if (bufp->cflags & REG_ICASE) {
1458       char *p, *lcase = bstrdup(regex);
1459       for( p = lcase; *p ; p++) {
1460          *p = tolower(*p);
1461       }
1462       re_compile_pattern(bufp, (unsigned char *)lcase);
1463       bfree(lcase);
1464    } else {
1465       re_compile_pattern(bufp, (unsigned char *)regex);
1466    }
1467    if (got_error) {
1468       return -1;
1469    }
1470    return 0;
1471 }
1472 
re_registers_to_regmatch(regexp_registers_t old_regs,regmatch_t pmatch[],size_t nmatch)1473 void re_registers_to_regmatch(regexp_registers_t old_regs,
1474                               regmatch_t pmatch[],
1475                               size_t nmatch)
1476 {
1477    if (!(nmatch == 0 && pmatch == NULL)) {
1478       size_t i = 0;
1479 
1480       /*
1481        * We have to set the last entry to -1
1482        */
1483       nmatch = nmatch - 1;
1484       for (i = 0; (i < nmatch) && (old_regs->start[i] > -1) ; i++) {
1485          pmatch[i].rm_so = old_regs->start[i];
1486          pmatch[i].rm_eo = old_regs->end[i];
1487       }
1488 
1489       pmatch[i].rm_eo = pmatch[i].rm_so = -1;
1490    }
1491 }
1492 
regexec(regex_t * preg,const char * string,size_t nmatch,regmatch_t pmatch[],int eflags)1493 int regexec(regex_t *preg, const char *string, size_t nmatch,
1494             regmatch_t pmatch[], int eflags)
1495 {
1496    int status;
1497    int len = strlen(string);
1498    struct re_registers regs;
1499 
1500    status = ReSearch(preg, (unsigned char *)string, len, 0, len, &regs);
1501    if (status >= 0) {
1502       re_registers_to_regmatch(&regs, pmatch, nmatch);
1503    }
1504 
1505    /*
1506     * status is the start position in the string base 0 where
1507     * the pattern was found or negative if not found.
1508     */
1509    return status < 0 ? -1 : 0;
1510 }
1511 
regerror(int errcode,regex_t * preg,char * errbuf,size_t errbuf_size)1512 size_t regerror(int errcode, regex_t * preg, char *errbuf, size_t errbuf_size)
1513 {
1514    bstrncpy(errbuf, preg->errmsg, errbuf_size);
1515    return 0;
1516 }
1517 
regfree(regex_t * preg)1518 void regfree(regex_t * preg)
1519 {
1520    if (preg->lcase) {
1521       FreePoolMemory(preg->lcase);
1522       preg->lcase = NULL;
1523    }
1524    if (preg->buffer) {
1525       free(preg->buffer);
1526       preg->buffer = NULL;
1527    }
1528 }
1529 
ReMatch(regex_t * bufp,unsigned char * string,int size,int pos,regexp_registers_t old_regs)1530 int ReMatch(regex_t * bufp, unsigned char *string, int size, int pos,
1531              regexp_registers_t old_regs)
1532 {
1533    unsigned char *code;
1534    unsigned char *translate;
1535    unsigned char *text;
1536    unsigned char *textstart;
1537    unsigned char *textend;
1538    int a;
1539    int b;
1540    int ch;
1541    int reg;
1542    int match_end;
1543    unsigned char *regstart;
1544    unsigned char *regend;
1545    int regsize;
1546    match_state state;
1547 
1548 // assert(pos >= 0 && size >= 0);
1549 // assert(pos <= size);
1550 
1551    text = string + pos;
1552    textstart = string;
1553    textend = string + size;
1554 
1555    code = bufp->buffer;
1556 
1557    translate = bufp->translate;
1558 
1559    NEW_STATE(state, bufp->num_registers);
1560 
1561  continue_matching:
1562    switch (*code++) {
1563    case Cend:
1564       {
1565          match_end = text - textstart;
1566          if (old_regs) {
1567             old_regs->start[0] = pos;
1568             old_regs->end[0] = match_end;
1569             if (!bufp->uses_registers) {
1570                for (a = 1; a < RE_NREGS; a++) {
1571                   old_regs->start[a] = -1;
1572                   old_regs->end[a] = -1;
1573                }
1574             } else {
1575                for (a = 1; a < bufp->num_registers; a++) {
1576                   if ((GET_REG_START(state, a) == NULL) ||
1577                       (GET_REG_END(state, a) == NULL)) {
1578                      old_regs->start[a] = -1;
1579                      old_regs->end[a] = -1;
1580                      continue;
1581                   }
1582                   old_regs->start[a] = GET_REG_START(state, a) - textstart;
1583                   old_regs->end[a] = GET_REG_END(state, a) - textstart;
1584                }
1585                for (; a < RE_NREGS; a++) {
1586                   old_regs->start[a] = -1;
1587                   old_regs->end[a] = -1;
1588                }
1589             }
1590          }
1591          FREE_STATE(state);
1592          return match_end - pos;
1593       }
1594    case Cbol:
1595       {
1596          if (text == textstart || text[-1] == '\n')
1597             goto continue_matching;
1598          goto fail;
1599       }
1600    case Ceol:
1601       {
1602          if (text == textend || *text == '\n')
1603             goto continue_matching;
1604          goto fail;
1605       }
1606    case Cset:
1607       {
1608          NEXTCHAR(ch);
1609          if (code[ch / 8] & (1 << (ch & 7))) {
1610             code += 256 / 8;
1611             goto continue_matching;
1612          }
1613          goto fail;
1614       }
1615    case Cexact:
1616       {
1617          NEXTCHAR(ch);
1618          if (ch != (unsigned char)*code++)
1619             goto fail;
1620          goto continue_matching;
1621       }
1622    case Canychar:
1623       {
1624          NEXTCHAR(ch);
1625          if (ch == '\n')
1626             goto fail;
1627          goto continue_matching;
1628       }
1629    case Cstart_memory:
1630       {
1631          reg = *code++;
1632          SET_REG_START(state, reg, text, goto error);
1633          goto continue_matching;
1634       }
1635    case Cend_memory:
1636       {
1637          reg = *code++;
1638          SET_REG_END(state, reg, text, goto error);
1639          goto continue_matching;
1640       }
1641    case Cmatch_memory:
1642       {
1643          reg = *code++;
1644          regstart = GET_REG_START(state, reg);
1645          regend = GET_REG_END(state, reg);
1646          if ((regstart == NULL) || (regend == NULL))
1647             goto fail;             /* or should we just match nothing? */
1648          regsize = regend - regstart;
1649 
1650          if (regsize > (textend - text))
1651             goto fail;
1652          if (translate) {
1653             for (; regstart < regend; regstart++, text++)
1654                if (translate[*regstart] != translate[*text])
1655                   goto fail;
1656          } else
1657             for (; regstart < regend; regstart++, text++)
1658                if (*regstart != *text)
1659                   goto fail;
1660          goto continue_matching;
1661       }
1662    case Cupdate_failure_jump:
1663       {
1664          UPDATE_FAILURE(state, text, goto error);
1665          /* fall to next case */
1666       }
1667       /* treat Cstar_jump just like Cjump if it hasn't been optimized */
1668    case Cstar_jump:
1669    case Cjump:
1670       {
1671          a = (unsigned char)*code++;
1672          a |= (unsigned char)*code++ << 8;
1673          code += (int)SHORT(a);
1674          if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1675             set_error("Regex VM jump out of bounds (Cjump)");
1676             FREE_STATE(state);
1677             return -2;
1678          }
1679          goto continue_matching;
1680       }
1681    case Cdummy_failure_jump:
1682       {
1683          unsigned char *failuredest;
1684 
1685          a = (unsigned char)*code++;
1686          a |= (unsigned char)*code++ << 8;
1687          a = (int)SHORT(a);
1688 //       assert(*code == Cfailure_jump);
1689          b = (unsigned char)code[1];
1690          b |= (unsigned char)code[2] << 8;
1691          failuredest = code + (int)SHORT(b) + 3;
1692          if (failuredest < bufp->buffer || bufp->buffer + bufp->used < failuredest) {
1693             set_error
1694                ("Regex VM jump out of bounds (Cdummy_failure_jump failuredest)");
1695             FREE_STATE(state);
1696             return -2;
1697          }
1698          PUSH_FAILURE(state, failuredest, NULL, goto error);
1699          code += a;
1700          if (code < bufp->buffer || bufp->buffer + bufp->used < code) {
1701             set_error("Regex VM jump out of bounds (Cdummy_failure_jump code)");
1702             FREE_STATE(state);
1703             return -2;
1704          }
1705          goto continue_matching;
1706       }
1707    case Cfailure_jump:
1708       {
1709          a = (unsigned char)*code++;
1710          a |= (unsigned char)*code++ << 8;
1711          a = (int)SHORT(a);
1712          if (code + a < bufp->buffer || bufp->buffer + bufp->used < code + a) {
1713             set_error("Regex VM jump out of bounds (Cfailure_jump)");
1714             FREE_STATE(state);
1715             return -2;
1716          }
1717          PUSH_FAILURE(state, code + a, text, goto error);
1718          goto continue_matching;
1719       }
1720    case Crepeat1:
1721       {
1722          unsigned char *pinst;
1723          a = (unsigned char)*code++;
1724          a |= (unsigned char)*code++ << 8;
1725          a = (int)SHORT(a);
1726          pinst = code + a;
1727          if (pinst < bufp->buffer || bufp->buffer + bufp->used < pinst) {
1728             set_error("Regex VM jump out of bounds (Crepeat1)");
1729             FREE_STATE(state);
1730             return -2;
1731          }
1732          /* pinst is sole instruction in loop, and it matches a
1733           * single character.  Since Crepeat1 was originally a
1734           * Cupdate_failure_jump, we also know that backtracking
1735           * is useless: so long as the single-character
1736           * expression matches, it must be used.  Also, in the
1737           * case of +, we've already matched one character, so +
1738           * can't fail: nothing here can cause a failure.  */
1739          switch (*pinst++) {
1740          case Cset:
1741             {
1742                if (translate) {
1743                   while (text < textend) {
1744                      ch = translate[(unsigned char)*text];
1745                      if (pinst[ch / 8] & (1 << (ch & 7)))
1746                         text++;
1747                      else
1748                         break;
1749                   }
1750                } else {
1751                   while (text < textend) {
1752                      ch = (unsigned char)*text;
1753                      if (pinst[ch / 8] & (1 << (ch & 7)))
1754                         text++;
1755                      else
1756                         break;
1757                   }
1758                }
1759                break;
1760             }
1761          case Cexact:
1762             {
1763                ch = (unsigned char)*pinst;
1764                if (translate) {
1765                   while (text < textend && translate[(unsigned char)*text] == ch)
1766                      text++;
1767                } else {
1768                   while (text < textend && (unsigned char)*text == ch)
1769                      text++;
1770                }
1771                break;
1772             }
1773          case Canychar:
1774             {
1775                while (text < textend && (unsigned char)*text != '\n')
1776                   text++;
1777                break;
1778             }
1779          case Csyntaxspec:
1780             {
1781                a = (unsigned char)*pinst;
1782                if (translate) {
1783                   while (text < textend && (SYNTAX(translate[*text]) & a))
1784                      text++;
1785                } else {
1786                   while (text < textend && (SYNTAX(*text) & a))
1787                      text++;
1788                }
1789                break;
1790             }
1791          case Cnotsyntaxspec:
1792             {
1793                a = (unsigned char)*pinst;
1794                if (translate) {
1795                   while (text < textend && !(SYNTAX(translate[*text]) & a))
1796                      text++;
1797                } else {
1798                   while (text < textend && !(SYNTAX(*text) & a))
1799                      text++;
1800                }
1801                break;
1802             }
1803          default:
1804             {
1805                FREE_STATE(state);
1806                set_error("Unknown regex opcode: memory corrupted?");
1807                return -2;
1808              /*NOTREACHED*/}
1809          }
1810          /* due to the funky way + and * are compiled, the top
1811           * failure- stack entry at this point is actually a
1812           * success entry -- update it & pop it */
1813          UPDATE_FAILURE(state, text, goto error);
1814          goto fail;                /* i.e., succeed <wink/sigh> */
1815       }
1816    case Cbegbuf:
1817       {
1818          if (text == textstart)
1819             goto continue_matching;
1820          goto fail;
1821       }
1822    case Cendbuf:
1823       {
1824          if (text == textend)
1825             goto continue_matching;
1826          goto fail;
1827       }
1828    case Cwordbeg:
1829       {
1830          if (text == textend)
1831             goto fail;
1832          if (!(SYNTAX(*text) & Sword))
1833             goto fail;
1834          if (text == textstart)
1835             goto continue_matching;
1836          if (!(SYNTAX(text[-1]) & Sword))
1837             goto continue_matching;
1838          goto fail;
1839       }
1840    case Cwordend:
1841       {
1842          if (text == textstart)
1843             goto fail;
1844          if (!(SYNTAX(text[-1]) & Sword))
1845             goto fail;
1846          if (text == textend)
1847             goto continue_matching;
1848          if (!(SYNTAX(*text) & Sword))
1849             goto continue_matching;
1850          goto fail;
1851       }
1852    case Cwordbound:
1853       {
1854          /* Note: as in gnu regexp, this also matches at the
1855           * beginning and end of buffer.  */
1856 
1857          if (text == textstart || text == textend)
1858             goto continue_matching;
1859          if ((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword))
1860             goto continue_matching;
1861          goto fail;
1862       }
1863    case Cnotwordbound:
1864       {
1865          /* Note: as in gnu regexp, this never matches at the
1866           * beginning and end of buffer.  */
1867          if (text == textstart || text == textend)
1868             goto fail;
1869          if (!((SYNTAX(text[-1]) & Sword) ^ (SYNTAX(*text) & Sword)))
1870             goto continue_matching;
1871          goto fail;
1872       }
1873    case Csyntaxspec:
1874       {
1875          NEXTCHAR(ch);
1876          if (!(SYNTAX(ch) & (unsigned char)*code++))
1877             goto fail;
1878          goto continue_matching;
1879       }
1880    case Cnotsyntaxspec:
1881       {
1882          NEXTCHAR(ch);
1883          if (SYNTAX(ch) & (unsigned char)*code++)
1884             goto fail;
1885          goto continue_matching;
1886       }
1887    default:
1888       {
1889          FREE_STATE(state);
1890          set_error("Unknown regex opcode: memory corrupted?");
1891          return -2;
1892        /*NOTREACHED*/}
1893    }
1894 
1895 
1896 
1897 #if 0                              /* This line is never reached --Guido */
1898    abort();
1899 #endif
1900    /*
1901     *NOTREACHED
1902     */
1903 
1904    /* Using "break;" in the above switch statement is equivalent to "goto fail;" */
1905  fail:
1906    POP_FAILURE(state, code, text, goto done_matching, goto error);
1907    goto continue_matching;
1908 
1909  done_matching:
1910 /*   if(translated != NULL) */
1911 /*      free(translated); */
1912    FREE_STATE(state);
1913    return -1;
1914 
1915  error:
1916 /*   if (translated != NULL) */
1917 /*      free(translated); */
1918    FREE_STATE(state);
1919    return -2;
1920 }
1921 
1922 
1923 #undef PREFETCH
1924 #undef NEXTCHAR
1925 
ReSearch(regex_t * bufp,unsigned char * str,int size,int pos,int range,regexp_registers_t regs)1926 int ReSearch(regex_t * bufp, unsigned char *str, int size, int pos,
1927               int range, regexp_registers_t regs)
1928 {
1929    unsigned char *fastmap;
1930    unsigned char *translate;
1931    unsigned char *text;
1932    unsigned char *partstart;
1933    unsigned char *partend;
1934    int dir;
1935    int ret;
1936    unsigned char anchor;
1937    unsigned char *string = str;
1938 
1939    if (bufp->cflags & REG_ICASE) { /* we must use string in lowercase */
1940       int len = strlen((const char *)str);
1941       if (!bufp->lcase) {
1942          bufp->lcase = GetPoolMemory(PM_FNAME);
1943       }
1944       bufp->lcase = CheckPoolMemorySize(bufp->lcase, len+1);
1945       unsigned char *dst = (unsigned char *)bufp->lcase;
1946       while (*string) {
1947          *dst++ = tolower(*string++);
1948       }
1949       *dst = '\0';
1950       string = (unsigned char *)bufp->lcase;
1951    }
1952 
1953 // assert(size >= 0 && pos >= 0);
1954 // assert(pos + range >= 0 && pos + range <= size);     /* Bugfix by ylo */
1955 
1956    fastmap = bufp->fastmap;
1957    translate = bufp->translate;
1958    if (fastmap && !bufp->fastmap_accurate) {
1959       ReCompileFastmap(bufp);
1960       if (got_error)
1961          return -2;
1962    }
1963 
1964    anchor = bufp->anchor;
1965    if (bufp->can_be_null == 1)     /* can_be_null == 2: can match null at eob */
1966       fastmap = NULL;
1967 
1968    if (range < 0) {
1969       dir = -1;
1970       range = -range;
1971    } else
1972       dir = 1;
1973 
1974    if (anchor == 2) {
1975       if (pos != 0)
1976          return -1;
1977       else
1978          range = 0;
1979    }
1980 
1981    for (; range >= 0; range--, pos += dir) {
1982       if (fastmap) {
1983          if (dir == 1) {           /* searching forwards */
1984 
1985             text = string + pos;
1986             partend = string + size;
1987             partstart = text;
1988             if (translate)
1989                while (text != partend &&
1990                       !fastmap[(unsigned char)translate[(unsigned char)*text]])
1991                   text++;
1992             else
1993                while (text != partend && !fastmap[(unsigned char)*text])
1994                   text++;
1995             pos += text - partstart;
1996             range -= text - partstart;
1997             if (pos == size && bufp->can_be_null == 0)
1998                return -1;
1999          } else {                  /* searching backwards */
2000             text = string + pos;
2001             partstart = string + pos - range;
2002             partend = text;
2003             if (translate)
2004                while (text != partstart && !fastmap[(unsigned char)
2005                                                     translate[(unsigned char)*text]])
2006                   text--;
2007             else
2008                while (text != partstart && !fastmap[(unsigned char)*text])
2009                   text--;
2010             pos -= partend - text;
2011             range -= partend - text;
2012          }
2013       }
2014       if (anchor == 1) {           /* anchored to begline */
2015          if (pos > 0 && (string[pos - 1] != '\n'))
2016             continue;
2017       }
2018 //    assert(pos >= 0 && pos <= size);
2019       ret = ReMatch(bufp, string, size, pos, regs);
2020       if (ret >= 0)
2021          return pos;
2022       if (ret == -2)
2023          return -2;
2024    }
2025    return -1;
2026 }
2027 
2028 /*
2029 ** Local Variables:
2030 ** mode: c
2031 ** c-file-style: "python"
2032 ** End:
2033 */
2034