1 /*------------------------------------------------------------------------*
2  * `CompileRE', `ExecRE', and `substituteRE' -- regular expression parsing
3  *
4  * This is a HIGHLY ALTERED VERSION of Henry Spencer's `regcomp' and
5  * `regexec' code adapted for NEdit.
6  *
7  * .-------------------------------------------------------------------.
8  * | ORIGINAL COPYRIGHT NOTICE:                                        |
9  * |                                                                   |
10  * | Copyright (c) 1986 by University of Toronto.                      |
11  * | Written by Henry Spencer.  Not derived from licensed software.    |
12  * |                                                                   |
13  * | Permission is granted to anyone to use this software for any      |
14  * | purpose on any computer system, and to redistribute it freely,    |
15  * | subject to the following restrictions:                            |
16  * |                                                                   |
17  * | 1. The author is not responsible for the consequences of use of   |
18  * |      this software, no matter how awful, even if they arise       |
19  * |      from defects in it.                                          |
20  * |                                                                   |
21  * | 2. The origin of this software must not be misrepresented, either |
22  * |      by explicit claim or by omission.                            |
23  * |                                                                   |
24  * | 3. Altered versions must be plainly marked as such, and must not  |
25  * |      be misrepresented as being the original software.            |
26  * `-------------------------------------------------------------------'
27  *
28  * This is free software; you can redistribute it and/or modify it under the
29  * terms of the GNU General Public License as published by the Free Software
30  * Foundation; either version 2 of the License, or (at your option) any later
31  * version. In addition, you may distribute version of this program linked to
32  * Motif or Open Motif. See README for details.
33  *
34  * This software is distributed in the hope that it will be useful, but WITHOUT
35  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
36  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
37  * for more details.
38  *
39  * You should have received a copy of the GNU General Public License along with
40  * software; if not, write to the Free Software Foundation, Inc., 59 Temple
41  * Place, Suite 330, Boston, MA  02111-1307 USA
42  *
43  *
44  * BEWARE that some of this code is subtly aware of the way operator
45  * precedence is structured in regular expressions.  Serious changes in
46  * regular-expression syntax might require a total rethink.
47  *   -- Henry Spencer
48  * (Yes, it did!) -- Christopher Conrad, Dec. 1999
49  *
50  * January, 1994, Mark Edel
51  *    Consolidated files, changed names of external functions to avoid
52  *    potential conflicts with native regcomp and regexec functions, changed
53  *    error reporting to NEdit form, added multi-line and reverse searching,
54  *    and added \n \t \u \U \l \L.
55  *
56  * June, 1996, Mark Edel
57  *    Bug in NEXT macro, didn't work for expressions which compiled to over
58  *    256 bytes.
59  *
60  * December, 1999, Christopher Conrad
61  *    Reformatted code for readability, improved error output, added octal and
62  *    hexadecimal escapes, added back-references (\1-\9), added positive look
63  *    ahead: (?=...), added negative lookahead: (?!...),  added non-capturing
64  *    parentheses: (?:...), added case insensitive constructs (?i...) and
65  *    (?I...), added newline matching constructs (?n...) and (?N...), added
66  *    regex comments: (?#...), added shortcut escapes: \d\D\l\L\s\S\w\W\y\Y.
67  *    Added "not a word boundary" anchor \B.
68  *
69  * July, 2002, Eddy De Greef
70  *    Added look behind, both positive (?<=...) and negative (?<!...) for
71  *    bounded-length patterns.
72  *
73  * November, 2004, Eddy De Greef
74  *    Added constrained matching (allowing specification of the logical end
75  *    of the string iso. matching till \0), and fixed several (probably
76  *    very old) string overrun errors that could easily result in crashes,
77  *    especially in client code.
78  */
79 
80 #ifdef HAVE_CONFIG_H
81 #include "../config.h"
82 #endif
83 
84 #include "regularExp.h"
85 #include "../util/nedit_malloc.h"
86 
87 #include <ctype.h>
88 #include <limits.h>
89 #include <stdio.h>
90 #include <stdlib.h>
91 #include <string.h>
92 
93 #ifdef HAVE_DEBUG_H
94 #include "../debug.h"
95 #endif
96 
97 
98 /* The first byte of the regexp internal `program' is a magic number to help
99    gaurd against corrupted data; the compiled regex code really begins in the
100    second byte. */
101 
102 #define MAGIC   0234
103 
104 /* The "internal use only" fields in `regexp.h' are present to pass info from
105  * `CompileRE' to `ExecRE' which permits the execute phase to run lots faster on
106  * simple cases.  They are:
107  *
108  *   match_start     Character that must begin a match; '\0' if none obvious.
109  *   anchor          Is the match anchored (at beginning-of-line only)?
110  *
111  * `match_start' and `anchor' permit very fast decisions on suitable starting
112  * points for a match, considerably reducing the work done by ExecRE. */
113 
114 /* STRUCTURE FOR A REGULAR EXPRESSION (regex) `PROGRAM'.
115  *
116  * This is essentially a linear encoding of a nondeterministic finite-state
117  * machine or NFA (aka syntax charts or `railroad normal form' in parsing
118  * technology).  Each node is an opcode plus a NEXT pointer, possibly
119  * followed by operands.  NEXT pointers of all nodes except BRANCH implement
120  * concatenation; a NEXT pointer with a BRANCH on both ends of it is
121  * connecting two alternatives.  (Here we have one of the subtle syntax
122  * dependencies:  an individual BRANCH (as opposed to a collection of them) is
123  * never concatenated with anything because of operator precedence.)  The
124  * operand of some types of nodes is a literal string; for others, it is a node
125  * leading into a sub-FSM.  In particular, the operand of a BRANCH node is the
126  * first node of the branch. (NB this is _NOT_ a tree structure:  the tail of
127  * the branch connects to the thing following the set of BRANCHes.)
128  *
129  * The opcodes are: */
130 
131 /* DEFINITION            VALUE  MEANING */
132 
133 #define END                1  /* End of program. */
134 
135 /* Zero width positional assertions. */
136 
137 #define BOL                2  /* Match position at beginning of line. */
138 #define EOL                3  /* Match position at end of line. */
139 #define BOWORD             4  /* Match "" representing word delimiter or BOL */
140 #define EOWORD             5  /* Match "" representing word delimiter or EOL */
141 #define NOT_BOUNDARY       6  /* Not word boundary (\B, opposite of < and >) */
142 
143 /* Op codes with null terminated string operands. */
144 
145 #define EXACTLY            7  /* Match this string. */
146 #define SIMILAR            8  /* Match this case insensitive string */
147 #define ANY_OF             9  /* Match any character in the set. */
148 #define ANY_BUT           10  /* Match any character not in the set. */
149 
150 /* Op codes to match any character. */
151 
152 #define ANY               11  /* Match any one character (implements '.') */
153 #define EVERY             12  /* Same as ANY but matches newline. */
154 
155 /* Shortcut escapes, \d, \D, \l, \L, \s, \S, \w, \W, \y, \Y. */
156 
157 #define DIGIT             13  /* Match any digit, i.e. [0123456789] */
158 #define NOT_DIGIT         14  /* Match any non-digit, i.e. [^0123456789] */
159 #define LETTER            15  /* Match any letter character [a-zA-Z] */
160 #define NOT_LETTER        16  /* Match any non-letter character [^a-zA-Z] */
161 #define SPACE             17  /* Match any whitespace character EXCEPT \n */
162 #define SPACE_NL          18  /* Match any whitespace character INCLUDING \n */
163 #define NOT_SPACE         19  /* Match any non-whitespace character */
164 #define NOT_SPACE_NL      20  /* Same as NOT_SPACE but matches newline. */
165 #define WORD_CHAR         21  /* Match any word character [a-zA-Z0-9_] */
166 #define NOT_WORD_CHAR     22  /* Match any non-word character [^a-zA-Z0-9_] */
167 #define IS_DELIM          23  /* Match any character that's a word delimiter */
168 #define NOT_DELIM         24  /* Match any character NOT a word delimiter */
169 
170 /* Quantifier nodes. (Only applied to SIMPLE nodes.  Quantifiers applied to non
171    SIMPLE nodes or larger atoms are implemented using complex constructs.)*/
172 
173 #define STAR              25  /* Match this (simple) thing 0 or more times. */
174 #define LAZY_STAR         26  /* Minimal matching STAR */
175 #define QUESTION          27  /* Match this (simple) thing 0 or 1 times. */
176 #define LAZY_QUESTION     28  /* Minimal matching QUESTION */
177 #define PLUS              29  /* Match this (simple) thing 1 or more times. */
178 #define LAZY_PLUS         30  /* Minimal matching PLUS */
179 #define BRACE             31  /* Match this (simple) thing m to n times. */
180 #define LAZY_BRACE        32  /* Minimal matching BRACE */
181 
182 /* Nodes used to build complex constructs. */
183 
184 #define NOTHING           33  /* Match empty string (always matches) */
185 #define BRANCH            34  /* Match this alternative, or the next... */
186 #define BACK              35  /* Always matches, NEXT ptr points backward. */
187 #define INIT_COUNT        36  /* Initialize {m,n} counter to zero */
188 #define INC_COUNT         37  /* Increment {m,n} counter by one */
189 #define TEST_COUNT        38  /* Test {m,n} counter against operand */
190 
191 /* Back Reference nodes. */
192 
193 #define BACK_REF          39  /* Match latest matched parenthesized text */
194 #define BACK_REF_CI       40  /* Case insensitive version of BACK_REF */
195 #define X_REGEX_BR        41  /* Cross-Regex Back-Ref for syntax highlighting */
196 #define X_REGEX_BR_CI     42  /* Case insensitive version of X_REGEX_BR_CI */
197 
198 /* Various nodes used to implement parenthetical constructs. */
199 
200 #define POS_AHEAD_OPEN    43  /* Begin positive look ahead */
201 #define NEG_AHEAD_OPEN    44  /* Begin negative look ahead */
202 #define LOOK_AHEAD_CLOSE  45  /* End positive or negative look ahead */
203 
204 #define POS_BEHIND_OPEN   46  /* Begin positive look behind */
205 #define NEG_BEHIND_OPEN   47  /* Begin negative look behind */
206 #define LOOK_BEHIND_CLOSE 48  /* Close look behind */
207 
208 #define OPEN              49  /* Open for capturing parentheses. */
209 
210                               /*  OPEN+1 is number 1, etc. */
211 #define CLOSE       (OPEN + NSUBEXP)  /* Close for capturing parentheses. */
212 
213 #define LAST_PAREN  (CLOSE + NSUBEXP)
214 
215 #if (LAST_PAREN > UCHAR_MAX)
216 #error "Too many parentheses for storage in an unsigned char (LAST_PAREN too big.)"
217 #endif
218 
219 /* The next_ptr () function can consume up to 30% of the time during matching
220    because it is called an immense number of times (an average of 25
221    next_ptr() calls per match() call was witnessed for Perl syntax
222    highlighting). Therefore it is well worth removing some of the function
223    call overhead by selectively inlining the next_ptr() calls. Moreover,
224    the inlined code can be simplified for matching because one of the tests,
225    only necessary during compilation, can be left out.
226    The net result of using this inlined version at two critical places is
227    a 25% speedup (again, witnesses on Perl syntax highlighting). */
228 
229 #define NEXT_PTR(in_ptr, out_ptr)\
230    next_ptr_offset = GET_OFFSET (in_ptr);\
231    if (next_ptr_offset == 0)\
232        out_ptr = NULL;\
233    else {\
234        if (GET_OP_CODE (in_ptr) == BACK)\
235            out_ptr = in_ptr - next_ptr_offset;\
236        else \
237            out_ptr = in_ptr + next_ptr_offset;\
238    }
239 
240 /* OPCODE NOTES:
241    ------------
242 
243    All nodes consist of an 8 bit op code followed by 2 bytes that make up a 16
244    bit NEXT pointer.  Some nodes have a null terminated character string operand
245    following the NEXT pointer.  Other nodes may have an 8 bit index operand.
246    The TEST_COUNT node has an index operand followed by a 16 bit test value.
247    The BRACE and LAZY_BRACE nodes have two 16 bit values for min and max but no
248    index value.
249 
250    SIMILAR
251       Operand(s): null terminated string
252 
253       Implements a case insensitive match of a string.  Mostly intended for use
254       in syntax highlighting patterns for keywords of languages like FORTRAN
255       and Ada that are case insensitive.  The regex text in this node is
256       converted to lower case during regex compile.
257 
258    DIGIT, NOT_DIGIT, LETTER, NOT_LETTER, SPACE, NOT_SPACE, WORD_CHAR,
259    NOT_WORD_CHAR
260       Operand(s): None
261 
262       Implements shortcut escapes \d, \D, \l, \L, \s, \S, \w, \W.  The locale
263       aware ANSI functions isdigit(), isalpha(), isalnun(), and isspace() are
264       used to implement these in the hopes of increasing portability.
265 
266    NOT_BOUNDARY
267       Operand(s): None
268 
269       Implements \B as a zero width assertion that the current character is
270       NOT on a word boundary.  Word boundaries are defined to be the position
271       between two characters where one of those characters is one of the
272       dynamically defined word delimiters, and the other character is not.
273 
274    IS_DELIM
275       Operand(s): None
276 
277       Implements \y as any character that is one of the dynamically
278       specified word delimiters.
279 
280    NOT_DELIM
281       Operand(s): None
282 
283       Implements \Y as any character that is NOT one of the dynamically
284       specified word delimiters.
285 
286    STAR, PLUS, QUESTION, and complex '*', '+', and '?'
287       Operand(s): None (Note: NEXT pointer is usually zero.  The code that
288                         processes this node skips over it.)
289 
290       Complex (parenthesized) versions implemented as circular BRANCH
291       structures using BACK.  SIMPLE versions (one character per match) are
292       implemented separately for speed and to minimize recursion.
293 
294    BRACE, LAZY_BRACE
295       Operand(s): minimum value (2 bytes), maximum value (2 bytes)
296 
297       Implements the {m,n} construct for atoms that are SIMPLE.
298 
299    BRANCH
300       Operand(s): None
301 
302       The set of branches constituting a single choice are hooked together
303       with their NEXT pointers, since precedence prevents anything being
304       concatenated to any individual branch.  The NEXT pointer of the last
305       BRANCH in a choice points to the thing following the whole choice.  This
306       is also where the final NEXT pointer of each individual branch points;
307       each branch starts with the operand node of a BRANCH node.
308 
309    BACK
310       Operand(s): None
311 
312       Normal NEXT pointers all implicitly point forward.  Back implicitly
313       points backward.  BACK exists to make loop structures possible.
314 
315    INIT_COUNT
316       Operand(s): index (1 byte)
317 
318       Initializes the count array element referenced by the index operand.
319       This node is used to build general (i.e. parenthesized) {m,n} constructs.
320 
321    INC_COUNT
322       Operand(s): index (1 byte)
323 
324       Increments the count array element referenced by the index operand.
325       This node is used to build general (i.e. parenthesized) {m,n} constructs.
326 
327    TEST_COUNT
328       Operand(s): index (1 byte), test value (2 bytes)
329 
330       Tests the current value of the count array element specified by the
331       index operand against the test value.  If the current value is less than
332       the test value, control passes to the node after that TEST_COUNT node.
333       Otherwise control passes to the node referenced by the NEXT pointer for
334       the TEST_COUNT node.  This node is used to build general (i.e.
335       parenthesized) {m,n} constructs.
336 
337    BACK_REF, BACK_REF_CI
338       Operand(s): index (1 byte, value 1-9)
339 
340       Implements back references.  This node will attempt to match whatever text
341       was most recently captured by the index'th set of parentheses.
342       BACK_REF_CI is case insensitive version.
343 
344    X_REGEX_BR, X_REGEX_BR_CI
345       (NOT IMPLEMENTED YET)
346 
347       Operand(s): index (1 byte, value 1-9)
348 
349       Implements back references into a previously matched but separate regular
350       expression.  This is used by syntax highlighting patterns. This node will
351       attempt to match whatever text was most captured by the index'th set of
352       parentheses of the separate regex passed to ExecRE. X_REGEX_BR_CI is case
353       insensitive version.
354 
355    POS_AHEAD_OPEN, NEG_AHEAD_OPEN, LOOK_AHEAD_CLOSE
356 
357       Operand(s): None
358 
359       Implements positive and negative look ahead.  Look ahead is an assertion
360       that something is either there or not there.   Once this is determined the
361       regex engine backtracks to where it was just before the look ahead was
362       encountered, i.e. look ahead is a zero width assertion.
363 
364    POS_BEHIND_OPEN, NEG_BEHIND_OPEN, LOOK_BEHIND_CLOSE
365 
366       Operand(s): 2x2 bytes for OPEN (match boundaries), None for CLOSE
367 
368       Implements positive and negative look behind.  Look behind is an assertion
369       that something is either there or not there in front of the current
370       position.  Look behind is a zero width assertion, with the additional
371       constraint that it must have a bounded length (for complexity and
372       efficiency reasons; note that most other implementation even impose
373       fixed length).
374 
375    OPEN, CLOSE
376 
377       Operand(s): None
378 
379       OPEN  + n = Start of parenthesis 'n', CLOSE + n = Close of parenthesis
380       'n', and are numbered at compile time.
381  */
382 
383 /* A node is one char of opcode followed by two chars of NEXT pointer plus
384  * any operands.  NEXT pointers are stored as two 8-bit pieces, high order
385  * first.  The value is a positive offset from the opcode of the node
386  * containing it.  An operand, if any, simply follows the node.  (Note that
387  * much of the code generation knows about this implicit relationship.)
388  *
389  * Using two bytes for NEXT_PTR_SIZE is vast overkill for most things,
390  * but allows patterns to get big without disasters. */
391 
392 #define OP_CODE_SIZE    1
393 #define NEXT_PTR_SIZE   2
394 #define INDEX_SIZE      1
395 #define LENGTH_SIZE	4
396 #define NODE_SIZE       (NEXT_PTR_SIZE + OP_CODE_SIZE)
397 
398 #define GET_OP_CODE(p)  (*(unsigned char *)(p))
399 #define OPERAND(p)      ((p) + NODE_SIZE)
400 #define GET_OFFSET(p)   ((( *((p) + 1) & 0377) << 8) + (( *((p) + 2)) & 0377))
401 #define PUT_OFFSET_L(v) (unsigned char)(((v) >> 8) & 0377)
402 #define PUT_OFFSET_R(v) (unsigned char) ((v)       & 0377)
403 #define GET_LOWER(p)    ((( *((p) + NODE_SIZE) & 0377) << 8) + \
404                          (( *((p) + NODE_SIZE+1)) & 0377))
405 #define GET_UPPER(p)    ((( *((p) + NODE_SIZE+2) & 0377) << 8) + \
406                          (( *((p) + NODE_SIZE+3)) & 0377))
407 
408 /* Utility definitions. */
409 
410 #define REG_FAIL(m)      {*Error_Ptr = (m); return (NULL);}
411 #define IS_QUANTIFIER(c) ((c) == '*' || (c) == '+' || \
412                           (c) == '?' || (c) == Brace_Char)
413 #define SET_BIT(i,n)     ((i) |= (1 << ((n) - 1)))
414 #define TEST_BIT(i,n)    ((i) &  (1 << ((n) - 1)))
415 #define U_CHAR_AT(p)     ((unsigned int) *(unsigned char *)(p))
416 
417 /* Flags to be passed up and down via function parameters during compile. */
418 
419 #define WORST             0  /* Worst case. No assumptions can be made.*/
420 #define HAS_WIDTH         1  /* Known never to match null string. */
421 #define SIMPLE            2  /* Simple enough to be STAR/PLUS operand. */
422 
423 #define NO_PAREN          0  /* Only set by initial call to "chunk". */
424 #define PAREN             1  /* Used for normal capturing parentheses. */
425 #define NO_CAPTURE        2  /* Non-capturing parentheses (grouping only). */
426 #define INSENSITIVE       3  /* Case insensitive parenthetical construct */
427 #define SENSITIVE         4  /* Case sensitive parenthetical construct */
428 #define NEWLINE           5  /* Construct to match newlines in most cases */
429 #define NO_NEWLINE        6  /* Construct to match newlines normally */
430 
431 #define REG_INFINITY    0UL
432 #define REG_ZERO        0UL
433 #define REG_ONE         1UL
434 
435 /* Flags for function shortcut_escape() */
436 
437 #define CHECK_ESCAPE       0  /* Check an escape sequence for validity only. */
438 #define CHECK_CLASS_ESCAPE 1  /* Check the validity of an escape within a
439                                  character class */
440 #define EMIT_CLASS_BYTES   2  /* Emit equivalent character class bytes,
441                                  e.g \d=0123456789 */
442 #define EMIT_NODE          3  /* Emit the appropriate node. */
443 
444 /* Array sizes for arrays used by function init_ansi_classes. */
445 
446 #define WHITE_SPACE_SIZE   16
447 #define ALNUM_CHAR_SIZE   256
448 
449 /* Number of bytes to offset from the beginning of the regex program to the start
450    of the actual compiled regex code, i.e. skipping over the MAGIC number and
451    the two counters at the front.  */
452 
453 #define REGEX_START_OFFSET 3
454 
455 #define MAX_COMPILED_SIZE  32767UL  /* Largest size a compiled regex can be.
456                                        Probably could be 65535UL. */
457 
458 /* Global work variables for `CompileRE'. */
459 
460 static unsigned char *Reg_Parse;       /* Input scan ptr (scans user's regex) */
461 static int            Total_Paren;     /* Parentheses, (),  counter. */
462 static int            Num_Braces;      /* Number of general {m,n} constructs.
463                                           {m,n} quantifiers of SIMPLE atoms are
464                                           not included in this count. */
465 static int            Closed_Parens;   /* Bit flags indicating () closure. */
466 static int            Paren_Has_Width; /* Bit flags indicating ()'s that are
467                                           known to not match the empty string */
468 static unsigned char  Compute_Size;    /* Address of this used as flag. */
469 static unsigned char *Code_Emit_Ptr;   /* When Code_Emit_Ptr is set to
470                                           &Compute_Size no code is emitted.
471                                           Instead, the size of code that WOULD
472                                           have been generated is accumulated in
473                                           Reg_Size.  Otherwise, Code_Emit_Ptr
474                                           points to where compiled regex code is
475                                           to be written. */
476 static unsigned long  Reg_Size;        /* Size of compiled regex code. */
477 static char         **Error_Ptr;       /* Place to store error messages so
478                                           they can be returned by `CompileRE' */
479 static char           Error_Text [128];/* Sting to build error messages in. */
480 
481 static unsigned char  White_Space [WHITE_SPACE_SIZE]; /* Arrays used by       */
482 static unsigned char  Word_Char   [ALNUM_CHAR_SIZE];  /* functions            */
483 static unsigned char  Letter_Char [ALNUM_CHAR_SIZE];  /* init_ansi_classes () */
484                                                       /* and
485                                                          shortcut_escape ().  */
486 
487 static unsigned char  ASCII_Digits [] = "0123456789"; /* Same for all */
488                                                       /* locales.     */
489 static int            Is_Case_Insensitive;
490 static int            Match_Newline;
491 
492 static int            Enable_Counting_Quantifier = 1;
493 static unsigned char  Brace_Char;
494 static unsigned char  Default_Meta_Char [] = "{.*+?[(|)^<>$";
495 static unsigned char *Meta_Char;
496 
497 typedef struct { long lower; long upper; } len_range;
498 
499 /* Forward declarations for functions used by `CompileRE'. */
500 
501 static unsigned char * alternative     (int *flag_param, len_range *range_param);
502 static unsigned char * back_ref        (unsigned char *c, int *flag_param,
503                                         int emit);
504 static unsigned char * chunk           (int paren, int *flag_param, len_range *range_param);
505 static void            emit_byte       (unsigned char c);
506 static void            emit_class_byte (unsigned char c);
507 static unsigned char * emit_node       (int op_code);
508 static unsigned char * emit_special    (unsigned char op_code,
509                                         unsigned long test_val,
510                                         int index);
511 static unsigned char   literal_escape  (unsigned char c);
512 static unsigned char   numeric_escape  (unsigned char c, unsigned char **parse);
513 static unsigned char * atom            (int *flag_param, len_range *range_param);
514 static void            reg_error       (char *str);
515 static unsigned char * insert          (unsigned char op, unsigned char *opnd,
516                                         long min, long max, int index);
517 static unsigned char * next_ptr        (unsigned char *ptr);
518 static void            offset_tail     (unsigned char *ptr, int offset,
519                                         unsigned char *val);
520 static void            branch_tail     (unsigned char *ptr, int offset,
521                                         unsigned char *val);
522 static unsigned char * piece           (int *flag_param, len_range *range_param);
523 static void            tail            (unsigned char *search_from,
524                                         unsigned char *point_t);
525 static unsigned char * shortcut_escape (unsigned char c, int *flag_param,
526                                         int emit);
527 
528 static int             init_ansi_classes  (void);
529 
530 /*----------------------------------------------------------------------*
531  * CompileRE
532  *
533  * Compiles a regular expression into the internal format used by
534  * `ExecRE'.
535  *
536  * The default behaviour wrt. case sensitivity and newline matching can
537  * be controlled through the defaultFlags argument (Markus Schwarzenberg).
538  * Future extensions are possible by using other flag bits.
539  * Note that currently only the case sensitivity flag is effectively used.
540  *
541  * Beware that the optimization and preparation code in here knows about
542  * some of the structure of the compiled regexp.
543  *----------------------------------------------------------------------*/
544 
CompileRE(const char * exp,char ** errorText,int defaultFlags)545 regexp * CompileRE (const char *exp, char **errorText, int defaultFlags) {
546 
547    register                regexp *comp_regex = NULL;
548    register unsigned char *scan;
549                      int   flags_local, pass;
550 	 	     len_range range_local;
551 
552    if (Enable_Counting_Quantifier) {
553       Brace_Char  = '{';
554       Meta_Char   = &Default_Meta_Char [0];
555    } else {
556       Brace_Char  = '*';                    /* Bypass the '{' in */
557       Meta_Char   = &Default_Meta_Char [1]; /* Default_Meta_Char */
558    }
559 
560    /* Set up errorText to receive failure reports. */
561 
562     Error_Ptr = errorText;
563    *Error_Ptr = "";
564 
565    if (exp == NULL) REG_FAIL ("NULL argument, `CompileRE\'");
566 
567    /* Initialize arrays used by function `shortcut_escape'. */
568 
569    if (!init_ansi_classes ()) REG_FAIL ("internal error #1, `CompileRE\'");
570 
571    Code_Emit_Ptr = &Compute_Size;
572    Reg_Size      = 0UL;
573 
574    /* We can't allocate space until we know how big the compiled form will be,
575       but we can't compile it (and thus know how big it is) until we've got a
576       place to put the code.  So we cheat: we compile it twice, once with code
577       generation turned off and size counting turned on, and once "for real".
578       This also means that we don't allocate space until we are sure that the
579       thing really will compile successfully, and we never have to move the
580       code and thus invalidate pointers into it.  (Note that it has to be in
581       one piece because free() must be able to free it all.) */
582 
583    for (pass = 1; pass <= 2; pass++) {
584       /*-------------------------------------------*
585        * FIRST  PASS: Determine size and legality. *
586        * SECOND PASS: Emit code.                   *
587        *-------------------------------------------*/
588 
589       /*  Schwarzenberg:
590        *  If defaultFlags = 0 use standard defaults:
591        *    Is_Case_Insensitive: Case sensitive is the default
592        *    Match_Newline:       Newlines are NOT matched by default
593        *                         in character classes
594        */
595       Is_Case_Insensitive = ((defaultFlags & REDFLT_CASE_INSENSITIVE) ? 1 : 0);
596       Match_Newline = 0;  /* ((defaultFlags & REDFLT_MATCH_NEWLINE)   ? 1 : 0);
597                              Currently not used. Uncomment if needed. */
598 
599       Reg_Parse       = (unsigned char *) exp;
600       Total_Paren     = 1;
601       Num_Braces      = 0;
602       Closed_Parens   = 0;
603       Paren_Has_Width = 0;
604 
605       emit_byte (MAGIC);
606       emit_byte ('%');  /* Placeholder for num of capturing parentheses.    */
607       emit_byte ('%');  /* Placeholder for num of general {m,n} constructs. */
608 
609       if (chunk (NO_PAREN, &flags_local, &range_local) == NULL)
610 	  return (NULL); /* Something went wrong */
611       if (pass == 1) {
612          if (Reg_Size >= MAX_COMPILED_SIZE) {
613             /* Too big for NEXT pointers NEXT_PTR_SIZE bytes long to span.
614                This is a real issue since the first BRANCH node usually points
615                to the end of the compiled regex code. */
616 
617             sprintf  (Error_Text, "regexp > %lu bytes", MAX_COMPILED_SIZE);
618             REG_FAIL (Error_Text);
619          }
620 
621          /* Allocate memory. */
622 
623          comp_regex = (regexp *) NEditMalloc (sizeof (regexp) + Reg_Size);
624 
625          if (comp_regex == NULL) REG_FAIL ("out of memory in `CompileRE\'");
626 
627          Code_Emit_Ptr = (unsigned char *) comp_regex->program;
628       }
629    }
630 
631    comp_regex->program [1] = (unsigned char) Total_Paren - 1;
632    comp_regex->program [2] = (unsigned char) Num_Braces;
633 
634    /*----------------------------------------*
635     * Dig out information for optimizations. *
636     *----------------------------------------*/
637 
638    comp_regex->match_start = '\0';   /* Worst-case defaults. */
639    comp_regex->anchor      =   0;
640 
641    /* First BRANCH. */
642 
643    scan = (unsigned char *) (comp_regex->program + REGEX_START_OFFSET);
644 
645    if (GET_OP_CODE (next_ptr (scan)) == END) { /* Only one top-level choice. */
646       scan = OPERAND (scan);
647 
648       /* Starting-point info. */
649 
650       if (GET_OP_CODE (scan) == EXACTLY) {
651          comp_regex->match_start = *OPERAND (scan);
652 
653       } else if (PLUS <= GET_OP_CODE (scan) &&
654                          GET_OP_CODE (scan) <= LAZY_PLUS) {
655 
656          /* Allow x+ or x+? at the start of the regex to be
657             optimized. */
658 
659          if (GET_OP_CODE (scan + NODE_SIZE) == EXACTLY) {
660             comp_regex->match_start = *OPERAND (scan + NODE_SIZE);
661          }
662       } else if (GET_OP_CODE (scan) == BOL) {
663          comp_regex->anchor++;
664       }
665    }
666 
667    return (comp_regex);
668 }
669 
670 /*----------------------------------------------------------------------*
671  * chunk                                                                *
672  *                                                                      *
673  * Process main body of regex or process a parenthesized "thing".       *
674  *                                                                      *
675  * Caller must absorb opening parenthesis.                              *
676  *                                                                      *
677  * Combining parenthesis handling with the base level of regular        *
678  * expression is a trifle forced, but the need to tie the tails of the  *
679  * branches to what follows makes it hard to avoid.                     *
680  *----------------------------------------------------------------------*/
681 
chunk(int paren,int * flag_param,len_range * range_param)682 static unsigned char * chunk (int paren, int *flag_param,
683                               len_range *range_param) {
684 
685    register unsigned char *ret_val = NULL;
686    register unsigned char *this_branch;
687    register unsigned char *ender = NULL;
688    register          int   this_paren = 0;
689                      int   flags_local, first = 1, zero_width, i;
690                      int   old_sensitive = Is_Case_Insensitive;
691                      int   old_newline   = Match_Newline;
692 		     len_range range_local;
693 		     int   look_only = 0;
694             unsigned char *emit_look_behind_bounds = NULL;
695 
696 
697    *flag_param = HAS_WIDTH;  /* Tentatively. */
698    range_param->lower = 0;   /* Idem */
699    range_param->upper = 0;
700 
701    /* Make an OPEN node, if parenthesized. */
702 
703    if (paren == PAREN) {
704       if (Total_Paren >= NSUBEXP) {
705          sprintf (Error_Text, "number of ()'s > %d", (int) NSUBEXP);
706          REG_FAIL (Error_Text);
707       }
708 
709       this_paren = Total_Paren; Total_Paren++;
710       ret_val    = emit_node (OPEN + this_paren);
711    } else if (paren == POS_AHEAD_OPEN || paren == NEG_AHEAD_OPEN) {
712       *flag_param = WORST;  /* Look ahead is zero width. */
713       look_only   = 1;
714       ret_val     = emit_node (paren);
715    } else if (paren == POS_BEHIND_OPEN || paren == NEG_BEHIND_OPEN) {
716       *flag_param = WORST;  /* Look behind is zero width. */
717       look_only   = 1;
718       /* We'll overwrite the zero length later on, so we save the ptr */
719       ret_val 	  = emit_special (paren, 0, 0);
720       emit_look_behind_bounds = ret_val + NODE_SIZE;
721    } else if (paren == INSENSITIVE) {
722       Is_Case_Insensitive = 1;
723    } else if (paren == SENSITIVE) {
724       Is_Case_Insensitive = 0;
725    } else if (paren == NEWLINE) {
726       Match_Newline = 1;
727    } else if (paren == NO_NEWLINE) {
728       Match_Newline = 0;
729    }
730 
731    /* Pick up the branches, linking them together. */
732 
733    do {
734       this_branch = alternative (&flags_local, &range_local);
735 
736       if (this_branch == NULL) return (NULL);
737 
738       if (first) {
739          first = 0;
740 	 *range_param = range_local;
741          if (ret_val == NULL) ret_val = this_branch;
742       } else if (range_param->lower >= 0) {
743 	  if (range_local.lower >= 0) {
744              if (range_local.lower < range_param->lower)
745                 range_param->lower = range_local.lower;
746              if (range_local.upper > range_param->upper)
747                 range_param->upper = range_local.upper;
748           } else {
749 	      range_param->lower = -1; /* Branches have different lengths */
750 	      range_param->upper = -1;
751 	  }
752       }
753 
754       tail (ret_val, this_branch);  /* Connect BRANCH -> BRANCH. */
755 
756       /* If any alternative could be zero width, consider the whole
757          parenthisized thing to be zero width. */
758 
759       if (!(flags_local & HAS_WIDTH)) *flag_param &= ~HAS_WIDTH;
760 
761       /* Are there more alternatives to process? */
762 
763       if (*Reg_Parse != '|') break;
764 
765       Reg_Parse++;
766    } while (1);
767 
768    /* Make a closing node, and hook it on the end. */
769 
770    if (paren == PAREN) {
771       ender = emit_node (CLOSE + this_paren);
772 
773    } else if (paren == NO_PAREN) {
774       ender = emit_node (END);
775 
776    } else if (paren == POS_AHEAD_OPEN || paren == NEG_AHEAD_OPEN) {
777       ender = emit_node (LOOK_AHEAD_CLOSE);
778 
779    } else if (paren == POS_BEHIND_OPEN || paren == NEG_BEHIND_OPEN) {
780       ender = emit_node (LOOK_BEHIND_CLOSE);
781 
782    } else {
783       ender = emit_node (NOTHING);
784    }
785 
786    tail (ret_val, ender);
787 
788    /* Hook the tails of the branch alternatives to the closing node. */
789 
790    for (this_branch = ret_val; this_branch != NULL; ) {
791       branch_tail (this_branch, NODE_SIZE, ender);
792       this_branch = next_ptr (this_branch);
793    }
794 
795    /* Check for proper termination. */
796 
797    if (paren != NO_PAREN && *Reg_Parse++ != ')') {
798       REG_FAIL ("missing right parenthesis \')\'");
799    } else if (paren == NO_PAREN && *Reg_Parse != '\0') {
800       if (*Reg_Parse == ')') {
801          REG_FAIL ("missing left parenthesis \'(\'");
802       } else {
803          REG_FAIL ("junk on end");  /* "Can't happen" - NOTREACHED */
804       }
805    }
806 
807    /* Check whether look behind has a fixed size */
808 
809    if (emit_look_behind_bounds) {
810        if (range_param->lower < 0) {
811 	   REG_FAIL ("look-behind does not have a bounded size");
812        }
813        if (range_param->upper > 65535L) {
814 	   REG_FAIL ("max. look-behind size is too large (>65535)")
815        }
816        if (Code_Emit_Ptr != &Compute_Size) {
817           *emit_look_behind_bounds++ = PUT_OFFSET_L (range_param->lower);
818           *emit_look_behind_bounds++ = PUT_OFFSET_R (range_param->lower);
819           *emit_look_behind_bounds++ = PUT_OFFSET_L (range_param->upper);
820           *emit_look_behind_bounds   = PUT_OFFSET_R (range_param->upper);
821        }
822    }
823 
824    /* For look ahead/behind, the length must be set to zero again */
825    if (look_only) {
826        range_param->lower = 0;
827        range_param->upper = 0;
828    }
829 
830    zero_width = 0;
831 
832    /* Set a bit in Closed_Parens to let future calls to function `back_ref'
833       know that we have closed this set of parentheses. */
834 
835    if (paren == PAREN && this_paren <= (int)sizeof (Closed_Parens) * CHAR_BIT) {
836       SET_BIT (Closed_Parens, this_paren);
837 
838       /* Determine if a parenthesized expression is modified by a quantifier
839          that can have zero width. */
840 
841       if (*(Reg_Parse) == '?' || *(Reg_Parse) == '*') {
842          zero_width++;
843       } else if (*(Reg_Parse) == '{' && Brace_Char == '{') {
844          if (*(Reg_Parse + 1) == ',' || *(Reg_Parse + 1) == '}') {
845             zero_width++;
846          } else if (*(Reg_Parse + 1) == '0') {
847             i = 2;
848 
849             while (*(Reg_Parse + i) == '0') i++;
850 
851             if (*(Reg_Parse + i) == ',') zero_width++;
852          }
853       }
854    }
855 
856    /* If this set of parentheses is known to never match the empty string, set
857       a bit in Paren_Has_Width to let future calls to function back_ref know
858       that this set of parentheses has non-zero width.  This will allow star
859       (*) or question (?) quantifiers to be aplied to a back-reference that
860       refers to this set of parentheses. */
861 
862    if ((*flag_param & HAS_WIDTH)  &&
863         paren == PAREN            &&
864         !zero_width               &&
865         this_paren <= (int)(sizeof (Paren_Has_Width) * CHAR_BIT)) {
866 
867       SET_BIT (Paren_Has_Width, this_paren);
868    }
869 
870    Is_Case_Insensitive = old_sensitive;
871    Match_Newline       = old_newline;
872 
873    return (ret_val);
874 }
875 
876 /*----------------------------------------------------------------------*
877  * alternative
878  *
879  * Processes one alternative of an '|' operator.  Connects the NEXT
880  * pointers of each regex atom together sequentialy.
881  *----------------------------------------------------------------------*/
882 
alternative(int * flag_param,len_range * range_param)883 static unsigned char * alternative (int *flag_param, len_range *range_param) {
884 
885    register unsigned char *ret_val;
886    register unsigned char *chain;
887    register unsigned char *latest;
888                      int   flags_local;
889 		     len_range range_local;
890 
891    *flag_param = WORST;  /* Tentatively. */
892    range_param->lower = 0; /* Idem */
893    range_param->upper = 0;
894 
895    ret_val = emit_node (BRANCH);
896    chain   = NULL;
897 
898    /* Loop until we hit the start of the next alternative, the end of this set
899       of alternatives (end of parentheses), or the end of the regex. */
900 
901    while (*Reg_Parse != '|' && *Reg_Parse != ')' && *Reg_Parse != '\0') {
902       latest = piece (&flags_local, &range_local);
903 
904       if (latest == NULL) return (NULL); /* Something went wrong. */
905 
906       *flag_param |= flags_local & HAS_WIDTH;
907       if (range_local.lower < 0) {
908 	  /* Not a fixed length */
909 	  range_param->lower = -1;
910 	  range_param->upper = -1;
911       } else if (range_param->lower >= 0) {
912 	  range_param->lower += range_local.lower;
913 	  range_param->upper += range_local.upper;
914       }
915 
916       if (chain != NULL) { /* Connect the regex atoms together sequentialy. */
917          tail (chain, latest);
918       }
919 
920       chain = latest;
921    }
922 
923    if (chain == NULL) {  /* Loop ran zero times. */
924       (void) emit_node (NOTHING);
925    }
926 
927    return (ret_val);
928 }
929 
930 /*----------------------------------------------------------------------*
931  * piece - something followed by possible '*', '+', '?', or "{m,n}"
932  *
933  * Note that the branching code sequences used for the general cases of
934  * *, +. ?, and {m,n} are somewhat optimized:  they use the same
935  * NOTHING node as both the endmarker for their branch list and the
936  * body of the last branch. It might seem that this node could be
937  * dispensed with entirely, but the endmarker role is not redundant.
938  *----------------------------------------------------------------------*/
939 
piece(int * flag_param,len_range * range_param)940 static unsigned char * piece (int *flag_param, len_range *range_param) {
941 
942    register unsigned char *ret_val;
943    register unsigned char *next;
944    register unsigned char  op_code;
945             unsigned long  min_max [2] = {REG_ZERO, REG_INFINITY};
946             int            flags_local, i, brace_present = 0;
947             int            lazy = 0, comma_present = 0;
948             int            digit_present [2] = {0,0};
949 	    len_range      range_local;
950 
951    ret_val = atom (&flags_local, &range_local);
952 
953    if (ret_val == NULL) return (NULL);  /* Something went wrong. */
954 
955    op_code = *Reg_Parse;
956 
957    if (!IS_QUANTIFIER (op_code)) {
958       *flag_param = flags_local;
959       *range_param = range_local;
960       return (ret_val);
961    } else if (op_code == '{') { /* {n,m} quantifier present */
962       brace_present++;
963       Reg_Parse++;
964 
965       /* This code will allow specifying a counting range in any of the
966          following forms:
967 
968          {m,n}  between m and n.
969          {,n}   same as {0,n} or between 0 and infinity.
970          {m,}   same as {m,0} or between m and infinity.
971          {m}    same as {m,m} or exactly m.
972          {,}    same as {0,0} or between 0 and infinity or just '*'.
973          {}     same as {0,0} or between 0 and infinity or just '*'.
974 
975          Note that specifying a max of zero, {m,0} is not allowed in the regex
976          itself, but it is implemented internally that way to support '*', '+',
977          and {min,} constructs and signals an unlimited number. */
978 
979       for (i = 0; i < 2; i++) {
980          /* Look for digits of number and convert as we go.  The numeric maximum
981             value for max and min of 65,535 is due to using 2 bytes to store
982             each value in the compiled regex code. */
983 
984          while (isdigit (*Reg_Parse)) {
985             /* (6553 * 10 + 6) > 65535 (16 bit max) */
986 
987             if ((min_max [i] == 6553UL && (*Reg_Parse - '0') <= 5) ||
988                 (min_max [i] <= 6552UL)) {
989 
990                min_max [i] = (min_max [i] * 10UL) +
991                              (unsigned long) (*Reg_Parse - '0');
992                Reg_Parse++;
993 
994                digit_present [i]++;
995             } else {
996                if (i == 0) {
997                   sprintf (Error_Text, "min operand of {%lu%c,???} > 65535",
998                            min_max [0], *Reg_Parse);
999                } else {
1000                   sprintf (Error_Text, "max operand of {%lu,%lu%c} > 65535",
1001                            min_max [0], min_max [1], *Reg_Parse);
1002                }
1003 
1004                REG_FAIL (Error_Text);
1005             }
1006          }
1007 
1008          if (!comma_present && *Reg_Parse == ',') {
1009             comma_present++;
1010             Reg_Parse++;
1011          }
1012       }
1013 
1014       /* A max of zero can not be specified directly in the regex since it would
1015          signal a max of infinity.  This code specifically disallows `{0,0}',
1016          `{,0}', and `{0}' which really means nothing to humans but would be
1017          interpreted as `{0,infinity}' or `*' if we didn't make this check. */
1018 
1019       if (digit_present [0] && (min_max [0] == REG_ZERO) && !comma_present) {
1020 
1021          REG_FAIL ("{0} is an invalid range");
1022       } else if (digit_present [0] && (min_max [0] == REG_ZERO) &&
1023                  digit_present [1] && (min_max [1] == REG_ZERO)) {
1024 
1025          REG_FAIL ("{0,0} is an invalid range");
1026       } else if (digit_present [1] && (min_max [1] == REG_ZERO)) {
1027          if (digit_present [0]) {
1028             sprintf (Error_Text, "{%lu,0} is an invalid range", min_max [0]);
1029             REG_FAIL (Error_Text);
1030          } else {
1031             REG_FAIL ("{,0} is an invalid range");
1032          }
1033       }
1034 
1035       if (!comma_present) min_max [1] = min_max [0]; /* {x} means {x,x} */
1036 
1037       if (*Reg_Parse != '}') {
1038          REG_FAIL ("{m,n} specification missing right \'}\'");
1039 
1040       } else if (min_max [1] != REG_INFINITY && min_max [0] > min_max [1]) {
1041          /* Disallow a backward range. */
1042 
1043          sprintf (Error_Text, "{%lu,%lu} is an invalid range",
1044                   min_max [0], min_max [1]);
1045          REG_FAIL (Error_Text);
1046       }
1047    }
1048 
1049    Reg_Parse++;
1050 
1051    /* Check for a minimal matching (non-greedy or "lazy") specification. */
1052 
1053    if (*Reg_Parse == '?') {
1054       lazy = 1;
1055       Reg_Parse++;
1056    }
1057 
1058    /* Avoid overhead of counting if possible */
1059 
1060    if (op_code == '{') {
1061       if (min_max [0] == REG_ZERO && min_max [1] == REG_INFINITY) {
1062          op_code = '*';
1063       } else if (min_max [0] == REG_ONE  && min_max [1] == REG_INFINITY) {
1064          op_code = '+';
1065       } else if (min_max [0] == REG_ZERO && min_max [1] == REG_ONE) {
1066          op_code = '?';
1067       } else if (min_max [0] == REG_ONE  && min_max [1] == REG_ONE) {
1068          /* "x{1,1}" is the same as "x".  No need to pollute the compiled
1069              regex with such nonsense. */
1070 
1071          *flag_param = flags_local;
1072 	 *range_param = range_local;
1073          return (ret_val);
1074       } else if (Num_Braces > (int)UCHAR_MAX) {
1075          sprintf (Error_Text, "number of {m,n} constructs > %d", UCHAR_MAX);
1076          REG_FAIL (Error_Text);
1077       }
1078    }
1079 
1080    if (op_code == '+') min_max [0] = REG_ONE;
1081    if (op_code == '?') min_max [1] = REG_ONE;
1082 
1083    /* It is dangerous to apply certain quantifiers to a possibly zero width
1084       item. */
1085 
1086    if (!(flags_local & HAS_WIDTH)) {
1087       if (brace_present) {
1088          sprintf (Error_Text, "{%lu,%lu} operand could be empty",
1089                   min_max [0], min_max [1]);
1090       } else {
1091          sprintf (Error_Text, "%c operand could be empty", op_code);
1092       }
1093 
1094       REG_FAIL (Error_Text);
1095    }
1096 
1097    *flag_param = (min_max [0] > REG_ZERO) ? (WORST | HAS_WIDTH) : WORST;
1098    if (range_local.lower >= 0) {
1099        if (min_max[1] != REG_INFINITY) {
1100 	   range_param->lower = range_local.lower * min_max[0];
1101 	   range_param->upper = range_local.upper * min_max[1];
1102        } else {
1103 	   range_param->lower = -1; /* Not a fixed-size length */
1104 	   range_param->upper = -1;
1105        }
1106    } else {
1107        range_param->lower = -1; /* Not a fixed-size length */
1108        range_param->upper = -1;
1109    }
1110 
1111    /*---------------------------------------------------------------------*
1112     *          Symbol  Legend  For  Node  Structure  Diagrams
1113     *---------------------------------------------------------------------*
1114     * (...) = general grouped thing
1115     * B     = (B)ranch,  K = bac(K),  N = (N)othing
1116     * I     = (I)nitialize count,     C = Increment (C)ount
1117     * T~m   = (T)est against mini(m)um- go to NEXT pointer if >= operand
1118     * T~x   = (T)est against ma(x)imum- go to NEXT pointer if >= operand
1119     * '~'   = NEXT pointer, \___| = forward pointer, |___/ = Backward pointer
1120     *---------------------------------------------------------------------*/
1121 
1122    if (op_code == '*' && (flags_local & SIMPLE)) {
1123       insert ((lazy ? LAZY_STAR : STAR), ret_val, 0UL, 0UL, 0);
1124 
1125    } else if (op_code == '+' && (flags_local & SIMPLE)) {
1126       insert (lazy ? LAZY_PLUS : PLUS, ret_val, 0UL, 0UL, 0);
1127 
1128    } else if (op_code == '?' && (flags_local & SIMPLE)) {
1129       insert (lazy ? LAZY_QUESTION : QUESTION, ret_val, 0UL, 0UL, 0);
1130 
1131    } else if (op_code == '{' && (flags_local & SIMPLE)) {
1132       insert (lazy ? LAZY_BRACE : BRACE, ret_val, min_max [0], min_max [1], 0);
1133 
1134    } else if ((op_code == '*' || op_code == '+') && lazy) {
1135       /*  Node structure for (x)*?    Node structure for (x)+? construct.
1136        *  construct.                  (Same as (x)*? except for initial
1137        *                              forward jump into parenthesis.)
1138        *
1139        *                                  ___6____
1140        *   _______5_______               /________|______
1141        *  | _4__        1_\             /| ____   |     _\
1142        *  |/    |       / |\           / |/    |  |    / |\
1143        *  B~ N~ B~ (...)~ K~ N~       N~ B~ N~ B~ (...)~ K~ N~
1144        *      \  \___2_______|               \  \___________|
1145        *       \_____3_______|                \_____________|
1146        *
1147        */
1148 
1149       tail (ret_val, emit_node (BACK));                 /* 1 */
1150       (void) insert (BRANCH,  ret_val, 0UL, 0UL, 0);    /* 2,4 */
1151       (void) insert (NOTHING, ret_val, 0UL, 0UL, 0);    /* 3 */
1152 
1153       next = emit_node (NOTHING);                       /* 2,3 */
1154 
1155       offset_tail (ret_val, NODE_SIZE, next);           /* 2 */
1156       tail        (ret_val, next);                      /* 3 */
1157       insert      (BRANCH, ret_val, 0UL, 0UL, 0);       /* 4,5 */
1158       tail        (ret_val, ret_val + (2 * NODE_SIZE)); /* 4 */
1159       offset_tail (ret_val, 3 * NODE_SIZE, ret_val);    /* 5 */
1160 
1161       if (op_code == '+') {
1162          insert (NOTHING, ret_val, 0UL, 0UL, 0);        /* 6 */
1163          tail   (ret_val, ret_val + (4 * NODE_SIZE));   /* 6 */
1164       }
1165    } else if (op_code == '*') {
1166       /* Node structure for (x)* construct.
1167        *      ____1_____
1168        *     |          \
1169        *     B~ (...)~ K~ B~ N~
1170        *      \      \_|2 |\_|
1171        *       \__3_______|  4
1172        */
1173 
1174       insert  (BRANCH, ret_val, 0UL, 0UL, 0);             /* 1,3 */
1175       offset_tail (ret_val, NODE_SIZE, emit_node (BACK)); /* 2 */
1176       offset_tail (ret_val, NODE_SIZE, ret_val);          /* 1 */
1177       tail    (ret_val, emit_node (BRANCH));              /* 3 */
1178       tail    (ret_val, emit_node (NOTHING));             /* 4 */
1179    } else if (op_code == '+') {
1180       /* Node structure for (x)+ construct.
1181        *
1182        *      ____2_____
1183        *     |          \
1184        *     (...)~ B~ K~ B~ N~
1185        *          \_|\____|\_|
1186        *          1     3    4
1187        */
1188 
1189       next = emit_node (BRANCH);            /* 1 */
1190 
1191       tail (ret_val, next);                 /* 1 */
1192       tail (emit_node (BACK), ret_val);     /* 2 */
1193       tail (next, emit_node (BRANCH));      /* 3 */
1194       tail (ret_val, emit_node (NOTHING));  /* 4 */
1195    } else if (op_code == '?' && lazy) {
1196       /* Node structure for (x)?? construct.
1197        *       _4__        1_
1198        *      /    |       / |
1199        *     B~ N~ B~ (...)~ N~
1200        *         \  \___2____|
1201        *          \_____3____|
1202        */
1203 
1204       (void) insert  (BRANCH,  ret_val, 0UL, 0UL, 0);      /* 2,4 */
1205       (void) insert  (NOTHING, ret_val, 0UL, 0UL, 0);      /* 3 */
1206 
1207       next = emit_node (NOTHING);                          /* 1,2,3 */
1208 
1209       offset_tail (ret_val, 2 * NODE_SIZE, next);          /* 1 */
1210       offset_tail (ret_val,     NODE_SIZE, next);          /* 2 */
1211       tail        (ret_val, next);                         /* 3 */
1212       insert      (BRANCH,  ret_val, 0UL, 0UL, 0);         /* 4 */
1213       tail        (ret_val, (ret_val + (2 * NODE_SIZE)));  /* 4 */
1214 
1215    } else if (op_code == '?') {
1216       /* Node structure for (x)? construct.
1217        *       ___1____  _2
1218        *      /        |/ |
1219        *     B~ (...)~ B~ N~
1220        *             \__3_|
1221        */
1222 
1223       insert (BRANCH, ret_val, 0UL, 0UL, 0);   /* 1 */
1224       tail   (ret_val, emit_node (BRANCH));    /* 1 */
1225 
1226       next = emit_node (NOTHING);              /* 2,3 */
1227 
1228       tail        (ret_val, next);             /* 2 */
1229       offset_tail (ret_val, NODE_SIZE, next);  /* 3 */
1230    } else if (op_code == '{' && min_max [0] == min_max [1]) {
1231       /* Node structure for (x){m}, (x){m}?, (x){m,m}, or (x){m,m}? constructs.
1232        * Note that minimal and maximal matching mean the same thing when we
1233        * specify the minimum and maximum to be the same value.
1234        *       _______3_____
1235        *      |    1_  _2   \
1236        *      |    / |/ |    \
1237        *   I~ (...)~ C~ T~m K~ N~
1238        *    \_|          \_____|
1239        *     5              4
1240        */
1241 
1242       tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces));         /* 1 */
1243       tail (ret_val, emit_special (TEST_COUNT, min_max [0], Num_Braces));/* 2 */
1244       tail (emit_node (BACK), ret_val);                                  /* 3 */
1245       tail (ret_val, emit_node (NOTHING));                               /* 4 */
1246 
1247       next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces);         /* 5 */
1248 
1249       tail (ret_val, next);                                              /* 5 */
1250 
1251       Num_Braces++;
1252    } else if (op_code == '{' && lazy) {
1253       if (min_max [0] == REG_ZERO && min_max [1] != REG_INFINITY) {
1254          /* Node structure for (x){0,n}? or {,n}? construct.
1255           *       _________3____________
1256           *    8_| _4__        1_  _2   \
1257           *    / |/    |       / |/ |    \
1258           *   I~ B~ N~ B~ (...)~ C~ T~x K~ N~
1259           *          \  \            \__7__|
1260           *           \  \_________6_______|
1261           *            \______5____________|
1262           */
1263 
1264          tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1265 
1266          next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2,7 */
1267 
1268          tail (ret_val, next);                                      /* 2 */
1269          (void) insert (BRANCH,  ret_val, 0UL, 0UL, Num_Braces);    /* 4,6 */
1270          (void) insert (NOTHING, ret_val, 0UL, 0UL, Num_Braces);    /* 5 */
1271          (void) insert (BRANCH,  ret_val, 0UL, 0UL, Num_Braces);    /* 3,4,8 */
1272          tail (emit_node (BACK), ret_val);                          /* 3 */
1273          tail (ret_val, ret_val + (2 * NODE_SIZE));                 /* 4 */
1274 
1275          next = emit_node (NOTHING);                                /* 5,6,7 */
1276 
1277          offset_tail (ret_val, NODE_SIZE, next);                    /* 5 */
1278          offset_tail (ret_val, 2 * NODE_SIZE, next);                /* 6 */
1279          offset_tail (ret_val, 3 * NODE_SIZE, next);                /* 7 */
1280 
1281          next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 8 */
1282 
1283          tail (ret_val, next);                                      /* 8 */
1284 
1285       } else if (min_max [0] > REG_ZERO && min_max [1] == REG_INFINITY) {
1286          /* Node structure for (x){m,}? construct.
1287           *       ______8_________________
1288           *      |         _______3_____  \
1289           *      | _7__   |    1_  _2   \  \
1290           *      |/    |  |    / |/ |    \  \
1291           *   I~ B~ N~ B~ (...)~ C~ T~m K~ K~ N~
1292           *    \_____\__\_|          \_4___|  |
1293           *       9   \  \_________5__________|
1294           *            \_______6______________|
1295           */
1296 
1297          tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1298 
1299          next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2,4 */
1300 
1301          tail (ret_val, next);                                      /* 2 */
1302          tail (emit_node (BACK), ret_val);                          /* 3 */
1303          tail (ret_val, emit_node (BACK));                          /* 4 */
1304          (void) insert (BRANCH, ret_val, 0UL, 0UL, 0);              /* 5,7 */
1305          (void) insert (NOTHING, ret_val, 0UL, 0UL, 0);             /* 6 */
1306 
1307          next = emit_node (NOTHING);                                /* 5,6 */
1308 
1309          offset_tail (ret_val, NODE_SIZE, next);                    /* 5 */
1310          tail (ret_val, next);                                      /* 6 */
1311          (void) insert (BRANCH,  ret_val, 0UL, 0UL, 0);             /* 7,8 */
1312          tail (ret_val, ret_val + (2 * NODE_SIZE));                 /* 7 */
1313          offset_tail (ret_val, 3 * NODE_SIZE, ret_val);             /* 8 */
1314          (void) insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 9 */
1315          tail (ret_val, ret_val + INDEX_SIZE + (4 * NODE_SIZE));    /* 9 */
1316 
1317       } else {
1318          /* Node structure for (x){m,n}? construct.
1319           *       ______9_____________________
1320           *      |         _____________3___  \
1321           *      | __8_   |    1_  _2       \  \
1322           *      |/    |  |    / |/ |        \  \
1323           *   I~ B~ N~ B~ (...)~ C~ T~x T~m K~ K~ N~
1324           *    \_____\__\_|          \   \__4__|  |
1325           *      10   \  \            \_7_________|
1326           *            \  \_________6_____________|
1327           *             \_______5_________________|
1328           */
1329 
1330          tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1331 
1332          next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,7 */
1333 
1334          tail (ret_val, next);                                      /* 2 */
1335 
1336          next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 4 */
1337 
1338          tail (emit_node (BACK), ret_val);                          /* 3 */
1339          tail (next, emit_node (BACK));                             /* 4 */
1340          (void) insert (BRANCH, ret_val, 0UL, 0UL, 0);              /* 6,8 */
1341          (void) insert (NOTHING, ret_val, 0UL, 0UL, 0);             /* 5 */
1342          (void) insert (BRANCH,  ret_val, 0UL, 0UL, 0);             /* 8,9 */
1343 
1344          next = emit_node (NOTHING);                                /* 5,6,7 */
1345 
1346          offset_tail (ret_val, NODE_SIZE, next);                    /* 5 */
1347          offset_tail (ret_val, 2 * NODE_SIZE, next);                /* 6 */
1348          offset_tail (ret_val, 3 * NODE_SIZE, next);                /* 7 */
1349          tail (ret_val, ret_val + (2 * NODE_SIZE));                 /* 8 */
1350          offset_tail (next, -NODE_SIZE, ret_val);                   /* 9 */
1351          insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces);        /* 10 */
1352          tail (ret_val, ret_val + INDEX_SIZE + (4 * NODE_SIZE));    /* 10 */
1353       }
1354 
1355       Num_Braces++;
1356    } else if (op_code == '{') {
1357       if (min_max [0] == REG_ZERO && min_max [1] != REG_INFINITY) {
1358          /* Node structure for (x){0,n} or (x){,n} construct.
1359           *
1360           *       ___3____________
1361           *      |       1_  _2   \   5_
1362           *      |       / |/ |    \  / |
1363           *   I~ B~ (...)~ C~ T~x K~ B~ N~
1364           *    \_|\            \_6___|__|
1365           *    7   \________4________|
1366           */
1367 
1368          tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1369 
1370          next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,6 */
1371 
1372          tail (ret_val, next);                                      /* 2 */
1373          (void) insert (BRANCH, ret_val, 0UL, 0UL, 0);              /* 3,4,7 */
1374          tail (emit_node (BACK), ret_val);                          /* 3 */
1375 
1376          next = emit_node (BRANCH);                                 /* 4,5 */
1377 
1378          tail (ret_val, next);                                      /* 4 */
1379          tail (next, emit_node (NOTHING));                          /* 5,6 */
1380          offset_tail (ret_val, NODE_SIZE, next);                    /* 6 */
1381 
1382          next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 7 */
1383 
1384          tail (ret_val, next);                                      /* 7 */
1385 
1386       } else if (min_max [0] > REG_ZERO && min_max [1] == REG_INFINITY) {
1387          /* Node structure for (x){m,} construct.
1388           *       __________4________
1389           *      |    __3__________  \
1390           *     _|___|    1_  _2   \  \    _7
1391           *    / | 8 |    / |/ |    \  \  / |
1392           *   I~ B~  (...)~ C~ T~m K~ K~ B~ N~
1393           *       \             \_5___|  |
1394           *        \__________6__________|
1395           */
1396 
1397          tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1398 
1399          next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2 */
1400 
1401          tail (ret_val, next);                                      /* 2 */
1402          tail (emit_node (BACK), ret_val);                          /* 3 */
1403          (void) insert (BRANCH, ret_val, 0UL, 0UL, 0);              /* 4,6 */
1404 
1405          next = emit_node (BACK);                                   /* 4 */
1406 
1407          tail        (next, ret_val);                               /* 4 */
1408          offset_tail (ret_val, NODE_SIZE, next);                    /* 5 */
1409          tail        (ret_val, emit_node (BRANCH));                 /* 6 */
1410          tail        (ret_val, emit_node (NOTHING));                /* 7 */
1411 
1412          insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces);        /* 8 */
1413 
1414          tail (ret_val, ret_val + INDEX_SIZE + (2 * NODE_SIZE));    /* 8 */
1415 
1416       } else {
1417          /* Node structure for (x){m,n} construct.
1418           *       _____6________________
1419           *      |   _____________3___  \
1420           *    9_|__|    1_  _2       \  \    _8
1421           *    / |  |    / |/ |        \  \  / |
1422           *   I~ B~ (...)~ C~ T~x T~m K~ K~ B~ N~
1423           *       \            \   \__4__|  |  |
1424           *        \            \_7_________|__|
1425           *         \_________5_____________|
1426           */
1427 
1428          tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1429 
1430          next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,4 */
1431 
1432          tail (ret_val, next);                                      /* 2 */
1433 
1434          next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 4 */
1435 
1436          tail (emit_node (BACK), ret_val);                          /* 3 */
1437          tail (next, emit_node (BACK));                             /* 4 */
1438          (void) insert (BRANCH, ret_val, 0UL, 0UL, 0);              /* 5,6 */
1439 
1440          next = emit_node (BRANCH);                                 /* 5,8 */
1441 
1442          tail        (ret_val, next);                               /* 5 */
1443          offset_tail (next, -NODE_SIZE, ret_val);                   /* 6 */
1444 
1445          next = emit_node (NOTHING);                                /* 7,8 */
1446 
1447          offset_tail (ret_val, NODE_SIZE, next);                    /* 7 */
1448 
1449          offset_tail (next, -NODE_SIZE, next);                      /* 8 */
1450          (void) insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 9 */
1451          tail (ret_val, ret_val + INDEX_SIZE + (2 * NODE_SIZE));    /* 9 */
1452       }
1453 
1454       Num_Braces++;
1455    } else {
1456       /* We get here if the IS_QUANTIFIER macro is not coordinated properly
1457          with this function. */
1458 
1459       REG_FAIL ("internal error #2, `piece\'");
1460    }
1461 
1462    if (IS_QUANTIFIER (*Reg_Parse)) {
1463       if (op_code == '{') {
1464          sprintf (Error_Text, "nested quantifiers, {m,n}%c", *Reg_Parse);
1465       } else {
1466          sprintf (Error_Text, "nested quantifiers, %c%c", op_code, *Reg_Parse);
1467       }
1468 
1469       REG_FAIL (Error_Text);
1470    }
1471 
1472    return (ret_val);
1473 }
1474 
1475 /*----------------------------------------------------------------------*
1476  * atom
1477  *
1478  * Process one regex item at the lowest level
1479  *
1480  * OPTIMIZATION:  Lumps a continuous sequence of ordinary characters
1481  * together so that it can turn them into a single EXACTLY node, which
1482  * is smaller to store and faster to run.
1483  *----------------------------------------------------------------------*/
1484 
atom(int * flag_param,len_range * range_param)1485 static unsigned char * atom (int *flag_param, len_range *range_param) {
1486 
1487    register unsigned char *ret_val;
1488             unsigned char  test;
1489             int            flags_local;
1490 	    len_range      range_local;
1491 
1492    *flag_param = WORST;  /* Tentatively. */
1493    range_param->lower = 0; /* Idem */
1494    range_param->upper = 0;
1495 
1496    /* Process any regex comments, e.g. `(?# match next token->)'.  The
1497       terminating right parenthesis can not be escaped.  The comment stops at
1498       the first right parenthesis encountered (or the end of the regex
1499       string)... period.  Handles multiple sequential comments,
1500       e.g. `(?# one)(?# two)...'  */
1501 
1502    while (*Reg_Parse      == '(' &&
1503          *(Reg_Parse + 1) == '?' &&
1504          *(Reg_Parse + 2) == '#') {
1505 
1506       Reg_Parse += 3;
1507 
1508       while (*Reg_Parse != ')' && *Reg_Parse != '\0') {
1509          Reg_Parse++;
1510       }
1511 
1512       if (*Reg_Parse == ')') {
1513          Reg_Parse++;
1514       }
1515 
1516       if (*Reg_Parse == ')' || *Reg_Parse == '|' || *Reg_Parse == '\0') {
1517          /* Hit end of regex string or end of parenthesized regex; have to
1518           return "something" (i.e. a NOTHING node) to avoid generating an
1519           error. */
1520 
1521          ret_val = emit_node (NOTHING);
1522 
1523          return (ret_val);
1524       }
1525    }
1526 
1527    switch (*Reg_Parse++) {
1528       case '^':
1529          ret_val = emit_node (BOL);
1530          break;
1531 
1532       case '$':
1533          ret_val = emit_node (EOL);
1534          break;
1535 
1536       case '<':
1537          ret_val = emit_node (BOWORD);
1538          break;
1539 
1540       case '>':
1541          ret_val = emit_node (EOWORD);
1542          break;
1543 
1544       case '.':
1545          if (Match_Newline) {
1546             ret_val = emit_node (EVERY);
1547          } else {
1548             ret_val = emit_node (ANY);
1549          }
1550 
1551          *flag_param |= (HAS_WIDTH | SIMPLE);
1552 	 range_param->lower = 1;
1553 	 range_param->upper = 1;
1554 	 break;
1555 
1556       case '(':
1557          if (*Reg_Parse == '?') {  /* Special parenthetical expression */
1558             Reg_Parse++;
1559 	    range_local.lower = 0; /* Make sure it is always used */
1560 	    range_local.upper = 0;
1561 
1562             if (*Reg_Parse == ':') {
1563                Reg_Parse++;
1564                ret_val = chunk (NO_CAPTURE, &flags_local, &range_local);
1565             } else if (*Reg_Parse == '=') {
1566                Reg_Parse++;
1567                ret_val = chunk (POS_AHEAD_OPEN, &flags_local, &range_local);
1568             } else if (*Reg_Parse == '!') {
1569                Reg_Parse++;
1570                ret_val = chunk (NEG_AHEAD_OPEN, &flags_local, &range_local);
1571             } else if (*Reg_Parse == 'i') {
1572                Reg_Parse++;
1573                ret_val = chunk (INSENSITIVE, &flags_local, &range_local);
1574             } else if (*Reg_Parse == 'I') {
1575                Reg_Parse++;
1576                ret_val = chunk (SENSITIVE, &flags_local, &range_local);
1577             } else if (*Reg_Parse == 'n') {
1578                Reg_Parse++;
1579                ret_val = chunk (NEWLINE, &flags_local, &range_local);
1580             } else if (*Reg_Parse == 'N') {
1581                Reg_Parse++;
1582                ret_val = chunk (NO_NEWLINE, &flags_local, &range_local);
1583             } else if (*Reg_Parse == '<') {
1584                Reg_Parse++;
1585 	       if (*Reg_Parse == '=') {
1586 	          Reg_Parse++;
1587                   ret_val = chunk (POS_BEHIND_OPEN, &flags_local, &range_local);
1588 	       } else if (*Reg_Parse == '!') {
1589 	          Reg_Parse++;
1590                   ret_val = chunk (NEG_BEHIND_OPEN, &flags_local, &range_local);
1591 	       } else {
1592                   sprintf (Error_Text,
1593                            "invalid look-behind syntax, \"(?<%c...)\"",
1594                            *Reg_Parse);
1595 
1596                   REG_FAIL (Error_Text);
1597 	       }
1598             } else {
1599                sprintf (Error_Text,
1600                         "invalid grouping syntax, \"(?%c...)\"",
1601                         *Reg_Parse);
1602 
1603                REG_FAIL (Error_Text);
1604             }
1605          } else { /* Normal capturing parentheses */
1606             ret_val = chunk (PAREN, &flags_local, &range_local);
1607          }
1608 
1609          if (ret_val == NULL) return (NULL);  /* Something went wrong. */
1610 
1611          /* Add HAS_WIDTH flag if it was set by call to chunk. */
1612 
1613          *flag_param |= flags_local & HAS_WIDTH;
1614 	 *range_param = range_local;
1615 
1616          break;
1617 
1618       case '\0':
1619       case '|':
1620       case ')':
1621          REG_FAIL ("internal error #3, `atom\'");  /* Supposed to be  */
1622                                                    /* caught earlier. */
1623       case '?':
1624       case '+':
1625       case '*':
1626          sprintf (Error_Text, "%c follows nothing", *(Reg_Parse - 1));
1627          REG_FAIL (Error_Text);
1628 
1629       case '{':
1630          if (Enable_Counting_Quantifier) {
1631             REG_FAIL ("{m,n} follows nothing");
1632          } else {
1633             ret_val = emit_node (EXACTLY); /* Treat braces as literals. */
1634             emit_byte ('{');
1635             emit_byte ('\0');
1636 	    range_param->lower = 1;
1637 	    range_param->upper = 1;
1638          }
1639 
1640          break;
1641 
1642       case '[':
1643          {
1644             register unsigned int  second_value;
1645             register unsigned int  last_value;
1646                      unsigned char last_emit = 0;
1647 
1648             /* Handle characters that can only occur at the start of a class. */
1649 
1650             if (*Reg_Parse == '^') { /* Complement of range. */
1651                ret_val = emit_node (ANY_BUT);
1652                Reg_Parse++;
1653 
1654                /* All negated classes include newline unless escaped with
1655                   a "(?n)" switch. */
1656 
1657                if (!Match_Newline) emit_byte ('\n');
1658             } else {
1659                ret_val = emit_node (ANY_OF);
1660             }
1661 
1662             if (*Reg_Parse == ']' || *Reg_Parse == '-') {
1663                /* If '-' or ']' is the first character in a class,
1664                   it is a literal character in the class. */
1665 
1666                last_emit = *Reg_Parse;
1667                emit_byte (*Reg_Parse);
1668                Reg_Parse++;
1669             }
1670 
1671             /* Handle the rest of the class characters. */
1672 
1673             while (*Reg_Parse != '\0' && *Reg_Parse != ']') {
1674                if (*Reg_Parse == '-') { /* Process a range, e.g [a-z]. */
1675                   Reg_Parse++;
1676 
1677                   if (*Reg_Parse == ']' || *Reg_Parse == '\0') {
1678                      /* If '-' is the last character in a class it is a literal
1679                         character.  If `Reg_Parse' points to the end of the
1680                         regex string, an error will be generated later. */
1681 
1682                      emit_byte ('-');
1683                      last_emit = '-';
1684                   } else {
1685                      /* We must get the range starting character value from the
1686                         emitted code since it may have been an escaped
1687                         character.  `second_value' is set one larger than the
1688                         just emitted character value.  This is done since
1689                         `second_value' is used as the start value for the loop
1690                         that emits the values in the range.  Since we have
1691                         already emitted the first character of the class, we do
1692                         not want to emit it again. */
1693 
1694                      second_value = ((unsigned int) last_emit) + 1;
1695 
1696                      if (*Reg_Parse == '\\') {
1697                         /* Handle escaped characters within a class range.
1698                            Specifically disallow shortcut escapes as the end of
1699                            a class range.  To allow this would be ambiguous
1700                            since shortcut escapes represent a set of characters,
1701                            and it would not be clear which character of the
1702                            class should be treated as the "last" character. */
1703 
1704                         Reg_Parse++;
1705 
1706                         if ((test = numeric_escape (*Reg_Parse, &Reg_Parse))) {
1707                            last_value = (unsigned int) test;
1708                         } else if ((test = literal_escape (*Reg_Parse))) {
1709                            last_value = (unsigned int) test;
1710                         } else if (shortcut_escape (*Reg_Parse,
1711                                                     NULL,
1712                                                     CHECK_CLASS_ESCAPE)) {
1713                            sprintf (Error_Text,
1714                                     "\\%c is not allowed as range operand",
1715                                     *Reg_Parse);
1716 
1717                            REG_FAIL (Error_Text);
1718                         } else {
1719                            sprintf (
1720                               Error_Text,
1721                               "\\%c is an invalid char class escape sequence",
1722                               *Reg_Parse);
1723 
1724                            REG_FAIL (Error_Text);
1725                         }
1726                      } else {
1727                         last_value = U_CHAR_AT (Reg_Parse);
1728                      }
1729 
1730                      if (Is_Case_Insensitive) {
1731                         second_value =
1732                            (unsigned int) tolower ((int) second_value);
1733                         last_value =
1734                            (unsigned int) tolower ((int) last_value);
1735                      }
1736 
1737                      /* For case insensitive, something like [A-_] will
1738                         generate an error here since ranges are converted to
1739                         lower case. */
1740 
1741                      if (second_value - 1 > last_value) {
1742                         REG_FAIL ("invalid [] range");
1743                      }
1744 
1745                      /* If only one character in range (e.g [a-a]) then this
1746                         loop is not run since the first character of any range
1747                         was emitted by the previous iteration of while loop. */
1748 
1749                      for (; second_value <= last_value; second_value++) {
1750                         emit_class_byte (second_value);
1751                      }
1752 
1753                      last_emit = (unsigned char) last_value;
1754 
1755                      Reg_Parse++;
1756 
1757                   } /* End class character range code. */
1758                } else if (*Reg_Parse == '\\') {
1759                   Reg_Parse++;
1760 
1761                   if ((test = numeric_escape (*Reg_Parse, &Reg_Parse)) != '\0') {
1762                      emit_class_byte (test);
1763 
1764                      last_emit = test;
1765                   } else if ((test = literal_escape (*Reg_Parse)) != '\0') {
1766                      emit_byte (test);
1767                      last_emit = test;
1768                   } else if (shortcut_escape (*Reg_Parse,
1769                                                NULL,
1770                                                CHECK_CLASS_ESCAPE)) {
1771 
1772                      if (*(Reg_Parse + 1) == '-') {
1773                         /* Specifically disallow shortcut escapes as the start
1774                            of a character class range (see comment above.) */
1775 
1776                         sprintf (Error_Text,
1777                                  "\\%c not allowed as range operand",
1778                                  *Reg_Parse);
1779 
1780                         REG_FAIL (Error_Text);
1781                      } else {
1782                         /* Emit the bytes that are part of the shortcut
1783                            escape sequence's range (e.g. \d = 0123456789) */
1784 
1785                         shortcut_escape (*Reg_Parse, NULL, EMIT_CLASS_BYTES);
1786                      }
1787                   } else {
1788                      sprintf (Error_Text,
1789                               "\\%c is an invalid char class escape sequence",
1790                               *Reg_Parse);
1791 
1792                      REG_FAIL (Error_Text);
1793                   }
1794 
1795                   Reg_Parse++;
1796 
1797                   /* End of class escaped sequence code */
1798                } else {
1799                   emit_class_byte (*Reg_Parse); /* Ordinary class character. */
1800 
1801                   last_emit = *Reg_Parse;
1802                   Reg_Parse++;
1803                }
1804             } /* End of while (*Reg_Parse != '\0' && *Reg_Parse != ']') */
1805 
1806             if (*Reg_Parse != ']') REG_FAIL ("missing right \']\'");
1807 
1808             emit_byte('\0');
1809 
1810             /* NOTE: it is impossible to specify an empty class.  This is
1811                because [] would be interpreted as "begin character class"
1812                followed by a literal ']' character and no "end character class"
1813                delimiter (']').  Because of this, it is always safe to assume
1814                that a class HAS_WIDTH. */
1815 
1816             Reg_Parse++;
1817 	    *flag_param |= HAS_WIDTH | SIMPLE;
1818 	    range_param->lower = 1;
1819 	    range_param->upper = 1;
1820          }
1821 
1822          break; /* End of character class code. */
1823 
1824       case '\\':
1825          /* Force Error_Text to have a length of zero.  This way we can tell if
1826             either of the calls to shortcut_escape() or back_ref() fill
1827             Error_Text with an error message. */
1828 
1829          Error_Text [0] = '\0';
1830 
1831          if ((ret_val = shortcut_escape (*Reg_Parse, flag_param, EMIT_NODE))) {
1832 
1833             Reg_Parse++;
1834 	    range_param->lower = 1;
1835 	    range_param->upper = 1;
1836             break;
1837 
1838          } else if ((ret_val = back_ref (Reg_Parse, flag_param, EMIT_NODE))) {
1839             /* Can't make any assumptions about a back-reference as to SIMPLE
1840                or HAS_WIDTH.  For example (^|<) is neither simple nor has
1841                width.  So we don't flip bits in flag_param here. */
1842 
1843             Reg_Parse++;
1844             /* Back-references always have an unknown length */
1845 	    range_param->lower = -1;
1846 	    range_param->upper = -1;
1847 	    break;
1848          }
1849 
1850          if (strlen (Error_Text) > 0) REG_FAIL (Error_Text);
1851 
1852          /* At this point it is apparent that the escaped character is not a
1853             shortcut escape or back-reference.  Back up one character to allow
1854             the default code to include it as an ordinary character. */
1855 
1856          /* Fall through to Default case to handle literal escapes and numeric
1857             escapes. */
1858 
1859       default:
1860          Reg_Parse--; /* If we fell through from the above code, we are now
1861                          pointing at the back slash (\) character. */
1862          {
1863             unsigned char *parse_save;
1864                      int   len = 0;
1865 
1866             if (Is_Case_Insensitive) {
1867                ret_val = emit_node (SIMILAR);
1868             } else {
1869                ret_val = emit_node (EXACTLY);
1870             }
1871 
1872             /* Loop until we find a meta character, shortcut escape, back
1873                reference, or end of regex string. */
1874 
1875             for (; *Reg_Parse != '\0' &&
1876                    !strchr ((char *) Meta_Char, (int) *Reg_Parse);
1877                  len++) {
1878 
1879                /* Save where we are in case we have to back
1880                   this character out. */
1881 
1882                parse_save = Reg_Parse;
1883 
1884                if (*Reg_Parse == '\\') {
1885                   Reg_Parse++; /* Point to escaped character */
1886 
1887                   Error_Text [0] = '\0';  /* See comment above. */
1888 
1889                   if ((test = numeric_escape (*Reg_Parse, &Reg_Parse))) {
1890                      if (Is_Case_Insensitive) {
1891                         emit_byte (tolower (test));
1892                      } else {
1893                         emit_byte (test);
1894                      }
1895                   } else if ((test = literal_escape (*Reg_Parse))) {
1896                      emit_byte (test);
1897                   } else if (back_ref (Reg_Parse, NULL, CHECK_ESCAPE)) {
1898                      /* Leave back reference for next `atom' call */
1899 
1900                      Reg_Parse--; break;
1901                   } else if (shortcut_escape (*Reg_Parse, NULL, CHECK_ESCAPE)) {
1902                      /* Leave shortcut escape for next `atom' call */
1903 
1904                      Reg_Parse--; break;
1905                   } else {
1906                      if (strlen (Error_Text) == 0) {
1907                         /* None of the above calls generated an error message
1908                            so generate our own here. */
1909 
1910                         sprintf (Error_Text,
1911                                  "\\%c is an invalid escape sequence",
1912                                  *Reg_Parse);
1913                      }
1914 
1915                      REG_FAIL (Error_Text);
1916                   }
1917 
1918                   Reg_Parse++;
1919                } else {
1920                   /* Ordinary character */
1921 
1922                   if (Is_Case_Insensitive) {
1923                      emit_byte (tolower (*Reg_Parse));
1924                   } else {
1925                      emit_byte (*Reg_Parse);
1926                   }
1927 
1928                   Reg_Parse++;
1929                }
1930 
1931                /* If next regex token is a quantifier (?, +. *, or {m,n}) and
1932                   our EXACTLY node so far is more than one character, leave the
1933                   last character to be made into an EXACTLY node one character
1934                   wide for the multiplier to act on.  For example 'abcd* would
1935                   have an EXACTLY node with an 'abc' operand followed by a STAR
1936                   node followed by another EXACTLY node with a 'd' operand. */
1937 
1938                if (IS_QUANTIFIER (*Reg_Parse) && len > 0) {
1939                   Reg_Parse = parse_save; /* Point to previous regex token. */
1940 
1941                   if (Code_Emit_Ptr == &Compute_Size) {
1942                      Reg_Size--;
1943                   } else {
1944                      Code_Emit_Ptr--; /* Write over previously emitted byte. */
1945                   }
1946 
1947                   break;
1948                }
1949             }
1950 
1951             if (len <= 0) REG_FAIL ("internal error #4, `atom\'");
1952 
1953             *flag_param |= HAS_WIDTH;
1954 
1955             if (len == 1) *flag_param |= SIMPLE;
1956 
1957 	    range_param->lower = len;
1958 	    range_param->upper = len;
1959 
1960             emit_byte ('\0');
1961          }
1962       } /* END switch (*Reg_Parse++) */
1963 
1964    return (ret_val);
1965 }
1966 
1967 /*----------------------------------------------------------------------*
1968  * emit_node
1969  *
1970  * Emit (if appropriate) the op code for a regex node atom.
1971  *
1972  * The NEXT pointer is initialized to NULL.
1973  *
1974  * Returns a pointer to the START of the emitted node.
1975  *----------------------------------------------------------------------*/
1976 
emit_node(int op_code)1977 static unsigned char * emit_node (int op_code) {
1978 
1979    register unsigned char *ret_val;
1980    register unsigned char *ptr;
1981 
1982    ret_val = Code_Emit_Ptr; /* Return address of start of node */
1983 
1984    if (ret_val == &Compute_Size) {
1985       Reg_Size += NODE_SIZE;
1986    } else {
1987        ptr   = ret_val;
1988       *ptr++ = (unsigned char) op_code;
1989       *ptr++ = '\0'; /* Null "NEXT" pointer. */
1990       *ptr++ = '\0';
1991 
1992       Code_Emit_Ptr = ptr;
1993    }
1994 
1995    return (ret_val);
1996 }
1997 
1998 /*----------------------------------------------------------------------*
1999  * emit_byte
2000  *
2001  * Emit (if appropriate) a byte of code (usually part of an operand.)
2002  *----------------------------------------------------------------------*/
2003 
emit_byte(unsigned char c)2004 static void emit_byte (unsigned char c) {
2005 
2006    if (Code_Emit_Ptr == &Compute_Size) {
2007       Reg_Size++;
2008    } else {
2009       *Code_Emit_Ptr++ = c;
2010    }
2011 }
2012 
2013 /*----------------------------------------------------------------------*
2014  * emit_class_byte
2015  *
2016  * Emit (if appropriate) a byte of code (usually part of a character
2017  * class operand.)
2018  *----------------------------------------------------------------------*/
2019 
emit_class_byte(unsigned char c)2020 static void emit_class_byte (unsigned char c) {
2021 
2022    if (Code_Emit_Ptr == &Compute_Size) {
2023       Reg_Size++;
2024 
2025       if (Is_Case_Insensitive && isalpha (c)) Reg_Size++;
2026    } else if (Is_Case_Insensitive && isalpha (c)) {
2027       /* For case insensitive character classes, emit both upper and lower case
2028          versions of alphabetical characters. */
2029 
2030       *Code_Emit_Ptr++ = tolower (c);
2031       *Code_Emit_Ptr++ = toupper (c);
2032    } else {
2033       *Code_Emit_Ptr++ = c;
2034    }
2035 }
2036 
2037 /*----------------------------------------------------------------------*
2038  * emit_special
2039  *
2040  * Emit nodes that need special processing.
2041  *----------------------------------------------------------------------*/
2042 
emit_special(unsigned char op_code,unsigned long test_val,int index)2043 static unsigned char * emit_special (
2044    unsigned char op_code,
2045    unsigned long test_val,
2046             int  index) {
2047 
2048    register unsigned char *ret_val = &Compute_Size;
2049    register unsigned char *ptr;
2050 
2051    if (Code_Emit_Ptr == &Compute_Size) {
2052       switch (op_code) {
2053 	 case POS_BEHIND_OPEN:
2054 	 case NEG_BEHIND_OPEN:
2055 	    Reg_Size += LENGTH_SIZE;   /* Length of the look-behind match */
2056 	    Reg_Size += NODE_SIZE;     /* Make room for the node */
2057 	    break;
2058 
2059          case TEST_COUNT:
2060             Reg_Size += NEXT_PTR_SIZE; /* Make room for a test value. */
2061 
2062          case INC_COUNT:
2063             Reg_Size += INDEX_SIZE;    /* Make room for an index value. */
2064 
2065          default:
2066             Reg_Size += NODE_SIZE;     /* Make room for the node. */
2067       }
2068    } else {
2069       ret_val = emit_node (op_code); /* Return the address for start of node. */
2070       ptr     = Code_Emit_Ptr;
2071 
2072       if (op_code == INC_COUNT || op_code == TEST_COUNT) {
2073          *ptr++ = (unsigned char) index;
2074 
2075          if (op_code == TEST_COUNT) {
2076             *ptr++ = PUT_OFFSET_L (test_val);
2077             *ptr++ = PUT_OFFSET_R (test_val);
2078          }
2079       } else if (op_code == POS_BEHIND_OPEN || op_code == NEG_BEHIND_OPEN) {
2080          *ptr++ = PUT_OFFSET_L (test_val);
2081          *ptr++ = PUT_OFFSET_R (test_val);
2082          *ptr++ = PUT_OFFSET_L (test_val);
2083          *ptr++ = PUT_OFFSET_R (test_val);
2084       }
2085 
2086       Code_Emit_Ptr = ptr;
2087    }
2088 
2089    return (ret_val);
2090 }
2091 
2092 /*----------------------------------------------------------------------*
2093  * insert
2094  *
2095  * Insert a node in front of already emitted node(s).  Means relocating
2096  * the operand.  Code_Emit_Ptr points one byte past the just emitted
2097  * node and operand.  The parameter `insert_pos' points to the location
2098  * where the new node is to be inserted.
2099  *----------------------------------------------------------------------*/
2100 
insert(unsigned char op,unsigned char * insert_pos,long min,long max,int index)2101 static unsigned char * insert (
2102    unsigned char  op,
2103    unsigned char *insert_pos,
2104    long           min,
2105    long           max,
2106    int            index) {
2107 
2108    register unsigned char *src;
2109    register unsigned char *dst;
2110             unsigned char *place;
2111                      int   insert_size = NODE_SIZE;
2112 
2113    if (op == BRACE || op == LAZY_BRACE) {
2114       /* Make room for the min and max values. */
2115 
2116       insert_size += (2 * NEXT_PTR_SIZE);
2117    } else if (op == INIT_COUNT) {
2118       /* Make room for an index value . */
2119 
2120       insert_size += INDEX_SIZE;
2121    }
2122 
2123    if (Code_Emit_Ptr == &Compute_Size) {
2124       Reg_Size += insert_size;
2125       return &Compute_Size;
2126    }
2127 
2128    src            = Code_Emit_Ptr;
2129    Code_Emit_Ptr += insert_size;
2130    dst            = Code_Emit_Ptr;
2131 
2132    /* Relocate the existing emitted code to make room for the new node. */
2133 
2134    while (src > insert_pos) *--dst = *--src;
2135 
2136     place   = insert_pos;   /* Where operand used to be. */
2137    *place++ = op;           /* Inserted operand. */
2138    *place++ = '\0';         /* NEXT pointer for inserted operand. */
2139    *place++ = '\0';
2140 
2141    if (op == BRACE || op == LAZY_BRACE) {
2142       *place++ = PUT_OFFSET_L (min);
2143       *place++ = PUT_OFFSET_R (min);
2144 
2145       *place++ = PUT_OFFSET_L (max);
2146       *place++ = PUT_OFFSET_R (max);
2147    } else if (op == INIT_COUNT) {
2148       *place++ = (unsigned char) index;
2149    }
2150 
2151    return place; /* Return a pointer to the start of the code moved. */
2152 }
2153 
2154 /*----------------------------------------------------------------------*
2155  * tail - Set the next-pointer at the end of a node chain.
2156  *----------------------------------------------------------------------*/
2157 
tail(unsigned char * search_from,unsigned char * point_to)2158 static void tail (unsigned char *search_from, unsigned char *point_to) {
2159 
2160    register unsigned char *scan;
2161    register unsigned char *next;
2162    register          int   offset;
2163 
2164    if (search_from == &Compute_Size) return;
2165 
2166    /* Find the last node in the chain (node with a null NEXT pointer) */
2167 
2168    scan = search_from;
2169 
2170    for (;;) {
2171       next = next_ptr (scan);
2172 
2173       if (!next) break;
2174 
2175       scan = next;
2176    }
2177 
2178    if (GET_OP_CODE (scan) == BACK) {
2179       offset = scan - point_to;
2180    } else {
2181       offset = point_to - scan;
2182    }
2183 
2184    /* Set NEXT pointer */
2185 
2186    *(scan + 1) = PUT_OFFSET_L (offset);
2187    *(scan + 2) = PUT_OFFSET_R (offset);
2188 }
2189 
2190 /*--------------------------------------------------------------------*
2191  * offset_tail
2192  *
2193  * Perform a tail operation on (ptr + offset).
2194  *--------------------------------------------------------------------*/
2195 
offset_tail(unsigned char * ptr,int offset,unsigned char * val)2196 static void offset_tail (unsigned char *ptr, int offset, unsigned char *val) {
2197 
2198    if (ptr == &Compute_Size || ptr == NULL) return;
2199 
2200    tail (ptr + offset, val);
2201 }
2202 
2203 /*--------------------------------------------------------------------*
2204  * branch_tail
2205  *
2206  * Perform a tail operation on (ptr + offset) but only if `ptr' is a
2207  * BRANCH node.
2208  *--------------------------------------------------------------------*/
2209 
branch_tail(unsigned char * ptr,int offset,unsigned char * val)2210 static void branch_tail (unsigned char *ptr, int offset, unsigned char *val) {
2211 
2212    if (ptr == &Compute_Size || ptr == NULL ||GET_OP_CODE (ptr) != BRANCH) {
2213       return;
2214    }
2215 
2216    tail (ptr + offset, val);
2217 }
2218 
2219 /*--------------------------------------------------------------------*
2220  * shortcut_escape
2221  *
2222  * Implements convenient escape sequences that represent entire
2223  * character classes or special location assertions (similar to escapes
2224  * supported by Perl)
2225  *                                                  _
2226  *    \d     Digits                  [0-9]           |
2227  *    \D     NOT a digit             [^0-9]          | (Examples
2228  *    \l     Letters                 [a-zA-Z]        |  at left
2229  *    \L     NOT a Letter            [^a-zA-Z]       |    are
2230  *    \s     Whitespace              [ \t\n\r\f\v]   |    for
2231  *    \S     NOT Whitespace          [^ \t\n\r\f\v]  |     C
2232  *    \w     "Word" character        [a-zA-Z0-9_]    |   Locale)
2233  *    \W     NOT a "Word" character  [^a-zA-Z0-9_]  _|
2234  *
2235  *    \B     Matches any character that is NOT a word-delimiter
2236  *
2237  *    Codes for the "emit" parameter:
2238  *
2239  *    EMIT_NODE
2240  *       Emit a shortcut node.  Shortcut nodes have an implied set of
2241  *       class characters.  This helps keep the compiled regex string
2242  *       small.
2243  *
2244  *    EMIT_CLASS_BYTES
2245  *       Emit just the equivalent characters of the class.  This makes
2246  *       the escape usable from within a class, e.g. [a-fA-F\d].  Only
2247  *       \d, \D, \s, \S, \w, and \W can be used within a class.
2248  *
2249  *    CHECK_ESCAPE
2250  *       Only verify that this is a valid shortcut escape.
2251  *
2252  *    CHECK_CLASS_ESCAPE
2253  *       Same as CHECK_ESCAPE but only allows characters valid within
2254  *       a class.
2255  *
2256  *--------------------------------------------------------------------*/
2257 
shortcut_escape(unsigned char c,int * flag_param,int emit)2258 static unsigned char * shortcut_escape (
2259    unsigned char  c,
2260    int           *flag_param,
2261    int            emit) {
2262 
2263    register unsigned char *class   = NULL;
2264    static   unsigned char *codes   = (unsigned char *) "ByYdDlLsSwW";
2265             unsigned char *ret_val = (unsigned char *) 1; /* Assume success. */
2266             unsigned char *valid_codes;
2267 
2268    if (emit == EMIT_CLASS_BYTES || emit == CHECK_CLASS_ESCAPE) {
2269       valid_codes = codes + 3; /* \B, \y and \Y are not allowed in classes */
2270    } else {
2271       valid_codes = codes;
2272    }
2273 
2274    if (!strchr ((char *) valid_codes, (int) c)) {
2275       return NULL; /* Not a valid shortcut escape sequence */
2276    } else if (emit == CHECK_ESCAPE || emit == CHECK_CLASS_ESCAPE) {
2277       return ret_val; /* Just checking if this is a valid shortcut escape. */
2278    }
2279 
2280    switch (c) {
2281       case 'd':
2282       case 'D':
2283          if (emit == EMIT_CLASS_BYTES) {
2284             class = ASCII_Digits;
2285          } else if (emit == EMIT_NODE) {
2286             ret_val = (islower (c) ? emit_node (DIGIT)
2287                                    : emit_node (NOT_DIGIT));
2288          }
2289 
2290          break;
2291 
2292       case 'l':
2293       case 'L':
2294          if (emit == EMIT_CLASS_BYTES) {
2295             class = Letter_Char;
2296          } else if (emit == EMIT_NODE) {
2297             ret_val = (islower (c) ? emit_node (LETTER)
2298                                    : emit_node (NOT_LETTER));
2299          }
2300 
2301          break;
2302 
2303       case 's':
2304       case 'S':
2305          if (emit == EMIT_CLASS_BYTES) {
2306             if (Match_Newline) emit_byte ('\n');
2307 
2308             class = White_Space;
2309          } else if (emit == EMIT_NODE) {
2310             if (Match_Newline) {
2311                ret_val = (islower (c) ? emit_node (SPACE_NL)
2312                                       : emit_node (NOT_SPACE_NL));
2313             } else {
2314                ret_val = (islower (c) ? emit_node (SPACE)
2315                                       : emit_node (NOT_SPACE));
2316             }
2317          }
2318 
2319          break;
2320 
2321       case 'w':
2322       case 'W':
2323          if (emit == EMIT_CLASS_BYTES) {
2324             class = Word_Char;
2325          } else if (emit == EMIT_NODE) {
2326             ret_val = (islower (c) ? emit_node (WORD_CHAR)
2327                                    : emit_node (NOT_WORD_CHAR));
2328          }
2329 
2330          break;
2331 
2332       /* Since the delimiter table is not available at regex compile time \B,
2333          \Y and \Y can only generate a node.  At run time, the delimiter table
2334          will be available for these nodes to use. */
2335 
2336       case 'y':
2337 
2338          if (emit == EMIT_NODE) {
2339             ret_val = emit_node (IS_DELIM);
2340          } else {
2341             REG_FAIL ("internal error #5 `shortcut_escape\'");
2342          }
2343 
2344          break;
2345 
2346       case 'Y':
2347 
2348          if (emit == EMIT_NODE) {
2349             ret_val = emit_node (NOT_DELIM);
2350          } else {
2351             REG_FAIL ("internal error #6 `shortcut_escape\'");
2352          }
2353 
2354          break;
2355 
2356       case 'B':
2357 
2358          if (emit == EMIT_NODE) {
2359             ret_val = emit_node (NOT_BOUNDARY);
2360          } else {
2361             REG_FAIL ("internal error #7 `shortcut_escape\'");
2362          }
2363 
2364          break;
2365 
2366       default:
2367          /* We get here if there isn't a case for every character in
2368             the string "codes" */
2369 
2370          REG_FAIL ("internal error #8 `shortcut_escape\'");
2371    }
2372 
2373    if (emit == EMIT_NODE  &&  c != 'B') {
2374       *flag_param |= (HAS_WIDTH | SIMPLE);
2375    }
2376 
2377    if (class) {
2378       /* Emit bytes within a character class operand. */
2379 
2380       while (*class != '\0') {
2381          emit_byte (*class++);
2382       }
2383    }
2384 
2385    return ret_val;
2386 }
2387 
2388 /*--------------------------------------------------------------------*
2389  * numeric_escape
2390  *
2391  * Implements hex and octal numeric escape sequence syntax.
2392  *
2393  * Hexadecimal Escape: \x##    Max of two digits  Must have leading 'x'.
2394  * Octal Escape:       \0###   Max of three digits and not greater
2395  *                             than 377 octal.  Must have leading zero.
2396  *
2397  * Returns the actual character value or NULL if not a valid hex or
2398  * octal escape.  REG_FAIL is called if \x0, \x00, \0, \00, \000, or
2399  * \0000 is specified.
2400  *--------------------------------------------------------------------*/
2401 
numeric_escape(unsigned char c,unsigned char ** parse)2402 static unsigned char numeric_escape (
2403    unsigned char    c,
2404    unsigned char  **parse) {
2405 
2406    static unsigned char digits [] = "fedcbaFEDCBA9876543210";
2407 
2408    static unsigned int digit_val [] = {
2409       15, 14, 13, 12, 11, 10,                  /* Lower case Hex digits */
2410       15, 14, 13, 12, 11, 10,                  /* Upper case Hex digits */
2411        9,  8,  7,  6,  5,  4,  3,  2,  1,  0}; /* Decimal Digits */
2412 
2413    register unsigned char *scan;
2414    register unsigned char *pos_ptr;
2415    register unsigned char *digit_str;
2416             unsigned int   value     =  0;
2417             unsigned int   radix     =  8;
2418                      int   width     =  3; /* Can not be bigger than \0377 */
2419                      int   pos_delta = 14;
2420                      int   i, pos;
2421 
2422    switch (c) {
2423       case '0':
2424          digit_str = digits + pos_delta; /* Only use Octal digits, i.e. 0-7. */
2425 
2426          break;
2427 
2428       case 'x':
2429       case 'X':
2430          width     =  2;     /* Can not be bigger than \0377 */
2431          radix     = 16;
2432          pos_delta =  0;
2433          digit_str = digits; /* Use all of the digit characters. */
2434 
2435          break;
2436 
2437       default:
2438          return ('\0'); /* Not a numeric escape */
2439    }
2440 
2441    scan = *parse; scan++; /* Only change *parse on success. */
2442 
2443    pos_ptr = (unsigned char *) strchr ((char *) digit_str, (int) *scan);
2444 
2445    for (i = 0; pos_ptr != NULL && (i < width); i++) {
2446       pos   = (pos_ptr - digit_str) + pos_delta;
2447       value = (value * radix) + digit_val [pos];
2448 
2449       /* If this digit makes the value over 255, treat this digit as a literal
2450          character instead of part of the numeric escape.  For example, \0777
2451          will be processed as \077 (an 'M') and a literal '7' character, NOT
2452          511 decimal which is > 255. */
2453 
2454       if (value > 255) {
2455          /* Back out calculations for last digit processed. */
2456 
2457          value -= digit_val [pos];
2458          value /= radix;
2459 
2460          break; /* Note that scan will not be incremented and still points to
2461                    the digit that caused overflow.  It will be decremented by
2462                    the "else" below to point to the last character that is
2463                    considered to be part of the octal escape. */
2464       }
2465 
2466       scan++;
2467       pos_ptr = (unsigned char *) strchr ((char *) digit_str, (int) *scan);
2468    }
2469 
2470    /* Handle the case of "\0" i.e. trying to specify a NULL character. */
2471 
2472    if (value == 0) {
2473       if (c == '0') {
2474          sprintf (Error_Text, "\\00 is an invalid octal escape");
2475       } else {
2476          sprintf (Error_Text, "\\%c0 is an invalid hexadecimal escape", c);
2477       }
2478    } else {
2479       /* Point to the last character of the number on success. */
2480 
2481       scan--;
2482       *parse = scan;
2483    }
2484 
2485    return (unsigned char) value;
2486 }
2487 
2488 /*--------------------------------------------------------------------*
2489  * literal_escape
2490  *
2491  * Recognize escaped literal characters (prefixed with backslash),
2492  * and translate them into the corresponding character.
2493  *
2494  * Returns the proper character value or NULL if not a valid literal
2495  * escape.
2496  *--------------------------------------------------------------------*/
2497 
literal_escape(unsigned char c)2498 static unsigned char literal_escape (unsigned char c) {
2499 
2500    static unsigned char valid_escape [] =  {
2501       'a',   'b',
2502       'e',
2503       'f',   'n',   'r',   't',   'v',   '(',    ')',   '-',   '[',   ']',
2504       '<',   '>',   '{',   '}',   '.',   '\\',   '|',   '^',   '$',   '*',
2505       '+',   '?',   '&',   '\0'
2506    };
2507 
2508    static unsigned char value [] = {
2509       '\a',  '\b',
2510 #ifdef EBCDIC_CHARSET
2511       0x27,  /* Escape character in IBM's EBCDIC character set. */
2512 #else
2513       0x1B,  /* Escape character in ASCII character set. */
2514 #endif
2515       '\f',  '\n',  '\r',  '\t',  '\v',  '(',    ')',   '-',   '[',   ']',
2516       '<',   '>',   '{',   '}',   '.',   '\\',   '|',   '^',   '$',   '*',
2517       '+',   '?',   '&',   '\0'
2518    };
2519 
2520    int i;
2521 
2522    for (i = 0; valid_escape [i] != '\0'; i++) {
2523       if (c == valid_escape [i]) return value [i];
2524    }
2525 
2526    return '\0';
2527 }
2528 
2529 /*--------------------------------------------------------------------*
2530  * back_ref
2531  *
2532  * Process a request to match a previous parenthesized thing.
2533  * Parenthetical entities are numbered beginning at 1 by counting
2534  * opening parentheses from left to to right.  \0 would represent
2535  * whole match, but would confuse numeric_escape as an octal escape,
2536  * so it is forbidden.
2537  *
2538  * Constructs of the form \~1, \~2, etc. are cross-regex back
2539  * references and are used in syntax highlighting patterns to match
2540  * text previously matched by another regex. *** IMPLEMENT LATER ***
2541  *--------------------------------------------------------------------*/
2542 
back_ref(unsigned char * c,int * flag_param,int emit)2543 static unsigned char * back_ref (
2544    unsigned char *c,
2545    int           *flag_param,
2546    int            emit) {
2547 
2548    int  paren_no, c_offset = 0, is_cross_regex = 0;
2549 
2550    unsigned char *ret_val;
2551 
2552    /* Implement cross regex backreferences later. */
2553 
2554    /* if (*c == (unsigned char) ('~')) {
2555       c_offset++;
2556       is_cross_regex++;
2557    } */
2558 
2559    paren_no = (int) (*(c + c_offset) - (unsigned char) ('0'));
2560 
2561    if (!isdigit (*(c + c_offset)) || /* Only \1, \2, ... \9 are supported.  */
2562        paren_no == 0) {              /* Should be caught by numeric_escape. */
2563 
2564       return NULL;
2565    }
2566 
2567    /* Make sure parentheses for requested back-reference are complete. */
2568 
2569    if (!is_cross_regex && !TEST_BIT (Closed_Parens, paren_no)) {
2570       sprintf (Error_Text, "\\%d is an illegal back reference", paren_no);
2571       return NULL;
2572    }
2573 
2574    if (emit == EMIT_NODE) {
2575       if (is_cross_regex) {
2576          Reg_Parse++; /* Skip past the '~' in a cross regex back reference.
2577                          We only do this if we are emitting code. */
2578 
2579          if (Is_Case_Insensitive) {
2580             ret_val = emit_node (X_REGEX_BR_CI);
2581          } else {
2582             ret_val = emit_node (X_REGEX_BR);
2583          }
2584       } else {
2585          if (Is_Case_Insensitive) {
2586             ret_val = emit_node (BACK_REF_CI);
2587          } else {
2588             ret_val = emit_node (BACK_REF);
2589          }
2590       }
2591 
2592       emit_byte ((unsigned char) paren_no);
2593 
2594       if (is_cross_regex || TEST_BIT (Paren_Has_Width, paren_no)) {
2595          *flag_param |= HAS_WIDTH;
2596       }
2597    } else if (emit == CHECK_ESCAPE) {
2598       ret_val = (unsigned char *) 1;
2599    } else {
2600       ret_val = NULL;
2601    }
2602 
2603    return ret_val;
2604 }
2605 
2606 /*======================================================================*
2607  *  Regex execution related code
2608  *======================================================================*/
2609 
2610  /* Global work variables for `ExecRE'. */
2611 
2612 static unsigned char  *Reg_Input;           /* String-input pointer.         */
2613 static unsigned char  *Start_Of_String;     /* Beginning of input, for ^     */
2614                                             /* and < checks.                 */
2615 static unsigned char  *End_Of_String;       /* Logical end of input (if
2616    				               supplied, till \0 otherwise)  */
2617 static unsigned char  *Look_Behind_To;      /* Position till were look behind
2618                                                can safely check back         */
2619 static unsigned char **Start_Ptr_Ptr;       /* Pointer to `startp' array.    */
2620 static unsigned char **End_Ptr_Ptr;         /* Ditto for `endp'.             */
2621 static unsigned char  *Extent_Ptr_FW;       /* Forward extent pointer        */
2622 static unsigned char  *Extent_Ptr_BW;       /* Backward extent pointer       */
2623 static unsigned char  *Back_Ref_Start [10]; /* Back_Ref_Start [0] and        */
2624 static unsigned char  *Back_Ref_End   [10]; /* Back_Ref_End [0] are not      */
2625                                             /* used. This simplifies         */
2626                                             /* indexing.                     */
2627 /*
2628  * Measured recursion limits:
2629  *    Linux:      +/-  40 000 (up to 110 000)
2630  *    Solaris:    +/-  85 000
2631  *    HP-UX 11:   +/- 325 000
2632  *
2633  * So 10 000 ought to be safe.
2634  */
2635 #define REGEX_RECURSION_LIMIT 10000
2636 static int Recursion_Count;          /* Recursion counter */
2637 static int Recursion_Limit_Exceeded; /* Recursion limit exceeded flag */
2638 
2639 #define AT_END_OF_STRING(X) (*(X) == (unsigned char)'\0' ||\
2640                              (End_Of_String != NULL && (X) >= End_Of_String))
2641 
2642 /* static regexp *Cross_Regex_Backref; */
2643 
2644 static int Prev_Is_BOL;
2645 static int Succ_Is_EOL;
2646 static int Prev_Is_Delim;
2647 static int Succ_Is_Delim;
2648 
2649 /* Define a pointer to an array to hold general (...){m,n} counts. */
2650 
2651 typedef struct brace_counts {
2652     unsigned long count [1]; /* More unwarranted chumminess with compiler. */
2653 } brace_counts;
2654 
2655 static struct brace_counts *Brace;
2656 
2657 /* Default table for determining whether a character is a word delimiter. */
2658 
2659 static unsigned char  Default_Delimiters [UCHAR_MAX+1] = {0};
2660 
2661 static unsigned char *Current_Delimiters;  /* Current delimiter table */
2662 
2663 /* Forward declarations of functions used by `ExecRE' */
2664 
2665 static int             attempt            (regexp *, unsigned char *);
2666 static int             match              (unsigned char *, int *);
2667 static unsigned long   greedy             (unsigned char *, long);
2668 static void            adjustcase         (unsigned char *, int, unsigned char);
2669 static unsigned char * makeDelimiterTable (unsigned char *, unsigned char *);
2670 
2671 /*
2672  * ExecRE - match a `regexp' structure against a string
2673  *
2674  * If `end' is non-NULL, matches may not BEGIN past end, but may extend past
2675  * it.  If reverse is true, `end' must be specified, and searching begins at
2676  * `end'.  "isbol" should be set to true if the beginning of the string is the
2677  * actual beginning of a line (since `ExecRE' can't look backwards from the
2678  * beginning to find whether there was a newline before).  Likewise, "isbow"
2679  * asks whether the string is preceded by a word delimiter.  End of string is
2680  * always treated as a word and line boundary (there may be cases where it
2681  * shouldn't be, in which case, this should be changed).  "delimit" (if
2682  * non-null) specifies a null-terminated string of characters to be considered
2683  * word delimiters matching "<" and ">".  if "delimit" is NULL, the default
2684  * delimiters (as set in SetREDefaultWordDelimiters) are used.
2685  * Look_behind_to indicates the position till where it is safe to
2686  * perform look-behind matches. If set, it should be smaller than or equal
2687  * to the start position of the search (pointed at by string). If it is NULL,
2688  * it defaults to the start position.
2689  * Finally, match_to indicates the logical end of the string, till where
2690  * matches are allowed to extend. Note that look-ahead patterns may look
2691  * past that boundary. If match_to is set to NULL, the terminating \0 is
2692  * assumed to correspond to the logical boundary. Match_to, if set, must be
2693  * larger than or equal to end, if set.
2694  */
2695 
ExecRE(regexp * prog,const char * string,const char * end,int reverse,char prev_char,char succ_char,const char * delimiters,const char * look_behind_to,const char * match_to)2696 int ExecRE(regexp *prog, const char* string, const char* end, int reverse,
2697         char prev_char, char succ_char, const char* delimiters,
2698         const char* look_behind_to, const char* match_to)
2699 {
2700 
2701    register unsigned char  *str;
2702             unsigned char **s_ptr;
2703             unsigned char **e_ptr;
2704                      int    ret_val = 0;
2705             unsigned char   tempDelimitTable [256];
2706                      int    i;
2707 
2708    /* Check for valid parameters. */
2709 
2710    if (prog == NULL || string == NULL) {
2711       reg_error ("NULL parameter to `ExecRE\'");
2712       goto SINGLE_RETURN;
2713    }
2714 
2715    /* Check validity of program. */
2716 
2717    if (U_CHAR_AT (prog->program) != MAGIC) {
2718       reg_error ("corrupted program");
2719       goto SINGLE_RETURN;
2720    }
2721 
2722    s_ptr = (unsigned char **) prog->startp;
2723    e_ptr = (unsigned char **) prog->endp;
2724 
2725    /* If caller has supplied delimiters, make a delimiter table */
2726 
2727    if (delimiters == NULL) {
2728       Current_Delimiters = Default_Delimiters;
2729    } else {
2730       Current_Delimiters = makeDelimiterTable (
2731                               (unsigned char *) delimiters,
2732                               (unsigned char *) tempDelimitTable);
2733    }
2734 
2735    /* Remember the logical end of the string. */
2736 
2737    End_Of_String = (unsigned char *) match_to;
2738 
2739    if (end == NULL && reverse) {
2740       for (end = string; !AT_END_OF_STRING((unsigned char*)end); end++) ;
2741       succ_char = '\n';
2742    } else if (end == NULL) {
2743       succ_char = '\n';
2744    }
2745 
2746    /* Initialize arrays used by shortcut_escape. */
2747 
2748    if (!init_ansi_classes ()) goto SINGLE_RETURN;
2749 
2750    /* Remember the beginning of the string for matching BOL */
2751 
2752    Start_Of_String    = (unsigned char *) string;
2753    Look_Behind_To     = (unsigned char *) (look_behind_to?look_behind_to:string);
2754 
2755    Prev_Is_BOL        = ((prev_char == '\n') || (prev_char == '\0') ? 1 : 0);
2756    Succ_Is_EOL        = ((succ_char == '\n') || (succ_char == '\0') ? 1 : 0);
2757    Prev_Is_Delim      = (Current_Delimiters [(unsigned char)prev_char] ? 1 : 0);
2758    Succ_Is_Delim      = (Current_Delimiters [(unsigned char)succ_char] ? 1 : 0);
2759 
2760    Total_Paren        = (int) (prog->program [1]);
2761    Num_Braces         = (int) (prog->program [2]);
2762 
2763    /* Reset the recursion detection flag */
2764    Recursion_Limit_Exceeded = 0;
2765 
2766    /* Allocate memory for {m,n} construct counting variables if need be. */
2767 
2768    if (Num_Braces > 0) {
2769       Brace =
2770          (brace_counts *) NEditMalloc (sizeof (brace_counts) * (size_t) Num_Braces);
2771 
2772       if (Brace == NULL) {
2773          reg_error ("out of memory in `ExecRE\'");
2774          goto SINGLE_RETURN;
2775       }
2776    } else {
2777       Brace = NULL;
2778    }
2779 
2780    /* Initialize the first nine (9) capturing parentheses start and end
2781       pointers to point to the start of the search string.  This is to prevent
2782       crashes when later trying to reference captured parens that do not exist
2783       in the compiled regex.  We only need to do the first nine since users
2784       can only specify \1, \2, ... \9. */
2785 
2786    for (i = 9; i > 0; i--) {
2787       *s_ptr++ = (unsigned char *) string;
2788       *e_ptr++ = (unsigned char *) string;
2789    }
2790 
2791    if (!reverse) { /* Forward Search */
2792       if (prog->anchor) {
2793          /* Search is anchored at BOL */
2794 
2795          if (attempt (prog, (unsigned char *) string)) {
2796             ret_val = 1;
2797             goto SINGLE_RETURN;
2798          }
2799 
2800          for (str = (unsigned char *) string;
2801              !AT_END_OF_STRING(str) && str != (unsigned char *) end && !Recursion_Limit_Exceeded;
2802               str++) {
2803 
2804             if (*str == '\n') {
2805                if (attempt (prog, str + 1)) {
2806                   ret_val = 1;
2807                   break;
2808                }
2809             }
2810          }
2811 
2812          goto SINGLE_RETURN;
2813 
2814       } else if (prog->match_start != '\0') {
2815          /* We know what char match must start with. */
2816 
2817          for (str = (unsigned char *) string;
2818              !AT_END_OF_STRING(str) && str != (unsigned char *) end && !Recursion_Limit_Exceeded;
2819               str++) {
2820 
2821             if (*str == (unsigned char)prog->match_start) {
2822                if (attempt (prog, str)) {
2823                   ret_val = 1;
2824                   break;
2825                }
2826             }
2827          }
2828 
2829          goto SINGLE_RETURN;
2830       } else {
2831          /* General case */
2832 
2833          for (str = (unsigned char *) string;
2834              !AT_END_OF_STRING(str) && str != (unsigned char *) end && !Recursion_Limit_Exceeded;
2835               str++) {
2836 
2837             if (attempt (prog, str)) {
2838                ret_val = 1;
2839                break;
2840             }
2841          }
2842 
2843          /* Beware of a single $ matching \0 */
2844          if (!Recursion_Limit_Exceeded && !ret_val && AT_END_OF_STRING(str) && str != (unsigned char *) end) {
2845             if (attempt (prog, str)) {
2846                ret_val = 1;
2847             }
2848          }
2849 
2850          goto SINGLE_RETURN;
2851       }
2852    } else { /* Search reverse, same as forward, but loops run backward */
2853 
2854       /* Make sure that we don't start matching beyond the logical end */
2855       if (End_Of_String != NULL && (unsigned char*)end > End_Of_String) {
2856          end = (const char*)End_Of_String;
2857       }
2858 
2859       if (prog->anchor) {
2860          /* Search is anchored at BOL */
2861 
2862          for (str = (unsigned char *)(end - 1);
2863               str >= (unsigned char *) string && !Recursion_Limit_Exceeded;
2864               str--) {
2865 
2866             if (*str == '\n') {
2867                if (attempt (prog, str + 1)) {
2868                   ret_val = 1;
2869                   goto SINGLE_RETURN;
2870                }
2871             }
2872          }
2873 
2874          if (!Recursion_Limit_Exceeded && attempt (prog, (unsigned char *) string)) {
2875             ret_val = 1;
2876             goto SINGLE_RETURN;
2877          }
2878 
2879          goto SINGLE_RETURN;
2880       } else if (prog->match_start != '\0') {
2881          /* We know what char match must start with. */
2882 
2883          for (str =  (unsigned char *) end;
2884               str >= (unsigned char *) string && !Recursion_Limit_Exceeded;
2885               str--) {
2886 
2887             if (*str == (unsigned char)prog->match_start) {
2888                if (attempt (prog, str)) {
2889                   ret_val = 1;
2890                   break;
2891                }
2892             }
2893          }
2894 
2895          goto SINGLE_RETURN;
2896       } else {
2897          /* General case */
2898 
2899          for (str =  (unsigned char *) end;
2900               str >= (unsigned char *) string && !Recursion_Limit_Exceeded;
2901               str--) {
2902 
2903             if (attempt (prog, str)) {
2904                ret_val = 1;
2905                break;
2906             }
2907          }
2908       }
2909    }
2910 
2911    SINGLE_RETURN: if (Brace) free (Brace);
2912 
2913    if (Recursion_Limit_Exceeded) return (0);
2914 
2915    return (ret_val);
2916 }
2917 
2918 /*--------------------------------------------------------------------*
2919  * init_ansi_classes
2920  *
2921  * Generate character class sets using locale aware ANSI C functions.
2922  *
2923  *--------------------------------------------------------------------*/
2924 
init_ansi_classes(void)2925 static int init_ansi_classes (void) {
2926 
2927    static int initialized = 0;
2928    static int underscore = (int) '_';
2929           int i, word_count, letter_count, space_count;
2930 
2931    if (!initialized) {
2932       initialized  = 1; /* Only need to generate character sets once. */
2933       word_count   = 0;
2934       letter_count = 0;
2935       space_count  = 0;
2936 
2937       for (i = 1; i < (int)UCHAR_MAX; i++) {
2938          if (isalnum (i) || i == underscore) {
2939             Word_Char [word_count++] = (unsigned char) i;
2940          }
2941 
2942          if (isalpha (i)) {
2943             Letter_Char [letter_count++] = (unsigned char) i;
2944          }
2945 
2946          /* Note: Whether or not newline is considered to be whitespace is
2947             handled by switches within the original regex and is thus omitted
2948             here. */
2949 
2950          if (isspace (i) && (i != (int) '\n')) {
2951             White_Space [space_count++] = (unsigned char) i;
2952          }
2953 
2954          /* Make sure arrays are big enough.  ("- 2" because of zero array
2955             origin and we need to leave room for the NULL terminator.) */
2956 
2957          if (word_count   > (ALNUM_CHAR_SIZE  - 2) ||
2958              space_count  > (WHITE_SPACE_SIZE - 2) ||
2959              letter_count > (ALNUM_CHAR_SIZE  - 2)) {
2960 
2961             reg_error ("internal error #9 `init_ansi_classes\'");
2962             return (0);
2963          }
2964       }
2965 
2966       Word_Char   [word_count]  = '\0';
2967       Letter_Char [word_count]  = '\0';
2968       White_Space [space_count] = '\0';
2969    }
2970 
2971    return (1);
2972 }
2973 
2974 /*----------------------------------------------------------------------*
2975  * attempt - try match at specific point, returns: 0 failure, 1 success
2976  *----------------------------------------------------------------------*/
2977 
attempt(regexp * prog,unsigned char * string)2978 static int attempt (regexp *prog, unsigned char *string) {
2979 
2980    register          int    i;
2981    register unsigned char **s_ptr;
2982    register unsigned char **e_ptr;
2983    		     int    branch_index = 0; /* Must be set to zero ! */
2984 
2985    Reg_Input      = string;
2986    Start_Ptr_Ptr  = (unsigned char **) prog->startp;
2987    End_Ptr_Ptr    = (unsigned char **) prog->endp;
2988    s_ptr          = (unsigned char **) prog->startp;
2989    e_ptr          = (unsigned char **) prog->endp;
2990 
2991    /* Reset the recursion counter. */
2992    Recursion_Count = 0;
2993 
2994    /* Overhead due to capturing parentheses. */
2995 
2996    Extent_Ptr_BW = string;
2997    Extent_Ptr_FW = NULL;
2998 
2999    for (i = Total_Paren + 1; i > 0; i--) {
3000       *s_ptr++ = NULL;
3001       *e_ptr++ = NULL;
3002    }
3003 
3004    if (match ((unsigned char *) (prog->program + REGEX_START_OFFSET),
3005 	&branch_index)) {
3006       prog->startp [0] = (char *) string;
3007       prog->endp   [0] = (char *) Reg_Input;     /* <-- One char AFTER  */
3008       prog->extentpBW  = (char *) Extent_Ptr_BW; /*     matched string! */
3009       prog->extentpFW  = (char *) Extent_Ptr_FW;
3010       prog->top_branch = branch_index;
3011 
3012       return (1);
3013    } else {
3014       return (0);
3015    }
3016 }
3017 
3018 /*----------------------------------------------------------------------*
3019  * match - main matching routine
3020  *
3021  * Conceptually the strategy is simple: check to see whether the
3022  * current node matches, call self recursively to see whether the rest
3023  * matches, and then act accordingly.  In practice we make some effort
3024  * to avoid recursion, in particular by going through "ordinary" nodes
3025  * (that don't need to know whether the rest of the match failed) by a
3026  * loop instead of by recursion.  Returns 0 failure, 1 success.
3027  *----------------------------------------------------------------------*/
3028 #define MATCH_RETURN(X)\
3029  { --Recursion_Count; return (X); }
3030 #define CHECK_RECURSION_LIMIT\
3031  if (Recursion_Limit_Exceeded) MATCH_RETURN (0);
3032 
match(unsigned char * prog,int * branch_index_param)3033 static int match (unsigned char *prog, int *branch_index_param) {
3034 
3035    register unsigned char *scan;  /* Current node. */
3036             unsigned char *next;  /* Next node. */
3037    register int next_ptr_offset;  /* Used by the NEXT_PTR () macro */
3038 
3039    if (++Recursion_Count > REGEX_RECURSION_LIMIT) {
3040        if (!Recursion_Limit_Exceeded) /* Prevent duplicate errors */
3041            reg_error("recursion limit exceeded, please respecify expression");
3042        Recursion_Limit_Exceeded = 1;
3043        MATCH_RETURN (0);
3044    }
3045 
3046 
3047    scan = prog;
3048 
3049    while (scan != NULL) {
3050       NEXT_PTR (scan, next);
3051 
3052       switch (GET_OP_CODE (scan)) {
3053          case BRANCH:
3054             {
3055                register unsigned char *save;
3056 	       register int branch_index_local = 0;
3057 
3058                if (GET_OP_CODE (next) != BRANCH) {  /* No choice. */
3059                   next = OPERAND (scan);   /* Avoid recursion. */
3060                } else {
3061                   do {
3062                      save = Reg_Input;
3063 
3064                      if (match (OPERAND (scan), NULL))
3065 		     {
3066 			if (branch_index_param)
3067 			   *branch_index_param = branch_index_local;
3068   			MATCH_RETURN (1);
3069 		     }
3070 
3071 		     CHECK_RECURSION_LIMIT
3072 
3073 		     ++branch_index_local;
3074 
3075                      Reg_Input = save; /* Backtrack. */
3076                      NEXT_PTR (scan, scan);
3077                   } while (scan != NULL && GET_OP_CODE (scan) == BRANCH);
3078 
3079                   MATCH_RETURN (0); /* NOT REACHED */
3080                }
3081             }
3082 
3083             break;
3084 
3085          case EXACTLY:
3086             {
3087                register int            len;
3088                register unsigned char *opnd;
3089 
3090                opnd = OPERAND (scan);
3091 
3092                /* Inline the first character, for speed. */
3093 
3094                if (*opnd != *Reg_Input) MATCH_RETURN (0);
3095 
3096                len = strlen ((char *) opnd);
3097 
3098                if (End_Of_String != NULL && Reg_Input + len > End_Of_String) {
3099                    MATCH_RETURN (0);
3100                }
3101 
3102                if (len > 1  &&
3103                    strncmp ((char *) opnd, (char *) Reg_Input, len) != 0) {
3104 
3105                    MATCH_RETURN (0);
3106                }
3107 
3108                Reg_Input += len;
3109             }
3110 
3111             break;
3112 
3113          case SIMILAR:
3114             {
3115                register unsigned char *opnd;
3116                register unsigned char  test;
3117 
3118                opnd = OPERAND (scan);
3119 
3120                /* Note: the SIMILAR operand was converted to lower case during
3121                   regex compile. */
3122 
3123                while ((test = *opnd++) != '\0') {
3124                   if (AT_END_OF_STRING(Reg_Input) ||
3125                       tolower (*Reg_Input++) != test) {
3126 
3127                       MATCH_RETURN (0);
3128                   }
3129                }
3130             }
3131 
3132             break;
3133 
3134          case BOL: /* `^' (beginning of line anchor) */
3135             if (Reg_Input == Start_Of_String) {
3136                if (Prev_Is_BOL) break;
3137             } else if (*(Reg_Input - 1) == '\n') {
3138                break;
3139             }
3140 
3141             MATCH_RETURN (0);
3142 
3143          case EOL: /* `$' anchor matches end of line and end of string */
3144             if (*Reg_Input == '\n' || (AT_END_OF_STRING(Reg_Input) && Succ_Is_EOL)) {
3145                break;
3146             }
3147 
3148             MATCH_RETURN (0);
3149 
3150          case BOWORD: /* `<' (beginning of word anchor) */
3151             /* Check to see if the current character is not a delimiter
3152                and the preceding character is. */
3153             {
3154 	       int prev_is_delim;
3155 	       if (Reg_Input == Start_Of_String) {
3156 		   prev_is_delim = Prev_Is_Delim;
3157 	       } else {
3158 		   prev_is_delim = Current_Delimiters [ *(Reg_Input - 1) ];
3159 	       }
3160 	       if (prev_is_delim) {
3161 		   int current_is_delim;
3162 		   if (AT_END_OF_STRING(Reg_Input)) {
3163 		      current_is_delim = Succ_Is_Delim;
3164 		   } else {
3165 		      current_is_delim = Current_Delimiters [ *Reg_Input ];
3166 		   }
3167 		   if (!current_is_delim) break;
3168 	       }
3169 	    }
3170 
3171             MATCH_RETURN (0);
3172 
3173          case EOWORD: /* `>' (end of word anchor) */
3174             /* Check to see if the current character is a delimiter
3175 	       and the preceding character is not. */
3176             {
3177 	       int prev_is_delim;
3178 	       if (Reg_Input == Start_Of_String) {
3179 		   prev_is_delim = Prev_Is_Delim;
3180 	       } else {
3181 		   prev_is_delim = Current_Delimiters [ *(Reg_Input-1) ];
3182 	       }
3183 	       if (!prev_is_delim) {
3184 		   int current_is_delim;
3185 		   if (AT_END_OF_STRING(Reg_Input)) {
3186 		      current_is_delim = Succ_Is_Delim;
3187 		   } else {
3188 		      current_is_delim = Current_Delimiters [ *Reg_Input ];
3189 		   }
3190 		   if (current_is_delim) break;
3191 	       }
3192 	    }
3193 
3194             MATCH_RETURN (0);
3195 
3196          case NOT_BOUNDARY: /* \B (NOT a word boundary) */
3197             {
3198 	       int prev_is_delim;
3199 	       int current_is_delim;
3200 	       if (Reg_Input == Start_Of_String) {
3201 		   prev_is_delim = Prev_Is_Delim;
3202 	       } else {
3203 		   prev_is_delim = Current_Delimiters [ *(Reg_Input-1) ];
3204 	       }
3205 	       if (AT_END_OF_STRING(Reg_Input)) {
3206 		  current_is_delim = Succ_Is_Delim;
3207 	       } else {
3208 		  current_is_delim = Current_Delimiters [ *Reg_Input ];
3209 	       }
3210 	       if (!(prev_is_delim ^ current_is_delim)) break;
3211 	    }
3212 
3213             MATCH_RETURN (0);
3214 
3215          case IS_DELIM: /* \y (A word delimiter character.) */
3216             if (Current_Delimiters [ *Reg_Input ] &&
3217                 !AT_END_OF_STRING(Reg_Input)) {
3218                Reg_Input++; break;
3219             }
3220 
3221             MATCH_RETURN (0);
3222 
3223          case NOT_DELIM: /* \Y (NOT a word delimiter character.) */
3224             if (!Current_Delimiters [ *Reg_Input ] &&
3225                 !AT_END_OF_STRING(Reg_Input)) {
3226                Reg_Input++; break;
3227             }
3228 
3229             MATCH_RETURN (0);
3230 
3231          case WORD_CHAR: /* \w (word character; alpha-numeric or underscore) */
3232             if ((isalnum ((int) *Reg_Input) || *Reg_Input == '_') &&
3233                 !AT_END_OF_STRING(Reg_Input)) {
3234                Reg_Input++; break;
3235             }
3236 
3237             MATCH_RETURN (0);
3238 
3239          case NOT_WORD_CHAR:/* \W (NOT a word character) */
3240             if (isalnum ((int) *Reg_Input) ||
3241                 *Reg_Input == '_'          ||
3242                 *Reg_Input == '\n'         ||
3243                 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3244 
3245             Reg_Input++; break;
3246 
3247          case ANY: /* `.' (matches any character EXCEPT newline) */
3248             if (AT_END_OF_STRING(Reg_Input) || *Reg_Input == '\n') MATCH_RETURN (0);
3249 
3250             Reg_Input++; break;
3251 
3252          case EVERY: /* `.' (matches any character INCLUDING newline) */
3253             if (AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3254 
3255             Reg_Input++; break;
3256 
3257          case DIGIT: /* \d, same as [0123456789] */
3258             if (!isdigit ((int) *Reg_Input) ||
3259                 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3260 
3261             Reg_Input++; break;
3262 
3263          case NOT_DIGIT: /* \D, same as [^0123456789] */
3264             if (isdigit ((int) *Reg_Input) ||
3265                 *Reg_Input == '\n'         ||
3266                 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3267 
3268             Reg_Input++; break;
3269 
3270          case LETTER: /* \l, same as [a-zA-Z] */
3271             if (!isalpha ((int) *Reg_Input) ||
3272                 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3273 
3274             Reg_Input++; break;
3275 
3276          case NOT_LETTER: /* \L, same as [^0123456789] */
3277             if (isalpha ((int) *Reg_Input)  ||
3278                 *Reg_Input == '\n' ||
3279                 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3280 
3281             Reg_Input++; break;
3282 
3283          case SPACE: /* \s, same as [ \t\r\f\v] */
3284             if (!isspace ((int) *Reg_Input) ||
3285                 *Reg_Input == '\n'          ||
3286                 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3287 
3288             Reg_Input++; break;
3289 
3290          case SPACE_NL: /* \s, same as [\n \t\r\f\v] */
3291             if (!isspace ((int) *Reg_Input) ||
3292                 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3293 
3294             Reg_Input++; break;
3295 
3296          case NOT_SPACE: /* \S, same as [^\n \t\r\f\v] */
3297             if (isspace ((int) *Reg_Input) ||
3298                 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3299 
3300             Reg_Input++; break;
3301 
3302          case NOT_SPACE_NL: /* \S, same as [^ \t\r\f\v] */
3303             if ((isspace ((int) *Reg_Input) && *Reg_Input != '\n') ||
3304                 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3305 
3306             Reg_Input++; break;
3307 
3308          case ANY_OF:  /* [...] character class. */
3309             if (AT_END_OF_STRING(Reg_Input))
3310                MATCH_RETURN (0); /* Needed because strchr ()
3311                                     considers \0 as a member
3312                                     of the character set. */
3313 
3314             if (strchr ((char *) OPERAND (scan), (int) *Reg_Input) == NULL) {
3315                MATCH_RETURN (0);
3316             }
3317 
3318             Reg_Input++; break;
3319 
3320          case ANY_BUT: /* [^...] Negated character class-- does NOT normally
3321                        match newline (\n added usually to operand at compile
3322                        time.) */
3323 
3324             if (AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0); /* See comment for ANY_OF. */
3325 
3326             if (strchr ((char *) OPERAND (scan), (int) *Reg_Input) != NULL) {
3327                MATCH_RETURN (0);
3328             }
3329 
3330             Reg_Input++; break;
3331 
3332          case NOTHING:
3333          case BACK:
3334             break;
3335 
3336          case STAR:
3337          case PLUS:
3338          case QUESTION:
3339          case BRACE:
3340 
3341          case LAZY_STAR:
3342          case LAZY_PLUS:
3343          case LAZY_QUESTION:
3344          case LAZY_BRACE:
3345             {
3346                register unsigned long  num_matched = REG_ZERO;
3347                register unsigned long  min = ULONG_MAX, max = REG_ZERO;
3348                register unsigned char *save;
3349                register unsigned char  next_char;
3350                         unsigned char *next_op;
3351                         int            lazy = 0;
3352 
3353                /* Lookahead (when possible) to avoid useless match attempts
3354                   when we know what character comes next. */
3355 
3356                if (GET_OP_CODE (next) == EXACTLY) {
3357                   next_char = *OPERAND (next);
3358                } else {
3359                   next_char = '\0';/* i.e. Don't know what next character is. */
3360                }
3361 
3362                next_op = OPERAND (scan);
3363 
3364                switch (GET_OP_CODE (scan)) {
3365                   case LAZY_STAR:
3366                      lazy = 1;
3367                   case STAR:
3368                      min = REG_ZERO;
3369                      max = ULONG_MAX;
3370                      break;
3371 
3372                   case LAZY_PLUS:
3373                      lazy = 1;
3374                   case PLUS:
3375                      min = REG_ONE;
3376                      max = ULONG_MAX;
3377                      break;
3378 
3379                   case LAZY_QUESTION:
3380                      lazy = 1;
3381                   case QUESTION:
3382                      min = REG_ZERO;
3383                      max = REG_ONE;
3384                      break;
3385 
3386                   case LAZY_BRACE:
3387                      lazy = 1;
3388                   case BRACE:
3389                      min = (unsigned long)
3390                            GET_OFFSET (scan + NEXT_PTR_SIZE);
3391 
3392                      max = (unsigned long)
3393                            GET_OFFSET (scan + (2 * NEXT_PTR_SIZE));
3394 
3395                      if (max <= REG_INFINITY) max = ULONG_MAX;
3396 
3397                      next_op = OPERAND (scan + (2 * NEXT_PTR_SIZE));
3398                }
3399 
3400                save = Reg_Input;
3401 
3402                if (lazy) {
3403                   if ( min > REG_ZERO) num_matched = greedy (next_op, min);
3404                } else {
3405                   num_matched = greedy (next_op, max);
3406                }
3407 
3408                while (min <= num_matched && num_matched <= max) {
3409                   if (next_char == '\0' || next_char == *Reg_Input) {
3410                      if (match (next, NULL)) MATCH_RETURN (1);
3411 
3412                      CHECK_RECURSION_LIMIT
3413                   }
3414 
3415                   /* Couldn't or didn't match. */
3416 
3417                   if (lazy) {
3418                      if (!greedy (next_op, 1)) MATCH_RETURN (0);
3419 
3420                      num_matched++; /* Inch forward. */
3421                   } else if (num_matched > REG_ZERO) {
3422                      num_matched--; /* Back up. */
3423                   } else if (min == REG_ZERO && num_matched == REG_ZERO) {
3424                      break;
3425                   }
3426 
3427                   Reg_Input = save + num_matched;
3428                }
3429 
3430                MATCH_RETURN (0);
3431             }
3432 
3433             break;
3434 
3435          case END:
3436             if (Extent_Ptr_FW == NULL || (Reg_Input - Extent_Ptr_FW) > 0) {
3437                Extent_Ptr_FW = Reg_Input;
3438             }
3439 
3440             MATCH_RETURN (1);  /* Success! */
3441 
3442             break;
3443 
3444          case INIT_COUNT:
3445             Brace->count [*OPERAND (scan)] = REG_ZERO;
3446 
3447             break;
3448 
3449          case INC_COUNT:
3450             Brace->count [*OPERAND (scan)]++;
3451 
3452             break;
3453 
3454          case TEST_COUNT:
3455             if (Brace->count [*OPERAND (scan)] <
3456                (unsigned long) GET_OFFSET (scan + NEXT_PTR_SIZE + INDEX_SIZE)) {
3457 
3458                next = scan + NODE_SIZE + INDEX_SIZE + NEXT_PTR_SIZE;
3459             }
3460 
3461             break;
3462 
3463          case BACK_REF:
3464          case BACK_REF_CI:
3465          /* case X_REGEX_BR:    */
3466          /* case X_REGEX_BR_CI: *** IMPLEMENT LATER */
3467             {
3468                register unsigned char *captured, *finish;
3469                                  int   paren_no;
3470 
3471                paren_no = (int) *OPERAND (scan);
3472 
3473                /* if (GET_OP_CODE (scan) == X_REGEX_BR ||
3474                    GET_OP_CODE (scan) == X_REGEX_BR_CI) {
3475 
3476                   if (Cross_Regex_Backref == NULL) MATCH_RETURN (0);
3477 
3478                   captured =
3479                      (unsigned char *) Cross_Regex_Backref->startp [paren_no];
3480 
3481                   finish =
3482                      (unsigned char *) Cross_Regex_Backref->endp   [paren_no];
3483                } else { */
3484                   captured = Back_Ref_Start [paren_no];
3485                   finish   = Back_Ref_End   [paren_no];
3486                /* } */
3487 
3488                if ((captured != NULL) && (finish != NULL)) {
3489                   if (captured > finish) MATCH_RETURN (0);
3490 
3491                   if (GET_OP_CODE (scan) == BACK_REF_CI  /* ||
3492                       GET_OP_CODE (scan) == X_REGEX_BR_CI*/ ) {
3493 
3494                      while (captured < finish) {
3495                         if (AT_END_OF_STRING(Reg_Input) ||
3496                             tolower (*captured++) != tolower (*Reg_Input++)) {
3497                            MATCH_RETURN (0);
3498                         }
3499                      }
3500                   } else {
3501                      while (captured < finish) {
3502                         if (AT_END_OF_STRING(Reg_Input) ||
3503                             *captured++ != *Reg_Input++) MATCH_RETURN (0);
3504                      }
3505                   }
3506 
3507                   break;
3508                } else {
3509                   MATCH_RETURN (0);
3510                }
3511             }
3512 
3513          case POS_AHEAD_OPEN:
3514          case NEG_AHEAD_OPEN:
3515             {
3516                register unsigned char *save;
3517                register unsigned char *saved_end;
3518                                  int   answer;
3519 
3520                save      = Reg_Input;
3521 
3522                /* Temporarily ignore the logical end of the string, to allow
3523                   lookahead past the end. */
3524                saved_end = End_Of_String;
3525                End_Of_String = NULL;
3526 
3527                answer    = match (next, NULL); /* Does the look-ahead regex match? */
3528 
3529                CHECK_RECURSION_LIMIT
3530 
3531                if ((GET_OP_CODE (scan) == POS_AHEAD_OPEN) ? answer : !answer) {
3532                   /* Remember the last (most to the right) character position
3533                      that we consume in the input for a successful match.  This
3534                      is info that may be needed should an attempt be made to
3535                      match the exact same text at the exact same place.  Since
3536                      look-aheads backtrack, a regex with a trailing look-ahead
3537                      may need more text than it matches to accomplish a
3538                      re-match. */
3539 
3540                   if (Extent_Ptr_FW == NULL || (Reg_Input - Extent_Ptr_FW) > 0) {
3541                      Extent_Ptr_FW = Reg_Input;
3542                   }
3543 
3544                   Reg_Input = save; /* Backtrack to look-ahead start. */
3545                   End_Of_String = saved_end; /* Restore logical end. */
3546 
3547                   /* Jump to the node just after the (?=...) or (?!...)
3548                      Construct. */
3549 
3550                   next = next_ptr (OPERAND (scan)); /* Skip 1st branch */
3551 		  /* Skip the chain of branches inside the look-ahead */
3552 		  while(GET_OP_CODE(next) == BRANCH)
3553 		      next = next_ptr (next);
3554                   next = next_ptr (next); /* Skip the LOOK_AHEAD_CLOSE */
3555                } else {
3556                   Reg_Input = save; /* Backtrack to look-ahead start. */
3557                   End_Of_String = saved_end; /* Restore logical end. */
3558 
3559                   MATCH_RETURN (0);
3560                }
3561             }
3562 
3563             break;
3564 
3565          case POS_BEHIND_OPEN:
3566          case NEG_BEHIND_OPEN:
3567             {
3568                register unsigned char *save;
3569                                  int   answer;
3570                register 	 int   offset, upper;
3571                                  int   lower;
3572                                  int   found = 0;
3573                         unsigned char *saved_end;
3574 
3575                save      = Reg_Input;
3576                saved_end = End_Of_String;
3577 
3578                /* Prevent overshoot (greedy matching could end past the
3579                   current position) by tightening the matching boundary.
3580                   Lookahead inside lookbehind can still cross that boundary. */
3581                End_Of_String = Reg_Input;
3582 
3583                lower = GET_LOWER (scan);
3584                upper = GET_UPPER (scan);
3585 
3586                /* Start with the shortest match first. This is the most
3587                   efficient direction in general.
3588                   Note! Negative look behind is _very_ tricky when the length
3589                   is not constant: we have to make sure the expression doesn't
3590                   match for _any_ of the starting positions. */
3591                for (offset = lower; offset <= upper; ++offset) {
3592 	          Reg_Input = save - offset;
3593 
3594                   if (Reg_Input < Look_Behind_To) {
3595                      /* No need to look any further */
3596                      break;
3597            	  }
3598 
3599                   answer    = match (next, NULL); /* Does the look-behind regex match? */
3600 
3601                   CHECK_RECURSION_LIMIT
3602 
3603                   /* The match must have ended at the current position;
3604                      otherwise it is invalid */
3605                   if (answer && Reg_Input == save) {
3606                      /* It matched, exactly far enough */
3607                      found = 1;
3608 
3609                      /* Remember the last (most to the left) character position
3610                         that we consume in the input for a successful match.
3611                         This is info that may be needed should an attempt be
3612                         made to match the exact same text at the exact same
3613                         place. Since look-behind backtracks, a regex with a
3614                         leading look-behind may need more text than it matches
3615                         to accomplish a re-match. */
3616 
3617                      if (Extent_Ptr_BW == NULL ||
3618                          (Extent_Ptr_BW - (save - offset)) > 0) {
3619                         Extent_Ptr_BW = save - offset;
3620                      }
3621 
3622                      break;
3623                   }
3624                }
3625 
3626 	       /* Always restore the position and the logical string end. */
3627 	       Reg_Input = save;
3628                End_Of_String = saved_end;
3629 
3630                if ((GET_OP_CODE (scan) == POS_BEHIND_OPEN) ? found : !found) {
3631                   /* The look-behind matches, so we must jump to the next
3632                      node. The look-behind node is followed by a chain of
3633                      branches (contents of the look-behind expression), and
3634                      terminated by a look-behind-close node. */
3635                   next = next_ptr (OPERAND (scan) + LENGTH_SIZE); /* 1st branch */
3636                   /* Skip the chained branches inside the look-ahead */
3637                   while (GET_OP_CODE (next) == BRANCH)
3638                      next = next_ptr (next);
3639                   next = next_ptr (next); /* Skip LOOK_BEHIND_CLOSE */
3640                } else {
3641                   /* Not a match */
3642                   MATCH_RETURN (0);
3643                }
3644             }
3645             break;
3646 
3647          case LOOK_AHEAD_CLOSE:
3648          case LOOK_BEHIND_CLOSE:
3649             MATCH_RETURN (1);  /* We have reached the end of the look-ahead or
3650 	                    look-behind which implies that we matched it,
3651 			    so return TRUE. */
3652          default:
3653             if ((GET_OP_CODE (scan) > OPEN) &&
3654                 (GET_OP_CODE (scan) < OPEN + NSUBEXP)) {
3655 
3656                register          int   no;
3657                register unsigned char *save;
3658 
3659                no   = GET_OP_CODE (scan) - OPEN;
3660                save = Reg_Input;
3661 
3662                if (no < 10) {
3663                   Back_Ref_Start [no] = save;
3664                   Back_Ref_End   [no] = NULL;
3665                }
3666 
3667                if (match (next, NULL)) {
3668                   /* Do not set `Start_Ptr_Ptr' if some later invocation (think
3669                      recursion) of the same parentheses already has. */
3670 
3671                   if (Start_Ptr_Ptr [no] == NULL) Start_Ptr_Ptr [no] = save;
3672 
3673                   MATCH_RETURN (1);
3674                } else {
3675                   MATCH_RETURN (0);
3676                }
3677             } else if ((GET_OP_CODE (scan) > CLOSE) &&
3678                        (GET_OP_CODE (scan) < CLOSE + NSUBEXP)) {
3679 
3680                register          int   no;
3681                register unsigned char *save;
3682 
3683                no   = GET_OP_CODE (scan) - CLOSE;
3684                save = Reg_Input;
3685 
3686                if (no < 10) Back_Ref_End [no] = save;
3687 
3688                if (match (next, NULL)) {
3689                   /* Do not set `End_Ptr_Ptr' if some later invocation of the
3690                      same parentheses already has. */
3691 
3692                   if (End_Ptr_Ptr [no] == NULL) End_Ptr_Ptr [no] = save;
3693 
3694                   MATCH_RETURN (1);
3695                } else {
3696                   MATCH_RETURN (0);
3697                }
3698             } else {
3699                reg_error ("memory corruption, `match\'");
3700 
3701                MATCH_RETURN (0);
3702             }
3703 
3704             break;
3705       }
3706 
3707       scan = next;
3708    }
3709 
3710    /* We get here only if there's trouble -- normally "case END" is
3711       the terminating point. */
3712 
3713    reg_error ("corrupted pointers, `match\'");
3714 
3715    MATCH_RETURN (0);
3716 }
3717 
3718 /*----------------------------------------------------------------------*
3719  * greedy
3720  *
3721  * Repeatedly match something simple up to "max" times. If max <= 0
3722  * then match as much as possible (max = infinity).  Uses unsigned long
3723  * variables to maximize the amount of text matchable for unbounded
3724  * qualifiers like '*' and '+'.  This will allow at least 4,294,967,295
3725  * matches (4 Gig!) for an ANSI C compliant compiler.  If you are
3726  * applying a regex to something bigger than that, you shouldn't be
3727  * using NEdit!
3728  *
3729  * Returns the actual number of matches.
3730  *----------------------------------------------------------------------*/
3731 
greedy(unsigned char * p,long max)3732 static unsigned long greedy (unsigned char *p, long max) {
3733 
3734    register unsigned char *input_str;
3735    register unsigned char *operand;
3736    register unsigned long  count = REG_ZERO;
3737    register unsigned long  max_cmp;
3738 
3739    input_str = Reg_Input;
3740    operand   = OPERAND (p); /* Literal char or start of class characters. */
3741    max_cmp   = (max > 0) ? (unsigned long) max : ULONG_MAX;
3742 
3743    switch (GET_OP_CODE (p)) {
3744       case ANY:
3745          /* Race to the end of the line or string. Dot DOESN'T match
3746             newline. */
3747 
3748          while (count < max_cmp              &&
3749                 *input_str != '\n'           &&
3750                 !AT_END_OF_STRING(input_str)) {
3751             count++; input_str++;
3752          }
3753 
3754          break;
3755 
3756       case EVERY:
3757          /* Race to the end of the line or string. Dot DOES match newline. */
3758 
3759          while (count < max_cmp               &&
3760                 !AT_END_OF_STRING(input_str)) {
3761             count++; input_str++;
3762          }
3763 
3764          break;
3765 
3766       case EXACTLY: /* Count occurrences of single character operand. */
3767          while (count < max_cmp               &&
3768                 *operand == *input_str        &&
3769                 !AT_END_OF_STRING(input_str)) {
3770             count++; input_str++;
3771          }
3772 
3773          break;
3774 
3775       case SIMILAR: /* Case insensitive version of EXACTLY */
3776          while (count < max_cmp                  &&
3777                 *operand == tolower (*input_str) &&
3778                 !AT_END_OF_STRING(input_str)) {
3779             count++; input_str++;
3780          }
3781 
3782          break;
3783 
3784       case ANY_OF:  /* [...] character class. */
3785          while (count < max_cmp                                      &&
3786                 strchr ((char *) operand, (int) *input_str) != NULL  &&
3787                 !AT_END_OF_STRING(input_str)) {
3788 
3789             count++; input_str++;
3790          }
3791 
3792          break;
3793 
3794       case ANY_BUT: /* [^...] Negated character class- does NOT normally
3795                        match newline (\n added usually to operand at compile
3796                        time.) */
3797 
3798          while (count < max_cmp                                      &&
3799                 strchr ((char *) operand, (int) *input_str) == NULL  &&
3800                 !AT_END_OF_STRING(input_str)) {
3801 
3802             count++; input_str++;
3803          }
3804 
3805          break;
3806 
3807       case IS_DELIM: /* \y (not a word delimiter char)
3808                          NOTE: '\n' and '\0' are always word delimiters. */
3809 
3810          while (count < max_cmp                   &&
3811                 Current_Delimiters [ *input_str ] &&
3812                 !AT_END_OF_STRING(input_str)) {
3813             count++; input_str++;
3814          }
3815 
3816          break;
3817 
3818       case NOT_DELIM: /* \Y (not a word delimiter char)
3819                          NOTE: '\n' and '\0' are always word delimiters. */
3820 
3821          while (count < max_cmp                    &&
3822                 !Current_Delimiters [ *input_str ] &&
3823                 !AT_END_OF_STRING(input_str)) {
3824             count++; input_str++;
3825          }
3826 
3827          break;
3828 
3829       case WORD_CHAR: /* \w (word character, alpha-numeric or underscore) */
3830          while (count < max_cmp                     &&
3831                 (isalnum ((int) *input_str) ||
3832                  *input_str == (unsigned char) '_') &&
3833                 !AT_END_OF_STRING(input_str)) {
3834 
3835             count++; input_str++;
3836          }
3837 
3838          break;
3839 
3840       case NOT_WORD_CHAR:/* \W (NOT a word character) */
3841          while (count < max_cmp                      &&
3842                 !isalnum ((int) *input_str)          &&
3843                 *input_str != (unsigned char) '_'    &&
3844                 *input_str != (unsigned char) '\n'   &&
3845                 !AT_END_OF_STRING(input_str)) {
3846 
3847             count++; input_str++;
3848          }
3849 
3850          break;
3851 
3852       case DIGIT: /* same as [0123456789] */
3853          while (count < max_cmp              &&
3854                 isdigit ((int) *input_str)   &&
3855                 !AT_END_OF_STRING(input_str)) {
3856             count++; input_str++;
3857          }
3858 
3859          break;
3860 
3861       case NOT_DIGIT: /* same as [^0123456789] */
3862          while (count < max_cmp              &&
3863                 !isdigit ((int) *input_str)  &&
3864                 *input_str != '\n'           &&
3865                 !AT_END_OF_STRING(input_str)) {
3866 
3867             count++; input_str++;
3868          }
3869 
3870          break;
3871 
3872       case SPACE: /* same as [ \t\r\f\v]-- doesn't match newline. */
3873          while (count < max_cmp             &&
3874                 isspace ((int) *input_str)  &&
3875                 *input_str != '\n'          &&
3876                 !AT_END_OF_STRING(input_str)) {
3877 
3878             count++; input_str++;
3879          }
3880 
3881          break;
3882 
3883       case SPACE_NL: /* same as [\n \t\r\f\v]-- matches newline. */
3884          while (count < max_cmp             &&
3885                 isspace ((int) *input_str)  &&
3886                 !AT_END_OF_STRING(input_str)) {
3887 
3888             count++; input_str++;
3889          }
3890 
3891          break;
3892 
3893       case NOT_SPACE: /* same as [^\n \t\r\f\v]-- doesn't match newline. */
3894          while (count < max_cmp              &&
3895                 !isspace ((int) *input_str)  &&
3896                 !AT_END_OF_STRING(input_str)) {
3897 
3898             count++; input_str++;
3899          }
3900 
3901          break;
3902 
3903       case NOT_SPACE_NL: /* same as [^ \t\r\f\v]-- matches newline. */
3904          while (count < max_cmp                                     &&
3905                (!isspace ((int) *input_str) || *input_str == '\n')  &&
3906                 !AT_END_OF_STRING(input_str)) {
3907 
3908             count++; input_str++;
3909          }
3910 
3911          break;
3912 
3913       case LETTER: /* same as [a-zA-Z] */
3914          while (count < max_cmp             &&
3915                 isalpha ((int) *input_str)  &&
3916                 !AT_END_OF_STRING(input_str)) {
3917 
3918             count++; input_str++;
3919          }
3920 
3921          break;
3922 
3923       case NOT_LETTER: /* same as [^a-zA-Z] */
3924          while (count < max_cmp              &&
3925                 !isalpha ((int) *input_str)  &&
3926                 *input_str != '\n'           &&
3927                 !AT_END_OF_STRING(input_str)) {
3928 
3929             count++; input_str++;
3930          }
3931 
3932          break;
3933 
3934       default:
3935          /* Called inappropriately.  Only atoms that are SIMPLE should
3936             generate a call to greedy.  The above cases should cover
3937             all the atoms that are SIMPLE. */
3938 
3939          reg_error ("internal error #10 `greedy\'");
3940          count = 0U;  /* Best we can do. */
3941    }
3942 
3943    /* Point to character just after last matched character. */
3944 
3945    Reg_Input = input_str;
3946 
3947    return (count);
3948 }
3949 
3950 /*----------------------------------------------------------------------*
3951  * next_ptr - compute the address of a node's "NEXT" pointer.
3952  * Note: a simplified inline version is available via the NEXT_PTR() macro,
3953  *       but that one is only to be used at time-critical places (see the
3954  *       description of the macro).
3955  *----------------------------------------------------------------------*/
3956 
next_ptr(unsigned char * ptr)3957 static unsigned char * next_ptr (unsigned char *ptr) {
3958 
3959    register int offset;
3960 
3961    if (ptr == &Compute_Size) return (NULL);
3962 
3963    offset = GET_OFFSET (ptr);
3964 
3965    if (offset == 0) return (NULL);
3966 
3967    if (GET_OP_CODE (ptr) == BACK) {
3968       return (ptr - offset);
3969    } else {
3970       return (ptr + offset);
3971    }
3972 }
3973 
3974 /*
3975 **  SubstituteRE - Perform substitutions after a `regexp' match.
3976 **
3977 **  This function cleanly shortens results of more than max length to max.
3978 **  To give the caller a chance to react to this the function returns False
3979 **  on any error. The substitution will still be executed.
3980 */
SubstituteRE(const regexp * prog,const char * source,char * dest,int max)3981 Boolean SubstituteRE(const regexp* prog, const char* source, char* dest,
3982         int max)
3983 {
3984 
3985    register unsigned char *src;
3986             unsigned char *src_alias;
3987    register unsigned char *dst;
3988    register unsigned char  c;
3989    register unsigned char  test;
3990    register          int   paren_no;
3991    register          int   len;
3992    register unsigned char  chgcase;
3993    Boolean anyWarnings = False;
3994 
3995    if (prog == NULL || source == NULL || dest == NULL) {
3996       reg_error ("NULL parm to `SubstituteRE\'");
3997 
3998       return False;
3999    }
4000 
4001    if (U_CHAR_AT (prog->program) != MAGIC) {
4002       reg_error ("damaged regexp passed to `SubstituteRE\'");
4003 
4004       return False;
4005    }
4006 
4007    src = (unsigned char *) source;
4008    dst = (unsigned char *) dest;
4009 
4010    while ((c = *src++) != '\0') {
4011       chgcase  = '\0';
4012       paren_no = -1;
4013 
4014       if (c == '\\') {
4015          /* Process any case altering tokens, i.e \u, \U, \l, \L. */
4016 
4017          if (*src == 'u' || *src == 'U' || *src == 'l' || *src == 'L') {
4018             chgcase = *src;
4019             src++;
4020             c = *src++;
4021 
4022             if (c == '\0') break;
4023          }
4024       }
4025 
4026       if (c == '&') {
4027          paren_no = 0;
4028       } else if (c == '\\') {
4029          /* Can not pass register variable `&src' to function `numeric_escape'
4030             so make a non-register copy that we can take the address of. */
4031 
4032          src_alias = src;
4033 
4034          if ('1' <= *src && *src <=  '9') {
4035             paren_no = (int) *src++ - (int) '0';
4036 
4037          } else if ((test = literal_escape (*src)) != '\0') {
4038             c = test; src++;
4039 
4040          } else if ((test = numeric_escape (*src, &src_alias)) != '\0') {
4041             c   = test;
4042             src = src_alias; src++;
4043 
4044             /* NOTE: if an octal escape for zero is attempted (e.g. \000), it
4045                will be treated as a literal string. */
4046          } else if (*src == '\0') {
4047             /* If '\' is the last character of the replacement string, it is
4048                interpreted as a literal backslash. */
4049 
4050             c = '\\';
4051          } else {
4052             c = *src++; /* Allow any escape sequence (This is  */
4053          }              /* INCONSISTENT with the `CompileRE'   */
4054       }                 /* mind set of issuing an error!       */
4055 
4056       if (paren_no < 0) { /* Ordinary character. */
4057          if (((char *) dst - (char *) dest) >= (max - 1)) {
4058             reg_error("replacing expression in `SubstituteRE\' too long; "
4059                       "truncating");
4060             anyWarnings = True;
4061             break;
4062          } else {
4063             *dst++ = c;
4064          }
4065       } else if (prog->startp [paren_no] != NULL &&
4066                  prog->endp   [paren_no] != NULL) {
4067 
4068          len = prog->endp [paren_no] - prog->startp [paren_no];
4069 
4070          if (((char *) dst + len - (char *) dest) >= max-1) {
4071             reg_error("replacing expression in `SubstituteRE\' too long; "
4072                       "truncating");
4073             anyWarnings = True;
4074             len = max - ((char *) dst - (char *) dest) - 1;
4075          }
4076 
4077          (void) strncpy ((char *) dst, (char *) prog->startp [paren_no], len);
4078 
4079          if (chgcase != '\0') adjustcase (dst, len, chgcase);
4080 
4081          dst += len;
4082 
4083          if (len != 0 && *(dst - 1) == '\0') {  /* strncpy hit NUL. */
4084             reg_error ("damaged match string in `SubstituteRE\'");
4085             anyWarnings = True;
4086          }
4087       }
4088    }
4089 
4090    *dst = '\0';
4091 
4092    return !anyWarnings;
4093 }
4094 
adjustcase(unsigned char * str,int len,unsigned char chgcase)4095 static void adjustcase (unsigned char *str, int len, unsigned char chgcase) {
4096 
4097    register unsigned char *string = str;
4098             int            i;
4099 
4100    /* The tokens \u and \l only modify the first character while the tokens
4101       \U and \L modify the entire string. */
4102 
4103    if (islower (chgcase) && len > 0) len = 1;
4104 
4105    switch (chgcase) {
4106       case 'u':
4107       case 'U':
4108          for (i = 0; i < len; i++) {
4109             *(string + i) = toupper ((int) *(string + i));
4110          }
4111 
4112          break;
4113 
4114       case 'l':
4115       case 'L':
4116          for (i = 0; i < len; i++) {
4117             *(string + i) = tolower ((int) *(string + i));
4118          }
4119 
4120          break;
4121    }
4122 }
4123 
4124 /*----------------------------------------------------------------------*
4125  * reg_error
4126  *----------------------------------------------------------------------*/
4127 
reg_error(char * str)4128 static void reg_error (char *str) {
4129 
4130    fprintf (
4131       stderr,
4132       "nedit: Internal error processing regular expression (%s)\n",
4133       str);
4134 }
4135 
4136 /*----------------------------------------------------------------------*
4137  * makeDelimiterTable
4138  *
4139  * Translate a null-terminated string of delimiters into a 256 byte
4140  * lookup table for determining whether a character is a delimiter or
4141  * not.
4142  *
4143  * Table must be allocated by the caller.
4144  *
4145  * Return value is a pointer to the table.
4146  *----------------------------------------------------------------------*/
4147 
makeDelimiterTable(unsigned char * delimiters,unsigned char * table)4148 static unsigned char * makeDelimiterTable (
4149    unsigned char *delimiters,
4150    unsigned char *table) {
4151 
4152    unsigned char *c;
4153 
4154    memset (table, 0, 256);
4155 
4156    for (c = (unsigned char *) delimiters; *c != '\0'; c++) {
4157       table [*c] = 1;
4158    }
4159 
4160    table [(int) '\0'] = 1; /* These       */
4161    table [(int) '\t'] = 1; /* characters  */
4162    table [(int) '\n'] = 1; /* are always  */
4163    table [(int) ' ' ] = 1; /* delimiters. */
4164 
4165    return table;
4166 }
4167 
4168 /*----------------------------------------------------------------------*
4169  * SetREDefaultWordDelimiters
4170  *
4171  * Builds a default delimiter table that persists across `ExecRE' calls.
4172  *----------------------------------------------------------------------*/
4173 
SetREDefaultWordDelimiters(char * delimiters)4174 void SetREDefaultWordDelimiters (char *delimiters) {
4175    makeDelimiterTable ((unsigned char *) delimiters, Default_Delimiters);
4176 }
4177