1 /*------------------------------------------------------------------------*
2 * `CompileRE', `ExecRE', and `substituteRE' -- regular expression parsing
3 *
4 * This is a HIGHLY ALTERED VERSION of Henry Spencer's `regcomp' and
5 * `regexec' code adapted for NEdit.
6 *
7 * .-------------------------------------------------------------------.
8 * | ORIGINAL COPYRIGHT NOTICE: |
9 * | |
10 * | Copyright (c) 1986 by University of Toronto. |
11 * | Written by Henry Spencer. Not derived from licensed software. |
12 * | |
13 * | Permission is granted to anyone to use this software for any |
14 * | purpose on any computer system, and to redistribute it freely, |
15 * | subject to the following restrictions: |
16 * | |
17 * | 1. The author is not responsible for the consequences of use of |
18 * | this software, no matter how awful, even if they arise |
19 * | from defects in it. |
20 * | |
21 * | 2. The origin of this software must not be misrepresented, either |
22 * | by explicit claim or by omission. |
23 * | |
24 * | 3. Altered versions must be plainly marked as such, and must not |
25 * | be misrepresented as being the original software. |
26 * `-------------------------------------------------------------------'
27 *
28 * This is free software; you can redistribute it and/or modify it under the
29 * terms of the GNU General Public License as published by the Free Software
30 * Foundation; either version 2 of the License, or (at your option) any later
31 * version. In addition, you may distribute version of this program linked to
32 * Motif or Open Motif. See README for details.
33 *
34 * This software is distributed in the hope that it will be useful, but WITHOUT
35 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
36 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
37 * for more details.
38 *
39 * You should have received a copy of the GNU General Public License along with
40 * software; if not, write to the Free Software Foundation, Inc., 59 Temple
41 * Place, Suite 330, Boston, MA 02111-1307 USA
42 *
43 *
44 * BEWARE that some of this code is subtly aware of the way operator
45 * precedence is structured in regular expressions. Serious changes in
46 * regular-expression syntax might require a total rethink.
47 * -- Henry Spencer
48 * (Yes, it did!) -- Christopher Conrad, Dec. 1999
49 *
50 * January, 1994, Mark Edel
51 * Consolidated files, changed names of external functions to avoid
52 * potential conflicts with native regcomp and regexec functions, changed
53 * error reporting to NEdit form, added multi-line and reverse searching,
54 * and added \n \t \u \U \l \L.
55 *
56 * June, 1996, Mark Edel
57 * Bug in NEXT macro, didn't work for expressions which compiled to over
58 * 256 bytes.
59 *
60 * December, 1999, Christopher Conrad
61 * Reformatted code for readability, improved error output, added octal and
62 * hexadecimal escapes, added back-references (\1-\9), added positive look
63 * ahead: (?=...), added negative lookahead: (?!...), added non-capturing
64 * parentheses: (?:...), added case insensitive constructs (?i...) and
65 * (?I...), added newline matching constructs (?n...) and (?N...), added
66 * regex comments: (?#...), added shortcut escapes: \d\D\l\L\s\S\w\W\y\Y.
67 * Added "not a word boundary" anchor \B.
68 *
69 * July, 2002, Eddy De Greef
70 * Added look behind, both positive (?<=...) and negative (?<!...) for
71 * bounded-length patterns.
72 *
73 * November, 2004, Eddy De Greef
74 * Added constrained matching (allowing specification of the logical end
75 * of the string iso. matching till \0), and fixed several (probably
76 * very old) string overrun errors that could easily result in crashes,
77 * especially in client code.
78 */
79
80 #ifdef HAVE_CONFIG_H
81 #include "../config.h"
82 #endif
83
84 #include "regularExp.h"
85 #include "../util/nedit_malloc.h"
86
87 #include <ctype.h>
88 #include <limits.h>
89 #include <stdio.h>
90 #include <stdlib.h>
91 #include <string.h>
92
93 #ifdef HAVE_DEBUG_H
94 #include "../debug.h"
95 #endif
96
97
98 /* The first byte of the regexp internal `program' is a magic number to help
99 gaurd against corrupted data; the compiled regex code really begins in the
100 second byte. */
101
102 #define MAGIC 0234
103
104 /* The "internal use only" fields in `regexp.h' are present to pass info from
105 * `CompileRE' to `ExecRE' which permits the execute phase to run lots faster on
106 * simple cases. They are:
107 *
108 * match_start Character that must begin a match; '\0' if none obvious.
109 * anchor Is the match anchored (at beginning-of-line only)?
110 *
111 * `match_start' and `anchor' permit very fast decisions on suitable starting
112 * points for a match, considerably reducing the work done by ExecRE. */
113
114 /* STRUCTURE FOR A REGULAR EXPRESSION (regex) `PROGRAM'.
115 *
116 * This is essentially a linear encoding of a nondeterministic finite-state
117 * machine or NFA (aka syntax charts or `railroad normal form' in parsing
118 * technology). Each node is an opcode plus a NEXT pointer, possibly
119 * followed by operands. NEXT pointers of all nodes except BRANCH implement
120 * concatenation; a NEXT pointer with a BRANCH on both ends of it is
121 * connecting two alternatives. (Here we have one of the subtle syntax
122 * dependencies: an individual BRANCH (as opposed to a collection of them) is
123 * never concatenated with anything because of operator precedence.) The
124 * operand of some types of nodes is a literal string; for others, it is a node
125 * leading into a sub-FSM. In particular, the operand of a BRANCH node is the
126 * first node of the branch. (NB this is _NOT_ a tree structure: the tail of
127 * the branch connects to the thing following the set of BRANCHes.)
128 *
129 * The opcodes are: */
130
131 /* DEFINITION VALUE MEANING */
132
133 #define END 1 /* End of program. */
134
135 /* Zero width positional assertions. */
136
137 #define BOL 2 /* Match position at beginning of line. */
138 #define EOL 3 /* Match position at end of line. */
139 #define BOWORD 4 /* Match "" representing word delimiter or BOL */
140 #define EOWORD 5 /* Match "" representing word delimiter or EOL */
141 #define NOT_BOUNDARY 6 /* Not word boundary (\B, opposite of < and >) */
142
143 /* Op codes with null terminated string operands. */
144
145 #define EXACTLY 7 /* Match this string. */
146 #define SIMILAR 8 /* Match this case insensitive string */
147 #define ANY_OF 9 /* Match any character in the set. */
148 #define ANY_BUT 10 /* Match any character not in the set. */
149
150 /* Op codes to match any character. */
151
152 #define ANY 11 /* Match any one character (implements '.') */
153 #define EVERY 12 /* Same as ANY but matches newline. */
154
155 /* Shortcut escapes, \d, \D, \l, \L, \s, \S, \w, \W, \y, \Y. */
156
157 #define DIGIT 13 /* Match any digit, i.e. [0123456789] */
158 #define NOT_DIGIT 14 /* Match any non-digit, i.e. [^0123456789] */
159 #define LETTER 15 /* Match any letter character [a-zA-Z] */
160 #define NOT_LETTER 16 /* Match any non-letter character [^a-zA-Z] */
161 #define SPACE 17 /* Match any whitespace character EXCEPT \n */
162 #define SPACE_NL 18 /* Match any whitespace character INCLUDING \n */
163 #define NOT_SPACE 19 /* Match any non-whitespace character */
164 #define NOT_SPACE_NL 20 /* Same as NOT_SPACE but matches newline. */
165 #define WORD_CHAR 21 /* Match any word character [a-zA-Z0-9_] */
166 #define NOT_WORD_CHAR 22 /* Match any non-word character [^a-zA-Z0-9_] */
167 #define IS_DELIM 23 /* Match any character that's a word delimiter */
168 #define NOT_DELIM 24 /* Match any character NOT a word delimiter */
169
170 /* Quantifier nodes. (Only applied to SIMPLE nodes. Quantifiers applied to non
171 SIMPLE nodes or larger atoms are implemented using complex constructs.)*/
172
173 #define STAR 25 /* Match this (simple) thing 0 or more times. */
174 #define LAZY_STAR 26 /* Minimal matching STAR */
175 #define QUESTION 27 /* Match this (simple) thing 0 or 1 times. */
176 #define LAZY_QUESTION 28 /* Minimal matching QUESTION */
177 #define PLUS 29 /* Match this (simple) thing 1 or more times. */
178 #define LAZY_PLUS 30 /* Minimal matching PLUS */
179 #define BRACE 31 /* Match this (simple) thing m to n times. */
180 #define LAZY_BRACE 32 /* Minimal matching BRACE */
181
182 /* Nodes used to build complex constructs. */
183
184 #define NOTHING 33 /* Match empty string (always matches) */
185 #define BRANCH 34 /* Match this alternative, or the next... */
186 #define BACK 35 /* Always matches, NEXT ptr points backward. */
187 #define INIT_COUNT 36 /* Initialize {m,n} counter to zero */
188 #define INC_COUNT 37 /* Increment {m,n} counter by one */
189 #define TEST_COUNT 38 /* Test {m,n} counter against operand */
190
191 /* Back Reference nodes. */
192
193 #define BACK_REF 39 /* Match latest matched parenthesized text */
194 #define BACK_REF_CI 40 /* Case insensitive version of BACK_REF */
195 #define X_REGEX_BR 41 /* Cross-Regex Back-Ref for syntax highlighting */
196 #define X_REGEX_BR_CI 42 /* Case insensitive version of X_REGEX_BR_CI */
197
198 /* Various nodes used to implement parenthetical constructs. */
199
200 #define POS_AHEAD_OPEN 43 /* Begin positive look ahead */
201 #define NEG_AHEAD_OPEN 44 /* Begin negative look ahead */
202 #define LOOK_AHEAD_CLOSE 45 /* End positive or negative look ahead */
203
204 #define POS_BEHIND_OPEN 46 /* Begin positive look behind */
205 #define NEG_BEHIND_OPEN 47 /* Begin negative look behind */
206 #define LOOK_BEHIND_CLOSE 48 /* Close look behind */
207
208 #define OPEN 49 /* Open for capturing parentheses. */
209
210 /* OPEN+1 is number 1, etc. */
211 #define CLOSE (OPEN + NSUBEXP) /* Close for capturing parentheses. */
212
213 #define LAST_PAREN (CLOSE + NSUBEXP)
214
215 #if (LAST_PAREN > UCHAR_MAX)
216 #error "Too many parentheses for storage in an unsigned char (LAST_PAREN too big.)"
217 #endif
218
219 /* The next_ptr () function can consume up to 30% of the time during matching
220 because it is called an immense number of times (an average of 25
221 next_ptr() calls per match() call was witnessed for Perl syntax
222 highlighting). Therefore it is well worth removing some of the function
223 call overhead by selectively inlining the next_ptr() calls. Moreover,
224 the inlined code can be simplified for matching because one of the tests,
225 only necessary during compilation, can be left out.
226 The net result of using this inlined version at two critical places is
227 a 25% speedup (again, witnesses on Perl syntax highlighting). */
228
229 #define NEXT_PTR(in_ptr, out_ptr)\
230 next_ptr_offset = GET_OFFSET (in_ptr);\
231 if (next_ptr_offset == 0)\
232 out_ptr = NULL;\
233 else {\
234 if (GET_OP_CODE (in_ptr) == BACK)\
235 out_ptr = in_ptr - next_ptr_offset;\
236 else \
237 out_ptr = in_ptr + next_ptr_offset;\
238 }
239
240 /* OPCODE NOTES:
241 ------------
242
243 All nodes consist of an 8 bit op code followed by 2 bytes that make up a 16
244 bit NEXT pointer. Some nodes have a null terminated character string operand
245 following the NEXT pointer. Other nodes may have an 8 bit index operand.
246 The TEST_COUNT node has an index operand followed by a 16 bit test value.
247 The BRACE and LAZY_BRACE nodes have two 16 bit values for min and max but no
248 index value.
249
250 SIMILAR
251 Operand(s): null terminated string
252
253 Implements a case insensitive match of a string. Mostly intended for use
254 in syntax highlighting patterns for keywords of languages like FORTRAN
255 and Ada that are case insensitive. The regex text in this node is
256 converted to lower case during regex compile.
257
258 DIGIT, NOT_DIGIT, LETTER, NOT_LETTER, SPACE, NOT_SPACE, WORD_CHAR,
259 NOT_WORD_CHAR
260 Operand(s): None
261
262 Implements shortcut escapes \d, \D, \l, \L, \s, \S, \w, \W. The locale
263 aware ANSI functions isdigit(), isalpha(), isalnun(), and isspace() are
264 used to implement these in the hopes of increasing portability.
265
266 NOT_BOUNDARY
267 Operand(s): None
268
269 Implements \B as a zero width assertion that the current character is
270 NOT on a word boundary. Word boundaries are defined to be the position
271 between two characters where one of those characters is one of the
272 dynamically defined word delimiters, and the other character is not.
273
274 IS_DELIM
275 Operand(s): None
276
277 Implements \y as any character that is one of the dynamically
278 specified word delimiters.
279
280 NOT_DELIM
281 Operand(s): None
282
283 Implements \Y as any character that is NOT one of the dynamically
284 specified word delimiters.
285
286 STAR, PLUS, QUESTION, and complex '*', '+', and '?'
287 Operand(s): None (Note: NEXT pointer is usually zero. The code that
288 processes this node skips over it.)
289
290 Complex (parenthesized) versions implemented as circular BRANCH
291 structures using BACK. SIMPLE versions (one character per match) are
292 implemented separately for speed and to minimize recursion.
293
294 BRACE, LAZY_BRACE
295 Operand(s): minimum value (2 bytes), maximum value (2 bytes)
296
297 Implements the {m,n} construct for atoms that are SIMPLE.
298
299 BRANCH
300 Operand(s): None
301
302 The set of branches constituting a single choice are hooked together
303 with their NEXT pointers, since precedence prevents anything being
304 concatenated to any individual branch. The NEXT pointer of the last
305 BRANCH in a choice points to the thing following the whole choice. This
306 is also where the final NEXT pointer of each individual branch points;
307 each branch starts with the operand node of a BRANCH node.
308
309 BACK
310 Operand(s): None
311
312 Normal NEXT pointers all implicitly point forward. Back implicitly
313 points backward. BACK exists to make loop structures possible.
314
315 INIT_COUNT
316 Operand(s): index (1 byte)
317
318 Initializes the count array element referenced by the index operand.
319 This node is used to build general (i.e. parenthesized) {m,n} constructs.
320
321 INC_COUNT
322 Operand(s): index (1 byte)
323
324 Increments the count array element referenced by the index operand.
325 This node is used to build general (i.e. parenthesized) {m,n} constructs.
326
327 TEST_COUNT
328 Operand(s): index (1 byte), test value (2 bytes)
329
330 Tests the current value of the count array element specified by the
331 index operand against the test value. If the current value is less than
332 the test value, control passes to the node after that TEST_COUNT node.
333 Otherwise control passes to the node referenced by the NEXT pointer for
334 the TEST_COUNT node. This node is used to build general (i.e.
335 parenthesized) {m,n} constructs.
336
337 BACK_REF, BACK_REF_CI
338 Operand(s): index (1 byte, value 1-9)
339
340 Implements back references. This node will attempt to match whatever text
341 was most recently captured by the index'th set of parentheses.
342 BACK_REF_CI is case insensitive version.
343
344 X_REGEX_BR, X_REGEX_BR_CI
345 (NOT IMPLEMENTED YET)
346
347 Operand(s): index (1 byte, value 1-9)
348
349 Implements back references into a previously matched but separate regular
350 expression. This is used by syntax highlighting patterns. This node will
351 attempt to match whatever text was most captured by the index'th set of
352 parentheses of the separate regex passed to ExecRE. X_REGEX_BR_CI is case
353 insensitive version.
354
355 POS_AHEAD_OPEN, NEG_AHEAD_OPEN, LOOK_AHEAD_CLOSE
356
357 Operand(s): None
358
359 Implements positive and negative look ahead. Look ahead is an assertion
360 that something is either there or not there. Once this is determined the
361 regex engine backtracks to where it was just before the look ahead was
362 encountered, i.e. look ahead is a zero width assertion.
363
364 POS_BEHIND_OPEN, NEG_BEHIND_OPEN, LOOK_BEHIND_CLOSE
365
366 Operand(s): 2x2 bytes for OPEN (match boundaries), None for CLOSE
367
368 Implements positive and negative look behind. Look behind is an assertion
369 that something is either there or not there in front of the current
370 position. Look behind is a zero width assertion, with the additional
371 constraint that it must have a bounded length (for complexity and
372 efficiency reasons; note that most other implementation even impose
373 fixed length).
374
375 OPEN, CLOSE
376
377 Operand(s): None
378
379 OPEN + n = Start of parenthesis 'n', CLOSE + n = Close of parenthesis
380 'n', and are numbered at compile time.
381 */
382
383 /* A node is one char of opcode followed by two chars of NEXT pointer plus
384 * any operands. NEXT pointers are stored as two 8-bit pieces, high order
385 * first. The value is a positive offset from the opcode of the node
386 * containing it. An operand, if any, simply follows the node. (Note that
387 * much of the code generation knows about this implicit relationship.)
388 *
389 * Using two bytes for NEXT_PTR_SIZE is vast overkill for most things,
390 * but allows patterns to get big without disasters. */
391
392 #define OP_CODE_SIZE 1
393 #define NEXT_PTR_SIZE 2
394 #define INDEX_SIZE 1
395 #define LENGTH_SIZE 4
396 #define NODE_SIZE (NEXT_PTR_SIZE + OP_CODE_SIZE)
397
398 #define GET_OP_CODE(p) (*(unsigned char *)(p))
399 #define OPERAND(p) ((p) + NODE_SIZE)
400 #define GET_OFFSET(p) ((( *((p) + 1) & 0377) << 8) + (( *((p) + 2)) & 0377))
401 #define PUT_OFFSET_L(v) (unsigned char)(((v) >> 8) & 0377)
402 #define PUT_OFFSET_R(v) (unsigned char) ((v) & 0377)
403 #define GET_LOWER(p) ((( *((p) + NODE_SIZE) & 0377) << 8) + \
404 (( *((p) + NODE_SIZE+1)) & 0377))
405 #define GET_UPPER(p) ((( *((p) + NODE_SIZE+2) & 0377) << 8) + \
406 (( *((p) + NODE_SIZE+3)) & 0377))
407
408 /* Utility definitions. */
409
410 #define REG_FAIL(m) {*Error_Ptr = (m); return (NULL);}
411 #define IS_QUANTIFIER(c) ((c) == '*' || (c) == '+' || \
412 (c) == '?' || (c) == Brace_Char)
413 #define SET_BIT(i,n) ((i) |= (1 << ((n) - 1)))
414 #define TEST_BIT(i,n) ((i) & (1 << ((n) - 1)))
415 #define U_CHAR_AT(p) ((unsigned int) *(unsigned char *)(p))
416
417 /* Flags to be passed up and down via function parameters during compile. */
418
419 #define WORST 0 /* Worst case. No assumptions can be made.*/
420 #define HAS_WIDTH 1 /* Known never to match null string. */
421 #define SIMPLE 2 /* Simple enough to be STAR/PLUS operand. */
422
423 #define NO_PAREN 0 /* Only set by initial call to "chunk". */
424 #define PAREN 1 /* Used for normal capturing parentheses. */
425 #define NO_CAPTURE 2 /* Non-capturing parentheses (grouping only). */
426 #define INSENSITIVE 3 /* Case insensitive parenthetical construct */
427 #define SENSITIVE 4 /* Case sensitive parenthetical construct */
428 #define NEWLINE 5 /* Construct to match newlines in most cases */
429 #define NO_NEWLINE 6 /* Construct to match newlines normally */
430
431 #define REG_INFINITY 0UL
432 #define REG_ZERO 0UL
433 #define REG_ONE 1UL
434
435 /* Flags for function shortcut_escape() */
436
437 #define CHECK_ESCAPE 0 /* Check an escape sequence for validity only. */
438 #define CHECK_CLASS_ESCAPE 1 /* Check the validity of an escape within a
439 character class */
440 #define EMIT_CLASS_BYTES 2 /* Emit equivalent character class bytes,
441 e.g \d=0123456789 */
442 #define EMIT_NODE 3 /* Emit the appropriate node. */
443
444 /* Array sizes for arrays used by function init_ansi_classes. */
445
446 #define WHITE_SPACE_SIZE 16
447 #define ALNUM_CHAR_SIZE 256
448
449 /* Number of bytes to offset from the beginning of the regex program to the start
450 of the actual compiled regex code, i.e. skipping over the MAGIC number and
451 the two counters at the front. */
452
453 #define REGEX_START_OFFSET 3
454
455 #define MAX_COMPILED_SIZE 32767UL /* Largest size a compiled regex can be.
456 Probably could be 65535UL. */
457
458 /* Global work variables for `CompileRE'. */
459
460 static unsigned char *Reg_Parse; /* Input scan ptr (scans user's regex) */
461 static int Total_Paren; /* Parentheses, (), counter. */
462 static int Num_Braces; /* Number of general {m,n} constructs.
463 {m,n} quantifiers of SIMPLE atoms are
464 not included in this count. */
465 static int Closed_Parens; /* Bit flags indicating () closure. */
466 static int Paren_Has_Width; /* Bit flags indicating ()'s that are
467 known to not match the empty string */
468 static unsigned char Compute_Size; /* Address of this used as flag. */
469 static unsigned char *Code_Emit_Ptr; /* When Code_Emit_Ptr is set to
470 &Compute_Size no code is emitted.
471 Instead, the size of code that WOULD
472 have been generated is accumulated in
473 Reg_Size. Otherwise, Code_Emit_Ptr
474 points to where compiled regex code is
475 to be written. */
476 static unsigned long Reg_Size; /* Size of compiled regex code. */
477 static char **Error_Ptr; /* Place to store error messages so
478 they can be returned by `CompileRE' */
479 static char Error_Text [128];/* Sting to build error messages in. */
480
481 static unsigned char White_Space [WHITE_SPACE_SIZE]; /* Arrays used by */
482 static unsigned char Word_Char [ALNUM_CHAR_SIZE]; /* functions */
483 static unsigned char Letter_Char [ALNUM_CHAR_SIZE]; /* init_ansi_classes () */
484 /* and
485 shortcut_escape (). */
486
487 static unsigned char ASCII_Digits [] = "0123456789"; /* Same for all */
488 /* locales. */
489 static int Is_Case_Insensitive;
490 static int Match_Newline;
491
492 static int Enable_Counting_Quantifier = 1;
493 static unsigned char Brace_Char;
494 static unsigned char Default_Meta_Char [] = "{.*+?[(|)^<>$";
495 static unsigned char *Meta_Char;
496
497 typedef struct { long lower; long upper; } len_range;
498
499 /* Forward declarations for functions used by `CompileRE'. */
500
501 static unsigned char * alternative (int *flag_param, len_range *range_param);
502 static unsigned char * back_ref (unsigned char *c, int *flag_param,
503 int emit);
504 static unsigned char * chunk (int paren, int *flag_param, len_range *range_param);
505 static void emit_byte (unsigned char c);
506 static void emit_class_byte (unsigned char c);
507 static unsigned char * emit_node (int op_code);
508 static unsigned char * emit_special (unsigned char op_code,
509 unsigned long test_val,
510 int index);
511 static unsigned char literal_escape (unsigned char c);
512 static unsigned char numeric_escape (unsigned char c, unsigned char **parse);
513 static unsigned char * atom (int *flag_param, len_range *range_param);
514 static void reg_error (char *str);
515 static unsigned char * insert (unsigned char op, unsigned char *opnd,
516 long min, long max, int index);
517 static unsigned char * next_ptr (unsigned char *ptr);
518 static void offset_tail (unsigned char *ptr, int offset,
519 unsigned char *val);
520 static void branch_tail (unsigned char *ptr, int offset,
521 unsigned char *val);
522 static unsigned char * piece (int *flag_param, len_range *range_param);
523 static void tail (unsigned char *search_from,
524 unsigned char *point_t);
525 static unsigned char * shortcut_escape (unsigned char c, int *flag_param,
526 int emit);
527
528 static int init_ansi_classes (void);
529
530 /*----------------------------------------------------------------------*
531 * CompileRE
532 *
533 * Compiles a regular expression into the internal format used by
534 * `ExecRE'.
535 *
536 * The default behaviour wrt. case sensitivity and newline matching can
537 * be controlled through the defaultFlags argument (Markus Schwarzenberg).
538 * Future extensions are possible by using other flag bits.
539 * Note that currently only the case sensitivity flag is effectively used.
540 *
541 * Beware that the optimization and preparation code in here knows about
542 * some of the structure of the compiled regexp.
543 *----------------------------------------------------------------------*/
544
CompileRE(const char * exp,char ** errorText,int defaultFlags)545 regexp * CompileRE (const char *exp, char **errorText, int defaultFlags) {
546
547 register regexp *comp_regex = NULL;
548 register unsigned char *scan;
549 int flags_local, pass;
550 len_range range_local;
551
552 if (Enable_Counting_Quantifier) {
553 Brace_Char = '{';
554 Meta_Char = &Default_Meta_Char [0];
555 } else {
556 Brace_Char = '*'; /* Bypass the '{' in */
557 Meta_Char = &Default_Meta_Char [1]; /* Default_Meta_Char */
558 }
559
560 /* Set up errorText to receive failure reports. */
561
562 Error_Ptr = errorText;
563 *Error_Ptr = "";
564
565 if (exp == NULL) REG_FAIL ("NULL argument, `CompileRE\'");
566
567 /* Initialize arrays used by function `shortcut_escape'. */
568
569 if (!init_ansi_classes ()) REG_FAIL ("internal error #1, `CompileRE\'");
570
571 Code_Emit_Ptr = &Compute_Size;
572 Reg_Size = 0UL;
573
574 /* We can't allocate space until we know how big the compiled form will be,
575 but we can't compile it (and thus know how big it is) until we've got a
576 place to put the code. So we cheat: we compile it twice, once with code
577 generation turned off and size counting turned on, and once "for real".
578 This also means that we don't allocate space until we are sure that the
579 thing really will compile successfully, and we never have to move the
580 code and thus invalidate pointers into it. (Note that it has to be in
581 one piece because free() must be able to free it all.) */
582
583 for (pass = 1; pass <= 2; pass++) {
584 /*-------------------------------------------*
585 * FIRST PASS: Determine size and legality. *
586 * SECOND PASS: Emit code. *
587 *-------------------------------------------*/
588
589 /* Schwarzenberg:
590 * If defaultFlags = 0 use standard defaults:
591 * Is_Case_Insensitive: Case sensitive is the default
592 * Match_Newline: Newlines are NOT matched by default
593 * in character classes
594 */
595 Is_Case_Insensitive = ((defaultFlags & REDFLT_CASE_INSENSITIVE) ? 1 : 0);
596 Match_Newline = 0; /* ((defaultFlags & REDFLT_MATCH_NEWLINE) ? 1 : 0);
597 Currently not used. Uncomment if needed. */
598
599 Reg_Parse = (unsigned char *) exp;
600 Total_Paren = 1;
601 Num_Braces = 0;
602 Closed_Parens = 0;
603 Paren_Has_Width = 0;
604
605 emit_byte (MAGIC);
606 emit_byte ('%'); /* Placeholder for num of capturing parentheses. */
607 emit_byte ('%'); /* Placeholder for num of general {m,n} constructs. */
608
609 if (chunk (NO_PAREN, &flags_local, &range_local) == NULL)
610 return (NULL); /* Something went wrong */
611 if (pass == 1) {
612 if (Reg_Size >= MAX_COMPILED_SIZE) {
613 /* Too big for NEXT pointers NEXT_PTR_SIZE bytes long to span.
614 This is a real issue since the first BRANCH node usually points
615 to the end of the compiled regex code. */
616
617 sprintf (Error_Text, "regexp > %lu bytes", MAX_COMPILED_SIZE);
618 REG_FAIL (Error_Text);
619 }
620
621 /* Allocate memory. */
622
623 comp_regex = (regexp *) NEditMalloc (sizeof (regexp) + Reg_Size);
624
625 if (comp_regex == NULL) REG_FAIL ("out of memory in `CompileRE\'");
626
627 Code_Emit_Ptr = (unsigned char *) comp_regex->program;
628 }
629 }
630
631 comp_regex->program [1] = (unsigned char) Total_Paren - 1;
632 comp_regex->program [2] = (unsigned char) Num_Braces;
633
634 /*----------------------------------------*
635 * Dig out information for optimizations. *
636 *----------------------------------------*/
637
638 comp_regex->match_start = '\0'; /* Worst-case defaults. */
639 comp_regex->anchor = 0;
640
641 /* First BRANCH. */
642
643 scan = (unsigned char *) (comp_regex->program + REGEX_START_OFFSET);
644
645 if (GET_OP_CODE (next_ptr (scan)) == END) { /* Only one top-level choice. */
646 scan = OPERAND (scan);
647
648 /* Starting-point info. */
649
650 if (GET_OP_CODE (scan) == EXACTLY) {
651 comp_regex->match_start = *OPERAND (scan);
652
653 } else if (PLUS <= GET_OP_CODE (scan) &&
654 GET_OP_CODE (scan) <= LAZY_PLUS) {
655
656 /* Allow x+ or x+? at the start of the regex to be
657 optimized. */
658
659 if (GET_OP_CODE (scan + NODE_SIZE) == EXACTLY) {
660 comp_regex->match_start = *OPERAND (scan + NODE_SIZE);
661 }
662 } else if (GET_OP_CODE (scan) == BOL) {
663 comp_regex->anchor++;
664 }
665 }
666
667 return (comp_regex);
668 }
669
670 /*----------------------------------------------------------------------*
671 * chunk *
672 * *
673 * Process main body of regex or process a parenthesized "thing". *
674 * *
675 * Caller must absorb opening parenthesis. *
676 * *
677 * Combining parenthesis handling with the base level of regular *
678 * expression is a trifle forced, but the need to tie the tails of the *
679 * branches to what follows makes it hard to avoid. *
680 *----------------------------------------------------------------------*/
681
chunk(int paren,int * flag_param,len_range * range_param)682 static unsigned char * chunk (int paren, int *flag_param,
683 len_range *range_param) {
684
685 register unsigned char *ret_val = NULL;
686 register unsigned char *this_branch;
687 register unsigned char *ender = NULL;
688 register int this_paren = 0;
689 int flags_local, first = 1, zero_width, i;
690 int old_sensitive = Is_Case_Insensitive;
691 int old_newline = Match_Newline;
692 len_range range_local;
693 int look_only = 0;
694 unsigned char *emit_look_behind_bounds = NULL;
695
696
697 *flag_param = HAS_WIDTH; /* Tentatively. */
698 range_param->lower = 0; /* Idem */
699 range_param->upper = 0;
700
701 /* Make an OPEN node, if parenthesized. */
702
703 if (paren == PAREN) {
704 if (Total_Paren >= NSUBEXP) {
705 sprintf (Error_Text, "number of ()'s > %d", (int) NSUBEXP);
706 REG_FAIL (Error_Text);
707 }
708
709 this_paren = Total_Paren; Total_Paren++;
710 ret_val = emit_node (OPEN + this_paren);
711 } else if (paren == POS_AHEAD_OPEN || paren == NEG_AHEAD_OPEN) {
712 *flag_param = WORST; /* Look ahead is zero width. */
713 look_only = 1;
714 ret_val = emit_node (paren);
715 } else if (paren == POS_BEHIND_OPEN || paren == NEG_BEHIND_OPEN) {
716 *flag_param = WORST; /* Look behind is zero width. */
717 look_only = 1;
718 /* We'll overwrite the zero length later on, so we save the ptr */
719 ret_val = emit_special (paren, 0, 0);
720 emit_look_behind_bounds = ret_val + NODE_SIZE;
721 } else if (paren == INSENSITIVE) {
722 Is_Case_Insensitive = 1;
723 } else if (paren == SENSITIVE) {
724 Is_Case_Insensitive = 0;
725 } else if (paren == NEWLINE) {
726 Match_Newline = 1;
727 } else if (paren == NO_NEWLINE) {
728 Match_Newline = 0;
729 }
730
731 /* Pick up the branches, linking them together. */
732
733 do {
734 this_branch = alternative (&flags_local, &range_local);
735
736 if (this_branch == NULL) return (NULL);
737
738 if (first) {
739 first = 0;
740 *range_param = range_local;
741 if (ret_val == NULL) ret_val = this_branch;
742 } else if (range_param->lower >= 0) {
743 if (range_local.lower >= 0) {
744 if (range_local.lower < range_param->lower)
745 range_param->lower = range_local.lower;
746 if (range_local.upper > range_param->upper)
747 range_param->upper = range_local.upper;
748 } else {
749 range_param->lower = -1; /* Branches have different lengths */
750 range_param->upper = -1;
751 }
752 }
753
754 tail (ret_val, this_branch); /* Connect BRANCH -> BRANCH. */
755
756 /* If any alternative could be zero width, consider the whole
757 parenthisized thing to be zero width. */
758
759 if (!(flags_local & HAS_WIDTH)) *flag_param &= ~HAS_WIDTH;
760
761 /* Are there more alternatives to process? */
762
763 if (*Reg_Parse != '|') break;
764
765 Reg_Parse++;
766 } while (1);
767
768 /* Make a closing node, and hook it on the end. */
769
770 if (paren == PAREN) {
771 ender = emit_node (CLOSE + this_paren);
772
773 } else if (paren == NO_PAREN) {
774 ender = emit_node (END);
775
776 } else if (paren == POS_AHEAD_OPEN || paren == NEG_AHEAD_OPEN) {
777 ender = emit_node (LOOK_AHEAD_CLOSE);
778
779 } else if (paren == POS_BEHIND_OPEN || paren == NEG_BEHIND_OPEN) {
780 ender = emit_node (LOOK_BEHIND_CLOSE);
781
782 } else {
783 ender = emit_node (NOTHING);
784 }
785
786 tail (ret_val, ender);
787
788 /* Hook the tails of the branch alternatives to the closing node. */
789
790 for (this_branch = ret_val; this_branch != NULL; ) {
791 branch_tail (this_branch, NODE_SIZE, ender);
792 this_branch = next_ptr (this_branch);
793 }
794
795 /* Check for proper termination. */
796
797 if (paren != NO_PAREN && *Reg_Parse++ != ')') {
798 REG_FAIL ("missing right parenthesis \')\'");
799 } else if (paren == NO_PAREN && *Reg_Parse != '\0') {
800 if (*Reg_Parse == ')') {
801 REG_FAIL ("missing left parenthesis \'(\'");
802 } else {
803 REG_FAIL ("junk on end"); /* "Can't happen" - NOTREACHED */
804 }
805 }
806
807 /* Check whether look behind has a fixed size */
808
809 if (emit_look_behind_bounds) {
810 if (range_param->lower < 0) {
811 REG_FAIL ("look-behind does not have a bounded size");
812 }
813 if (range_param->upper > 65535L) {
814 REG_FAIL ("max. look-behind size is too large (>65535)")
815 }
816 if (Code_Emit_Ptr != &Compute_Size) {
817 *emit_look_behind_bounds++ = PUT_OFFSET_L (range_param->lower);
818 *emit_look_behind_bounds++ = PUT_OFFSET_R (range_param->lower);
819 *emit_look_behind_bounds++ = PUT_OFFSET_L (range_param->upper);
820 *emit_look_behind_bounds = PUT_OFFSET_R (range_param->upper);
821 }
822 }
823
824 /* For look ahead/behind, the length must be set to zero again */
825 if (look_only) {
826 range_param->lower = 0;
827 range_param->upper = 0;
828 }
829
830 zero_width = 0;
831
832 /* Set a bit in Closed_Parens to let future calls to function `back_ref'
833 know that we have closed this set of parentheses. */
834
835 if (paren == PAREN && this_paren <= (int)sizeof (Closed_Parens) * CHAR_BIT) {
836 SET_BIT (Closed_Parens, this_paren);
837
838 /* Determine if a parenthesized expression is modified by a quantifier
839 that can have zero width. */
840
841 if (*(Reg_Parse) == '?' || *(Reg_Parse) == '*') {
842 zero_width++;
843 } else if (*(Reg_Parse) == '{' && Brace_Char == '{') {
844 if (*(Reg_Parse + 1) == ',' || *(Reg_Parse + 1) == '}') {
845 zero_width++;
846 } else if (*(Reg_Parse + 1) == '0') {
847 i = 2;
848
849 while (*(Reg_Parse + i) == '0') i++;
850
851 if (*(Reg_Parse + i) == ',') zero_width++;
852 }
853 }
854 }
855
856 /* If this set of parentheses is known to never match the empty string, set
857 a bit in Paren_Has_Width to let future calls to function back_ref know
858 that this set of parentheses has non-zero width. This will allow star
859 (*) or question (?) quantifiers to be aplied to a back-reference that
860 refers to this set of parentheses. */
861
862 if ((*flag_param & HAS_WIDTH) &&
863 paren == PAREN &&
864 !zero_width &&
865 this_paren <= (int)(sizeof (Paren_Has_Width) * CHAR_BIT)) {
866
867 SET_BIT (Paren_Has_Width, this_paren);
868 }
869
870 Is_Case_Insensitive = old_sensitive;
871 Match_Newline = old_newline;
872
873 return (ret_val);
874 }
875
876 /*----------------------------------------------------------------------*
877 * alternative
878 *
879 * Processes one alternative of an '|' operator. Connects the NEXT
880 * pointers of each regex atom together sequentialy.
881 *----------------------------------------------------------------------*/
882
alternative(int * flag_param,len_range * range_param)883 static unsigned char * alternative (int *flag_param, len_range *range_param) {
884
885 register unsigned char *ret_val;
886 register unsigned char *chain;
887 register unsigned char *latest;
888 int flags_local;
889 len_range range_local;
890
891 *flag_param = WORST; /* Tentatively. */
892 range_param->lower = 0; /* Idem */
893 range_param->upper = 0;
894
895 ret_val = emit_node (BRANCH);
896 chain = NULL;
897
898 /* Loop until we hit the start of the next alternative, the end of this set
899 of alternatives (end of parentheses), or the end of the regex. */
900
901 while (*Reg_Parse != '|' && *Reg_Parse != ')' && *Reg_Parse != '\0') {
902 latest = piece (&flags_local, &range_local);
903
904 if (latest == NULL) return (NULL); /* Something went wrong. */
905
906 *flag_param |= flags_local & HAS_WIDTH;
907 if (range_local.lower < 0) {
908 /* Not a fixed length */
909 range_param->lower = -1;
910 range_param->upper = -1;
911 } else if (range_param->lower >= 0) {
912 range_param->lower += range_local.lower;
913 range_param->upper += range_local.upper;
914 }
915
916 if (chain != NULL) { /* Connect the regex atoms together sequentialy. */
917 tail (chain, latest);
918 }
919
920 chain = latest;
921 }
922
923 if (chain == NULL) { /* Loop ran zero times. */
924 (void) emit_node (NOTHING);
925 }
926
927 return (ret_val);
928 }
929
930 /*----------------------------------------------------------------------*
931 * piece - something followed by possible '*', '+', '?', or "{m,n}"
932 *
933 * Note that the branching code sequences used for the general cases of
934 * *, +. ?, and {m,n} are somewhat optimized: they use the same
935 * NOTHING node as both the endmarker for their branch list and the
936 * body of the last branch. It might seem that this node could be
937 * dispensed with entirely, but the endmarker role is not redundant.
938 *----------------------------------------------------------------------*/
939
piece(int * flag_param,len_range * range_param)940 static unsigned char * piece (int *flag_param, len_range *range_param) {
941
942 register unsigned char *ret_val;
943 register unsigned char *next;
944 register unsigned char op_code;
945 unsigned long min_max [2] = {REG_ZERO, REG_INFINITY};
946 int flags_local, i, brace_present = 0;
947 int lazy = 0, comma_present = 0;
948 int digit_present [2] = {0,0};
949 len_range range_local;
950
951 ret_val = atom (&flags_local, &range_local);
952
953 if (ret_val == NULL) return (NULL); /* Something went wrong. */
954
955 op_code = *Reg_Parse;
956
957 if (!IS_QUANTIFIER (op_code)) {
958 *flag_param = flags_local;
959 *range_param = range_local;
960 return (ret_val);
961 } else if (op_code == '{') { /* {n,m} quantifier present */
962 brace_present++;
963 Reg_Parse++;
964
965 /* This code will allow specifying a counting range in any of the
966 following forms:
967
968 {m,n} between m and n.
969 {,n} same as {0,n} or between 0 and infinity.
970 {m,} same as {m,0} or between m and infinity.
971 {m} same as {m,m} or exactly m.
972 {,} same as {0,0} or between 0 and infinity or just '*'.
973 {} same as {0,0} or between 0 and infinity or just '*'.
974
975 Note that specifying a max of zero, {m,0} is not allowed in the regex
976 itself, but it is implemented internally that way to support '*', '+',
977 and {min,} constructs and signals an unlimited number. */
978
979 for (i = 0; i < 2; i++) {
980 /* Look for digits of number and convert as we go. The numeric maximum
981 value for max and min of 65,535 is due to using 2 bytes to store
982 each value in the compiled regex code. */
983
984 while (isdigit (*Reg_Parse)) {
985 /* (6553 * 10 + 6) > 65535 (16 bit max) */
986
987 if ((min_max [i] == 6553UL && (*Reg_Parse - '0') <= 5) ||
988 (min_max [i] <= 6552UL)) {
989
990 min_max [i] = (min_max [i] * 10UL) +
991 (unsigned long) (*Reg_Parse - '0');
992 Reg_Parse++;
993
994 digit_present [i]++;
995 } else {
996 if (i == 0) {
997 sprintf (Error_Text, "min operand of {%lu%c,???} > 65535",
998 min_max [0], *Reg_Parse);
999 } else {
1000 sprintf (Error_Text, "max operand of {%lu,%lu%c} > 65535",
1001 min_max [0], min_max [1], *Reg_Parse);
1002 }
1003
1004 REG_FAIL (Error_Text);
1005 }
1006 }
1007
1008 if (!comma_present && *Reg_Parse == ',') {
1009 comma_present++;
1010 Reg_Parse++;
1011 }
1012 }
1013
1014 /* A max of zero can not be specified directly in the regex since it would
1015 signal a max of infinity. This code specifically disallows `{0,0}',
1016 `{,0}', and `{0}' which really means nothing to humans but would be
1017 interpreted as `{0,infinity}' or `*' if we didn't make this check. */
1018
1019 if (digit_present [0] && (min_max [0] == REG_ZERO) && !comma_present) {
1020
1021 REG_FAIL ("{0} is an invalid range");
1022 } else if (digit_present [0] && (min_max [0] == REG_ZERO) &&
1023 digit_present [1] && (min_max [1] == REG_ZERO)) {
1024
1025 REG_FAIL ("{0,0} is an invalid range");
1026 } else if (digit_present [1] && (min_max [1] == REG_ZERO)) {
1027 if (digit_present [0]) {
1028 sprintf (Error_Text, "{%lu,0} is an invalid range", min_max [0]);
1029 REG_FAIL (Error_Text);
1030 } else {
1031 REG_FAIL ("{,0} is an invalid range");
1032 }
1033 }
1034
1035 if (!comma_present) min_max [1] = min_max [0]; /* {x} means {x,x} */
1036
1037 if (*Reg_Parse != '}') {
1038 REG_FAIL ("{m,n} specification missing right \'}\'");
1039
1040 } else if (min_max [1] != REG_INFINITY && min_max [0] > min_max [1]) {
1041 /* Disallow a backward range. */
1042
1043 sprintf (Error_Text, "{%lu,%lu} is an invalid range",
1044 min_max [0], min_max [1]);
1045 REG_FAIL (Error_Text);
1046 }
1047 }
1048
1049 Reg_Parse++;
1050
1051 /* Check for a minimal matching (non-greedy or "lazy") specification. */
1052
1053 if (*Reg_Parse == '?') {
1054 lazy = 1;
1055 Reg_Parse++;
1056 }
1057
1058 /* Avoid overhead of counting if possible */
1059
1060 if (op_code == '{') {
1061 if (min_max [0] == REG_ZERO && min_max [1] == REG_INFINITY) {
1062 op_code = '*';
1063 } else if (min_max [0] == REG_ONE && min_max [1] == REG_INFINITY) {
1064 op_code = '+';
1065 } else if (min_max [0] == REG_ZERO && min_max [1] == REG_ONE) {
1066 op_code = '?';
1067 } else if (min_max [0] == REG_ONE && min_max [1] == REG_ONE) {
1068 /* "x{1,1}" is the same as "x". No need to pollute the compiled
1069 regex with such nonsense. */
1070
1071 *flag_param = flags_local;
1072 *range_param = range_local;
1073 return (ret_val);
1074 } else if (Num_Braces > (int)UCHAR_MAX) {
1075 sprintf (Error_Text, "number of {m,n} constructs > %d", UCHAR_MAX);
1076 REG_FAIL (Error_Text);
1077 }
1078 }
1079
1080 if (op_code == '+') min_max [0] = REG_ONE;
1081 if (op_code == '?') min_max [1] = REG_ONE;
1082
1083 /* It is dangerous to apply certain quantifiers to a possibly zero width
1084 item. */
1085
1086 if (!(flags_local & HAS_WIDTH)) {
1087 if (brace_present) {
1088 sprintf (Error_Text, "{%lu,%lu} operand could be empty",
1089 min_max [0], min_max [1]);
1090 } else {
1091 sprintf (Error_Text, "%c operand could be empty", op_code);
1092 }
1093
1094 REG_FAIL (Error_Text);
1095 }
1096
1097 *flag_param = (min_max [0] > REG_ZERO) ? (WORST | HAS_WIDTH) : WORST;
1098 if (range_local.lower >= 0) {
1099 if (min_max[1] != REG_INFINITY) {
1100 range_param->lower = range_local.lower * min_max[0];
1101 range_param->upper = range_local.upper * min_max[1];
1102 } else {
1103 range_param->lower = -1; /* Not a fixed-size length */
1104 range_param->upper = -1;
1105 }
1106 } else {
1107 range_param->lower = -1; /* Not a fixed-size length */
1108 range_param->upper = -1;
1109 }
1110
1111 /*---------------------------------------------------------------------*
1112 * Symbol Legend For Node Structure Diagrams
1113 *---------------------------------------------------------------------*
1114 * (...) = general grouped thing
1115 * B = (B)ranch, K = bac(K), N = (N)othing
1116 * I = (I)nitialize count, C = Increment (C)ount
1117 * T~m = (T)est against mini(m)um- go to NEXT pointer if >= operand
1118 * T~x = (T)est against ma(x)imum- go to NEXT pointer if >= operand
1119 * '~' = NEXT pointer, \___| = forward pointer, |___/ = Backward pointer
1120 *---------------------------------------------------------------------*/
1121
1122 if (op_code == '*' && (flags_local & SIMPLE)) {
1123 insert ((lazy ? LAZY_STAR : STAR), ret_val, 0UL, 0UL, 0);
1124
1125 } else if (op_code == '+' && (flags_local & SIMPLE)) {
1126 insert (lazy ? LAZY_PLUS : PLUS, ret_val, 0UL, 0UL, 0);
1127
1128 } else if (op_code == '?' && (flags_local & SIMPLE)) {
1129 insert (lazy ? LAZY_QUESTION : QUESTION, ret_val, 0UL, 0UL, 0);
1130
1131 } else if (op_code == '{' && (flags_local & SIMPLE)) {
1132 insert (lazy ? LAZY_BRACE : BRACE, ret_val, min_max [0], min_max [1], 0);
1133
1134 } else if ((op_code == '*' || op_code == '+') && lazy) {
1135 /* Node structure for (x)*? Node structure for (x)+? construct.
1136 * construct. (Same as (x)*? except for initial
1137 * forward jump into parenthesis.)
1138 *
1139 * ___6____
1140 * _______5_______ /________|______
1141 * | _4__ 1_\ /| ____ | _\
1142 * |/ | / |\ / |/ | | / |\
1143 * B~ N~ B~ (...)~ K~ N~ N~ B~ N~ B~ (...)~ K~ N~
1144 * \ \___2_______| \ \___________|
1145 * \_____3_______| \_____________|
1146 *
1147 */
1148
1149 tail (ret_val, emit_node (BACK)); /* 1 */
1150 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 2,4 */
1151 (void) insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 3 */
1152
1153 next = emit_node (NOTHING); /* 2,3 */
1154
1155 offset_tail (ret_val, NODE_SIZE, next); /* 2 */
1156 tail (ret_val, next); /* 3 */
1157 insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 4,5 */
1158 tail (ret_val, ret_val + (2 * NODE_SIZE)); /* 4 */
1159 offset_tail (ret_val, 3 * NODE_SIZE, ret_val); /* 5 */
1160
1161 if (op_code == '+') {
1162 insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 6 */
1163 tail (ret_val, ret_val + (4 * NODE_SIZE)); /* 6 */
1164 }
1165 } else if (op_code == '*') {
1166 /* Node structure for (x)* construct.
1167 * ____1_____
1168 * | \
1169 * B~ (...)~ K~ B~ N~
1170 * \ \_|2 |\_|
1171 * \__3_______| 4
1172 */
1173
1174 insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 1,3 */
1175 offset_tail (ret_val, NODE_SIZE, emit_node (BACK)); /* 2 */
1176 offset_tail (ret_val, NODE_SIZE, ret_val); /* 1 */
1177 tail (ret_val, emit_node (BRANCH)); /* 3 */
1178 tail (ret_val, emit_node (NOTHING)); /* 4 */
1179 } else if (op_code == '+') {
1180 /* Node structure for (x)+ construct.
1181 *
1182 * ____2_____
1183 * | \
1184 * (...)~ B~ K~ B~ N~
1185 * \_|\____|\_|
1186 * 1 3 4
1187 */
1188
1189 next = emit_node (BRANCH); /* 1 */
1190
1191 tail (ret_val, next); /* 1 */
1192 tail (emit_node (BACK), ret_val); /* 2 */
1193 tail (next, emit_node (BRANCH)); /* 3 */
1194 tail (ret_val, emit_node (NOTHING)); /* 4 */
1195 } else if (op_code == '?' && lazy) {
1196 /* Node structure for (x)?? construct.
1197 * _4__ 1_
1198 * / | / |
1199 * B~ N~ B~ (...)~ N~
1200 * \ \___2____|
1201 * \_____3____|
1202 */
1203
1204 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 2,4 */
1205 (void) insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 3 */
1206
1207 next = emit_node (NOTHING); /* 1,2,3 */
1208
1209 offset_tail (ret_val, 2 * NODE_SIZE, next); /* 1 */
1210 offset_tail (ret_val, NODE_SIZE, next); /* 2 */
1211 tail (ret_val, next); /* 3 */
1212 insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 4 */
1213 tail (ret_val, (ret_val + (2 * NODE_SIZE))); /* 4 */
1214
1215 } else if (op_code == '?') {
1216 /* Node structure for (x)? construct.
1217 * ___1____ _2
1218 * / |/ |
1219 * B~ (...)~ B~ N~
1220 * \__3_|
1221 */
1222
1223 insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 1 */
1224 tail (ret_val, emit_node (BRANCH)); /* 1 */
1225
1226 next = emit_node (NOTHING); /* 2,3 */
1227
1228 tail (ret_val, next); /* 2 */
1229 offset_tail (ret_val, NODE_SIZE, next); /* 3 */
1230 } else if (op_code == '{' && min_max [0] == min_max [1]) {
1231 /* Node structure for (x){m}, (x){m}?, (x){m,m}, or (x){m,m}? constructs.
1232 * Note that minimal and maximal matching mean the same thing when we
1233 * specify the minimum and maximum to be the same value.
1234 * _______3_____
1235 * | 1_ _2 \
1236 * | / |/ | \
1237 * I~ (...)~ C~ T~m K~ N~
1238 * \_| \_____|
1239 * 5 4
1240 */
1241
1242 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1243 tail (ret_val, emit_special (TEST_COUNT, min_max [0], Num_Braces));/* 2 */
1244 tail (emit_node (BACK), ret_val); /* 3 */
1245 tail (ret_val, emit_node (NOTHING)); /* 4 */
1246
1247 next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 5 */
1248
1249 tail (ret_val, next); /* 5 */
1250
1251 Num_Braces++;
1252 } else if (op_code == '{' && lazy) {
1253 if (min_max [0] == REG_ZERO && min_max [1] != REG_INFINITY) {
1254 /* Node structure for (x){0,n}? or {,n}? construct.
1255 * _________3____________
1256 * 8_| _4__ 1_ _2 \
1257 * / |/ | / |/ | \
1258 * I~ B~ N~ B~ (...)~ C~ T~x K~ N~
1259 * \ \ \__7__|
1260 * \ \_________6_______|
1261 * \______5____________|
1262 */
1263
1264 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1265
1266 next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2,7 */
1267
1268 tail (ret_val, next); /* 2 */
1269 (void) insert (BRANCH, ret_val, 0UL, 0UL, Num_Braces); /* 4,6 */
1270 (void) insert (NOTHING, ret_val, 0UL, 0UL, Num_Braces); /* 5 */
1271 (void) insert (BRANCH, ret_val, 0UL, 0UL, Num_Braces); /* 3,4,8 */
1272 tail (emit_node (BACK), ret_val); /* 3 */
1273 tail (ret_val, ret_val + (2 * NODE_SIZE)); /* 4 */
1274
1275 next = emit_node (NOTHING); /* 5,6,7 */
1276
1277 offset_tail (ret_val, NODE_SIZE, next); /* 5 */
1278 offset_tail (ret_val, 2 * NODE_SIZE, next); /* 6 */
1279 offset_tail (ret_val, 3 * NODE_SIZE, next); /* 7 */
1280
1281 next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 8 */
1282
1283 tail (ret_val, next); /* 8 */
1284
1285 } else if (min_max [0] > REG_ZERO && min_max [1] == REG_INFINITY) {
1286 /* Node structure for (x){m,}? construct.
1287 * ______8_________________
1288 * | _______3_____ \
1289 * | _7__ | 1_ _2 \ \
1290 * |/ | | / |/ | \ \
1291 * I~ B~ N~ B~ (...)~ C~ T~m K~ K~ N~
1292 * \_____\__\_| \_4___| |
1293 * 9 \ \_________5__________|
1294 * \_______6______________|
1295 */
1296
1297 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1298
1299 next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2,4 */
1300
1301 tail (ret_val, next); /* 2 */
1302 tail (emit_node (BACK), ret_val); /* 3 */
1303 tail (ret_val, emit_node (BACK)); /* 4 */
1304 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 5,7 */
1305 (void) insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 6 */
1306
1307 next = emit_node (NOTHING); /* 5,6 */
1308
1309 offset_tail (ret_val, NODE_SIZE, next); /* 5 */
1310 tail (ret_val, next); /* 6 */
1311 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 7,8 */
1312 tail (ret_val, ret_val + (2 * NODE_SIZE)); /* 7 */
1313 offset_tail (ret_val, 3 * NODE_SIZE, ret_val); /* 8 */
1314 (void) insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 9 */
1315 tail (ret_val, ret_val + INDEX_SIZE + (4 * NODE_SIZE)); /* 9 */
1316
1317 } else {
1318 /* Node structure for (x){m,n}? construct.
1319 * ______9_____________________
1320 * | _____________3___ \
1321 * | __8_ | 1_ _2 \ \
1322 * |/ | | / |/ | \ \
1323 * I~ B~ N~ B~ (...)~ C~ T~x T~m K~ K~ N~
1324 * \_____\__\_| \ \__4__| |
1325 * 10 \ \ \_7_________|
1326 * \ \_________6_____________|
1327 * \_______5_________________|
1328 */
1329
1330 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1331
1332 next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,7 */
1333
1334 tail (ret_val, next); /* 2 */
1335
1336 next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 4 */
1337
1338 tail (emit_node (BACK), ret_val); /* 3 */
1339 tail (next, emit_node (BACK)); /* 4 */
1340 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 6,8 */
1341 (void) insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 5 */
1342 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 8,9 */
1343
1344 next = emit_node (NOTHING); /* 5,6,7 */
1345
1346 offset_tail (ret_val, NODE_SIZE, next); /* 5 */
1347 offset_tail (ret_val, 2 * NODE_SIZE, next); /* 6 */
1348 offset_tail (ret_val, 3 * NODE_SIZE, next); /* 7 */
1349 tail (ret_val, ret_val + (2 * NODE_SIZE)); /* 8 */
1350 offset_tail (next, -NODE_SIZE, ret_val); /* 9 */
1351 insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 10 */
1352 tail (ret_val, ret_val + INDEX_SIZE + (4 * NODE_SIZE)); /* 10 */
1353 }
1354
1355 Num_Braces++;
1356 } else if (op_code == '{') {
1357 if (min_max [0] == REG_ZERO && min_max [1] != REG_INFINITY) {
1358 /* Node structure for (x){0,n} or (x){,n} construct.
1359 *
1360 * ___3____________
1361 * | 1_ _2 \ 5_
1362 * | / |/ | \ / |
1363 * I~ B~ (...)~ C~ T~x K~ B~ N~
1364 * \_|\ \_6___|__|
1365 * 7 \________4________|
1366 */
1367
1368 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1369
1370 next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,6 */
1371
1372 tail (ret_val, next); /* 2 */
1373 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 3,4,7 */
1374 tail (emit_node (BACK), ret_val); /* 3 */
1375
1376 next = emit_node (BRANCH); /* 4,5 */
1377
1378 tail (ret_val, next); /* 4 */
1379 tail (next, emit_node (NOTHING)); /* 5,6 */
1380 offset_tail (ret_val, NODE_SIZE, next); /* 6 */
1381
1382 next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 7 */
1383
1384 tail (ret_val, next); /* 7 */
1385
1386 } else if (min_max [0] > REG_ZERO && min_max [1] == REG_INFINITY) {
1387 /* Node structure for (x){m,} construct.
1388 * __________4________
1389 * | __3__________ \
1390 * _|___| 1_ _2 \ \ _7
1391 * / | 8 | / |/ | \ \ / |
1392 * I~ B~ (...)~ C~ T~m K~ K~ B~ N~
1393 * \ \_5___| |
1394 * \__________6__________|
1395 */
1396
1397 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1398
1399 next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2 */
1400
1401 tail (ret_val, next); /* 2 */
1402 tail (emit_node (BACK), ret_val); /* 3 */
1403 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 4,6 */
1404
1405 next = emit_node (BACK); /* 4 */
1406
1407 tail (next, ret_val); /* 4 */
1408 offset_tail (ret_val, NODE_SIZE, next); /* 5 */
1409 tail (ret_val, emit_node (BRANCH)); /* 6 */
1410 tail (ret_val, emit_node (NOTHING)); /* 7 */
1411
1412 insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 8 */
1413
1414 tail (ret_val, ret_val + INDEX_SIZE + (2 * NODE_SIZE)); /* 8 */
1415
1416 } else {
1417 /* Node structure for (x){m,n} construct.
1418 * _____6________________
1419 * | _____________3___ \
1420 * 9_|__| 1_ _2 \ \ _8
1421 * / | | / |/ | \ \ / |
1422 * I~ B~ (...)~ C~ T~x T~m K~ K~ B~ N~
1423 * \ \ \__4__| | |
1424 * \ \_7_________|__|
1425 * \_________5_____________|
1426 */
1427
1428 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1429
1430 next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,4 */
1431
1432 tail (ret_val, next); /* 2 */
1433
1434 next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 4 */
1435
1436 tail (emit_node (BACK), ret_val); /* 3 */
1437 tail (next, emit_node (BACK)); /* 4 */
1438 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 5,6 */
1439
1440 next = emit_node (BRANCH); /* 5,8 */
1441
1442 tail (ret_val, next); /* 5 */
1443 offset_tail (next, -NODE_SIZE, ret_val); /* 6 */
1444
1445 next = emit_node (NOTHING); /* 7,8 */
1446
1447 offset_tail (ret_val, NODE_SIZE, next); /* 7 */
1448
1449 offset_tail (next, -NODE_SIZE, next); /* 8 */
1450 (void) insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 9 */
1451 tail (ret_val, ret_val + INDEX_SIZE + (2 * NODE_SIZE)); /* 9 */
1452 }
1453
1454 Num_Braces++;
1455 } else {
1456 /* We get here if the IS_QUANTIFIER macro is not coordinated properly
1457 with this function. */
1458
1459 REG_FAIL ("internal error #2, `piece\'");
1460 }
1461
1462 if (IS_QUANTIFIER (*Reg_Parse)) {
1463 if (op_code == '{') {
1464 sprintf (Error_Text, "nested quantifiers, {m,n}%c", *Reg_Parse);
1465 } else {
1466 sprintf (Error_Text, "nested quantifiers, %c%c", op_code, *Reg_Parse);
1467 }
1468
1469 REG_FAIL (Error_Text);
1470 }
1471
1472 return (ret_val);
1473 }
1474
1475 /*----------------------------------------------------------------------*
1476 * atom
1477 *
1478 * Process one regex item at the lowest level
1479 *
1480 * OPTIMIZATION: Lumps a continuous sequence of ordinary characters
1481 * together so that it can turn them into a single EXACTLY node, which
1482 * is smaller to store and faster to run.
1483 *----------------------------------------------------------------------*/
1484
atom(int * flag_param,len_range * range_param)1485 static unsigned char * atom (int *flag_param, len_range *range_param) {
1486
1487 register unsigned char *ret_val;
1488 unsigned char test;
1489 int flags_local;
1490 len_range range_local;
1491
1492 *flag_param = WORST; /* Tentatively. */
1493 range_param->lower = 0; /* Idem */
1494 range_param->upper = 0;
1495
1496 /* Process any regex comments, e.g. `(?# match next token->)'. The
1497 terminating right parenthesis can not be escaped. The comment stops at
1498 the first right parenthesis encountered (or the end of the regex
1499 string)... period. Handles multiple sequential comments,
1500 e.g. `(?# one)(?# two)...' */
1501
1502 while (*Reg_Parse == '(' &&
1503 *(Reg_Parse + 1) == '?' &&
1504 *(Reg_Parse + 2) == '#') {
1505
1506 Reg_Parse += 3;
1507
1508 while (*Reg_Parse != ')' && *Reg_Parse != '\0') {
1509 Reg_Parse++;
1510 }
1511
1512 if (*Reg_Parse == ')') {
1513 Reg_Parse++;
1514 }
1515
1516 if (*Reg_Parse == ')' || *Reg_Parse == '|' || *Reg_Parse == '\0') {
1517 /* Hit end of regex string or end of parenthesized regex; have to
1518 return "something" (i.e. a NOTHING node) to avoid generating an
1519 error. */
1520
1521 ret_val = emit_node (NOTHING);
1522
1523 return (ret_val);
1524 }
1525 }
1526
1527 switch (*Reg_Parse++) {
1528 case '^':
1529 ret_val = emit_node (BOL);
1530 break;
1531
1532 case '$':
1533 ret_val = emit_node (EOL);
1534 break;
1535
1536 case '<':
1537 ret_val = emit_node (BOWORD);
1538 break;
1539
1540 case '>':
1541 ret_val = emit_node (EOWORD);
1542 break;
1543
1544 case '.':
1545 if (Match_Newline) {
1546 ret_val = emit_node (EVERY);
1547 } else {
1548 ret_val = emit_node (ANY);
1549 }
1550
1551 *flag_param |= (HAS_WIDTH | SIMPLE);
1552 range_param->lower = 1;
1553 range_param->upper = 1;
1554 break;
1555
1556 case '(':
1557 if (*Reg_Parse == '?') { /* Special parenthetical expression */
1558 Reg_Parse++;
1559 range_local.lower = 0; /* Make sure it is always used */
1560 range_local.upper = 0;
1561
1562 if (*Reg_Parse == ':') {
1563 Reg_Parse++;
1564 ret_val = chunk (NO_CAPTURE, &flags_local, &range_local);
1565 } else if (*Reg_Parse == '=') {
1566 Reg_Parse++;
1567 ret_val = chunk (POS_AHEAD_OPEN, &flags_local, &range_local);
1568 } else if (*Reg_Parse == '!') {
1569 Reg_Parse++;
1570 ret_val = chunk (NEG_AHEAD_OPEN, &flags_local, &range_local);
1571 } else if (*Reg_Parse == 'i') {
1572 Reg_Parse++;
1573 ret_val = chunk (INSENSITIVE, &flags_local, &range_local);
1574 } else if (*Reg_Parse == 'I') {
1575 Reg_Parse++;
1576 ret_val = chunk (SENSITIVE, &flags_local, &range_local);
1577 } else if (*Reg_Parse == 'n') {
1578 Reg_Parse++;
1579 ret_val = chunk (NEWLINE, &flags_local, &range_local);
1580 } else if (*Reg_Parse == 'N') {
1581 Reg_Parse++;
1582 ret_val = chunk (NO_NEWLINE, &flags_local, &range_local);
1583 } else if (*Reg_Parse == '<') {
1584 Reg_Parse++;
1585 if (*Reg_Parse == '=') {
1586 Reg_Parse++;
1587 ret_val = chunk (POS_BEHIND_OPEN, &flags_local, &range_local);
1588 } else if (*Reg_Parse == '!') {
1589 Reg_Parse++;
1590 ret_val = chunk (NEG_BEHIND_OPEN, &flags_local, &range_local);
1591 } else {
1592 sprintf (Error_Text,
1593 "invalid look-behind syntax, \"(?<%c...)\"",
1594 *Reg_Parse);
1595
1596 REG_FAIL (Error_Text);
1597 }
1598 } else {
1599 sprintf (Error_Text,
1600 "invalid grouping syntax, \"(?%c...)\"",
1601 *Reg_Parse);
1602
1603 REG_FAIL (Error_Text);
1604 }
1605 } else { /* Normal capturing parentheses */
1606 ret_val = chunk (PAREN, &flags_local, &range_local);
1607 }
1608
1609 if (ret_val == NULL) return (NULL); /* Something went wrong. */
1610
1611 /* Add HAS_WIDTH flag if it was set by call to chunk. */
1612
1613 *flag_param |= flags_local & HAS_WIDTH;
1614 *range_param = range_local;
1615
1616 break;
1617
1618 case '\0':
1619 case '|':
1620 case ')':
1621 REG_FAIL ("internal error #3, `atom\'"); /* Supposed to be */
1622 /* caught earlier. */
1623 case '?':
1624 case '+':
1625 case '*':
1626 sprintf (Error_Text, "%c follows nothing", *(Reg_Parse - 1));
1627 REG_FAIL (Error_Text);
1628
1629 case '{':
1630 if (Enable_Counting_Quantifier) {
1631 REG_FAIL ("{m,n} follows nothing");
1632 } else {
1633 ret_val = emit_node (EXACTLY); /* Treat braces as literals. */
1634 emit_byte ('{');
1635 emit_byte ('\0');
1636 range_param->lower = 1;
1637 range_param->upper = 1;
1638 }
1639
1640 break;
1641
1642 case '[':
1643 {
1644 register unsigned int second_value;
1645 register unsigned int last_value;
1646 unsigned char last_emit = 0;
1647
1648 /* Handle characters that can only occur at the start of a class. */
1649
1650 if (*Reg_Parse == '^') { /* Complement of range. */
1651 ret_val = emit_node (ANY_BUT);
1652 Reg_Parse++;
1653
1654 /* All negated classes include newline unless escaped with
1655 a "(?n)" switch. */
1656
1657 if (!Match_Newline) emit_byte ('\n');
1658 } else {
1659 ret_val = emit_node (ANY_OF);
1660 }
1661
1662 if (*Reg_Parse == ']' || *Reg_Parse == '-') {
1663 /* If '-' or ']' is the first character in a class,
1664 it is a literal character in the class. */
1665
1666 last_emit = *Reg_Parse;
1667 emit_byte (*Reg_Parse);
1668 Reg_Parse++;
1669 }
1670
1671 /* Handle the rest of the class characters. */
1672
1673 while (*Reg_Parse != '\0' && *Reg_Parse != ']') {
1674 if (*Reg_Parse == '-') { /* Process a range, e.g [a-z]. */
1675 Reg_Parse++;
1676
1677 if (*Reg_Parse == ']' || *Reg_Parse == '\0') {
1678 /* If '-' is the last character in a class it is a literal
1679 character. If `Reg_Parse' points to the end of the
1680 regex string, an error will be generated later. */
1681
1682 emit_byte ('-');
1683 last_emit = '-';
1684 } else {
1685 /* We must get the range starting character value from the
1686 emitted code since it may have been an escaped
1687 character. `second_value' is set one larger than the
1688 just emitted character value. This is done since
1689 `second_value' is used as the start value for the loop
1690 that emits the values in the range. Since we have
1691 already emitted the first character of the class, we do
1692 not want to emit it again. */
1693
1694 second_value = ((unsigned int) last_emit) + 1;
1695
1696 if (*Reg_Parse == '\\') {
1697 /* Handle escaped characters within a class range.
1698 Specifically disallow shortcut escapes as the end of
1699 a class range. To allow this would be ambiguous
1700 since shortcut escapes represent a set of characters,
1701 and it would not be clear which character of the
1702 class should be treated as the "last" character. */
1703
1704 Reg_Parse++;
1705
1706 if ((test = numeric_escape (*Reg_Parse, &Reg_Parse))) {
1707 last_value = (unsigned int) test;
1708 } else if ((test = literal_escape (*Reg_Parse))) {
1709 last_value = (unsigned int) test;
1710 } else if (shortcut_escape (*Reg_Parse,
1711 NULL,
1712 CHECK_CLASS_ESCAPE)) {
1713 sprintf (Error_Text,
1714 "\\%c is not allowed as range operand",
1715 *Reg_Parse);
1716
1717 REG_FAIL (Error_Text);
1718 } else {
1719 sprintf (
1720 Error_Text,
1721 "\\%c is an invalid char class escape sequence",
1722 *Reg_Parse);
1723
1724 REG_FAIL (Error_Text);
1725 }
1726 } else {
1727 last_value = U_CHAR_AT (Reg_Parse);
1728 }
1729
1730 if (Is_Case_Insensitive) {
1731 second_value =
1732 (unsigned int) tolower ((int) second_value);
1733 last_value =
1734 (unsigned int) tolower ((int) last_value);
1735 }
1736
1737 /* For case insensitive, something like [A-_] will
1738 generate an error here since ranges are converted to
1739 lower case. */
1740
1741 if (second_value - 1 > last_value) {
1742 REG_FAIL ("invalid [] range");
1743 }
1744
1745 /* If only one character in range (e.g [a-a]) then this
1746 loop is not run since the first character of any range
1747 was emitted by the previous iteration of while loop. */
1748
1749 for (; second_value <= last_value; second_value++) {
1750 emit_class_byte (second_value);
1751 }
1752
1753 last_emit = (unsigned char) last_value;
1754
1755 Reg_Parse++;
1756
1757 } /* End class character range code. */
1758 } else if (*Reg_Parse == '\\') {
1759 Reg_Parse++;
1760
1761 if ((test = numeric_escape (*Reg_Parse, &Reg_Parse)) != '\0') {
1762 emit_class_byte (test);
1763
1764 last_emit = test;
1765 } else if ((test = literal_escape (*Reg_Parse)) != '\0') {
1766 emit_byte (test);
1767 last_emit = test;
1768 } else if (shortcut_escape (*Reg_Parse,
1769 NULL,
1770 CHECK_CLASS_ESCAPE)) {
1771
1772 if (*(Reg_Parse + 1) == '-') {
1773 /* Specifically disallow shortcut escapes as the start
1774 of a character class range (see comment above.) */
1775
1776 sprintf (Error_Text,
1777 "\\%c not allowed as range operand",
1778 *Reg_Parse);
1779
1780 REG_FAIL (Error_Text);
1781 } else {
1782 /* Emit the bytes that are part of the shortcut
1783 escape sequence's range (e.g. \d = 0123456789) */
1784
1785 shortcut_escape (*Reg_Parse, NULL, EMIT_CLASS_BYTES);
1786 }
1787 } else {
1788 sprintf (Error_Text,
1789 "\\%c is an invalid char class escape sequence",
1790 *Reg_Parse);
1791
1792 REG_FAIL (Error_Text);
1793 }
1794
1795 Reg_Parse++;
1796
1797 /* End of class escaped sequence code */
1798 } else {
1799 emit_class_byte (*Reg_Parse); /* Ordinary class character. */
1800
1801 last_emit = *Reg_Parse;
1802 Reg_Parse++;
1803 }
1804 } /* End of while (*Reg_Parse != '\0' && *Reg_Parse != ']') */
1805
1806 if (*Reg_Parse != ']') REG_FAIL ("missing right \']\'");
1807
1808 emit_byte('\0');
1809
1810 /* NOTE: it is impossible to specify an empty class. This is
1811 because [] would be interpreted as "begin character class"
1812 followed by a literal ']' character and no "end character class"
1813 delimiter (']'). Because of this, it is always safe to assume
1814 that a class HAS_WIDTH. */
1815
1816 Reg_Parse++;
1817 *flag_param |= HAS_WIDTH | SIMPLE;
1818 range_param->lower = 1;
1819 range_param->upper = 1;
1820 }
1821
1822 break; /* End of character class code. */
1823
1824 case '\\':
1825 /* Force Error_Text to have a length of zero. This way we can tell if
1826 either of the calls to shortcut_escape() or back_ref() fill
1827 Error_Text with an error message. */
1828
1829 Error_Text [0] = '\0';
1830
1831 if ((ret_val = shortcut_escape (*Reg_Parse, flag_param, EMIT_NODE))) {
1832
1833 Reg_Parse++;
1834 range_param->lower = 1;
1835 range_param->upper = 1;
1836 break;
1837
1838 } else if ((ret_val = back_ref (Reg_Parse, flag_param, EMIT_NODE))) {
1839 /* Can't make any assumptions about a back-reference as to SIMPLE
1840 or HAS_WIDTH. For example (^|<) is neither simple nor has
1841 width. So we don't flip bits in flag_param here. */
1842
1843 Reg_Parse++;
1844 /* Back-references always have an unknown length */
1845 range_param->lower = -1;
1846 range_param->upper = -1;
1847 break;
1848 }
1849
1850 if (strlen (Error_Text) > 0) REG_FAIL (Error_Text);
1851
1852 /* At this point it is apparent that the escaped character is not a
1853 shortcut escape or back-reference. Back up one character to allow
1854 the default code to include it as an ordinary character. */
1855
1856 /* Fall through to Default case to handle literal escapes and numeric
1857 escapes. */
1858
1859 default:
1860 Reg_Parse--; /* If we fell through from the above code, we are now
1861 pointing at the back slash (\) character. */
1862 {
1863 unsigned char *parse_save;
1864 int len = 0;
1865
1866 if (Is_Case_Insensitive) {
1867 ret_val = emit_node (SIMILAR);
1868 } else {
1869 ret_val = emit_node (EXACTLY);
1870 }
1871
1872 /* Loop until we find a meta character, shortcut escape, back
1873 reference, or end of regex string. */
1874
1875 for (; *Reg_Parse != '\0' &&
1876 !strchr ((char *) Meta_Char, (int) *Reg_Parse);
1877 len++) {
1878
1879 /* Save where we are in case we have to back
1880 this character out. */
1881
1882 parse_save = Reg_Parse;
1883
1884 if (*Reg_Parse == '\\') {
1885 Reg_Parse++; /* Point to escaped character */
1886
1887 Error_Text [0] = '\0'; /* See comment above. */
1888
1889 if ((test = numeric_escape (*Reg_Parse, &Reg_Parse))) {
1890 if (Is_Case_Insensitive) {
1891 emit_byte (tolower (test));
1892 } else {
1893 emit_byte (test);
1894 }
1895 } else if ((test = literal_escape (*Reg_Parse))) {
1896 emit_byte (test);
1897 } else if (back_ref (Reg_Parse, NULL, CHECK_ESCAPE)) {
1898 /* Leave back reference for next `atom' call */
1899
1900 Reg_Parse--; break;
1901 } else if (shortcut_escape (*Reg_Parse, NULL, CHECK_ESCAPE)) {
1902 /* Leave shortcut escape for next `atom' call */
1903
1904 Reg_Parse--; break;
1905 } else {
1906 if (strlen (Error_Text) == 0) {
1907 /* None of the above calls generated an error message
1908 so generate our own here. */
1909
1910 sprintf (Error_Text,
1911 "\\%c is an invalid escape sequence",
1912 *Reg_Parse);
1913 }
1914
1915 REG_FAIL (Error_Text);
1916 }
1917
1918 Reg_Parse++;
1919 } else {
1920 /* Ordinary character */
1921
1922 if (Is_Case_Insensitive) {
1923 emit_byte (tolower (*Reg_Parse));
1924 } else {
1925 emit_byte (*Reg_Parse);
1926 }
1927
1928 Reg_Parse++;
1929 }
1930
1931 /* If next regex token is a quantifier (?, +. *, or {m,n}) and
1932 our EXACTLY node so far is more than one character, leave the
1933 last character to be made into an EXACTLY node one character
1934 wide for the multiplier to act on. For example 'abcd* would
1935 have an EXACTLY node with an 'abc' operand followed by a STAR
1936 node followed by another EXACTLY node with a 'd' operand. */
1937
1938 if (IS_QUANTIFIER (*Reg_Parse) && len > 0) {
1939 Reg_Parse = parse_save; /* Point to previous regex token. */
1940
1941 if (Code_Emit_Ptr == &Compute_Size) {
1942 Reg_Size--;
1943 } else {
1944 Code_Emit_Ptr--; /* Write over previously emitted byte. */
1945 }
1946
1947 break;
1948 }
1949 }
1950
1951 if (len <= 0) REG_FAIL ("internal error #4, `atom\'");
1952
1953 *flag_param |= HAS_WIDTH;
1954
1955 if (len == 1) *flag_param |= SIMPLE;
1956
1957 range_param->lower = len;
1958 range_param->upper = len;
1959
1960 emit_byte ('\0');
1961 }
1962 } /* END switch (*Reg_Parse++) */
1963
1964 return (ret_val);
1965 }
1966
1967 /*----------------------------------------------------------------------*
1968 * emit_node
1969 *
1970 * Emit (if appropriate) the op code for a regex node atom.
1971 *
1972 * The NEXT pointer is initialized to NULL.
1973 *
1974 * Returns a pointer to the START of the emitted node.
1975 *----------------------------------------------------------------------*/
1976
emit_node(int op_code)1977 static unsigned char * emit_node (int op_code) {
1978
1979 register unsigned char *ret_val;
1980 register unsigned char *ptr;
1981
1982 ret_val = Code_Emit_Ptr; /* Return address of start of node */
1983
1984 if (ret_val == &Compute_Size) {
1985 Reg_Size += NODE_SIZE;
1986 } else {
1987 ptr = ret_val;
1988 *ptr++ = (unsigned char) op_code;
1989 *ptr++ = '\0'; /* Null "NEXT" pointer. */
1990 *ptr++ = '\0';
1991
1992 Code_Emit_Ptr = ptr;
1993 }
1994
1995 return (ret_val);
1996 }
1997
1998 /*----------------------------------------------------------------------*
1999 * emit_byte
2000 *
2001 * Emit (if appropriate) a byte of code (usually part of an operand.)
2002 *----------------------------------------------------------------------*/
2003
emit_byte(unsigned char c)2004 static void emit_byte (unsigned char c) {
2005
2006 if (Code_Emit_Ptr == &Compute_Size) {
2007 Reg_Size++;
2008 } else {
2009 *Code_Emit_Ptr++ = c;
2010 }
2011 }
2012
2013 /*----------------------------------------------------------------------*
2014 * emit_class_byte
2015 *
2016 * Emit (if appropriate) a byte of code (usually part of a character
2017 * class operand.)
2018 *----------------------------------------------------------------------*/
2019
emit_class_byte(unsigned char c)2020 static void emit_class_byte (unsigned char c) {
2021
2022 if (Code_Emit_Ptr == &Compute_Size) {
2023 Reg_Size++;
2024
2025 if (Is_Case_Insensitive && isalpha (c)) Reg_Size++;
2026 } else if (Is_Case_Insensitive && isalpha (c)) {
2027 /* For case insensitive character classes, emit both upper and lower case
2028 versions of alphabetical characters. */
2029
2030 *Code_Emit_Ptr++ = tolower (c);
2031 *Code_Emit_Ptr++ = toupper (c);
2032 } else {
2033 *Code_Emit_Ptr++ = c;
2034 }
2035 }
2036
2037 /*----------------------------------------------------------------------*
2038 * emit_special
2039 *
2040 * Emit nodes that need special processing.
2041 *----------------------------------------------------------------------*/
2042
emit_special(unsigned char op_code,unsigned long test_val,int index)2043 static unsigned char * emit_special (
2044 unsigned char op_code,
2045 unsigned long test_val,
2046 int index) {
2047
2048 register unsigned char *ret_val = &Compute_Size;
2049 register unsigned char *ptr;
2050
2051 if (Code_Emit_Ptr == &Compute_Size) {
2052 switch (op_code) {
2053 case POS_BEHIND_OPEN:
2054 case NEG_BEHIND_OPEN:
2055 Reg_Size += LENGTH_SIZE; /* Length of the look-behind match */
2056 Reg_Size += NODE_SIZE; /* Make room for the node */
2057 break;
2058
2059 case TEST_COUNT:
2060 Reg_Size += NEXT_PTR_SIZE; /* Make room for a test value. */
2061
2062 case INC_COUNT:
2063 Reg_Size += INDEX_SIZE; /* Make room for an index value. */
2064
2065 default:
2066 Reg_Size += NODE_SIZE; /* Make room for the node. */
2067 }
2068 } else {
2069 ret_val = emit_node (op_code); /* Return the address for start of node. */
2070 ptr = Code_Emit_Ptr;
2071
2072 if (op_code == INC_COUNT || op_code == TEST_COUNT) {
2073 *ptr++ = (unsigned char) index;
2074
2075 if (op_code == TEST_COUNT) {
2076 *ptr++ = PUT_OFFSET_L (test_val);
2077 *ptr++ = PUT_OFFSET_R (test_val);
2078 }
2079 } else if (op_code == POS_BEHIND_OPEN || op_code == NEG_BEHIND_OPEN) {
2080 *ptr++ = PUT_OFFSET_L (test_val);
2081 *ptr++ = PUT_OFFSET_R (test_val);
2082 *ptr++ = PUT_OFFSET_L (test_val);
2083 *ptr++ = PUT_OFFSET_R (test_val);
2084 }
2085
2086 Code_Emit_Ptr = ptr;
2087 }
2088
2089 return (ret_val);
2090 }
2091
2092 /*----------------------------------------------------------------------*
2093 * insert
2094 *
2095 * Insert a node in front of already emitted node(s). Means relocating
2096 * the operand. Code_Emit_Ptr points one byte past the just emitted
2097 * node and operand. The parameter `insert_pos' points to the location
2098 * where the new node is to be inserted.
2099 *----------------------------------------------------------------------*/
2100
insert(unsigned char op,unsigned char * insert_pos,long min,long max,int index)2101 static unsigned char * insert (
2102 unsigned char op,
2103 unsigned char *insert_pos,
2104 long min,
2105 long max,
2106 int index) {
2107
2108 register unsigned char *src;
2109 register unsigned char *dst;
2110 unsigned char *place;
2111 int insert_size = NODE_SIZE;
2112
2113 if (op == BRACE || op == LAZY_BRACE) {
2114 /* Make room for the min and max values. */
2115
2116 insert_size += (2 * NEXT_PTR_SIZE);
2117 } else if (op == INIT_COUNT) {
2118 /* Make room for an index value . */
2119
2120 insert_size += INDEX_SIZE;
2121 }
2122
2123 if (Code_Emit_Ptr == &Compute_Size) {
2124 Reg_Size += insert_size;
2125 return &Compute_Size;
2126 }
2127
2128 src = Code_Emit_Ptr;
2129 Code_Emit_Ptr += insert_size;
2130 dst = Code_Emit_Ptr;
2131
2132 /* Relocate the existing emitted code to make room for the new node. */
2133
2134 while (src > insert_pos) *--dst = *--src;
2135
2136 place = insert_pos; /* Where operand used to be. */
2137 *place++ = op; /* Inserted operand. */
2138 *place++ = '\0'; /* NEXT pointer for inserted operand. */
2139 *place++ = '\0';
2140
2141 if (op == BRACE || op == LAZY_BRACE) {
2142 *place++ = PUT_OFFSET_L (min);
2143 *place++ = PUT_OFFSET_R (min);
2144
2145 *place++ = PUT_OFFSET_L (max);
2146 *place++ = PUT_OFFSET_R (max);
2147 } else if (op == INIT_COUNT) {
2148 *place++ = (unsigned char) index;
2149 }
2150
2151 return place; /* Return a pointer to the start of the code moved. */
2152 }
2153
2154 /*----------------------------------------------------------------------*
2155 * tail - Set the next-pointer at the end of a node chain.
2156 *----------------------------------------------------------------------*/
2157
tail(unsigned char * search_from,unsigned char * point_to)2158 static void tail (unsigned char *search_from, unsigned char *point_to) {
2159
2160 register unsigned char *scan;
2161 register unsigned char *next;
2162 register int offset;
2163
2164 if (search_from == &Compute_Size) return;
2165
2166 /* Find the last node in the chain (node with a null NEXT pointer) */
2167
2168 scan = search_from;
2169
2170 for (;;) {
2171 next = next_ptr (scan);
2172
2173 if (!next) break;
2174
2175 scan = next;
2176 }
2177
2178 if (GET_OP_CODE (scan) == BACK) {
2179 offset = scan - point_to;
2180 } else {
2181 offset = point_to - scan;
2182 }
2183
2184 /* Set NEXT pointer */
2185
2186 *(scan + 1) = PUT_OFFSET_L (offset);
2187 *(scan + 2) = PUT_OFFSET_R (offset);
2188 }
2189
2190 /*--------------------------------------------------------------------*
2191 * offset_tail
2192 *
2193 * Perform a tail operation on (ptr + offset).
2194 *--------------------------------------------------------------------*/
2195
offset_tail(unsigned char * ptr,int offset,unsigned char * val)2196 static void offset_tail (unsigned char *ptr, int offset, unsigned char *val) {
2197
2198 if (ptr == &Compute_Size || ptr == NULL) return;
2199
2200 tail (ptr + offset, val);
2201 }
2202
2203 /*--------------------------------------------------------------------*
2204 * branch_tail
2205 *
2206 * Perform a tail operation on (ptr + offset) but only if `ptr' is a
2207 * BRANCH node.
2208 *--------------------------------------------------------------------*/
2209
branch_tail(unsigned char * ptr,int offset,unsigned char * val)2210 static void branch_tail (unsigned char *ptr, int offset, unsigned char *val) {
2211
2212 if (ptr == &Compute_Size || ptr == NULL ||GET_OP_CODE (ptr) != BRANCH) {
2213 return;
2214 }
2215
2216 tail (ptr + offset, val);
2217 }
2218
2219 /*--------------------------------------------------------------------*
2220 * shortcut_escape
2221 *
2222 * Implements convenient escape sequences that represent entire
2223 * character classes or special location assertions (similar to escapes
2224 * supported by Perl)
2225 * _
2226 * \d Digits [0-9] |
2227 * \D NOT a digit [^0-9] | (Examples
2228 * \l Letters [a-zA-Z] | at left
2229 * \L NOT a Letter [^a-zA-Z] | are
2230 * \s Whitespace [ \t\n\r\f\v] | for
2231 * \S NOT Whitespace [^ \t\n\r\f\v] | C
2232 * \w "Word" character [a-zA-Z0-9_] | Locale)
2233 * \W NOT a "Word" character [^a-zA-Z0-9_] _|
2234 *
2235 * \B Matches any character that is NOT a word-delimiter
2236 *
2237 * Codes for the "emit" parameter:
2238 *
2239 * EMIT_NODE
2240 * Emit a shortcut node. Shortcut nodes have an implied set of
2241 * class characters. This helps keep the compiled regex string
2242 * small.
2243 *
2244 * EMIT_CLASS_BYTES
2245 * Emit just the equivalent characters of the class. This makes
2246 * the escape usable from within a class, e.g. [a-fA-F\d]. Only
2247 * \d, \D, \s, \S, \w, and \W can be used within a class.
2248 *
2249 * CHECK_ESCAPE
2250 * Only verify that this is a valid shortcut escape.
2251 *
2252 * CHECK_CLASS_ESCAPE
2253 * Same as CHECK_ESCAPE but only allows characters valid within
2254 * a class.
2255 *
2256 *--------------------------------------------------------------------*/
2257
shortcut_escape(unsigned char c,int * flag_param,int emit)2258 static unsigned char * shortcut_escape (
2259 unsigned char c,
2260 int *flag_param,
2261 int emit) {
2262
2263 register unsigned char *class = NULL;
2264 static unsigned char *codes = (unsigned char *) "ByYdDlLsSwW";
2265 unsigned char *ret_val = (unsigned char *) 1; /* Assume success. */
2266 unsigned char *valid_codes;
2267
2268 if (emit == EMIT_CLASS_BYTES || emit == CHECK_CLASS_ESCAPE) {
2269 valid_codes = codes + 3; /* \B, \y and \Y are not allowed in classes */
2270 } else {
2271 valid_codes = codes;
2272 }
2273
2274 if (!strchr ((char *) valid_codes, (int) c)) {
2275 return NULL; /* Not a valid shortcut escape sequence */
2276 } else if (emit == CHECK_ESCAPE || emit == CHECK_CLASS_ESCAPE) {
2277 return ret_val; /* Just checking if this is a valid shortcut escape. */
2278 }
2279
2280 switch (c) {
2281 case 'd':
2282 case 'D':
2283 if (emit == EMIT_CLASS_BYTES) {
2284 class = ASCII_Digits;
2285 } else if (emit == EMIT_NODE) {
2286 ret_val = (islower (c) ? emit_node (DIGIT)
2287 : emit_node (NOT_DIGIT));
2288 }
2289
2290 break;
2291
2292 case 'l':
2293 case 'L':
2294 if (emit == EMIT_CLASS_BYTES) {
2295 class = Letter_Char;
2296 } else if (emit == EMIT_NODE) {
2297 ret_val = (islower (c) ? emit_node (LETTER)
2298 : emit_node (NOT_LETTER));
2299 }
2300
2301 break;
2302
2303 case 's':
2304 case 'S':
2305 if (emit == EMIT_CLASS_BYTES) {
2306 if (Match_Newline) emit_byte ('\n');
2307
2308 class = White_Space;
2309 } else if (emit == EMIT_NODE) {
2310 if (Match_Newline) {
2311 ret_val = (islower (c) ? emit_node (SPACE_NL)
2312 : emit_node (NOT_SPACE_NL));
2313 } else {
2314 ret_val = (islower (c) ? emit_node (SPACE)
2315 : emit_node (NOT_SPACE));
2316 }
2317 }
2318
2319 break;
2320
2321 case 'w':
2322 case 'W':
2323 if (emit == EMIT_CLASS_BYTES) {
2324 class = Word_Char;
2325 } else if (emit == EMIT_NODE) {
2326 ret_val = (islower (c) ? emit_node (WORD_CHAR)
2327 : emit_node (NOT_WORD_CHAR));
2328 }
2329
2330 break;
2331
2332 /* Since the delimiter table is not available at regex compile time \B,
2333 \Y and \Y can only generate a node. At run time, the delimiter table
2334 will be available for these nodes to use. */
2335
2336 case 'y':
2337
2338 if (emit == EMIT_NODE) {
2339 ret_val = emit_node (IS_DELIM);
2340 } else {
2341 REG_FAIL ("internal error #5 `shortcut_escape\'");
2342 }
2343
2344 break;
2345
2346 case 'Y':
2347
2348 if (emit == EMIT_NODE) {
2349 ret_val = emit_node (NOT_DELIM);
2350 } else {
2351 REG_FAIL ("internal error #6 `shortcut_escape\'");
2352 }
2353
2354 break;
2355
2356 case 'B':
2357
2358 if (emit == EMIT_NODE) {
2359 ret_val = emit_node (NOT_BOUNDARY);
2360 } else {
2361 REG_FAIL ("internal error #7 `shortcut_escape\'");
2362 }
2363
2364 break;
2365
2366 default:
2367 /* We get here if there isn't a case for every character in
2368 the string "codes" */
2369
2370 REG_FAIL ("internal error #8 `shortcut_escape\'");
2371 }
2372
2373 if (emit == EMIT_NODE && c != 'B') {
2374 *flag_param |= (HAS_WIDTH | SIMPLE);
2375 }
2376
2377 if (class) {
2378 /* Emit bytes within a character class operand. */
2379
2380 while (*class != '\0') {
2381 emit_byte (*class++);
2382 }
2383 }
2384
2385 return ret_val;
2386 }
2387
2388 /*--------------------------------------------------------------------*
2389 * numeric_escape
2390 *
2391 * Implements hex and octal numeric escape sequence syntax.
2392 *
2393 * Hexadecimal Escape: \x## Max of two digits Must have leading 'x'.
2394 * Octal Escape: \0### Max of three digits and not greater
2395 * than 377 octal. Must have leading zero.
2396 *
2397 * Returns the actual character value or NULL if not a valid hex or
2398 * octal escape. REG_FAIL is called if \x0, \x00, \0, \00, \000, or
2399 * \0000 is specified.
2400 *--------------------------------------------------------------------*/
2401
numeric_escape(unsigned char c,unsigned char ** parse)2402 static unsigned char numeric_escape (
2403 unsigned char c,
2404 unsigned char **parse) {
2405
2406 static unsigned char digits [] = "fedcbaFEDCBA9876543210";
2407
2408 static unsigned int digit_val [] = {
2409 15, 14, 13, 12, 11, 10, /* Lower case Hex digits */
2410 15, 14, 13, 12, 11, 10, /* Upper case Hex digits */
2411 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; /* Decimal Digits */
2412
2413 register unsigned char *scan;
2414 register unsigned char *pos_ptr;
2415 register unsigned char *digit_str;
2416 unsigned int value = 0;
2417 unsigned int radix = 8;
2418 int width = 3; /* Can not be bigger than \0377 */
2419 int pos_delta = 14;
2420 int i, pos;
2421
2422 switch (c) {
2423 case '0':
2424 digit_str = digits + pos_delta; /* Only use Octal digits, i.e. 0-7. */
2425
2426 break;
2427
2428 case 'x':
2429 case 'X':
2430 width = 2; /* Can not be bigger than \0377 */
2431 radix = 16;
2432 pos_delta = 0;
2433 digit_str = digits; /* Use all of the digit characters. */
2434
2435 break;
2436
2437 default:
2438 return ('\0'); /* Not a numeric escape */
2439 }
2440
2441 scan = *parse; scan++; /* Only change *parse on success. */
2442
2443 pos_ptr = (unsigned char *) strchr ((char *) digit_str, (int) *scan);
2444
2445 for (i = 0; pos_ptr != NULL && (i < width); i++) {
2446 pos = (pos_ptr - digit_str) + pos_delta;
2447 value = (value * radix) + digit_val [pos];
2448
2449 /* If this digit makes the value over 255, treat this digit as a literal
2450 character instead of part of the numeric escape. For example, \0777
2451 will be processed as \077 (an 'M') and a literal '7' character, NOT
2452 511 decimal which is > 255. */
2453
2454 if (value > 255) {
2455 /* Back out calculations for last digit processed. */
2456
2457 value -= digit_val [pos];
2458 value /= radix;
2459
2460 break; /* Note that scan will not be incremented and still points to
2461 the digit that caused overflow. It will be decremented by
2462 the "else" below to point to the last character that is
2463 considered to be part of the octal escape. */
2464 }
2465
2466 scan++;
2467 pos_ptr = (unsigned char *) strchr ((char *) digit_str, (int) *scan);
2468 }
2469
2470 /* Handle the case of "\0" i.e. trying to specify a NULL character. */
2471
2472 if (value == 0) {
2473 if (c == '0') {
2474 sprintf (Error_Text, "\\00 is an invalid octal escape");
2475 } else {
2476 sprintf (Error_Text, "\\%c0 is an invalid hexadecimal escape", c);
2477 }
2478 } else {
2479 /* Point to the last character of the number on success. */
2480
2481 scan--;
2482 *parse = scan;
2483 }
2484
2485 return (unsigned char) value;
2486 }
2487
2488 /*--------------------------------------------------------------------*
2489 * literal_escape
2490 *
2491 * Recognize escaped literal characters (prefixed with backslash),
2492 * and translate them into the corresponding character.
2493 *
2494 * Returns the proper character value or NULL if not a valid literal
2495 * escape.
2496 *--------------------------------------------------------------------*/
2497
literal_escape(unsigned char c)2498 static unsigned char literal_escape (unsigned char c) {
2499
2500 static unsigned char valid_escape [] = {
2501 'a', 'b',
2502 'e',
2503 'f', 'n', 'r', 't', 'v', '(', ')', '-', '[', ']',
2504 '<', '>', '{', '}', '.', '\\', '|', '^', '$', '*',
2505 '+', '?', '&', '\0'
2506 };
2507
2508 static unsigned char value [] = {
2509 '\a', '\b',
2510 #ifdef EBCDIC_CHARSET
2511 0x27, /* Escape character in IBM's EBCDIC character set. */
2512 #else
2513 0x1B, /* Escape character in ASCII character set. */
2514 #endif
2515 '\f', '\n', '\r', '\t', '\v', '(', ')', '-', '[', ']',
2516 '<', '>', '{', '}', '.', '\\', '|', '^', '$', '*',
2517 '+', '?', '&', '\0'
2518 };
2519
2520 int i;
2521
2522 for (i = 0; valid_escape [i] != '\0'; i++) {
2523 if (c == valid_escape [i]) return value [i];
2524 }
2525
2526 return '\0';
2527 }
2528
2529 /*--------------------------------------------------------------------*
2530 * back_ref
2531 *
2532 * Process a request to match a previous parenthesized thing.
2533 * Parenthetical entities are numbered beginning at 1 by counting
2534 * opening parentheses from left to to right. \0 would represent
2535 * whole match, but would confuse numeric_escape as an octal escape,
2536 * so it is forbidden.
2537 *
2538 * Constructs of the form \~1, \~2, etc. are cross-regex back
2539 * references and are used in syntax highlighting patterns to match
2540 * text previously matched by another regex. *** IMPLEMENT LATER ***
2541 *--------------------------------------------------------------------*/
2542
back_ref(unsigned char * c,int * flag_param,int emit)2543 static unsigned char * back_ref (
2544 unsigned char *c,
2545 int *flag_param,
2546 int emit) {
2547
2548 int paren_no, c_offset = 0, is_cross_regex = 0;
2549
2550 unsigned char *ret_val;
2551
2552 /* Implement cross regex backreferences later. */
2553
2554 /* if (*c == (unsigned char) ('~')) {
2555 c_offset++;
2556 is_cross_regex++;
2557 } */
2558
2559 paren_no = (int) (*(c + c_offset) - (unsigned char) ('0'));
2560
2561 if (!isdigit (*(c + c_offset)) || /* Only \1, \2, ... \9 are supported. */
2562 paren_no == 0) { /* Should be caught by numeric_escape. */
2563
2564 return NULL;
2565 }
2566
2567 /* Make sure parentheses for requested back-reference are complete. */
2568
2569 if (!is_cross_regex && !TEST_BIT (Closed_Parens, paren_no)) {
2570 sprintf (Error_Text, "\\%d is an illegal back reference", paren_no);
2571 return NULL;
2572 }
2573
2574 if (emit == EMIT_NODE) {
2575 if (is_cross_regex) {
2576 Reg_Parse++; /* Skip past the '~' in a cross regex back reference.
2577 We only do this if we are emitting code. */
2578
2579 if (Is_Case_Insensitive) {
2580 ret_val = emit_node (X_REGEX_BR_CI);
2581 } else {
2582 ret_val = emit_node (X_REGEX_BR);
2583 }
2584 } else {
2585 if (Is_Case_Insensitive) {
2586 ret_val = emit_node (BACK_REF_CI);
2587 } else {
2588 ret_val = emit_node (BACK_REF);
2589 }
2590 }
2591
2592 emit_byte ((unsigned char) paren_no);
2593
2594 if (is_cross_regex || TEST_BIT (Paren_Has_Width, paren_no)) {
2595 *flag_param |= HAS_WIDTH;
2596 }
2597 } else if (emit == CHECK_ESCAPE) {
2598 ret_val = (unsigned char *) 1;
2599 } else {
2600 ret_val = NULL;
2601 }
2602
2603 return ret_val;
2604 }
2605
2606 /*======================================================================*
2607 * Regex execution related code
2608 *======================================================================*/
2609
2610 /* Global work variables for `ExecRE'. */
2611
2612 static unsigned char *Reg_Input; /* String-input pointer. */
2613 static unsigned char *Start_Of_String; /* Beginning of input, for ^ */
2614 /* and < checks. */
2615 static unsigned char *End_Of_String; /* Logical end of input (if
2616 supplied, till \0 otherwise) */
2617 static unsigned char *Look_Behind_To; /* Position till were look behind
2618 can safely check back */
2619 static unsigned char **Start_Ptr_Ptr; /* Pointer to `startp' array. */
2620 static unsigned char **End_Ptr_Ptr; /* Ditto for `endp'. */
2621 static unsigned char *Extent_Ptr_FW; /* Forward extent pointer */
2622 static unsigned char *Extent_Ptr_BW; /* Backward extent pointer */
2623 static unsigned char *Back_Ref_Start [10]; /* Back_Ref_Start [0] and */
2624 static unsigned char *Back_Ref_End [10]; /* Back_Ref_End [0] are not */
2625 /* used. This simplifies */
2626 /* indexing. */
2627 /*
2628 * Measured recursion limits:
2629 * Linux: +/- 40 000 (up to 110 000)
2630 * Solaris: +/- 85 000
2631 * HP-UX 11: +/- 325 000
2632 *
2633 * So 10 000 ought to be safe.
2634 */
2635 #define REGEX_RECURSION_LIMIT 10000
2636 static int Recursion_Count; /* Recursion counter */
2637 static int Recursion_Limit_Exceeded; /* Recursion limit exceeded flag */
2638
2639 #define AT_END_OF_STRING(X) (*(X) == (unsigned char)'\0' ||\
2640 (End_Of_String != NULL && (X) >= End_Of_String))
2641
2642 /* static regexp *Cross_Regex_Backref; */
2643
2644 static int Prev_Is_BOL;
2645 static int Succ_Is_EOL;
2646 static int Prev_Is_Delim;
2647 static int Succ_Is_Delim;
2648
2649 /* Define a pointer to an array to hold general (...){m,n} counts. */
2650
2651 typedef struct brace_counts {
2652 unsigned long count [1]; /* More unwarranted chumminess with compiler. */
2653 } brace_counts;
2654
2655 static struct brace_counts *Brace;
2656
2657 /* Default table for determining whether a character is a word delimiter. */
2658
2659 static unsigned char Default_Delimiters [UCHAR_MAX+1] = {0};
2660
2661 static unsigned char *Current_Delimiters; /* Current delimiter table */
2662
2663 /* Forward declarations of functions used by `ExecRE' */
2664
2665 static int attempt (regexp *, unsigned char *);
2666 static int match (unsigned char *, int *);
2667 static unsigned long greedy (unsigned char *, long);
2668 static void adjustcase (unsigned char *, int, unsigned char);
2669 static unsigned char * makeDelimiterTable (unsigned char *, unsigned char *);
2670
2671 /*
2672 * ExecRE - match a `regexp' structure against a string
2673 *
2674 * If `end' is non-NULL, matches may not BEGIN past end, but may extend past
2675 * it. If reverse is true, `end' must be specified, and searching begins at
2676 * `end'. "isbol" should be set to true if the beginning of the string is the
2677 * actual beginning of a line (since `ExecRE' can't look backwards from the
2678 * beginning to find whether there was a newline before). Likewise, "isbow"
2679 * asks whether the string is preceded by a word delimiter. End of string is
2680 * always treated as a word and line boundary (there may be cases where it
2681 * shouldn't be, in which case, this should be changed). "delimit" (if
2682 * non-null) specifies a null-terminated string of characters to be considered
2683 * word delimiters matching "<" and ">". if "delimit" is NULL, the default
2684 * delimiters (as set in SetREDefaultWordDelimiters) are used.
2685 * Look_behind_to indicates the position till where it is safe to
2686 * perform look-behind matches. If set, it should be smaller than or equal
2687 * to the start position of the search (pointed at by string). If it is NULL,
2688 * it defaults to the start position.
2689 * Finally, match_to indicates the logical end of the string, till where
2690 * matches are allowed to extend. Note that look-ahead patterns may look
2691 * past that boundary. If match_to is set to NULL, the terminating \0 is
2692 * assumed to correspond to the logical boundary. Match_to, if set, must be
2693 * larger than or equal to end, if set.
2694 */
2695
ExecRE(regexp * prog,const char * string,const char * end,int reverse,char prev_char,char succ_char,const char * delimiters,const char * look_behind_to,const char * match_to)2696 int ExecRE(regexp *prog, const char* string, const char* end, int reverse,
2697 char prev_char, char succ_char, const char* delimiters,
2698 const char* look_behind_to, const char* match_to)
2699 {
2700
2701 register unsigned char *str;
2702 unsigned char **s_ptr;
2703 unsigned char **e_ptr;
2704 int ret_val = 0;
2705 unsigned char tempDelimitTable [256];
2706 int i;
2707
2708 /* Check for valid parameters. */
2709
2710 if (prog == NULL || string == NULL) {
2711 reg_error ("NULL parameter to `ExecRE\'");
2712 goto SINGLE_RETURN;
2713 }
2714
2715 /* Check validity of program. */
2716
2717 if (U_CHAR_AT (prog->program) != MAGIC) {
2718 reg_error ("corrupted program");
2719 goto SINGLE_RETURN;
2720 }
2721
2722 s_ptr = (unsigned char **) prog->startp;
2723 e_ptr = (unsigned char **) prog->endp;
2724
2725 /* If caller has supplied delimiters, make a delimiter table */
2726
2727 if (delimiters == NULL) {
2728 Current_Delimiters = Default_Delimiters;
2729 } else {
2730 Current_Delimiters = makeDelimiterTable (
2731 (unsigned char *) delimiters,
2732 (unsigned char *) tempDelimitTable);
2733 }
2734
2735 /* Remember the logical end of the string. */
2736
2737 End_Of_String = (unsigned char *) match_to;
2738
2739 if (end == NULL && reverse) {
2740 for (end = string; !AT_END_OF_STRING((unsigned char*)end); end++) ;
2741 succ_char = '\n';
2742 } else if (end == NULL) {
2743 succ_char = '\n';
2744 }
2745
2746 /* Initialize arrays used by shortcut_escape. */
2747
2748 if (!init_ansi_classes ()) goto SINGLE_RETURN;
2749
2750 /* Remember the beginning of the string for matching BOL */
2751
2752 Start_Of_String = (unsigned char *) string;
2753 Look_Behind_To = (unsigned char *) (look_behind_to?look_behind_to:string);
2754
2755 Prev_Is_BOL = ((prev_char == '\n') || (prev_char == '\0') ? 1 : 0);
2756 Succ_Is_EOL = ((succ_char == '\n') || (succ_char == '\0') ? 1 : 0);
2757 Prev_Is_Delim = (Current_Delimiters [(unsigned char)prev_char] ? 1 : 0);
2758 Succ_Is_Delim = (Current_Delimiters [(unsigned char)succ_char] ? 1 : 0);
2759
2760 Total_Paren = (int) (prog->program [1]);
2761 Num_Braces = (int) (prog->program [2]);
2762
2763 /* Reset the recursion detection flag */
2764 Recursion_Limit_Exceeded = 0;
2765
2766 /* Allocate memory for {m,n} construct counting variables if need be. */
2767
2768 if (Num_Braces > 0) {
2769 Brace =
2770 (brace_counts *) NEditMalloc (sizeof (brace_counts) * (size_t) Num_Braces);
2771
2772 if (Brace == NULL) {
2773 reg_error ("out of memory in `ExecRE\'");
2774 goto SINGLE_RETURN;
2775 }
2776 } else {
2777 Brace = NULL;
2778 }
2779
2780 /* Initialize the first nine (9) capturing parentheses start and end
2781 pointers to point to the start of the search string. This is to prevent
2782 crashes when later trying to reference captured parens that do not exist
2783 in the compiled regex. We only need to do the first nine since users
2784 can only specify \1, \2, ... \9. */
2785
2786 for (i = 9; i > 0; i--) {
2787 *s_ptr++ = (unsigned char *) string;
2788 *e_ptr++ = (unsigned char *) string;
2789 }
2790
2791 if (!reverse) { /* Forward Search */
2792 if (prog->anchor) {
2793 /* Search is anchored at BOL */
2794
2795 if (attempt (prog, (unsigned char *) string)) {
2796 ret_val = 1;
2797 goto SINGLE_RETURN;
2798 }
2799
2800 for (str = (unsigned char *) string;
2801 !AT_END_OF_STRING(str) && str != (unsigned char *) end && !Recursion_Limit_Exceeded;
2802 str++) {
2803
2804 if (*str == '\n') {
2805 if (attempt (prog, str + 1)) {
2806 ret_val = 1;
2807 break;
2808 }
2809 }
2810 }
2811
2812 goto SINGLE_RETURN;
2813
2814 } else if (prog->match_start != '\0') {
2815 /* We know what char match must start with. */
2816
2817 for (str = (unsigned char *) string;
2818 !AT_END_OF_STRING(str) && str != (unsigned char *) end && !Recursion_Limit_Exceeded;
2819 str++) {
2820
2821 if (*str == (unsigned char)prog->match_start) {
2822 if (attempt (prog, str)) {
2823 ret_val = 1;
2824 break;
2825 }
2826 }
2827 }
2828
2829 goto SINGLE_RETURN;
2830 } else {
2831 /* General case */
2832
2833 for (str = (unsigned char *) string;
2834 !AT_END_OF_STRING(str) && str != (unsigned char *) end && !Recursion_Limit_Exceeded;
2835 str++) {
2836
2837 if (attempt (prog, str)) {
2838 ret_val = 1;
2839 break;
2840 }
2841 }
2842
2843 /* Beware of a single $ matching \0 */
2844 if (!Recursion_Limit_Exceeded && !ret_val && AT_END_OF_STRING(str) && str != (unsigned char *) end) {
2845 if (attempt (prog, str)) {
2846 ret_val = 1;
2847 }
2848 }
2849
2850 goto SINGLE_RETURN;
2851 }
2852 } else { /* Search reverse, same as forward, but loops run backward */
2853
2854 /* Make sure that we don't start matching beyond the logical end */
2855 if (End_Of_String != NULL && (unsigned char*)end > End_Of_String) {
2856 end = (const char*)End_Of_String;
2857 }
2858
2859 if (prog->anchor) {
2860 /* Search is anchored at BOL */
2861
2862 for (str = (unsigned char *)(end - 1);
2863 str >= (unsigned char *) string && !Recursion_Limit_Exceeded;
2864 str--) {
2865
2866 if (*str == '\n') {
2867 if (attempt (prog, str + 1)) {
2868 ret_val = 1;
2869 goto SINGLE_RETURN;
2870 }
2871 }
2872 }
2873
2874 if (!Recursion_Limit_Exceeded && attempt (prog, (unsigned char *) string)) {
2875 ret_val = 1;
2876 goto SINGLE_RETURN;
2877 }
2878
2879 goto SINGLE_RETURN;
2880 } else if (prog->match_start != '\0') {
2881 /* We know what char match must start with. */
2882
2883 for (str = (unsigned char *) end;
2884 str >= (unsigned char *) string && !Recursion_Limit_Exceeded;
2885 str--) {
2886
2887 if (*str == (unsigned char)prog->match_start) {
2888 if (attempt (prog, str)) {
2889 ret_val = 1;
2890 break;
2891 }
2892 }
2893 }
2894
2895 goto SINGLE_RETURN;
2896 } else {
2897 /* General case */
2898
2899 for (str = (unsigned char *) end;
2900 str >= (unsigned char *) string && !Recursion_Limit_Exceeded;
2901 str--) {
2902
2903 if (attempt (prog, str)) {
2904 ret_val = 1;
2905 break;
2906 }
2907 }
2908 }
2909 }
2910
2911 SINGLE_RETURN: if (Brace) free (Brace);
2912
2913 if (Recursion_Limit_Exceeded) return (0);
2914
2915 return (ret_val);
2916 }
2917
2918 /*--------------------------------------------------------------------*
2919 * init_ansi_classes
2920 *
2921 * Generate character class sets using locale aware ANSI C functions.
2922 *
2923 *--------------------------------------------------------------------*/
2924
init_ansi_classes(void)2925 static int init_ansi_classes (void) {
2926
2927 static int initialized = 0;
2928 static int underscore = (int) '_';
2929 int i, word_count, letter_count, space_count;
2930
2931 if (!initialized) {
2932 initialized = 1; /* Only need to generate character sets once. */
2933 word_count = 0;
2934 letter_count = 0;
2935 space_count = 0;
2936
2937 for (i = 1; i < (int)UCHAR_MAX; i++) {
2938 if (isalnum (i) || i == underscore) {
2939 Word_Char [word_count++] = (unsigned char) i;
2940 }
2941
2942 if (isalpha (i)) {
2943 Letter_Char [letter_count++] = (unsigned char) i;
2944 }
2945
2946 /* Note: Whether or not newline is considered to be whitespace is
2947 handled by switches within the original regex and is thus omitted
2948 here. */
2949
2950 if (isspace (i) && (i != (int) '\n')) {
2951 White_Space [space_count++] = (unsigned char) i;
2952 }
2953
2954 /* Make sure arrays are big enough. ("- 2" because of zero array
2955 origin and we need to leave room for the NULL terminator.) */
2956
2957 if (word_count > (ALNUM_CHAR_SIZE - 2) ||
2958 space_count > (WHITE_SPACE_SIZE - 2) ||
2959 letter_count > (ALNUM_CHAR_SIZE - 2)) {
2960
2961 reg_error ("internal error #9 `init_ansi_classes\'");
2962 return (0);
2963 }
2964 }
2965
2966 Word_Char [word_count] = '\0';
2967 Letter_Char [word_count] = '\0';
2968 White_Space [space_count] = '\0';
2969 }
2970
2971 return (1);
2972 }
2973
2974 /*----------------------------------------------------------------------*
2975 * attempt - try match at specific point, returns: 0 failure, 1 success
2976 *----------------------------------------------------------------------*/
2977
attempt(regexp * prog,unsigned char * string)2978 static int attempt (regexp *prog, unsigned char *string) {
2979
2980 register int i;
2981 register unsigned char **s_ptr;
2982 register unsigned char **e_ptr;
2983 int branch_index = 0; /* Must be set to zero ! */
2984
2985 Reg_Input = string;
2986 Start_Ptr_Ptr = (unsigned char **) prog->startp;
2987 End_Ptr_Ptr = (unsigned char **) prog->endp;
2988 s_ptr = (unsigned char **) prog->startp;
2989 e_ptr = (unsigned char **) prog->endp;
2990
2991 /* Reset the recursion counter. */
2992 Recursion_Count = 0;
2993
2994 /* Overhead due to capturing parentheses. */
2995
2996 Extent_Ptr_BW = string;
2997 Extent_Ptr_FW = NULL;
2998
2999 for (i = Total_Paren + 1; i > 0; i--) {
3000 *s_ptr++ = NULL;
3001 *e_ptr++ = NULL;
3002 }
3003
3004 if (match ((unsigned char *) (prog->program + REGEX_START_OFFSET),
3005 &branch_index)) {
3006 prog->startp [0] = (char *) string;
3007 prog->endp [0] = (char *) Reg_Input; /* <-- One char AFTER */
3008 prog->extentpBW = (char *) Extent_Ptr_BW; /* matched string! */
3009 prog->extentpFW = (char *) Extent_Ptr_FW;
3010 prog->top_branch = branch_index;
3011
3012 return (1);
3013 } else {
3014 return (0);
3015 }
3016 }
3017
3018 /*----------------------------------------------------------------------*
3019 * match - main matching routine
3020 *
3021 * Conceptually the strategy is simple: check to see whether the
3022 * current node matches, call self recursively to see whether the rest
3023 * matches, and then act accordingly. In practice we make some effort
3024 * to avoid recursion, in particular by going through "ordinary" nodes
3025 * (that don't need to know whether the rest of the match failed) by a
3026 * loop instead of by recursion. Returns 0 failure, 1 success.
3027 *----------------------------------------------------------------------*/
3028 #define MATCH_RETURN(X)\
3029 { --Recursion_Count; return (X); }
3030 #define CHECK_RECURSION_LIMIT\
3031 if (Recursion_Limit_Exceeded) MATCH_RETURN (0);
3032
match(unsigned char * prog,int * branch_index_param)3033 static int match (unsigned char *prog, int *branch_index_param) {
3034
3035 register unsigned char *scan; /* Current node. */
3036 unsigned char *next; /* Next node. */
3037 register int next_ptr_offset; /* Used by the NEXT_PTR () macro */
3038
3039 if (++Recursion_Count > REGEX_RECURSION_LIMIT) {
3040 if (!Recursion_Limit_Exceeded) /* Prevent duplicate errors */
3041 reg_error("recursion limit exceeded, please respecify expression");
3042 Recursion_Limit_Exceeded = 1;
3043 MATCH_RETURN (0);
3044 }
3045
3046
3047 scan = prog;
3048
3049 while (scan != NULL) {
3050 NEXT_PTR (scan, next);
3051
3052 switch (GET_OP_CODE (scan)) {
3053 case BRANCH:
3054 {
3055 register unsigned char *save;
3056 register int branch_index_local = 0;
3057
3058 if (GET_OP_CODE (next) != BRANCH) { /* No choice. */
3059 next = OPERAND (scan); /* Avoid recursion. */
3060 } else {
3061 do {
3062 save = Reg_Input;
3063
3064 if (match (OPERAND (scan), NULL))
3065 {
3066 if (branch_index_param)
3067 *branch_index_param = branch_index_local;
3068 MATCH_RETURN (1);
3069 }
3070
3071 CHECK_RECURSION_LIMIT
3072
3073 ++branch_index_local;
3074
3075 Reg_Input = save; /* Backtrack. */
3076 NEXT_PTR (scan, scan);
3077 } while (scan != NULL && GET_OP_CODE (scan) == BRANCH);
3078
3079 MATCH_RETURN (0); /* NOT REACHED */
3080 }
3081 }
3082
3083 break;
3084
3085 case EXACTLY:
3086 {
3087 register int len;
3088 register unsigned char *opnd;
3089
3090 opnd = OPERAND (scan);
3091
3092 /* Inline the first character, for speed. */
3093
3094 if (*opnd != *Reg_Input) MATCH_RETURN (0);
3095
3096 len = strlen ((char *) opnd);
3097
3098 if (End_Of_String != NULL && Reg_Input + len > End_Of_String) {
3099 MATCH_RETURN (0);
3100 }
3101
3102 if (len > 1 &&
3103 strncmp ((char *) opnd, (char *) Reg_Input, len) != 0) {
3104
3105 MATCH_RETURN (0);
3106 }
3107
3108 Reg_Input += len;
3109 }
3110
3111 break;
3112
3113 case SIMILAR:
3114 {
3115 register unsigned char *opnd;
3116 register unsigned char test;
3117
3118 opnd = OPERAND (scan);
3119
3120 /* Note: the SIMILAR operand was converted to lower case during
3121 regex compile. */
3122
3123 while ((test = *opnd++) != '\0') {
3124 if (AT_END_OF_STRING(Reg_Input) ||
3125 tolower (*Reg_Input++) != test) {
3126
3127 MATCH_RETURN (0);
3128 }
3129 }
3130 }
3131
3132 break;
3133
3134 case BOL: /* `^' (beginning of line anchor) */
3135 if (Reg_Input == Start_Of_String) {
3136 if (Prev_Is_BOL) break;
3137 } else if (*(Reg_Input - 1) == '\n') {
3138 break;
3139 }
3140
3141 MATCH_RETURN (0);
3142
3143 case EOL: /* `$' anchor matches end of line and end of string */
3144 if (*Reg_Input == '\n' || (AT_END_OF_STRING(Reg_Input) && Succ_Is_EOL)) {
3145 break;
3146 }
3147
3148 MATCH_RETURN (0);
3149
3150 case BOWORD: /* `<' (beginning of word anchor) */
3151 /* Check to see if the current character is not a delimiter
3152 and the preceding character is. */
3153 {
3154 int prev_is_delim;
3155 if (Reg_Input == Start_Of_String) {
3156 prev_is_delim = Prev_Is_Delim;
3157 } else {
3158 prev_is_delim = Current_Delimiters [ *(Reg_Input - 1) ];
3159 }
3160 if (prev_is_delim) {
3161 int current_is_delim;
3162 if (AT_END_OF_STRING(Reg_Input)) {
3163 current_is_delim = Succ_Is_Delim;
3164 } else {
3165 current_is_delim = Current_Delimiters [ *Reg_Input ];
3166 }
3167 if (!current_is_delim) break;
3168 }
3169 }
3170
3171 MATCH_RETURN (0);
3172
3173 case EOWORD: /* `>' (end of word anchor) */
3174 /* Check to see if the current character is a delimiter
3175 and the preceding character is not. */
3176 {
3177 int prev_is_delim;
3178 if (Reg_Input == Start_Of_String) {
3179 prev_is_delim = Prev_Is_Delim;
3180 } else {
3181 prev_is_delim = Current_Delimiters [ *(Reg_Input-1) ];
3182 }
3183 if (!prev_is_delim) {
3184 int current_is_delim;
3185 if (AT_END_OF_STRING(Reg_Input)) {
3186 current_is_delim = Succ_Is_Delim;
3187 } else {
3188 current_is_delim = Current_Delimiters [ *Reg_Input ];
3189 }
3190 if (current_is_delim) break;
3191 }
3192 }
3193
3194 MATCH_RETURN (0);
3195
3196 case NOT_BOUNDARY: /* \B (NOT a word boundary) */
3197 {
3198 int prev_is_delim;
3199 int current_is_delim;
3200 if (Reg_Input == Start_Of_String) {
3201 prev_is_delim = Prev_Is_Delim;
3202 } else {
3203 prev_is_delim = Current_Delimiters [ *(Reg_Input-1) ];
3204 }
3205 if (AT_END_OF_STRING(Reg_Input)) {
3206 current_is_delim = Succ_Is_Delim;
3207 } else {
3208 current_is_delim = Current_Delimiters [ *Reg_Input ];
3209 }
3210 if (!(prev_is_delim ^ current_is_delim)) break;
3211 }
3212
3213 MATCH_RETURN (0);
3214
3215 case IS_DELIM: /* \y (A word delimiter character.) */
3216 if (Current_Delimiters [ *Reg_Input ] &&
3217 !AT_END_OF_STRING(Reg_Input)) {
3218 Reg_Input++; break;
3219 }
3220
3221 MATCH_RETURN (0);
3222
3223 case NOT_DELIM: /* \Y (NOT a word delimiter character.) */
3224 if (!Current_Delimiters [ *Reg_Input ] &&
3225 !AT_END_OF_STRING(Reg_Input)) {
3226 Reg_Input++; break;
3227 }
3228
3229 MATCH_RETURN (0);
3230
3231 case WORD_CHAR: /* \w (word character; alpha-numeric or underscore) */
3232 if ((isalnum ((int) *Reg_Input) || *Reg_Input == '_') &&
3233 !AT_END_OF_STRING(Reg_Input)) {
3234 Reg_Input++; break;
3235 }
3236
3237 MATCH_RETURN (0);
3238
3239 case NOT_WORD_CHAR:/* \W (NOT a word character) */
3240 if (isalnum ((int) *Reg_Input) ||
3241 *Reg_Input == '_' ||
3242 *Reg_Input == '\n' ||
3243 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3244
3245 Reg_Input++; break;
3246
3247 case ANY: /* `.' (matches any character EXCEPT newline) */
3248 if (AT_END_OF_STRING(Reg_Input) || *Reg_Input == '\n') MATCH_RETURN (0);
3249
3250 Reg_Input++; break;
3251
3252 case EVERY: /* `.' (matches any character INCLUDING newline) */
3253 if (AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3254
3255 Reg_Input++; break;
3256
3257 case DIGIT: /* \d, same as [0123456789] */
3258 if (!isdigit ((int) *Reg_Input) ||
3259 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3260
3261 Reg_Input++; break;
3262
3263 case NOT_DIGIT: /* \D, same as [^0123456789] */
3264 if (isdigit ((int) *Reg_Input) ||
3265 *Reg_Input == '\n' ||
3266 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3267
3268 Reg_Input++; break;
3269
3270 case LETTER: /* \l, same as [a-zA-Z] */
3271 if (!isalpha ((int) *Reg_Input) ||
3272 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3273
3274 Reg_Input++; break;
3275
3276 case NOT_LETTER: /* \L, same as [^0123456789] */
3277 if (isalpha ((int) *Reg_Input) ||
3278 *Reg_Input == '\n' ||
3279 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3280
3281 Reg_Input++; break;
3282
3283 case SPACE: /* \s, same as [ \t\r\f\v] */
3284 if (!isspace ((int) *Reg_Input) ||
3285 *Reg_Input == '\n' ||
3286 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3287
3288 Reg_Input++; break;
3289
3290 case SPACE_NL: /* \s, same as [\n \t\r\f\v] */
3291 if (!isspace ((int) *Reg_Input) ||
3292 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3293
3294 Reg_Input++; break;
3295
3296 case NOT_SPACE: /* \S, same as [^\n \t\r\f\v] */
3297 if (isspace ((int) *Reg_Input) ||
3298 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3299
3300 Reg_Input++; break;
3301
3302 case NOT_SPACE_NL: /* \S, same as [^ \t\r\f\v] */
3303 if ((isspace ((int) *Reg_Input) && *Reg_Input != '\n') ||
3304 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3305
3306 Reg_Input++; break;
3307
3308 case ANY_OF: /* [...] character class. */
3309 if (AT_END_OF_STRING(Reg_Input))
3310 MATCH_RETURN (0); /* Needed because strchr ()
3311 considers \0 as a member
3312 of the character set. */
3313
3314 if (strchr ((char *) OPERAND (scan), (int) *Reg_Input) == NULL) {
3315 MATCH_RETURN (0);
3316 }
3317
3318 Reg_Input++; break;
3319
3320 case ANY_BUT: /* [^...] Negated character class-- does NOT normally
3321 match newline (\n added usually to operand at compile
3322 time.) */
3323
3324 if (AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0); /* See comment for ANY_OF. */
3325
3326 if (strchr ((char *) OPERAND (scan), (int) *Reg_Input) != NULL) {
3327 MATCH_RETURN (0);
3328 }
3329
3330 Reg_Input++; break;
3331
3332 case NOTHING:
3333 case BACK:
3334 break;
3335
3336 case STAR:
3337 case PLUS:
3338 case QUESTION:
3339 case BRACE:
3340
3341 case LAZY_STAR:
3342 case LAZY_PLUS:
3343 case LAZY_QUESTION:
3344 case LAZY_BRACE:
3345 {
3346 register unsigned long num_matched = REG_ZERO;
3347 register unsigned long min = ULONG_MAX, max = REG_ZERO;
3348 register unsigned char *save;
3349 register unsigned char next_char;
3350 unsigned char *next_op;
3351 int lazy = 0;
3352
3353 /* Lookahead (when possible) to avoid useless match attempts
3354 when we know what character comes next. */
3355
3356 if (GET_OP_CODE (next) == EXACTLY) {
3357 next_char = *OPERAND (next);
3358 } else {
3359 next_char = '\0';/* i.e. Don't know what next character is. */
3360 }
3361
3362 next_op = OPERAND (scan);
3363
3364 switch (GET_OP_CODE (scan)) {
3365 case LAZY_STAR:
3366 lazy = 1;
3367 case STAR:
3368 min = REG_ZERO;
3369 max = ULONG_MAX;
3370 break;
3371
3372 case LAZY_PLUS:
3373 lazy = 1;
3374 case PLUS:
3375 min = REG_ONE;
3376 max = ULONG_MAX;
3377 break;
3378
3379 case LAZY_QUESTION:
3380 lazy = 1;
3381 case QUESTION:
3382 min = REG_ZERO;
3383 max = REG_ONE;
3384 break;
3385
3386 case LAZY_BRACE:
3387 lazy = 1;
3388 case BRACE:
3389 min = (unsigned long)
3390 GET_OFFSET (scan + NEXT_PTR_SIZE);
3391
3392 max = (unsigned long)
3393 GET_OFFSET (scan + (2 * NEXT_PTR_SIZE));
3394
3395 if (max <= REG_INFINITY) max = ULONG_MAX;
3396
3397 next_op = OPERAND (scan + (2 * NEXT_PTR_SIZE));
3398 }
3399
3400 save = Reg_Input;
3401
3402 if (lazy) {
3403 if ( min > REG_ZERO) num_matched = greedy (next_op, min);
3404 } else {
3405 num_matched = greedy (next_op, max);
3406 }
3407
3408 while (min <= num_matched && num_matched <= max) {
3409 if (next_char == '\0' || next_char == *Reg_Input) {
3410 if (match (next, NULL)) MATCH_RETURN (1);
3411
3412 CHECK_RECURSION_LIMIT
3413 }
3414
3415 /* Couldn't or didn't match. */
3416
3417 if (lazy) {
3418 if (!greedy (next_op, 1)) MATCH_RETURN (0);
3419
3420 num_matched++; /* Inch forward. */
3421 } else if (num_matched > REG_ZERO) {
3422 num_matched--; /* Back up. */
3423 } else if (min == REG_ZERO && num_matched == REG_ZERO) {
3424 break;
3425 }
3426
3427 Reg_Input = save + num_matched;
3428 }
3429
3430 MATCH_RETURN (0);
3431 }
3432
3433 break;
3434
3435 case END:
3436 if (Extent_Ptr_FW == NULL || (Reg_Input - Extent_Ptr_FW) > 0) {
3437 Extent_Ptr_FW = Reg_Input;
3438 }
3439
3440 MATCH_RETURN (1); /* Success! */
3441
3442 break;
3443
3444 case INIT_COUNT:
3445 Brace->count [*OPERAND (scan)] = REG_ZERO;
3446
3447 break;
3448
3449 case INC_COUNT:
3450 Brace->count [*OPERAND (scan)]++;
3451
3452 break;
3453
3454 case TEST_COUNT:
3455 if (Brace->count [*OPERAND (scan)] <
3456 (unsigned long) GET_OFFSET (scan + NEXT_PTR_SIZE + INDEX_SIZE)) {
3457
3458 next = scan + NODE_SIZE + INDEX_SIZE + NEXT_PTR_SIZE;
3459 }
3460
3461 break;
3462
3463 case BACK_REF:
3464 case BACK_REF_CI:
3465 /* case X_REGEX_BR: */
3466 /* case X_REGEX_BR_CI: *** IMPLEMENT LATER */
3467 {
3468 register unsigned char *captured, *finish;
3469 int paren_no;
3470
3471 paren_no = (int) *OPERAND (scan);
3472
3473 /* if (GET_OP_CODE (scan) == X_REGEX_BR ||
3474 GET_OP_CODE (scan) == X_REGEX_BR_CI) {
3475
3476 if (Cross_Regex_Backref == NULL) MATCH_RETURN (0);
3477
3478 captured =
3479 (unsigned char *) Cross_Regex_Backref->startp [paren_no];
3480
3481 finish =
3482 (unsigned char *) Cross_Regex_Backref->endp [paren_no];
3483 } else { */
3484 captured = Back_Ref_Start [paren_no];
3485 finish = Back_Ref_End [paren_no];
3486 /* } */
3487
3488 if ((captured != NULL) && (finish != NULL)) {
3489 if (captured > finish) MATCH_RETURN (0);
3490
3491 if (GET_OP_CODE (scan) == BACK_REF_CI /* ||
3492 GET_OP_CODE (scan) == X_REGEX_BR_CI*/ ) {
3493
3494 while (captured < finish) {
3495 if (AT_END_OF_STRING(Reg_Input) ||
3496 tolower (*captured++) != tolower (*Reg_Input++)) {
3497 MATCH_RETURN (0);
3498 }
3499 }
3500 } else {
3501 while (captured < finish) {
3502 if (AT_END_OF_STRING(Reg_Input) ||
3503 *captured++ != *Reg_Input++) MATCH_RETURN (0);
3504 }
3505 }
3506
3507 break;
3508 } else {
3509 MATCH_RETURN (0);
3510 }
3511 }
3512
3513 case POS_AHEAD_OPEN:
3514 case NEG_AHEAD_OPEN:
3515 {
3516 register unsigned char *save;
3517 register unsigned char *saved_end;
3518 int answer;
3519
3520 save = Reg_Input;
3521
3522 /* Temporarily ignore the logical end of the string, to allow
3523 lookahead past the end. */
3524 saved_end = End_Of_String;
3525 End_Of_String = NULL;
3526
3527 answer = match (next, NULL); /* Does the look-ahead regex match? */
3528
3529 CHECK_RECURSION_LIMIT
3530
3531 if ((GET_OP_CODE (scan) == POS_AHEAD_OPEN) ? answer : !answer) {
3532 /* Remember the last (most to the right) character position
3533 that we consume in the input for a successful match. This
3534 is info that may be needed should an attempt be made to
3535 match the exact same text at the exact same place. Since
3536 look-aheads backtrack, a regex with a trailing look-ahead
3537 may need more text than it matches to accomplish a
3538 re-match. */
3539
3540 if (Extent_Ptr_FW == NULL || (Reg_Input - Extent_Ptr_FW) > 0) {
3541 Extent_Ptr_FW = Reg_Input;
3542 }
3543
3544 Reg_Input = save; /* Backtrack to look-ahead start. */
3545 End_Of_String = saved_end; /* Restore logical end. */
3546
3547 /* Jump to the node just after the (?=...) or (?!...)
3548 Construct. */
3549
3550 next = next_ptr (OPERAND (scan)); /* Skip 1st branch */
3551 /* Skip the chain of branches inside the look-ahead */
3552 while(GET_OP_CODE(next) == BRANCH)
3553 next = next_ptr (next);
3554 next = next_ptr (next); /* Skip the LOOK_AHEAD_CLOSE */
3555 } else {
3556 Reg_Input = save; /* Backtrack to look-ahead start. */
3557 End_Of_String = saved_end; /* Restore logical end. */
3558
3559 MATCH_RETURN (0);
3560 }
3561 }
3562
3563 break;
3564
3565 case POS_BEHIND_OPEN:
3566 case NEG_BEHIND_OPEN:
3567 {
3568 register unsigned char *save;
3569 int answer;
3570 register int offset, upper;
3571 int lower;
3572 int found = 0;
3573 unsigned char *saved_end;
3574
3575 save = Reg_Input;
3576 saved_end = End_Of_String;
3577
3578 /* Prevent overshoot (greedy matching could end past the
3579 current position) by tightening the matching boundary.
3580 Lookahead inside lookbehind can still cross that boundary. */
3581 End_Of_String = Reg_Input;
3582
3583 lower = GET_LOWER (scan);
3584 upper = GET_UPPER (scan);
3585
3586 /* Start with the shortest match first. This is the most
3587 efficient direction in general.
3588 Note! Negative look behind is _very_ tricky when the length
3589 is not constant: we have to make sure the expression doesn't
3590 match for _any_ of the starting positions. */
3591 for (offset = lower; offset <= upper; ++offset) {
3592 Reg_Input = save - offset;
3593
3594 if (Reg_Input < Look_Behind_To) {
3595 /* No need to look any further */
3596 break;
3597 }
3598
3599 answer = match (next, NULL); /* Does the look-behind regex match? */
3600
3601 CHECK_RECURSION_LIMIT
3602
3603 /* The match must have ended at the current position;
3604 otherwise it is invalid */
3605 if (answer && Reg_Input == save) {
3606 /* It matched, exactly far enough */
3607 found = 1;
3608
3609 /* Remember the last (most to the left) character position
3610 that we consume in the input for a successful match.
3611 This is info that may be needed should an attempt be
3612 made to match the exact same text at the exact same
3613 place. Since look-behind backtracks, a regex with a
3614 leading look-behind may need more text than it matches
3615 to accomplish a re-match. */
3616
3617 if (Extent_Ptr_BW == NULL ||
3618 (Extent_Ptr_BW - (save - offset)) > 0) {
3619 Extent_Ptr_BW = save - offset;
3620 }
3621
3622 break;
3623 }
3624 }
3625
3626 /* Always restore the position and the logical string end. */
3627 Reg_Input = save;
3628 End_Of_String = saved_end;
3629
3630 if ((GET_OP_CODE (scan) == POS_BEHIND_OPEN) ? found : !found) {
3631 /* The look-behind matches, so we must jump to the next
3632 node. The look-behind node is followed by a chain of
3633 branches (contents of the look-behind expression), and
3634 terminated by a look-behind-close node. */
3635 next = next_ptr (OPERAND (scan) + LENGTH_SIZE); /* 1st branch */
3636 /* Skip the chained branches inside the look-ahead */
3637 while (GET_OP_CODE (next) == BRANCH)
3638 next = next_ptr (next);
3639 next = next_ptr (next); /* Skip LOOK_BEHIND_CLOSE */
3640 } else {
3641 /* Not a match */
3642 MATCH_RETURN (0);
3643 }
3644 }
3645 break;
3646
3647 case LOOK_AHEAD_CLOSE:
3648 case LOOK_BEHIND_CLOSE:
3649 MATCH_RETURN (1); /* We have reached the end of the look-ahead or
3650 look-behind which implies that we matched it,
3651 so return TRUE. */
3652 default:
3653 if ((GET_OP_CODE (scan) > OPEN) &&
3654 (GET_OP_CODE (scan) < OPEN + NSUBEXP)) {
3655
3656 register int no;
3657 register unsigned char *save;
3658
3659 no = GET_OP_CODE (scan) - OPEN;
3660 save = Reg_Input;
3661
3662 if (no < 10) {
3663 Back_Ref_Start [no] = save;
3664 Back_Ref_End [no] = NULL;
3665 }
3666
3667 if (match (next, NULL)) {
3668 /* Do not set `Start_Ptr_Ptr' if some later invocation (think
3669 recursion) of the same parentheses already has. */
3670
3671 if (Start_Ptr_Ptr [no] == NULL) Start_Ptr_Ptr [no] = save;
3672
3673 MATCH_RETURN (1);
3674 } else {
3675 MATCH_RETURN (0);
3676 }
3677 } else if ((GET_OP_CODE (scan) > CLOSE) &&
3678 (GET_OP_CODE (scan) < CLOSE + NSUBEXP)) {
3679
3680 register int no;
3681 register unsigned char *save;
3682
3683 no = GET_OP_CODE (scan) - CLOSE;
3684 save = Reg_Input;
3685
3686 if (no < 10) Back_Ref_End [no] = save;
3687
3688 if (match (next, NULL)) {
3689 /* Do not set `End_Ptr_Ptr' if some later invocation of the
3690 same parentheses already has. */
3691
3692 if (End_Ptr_Ptr [no] == NULL) End_Ptr_Ptr [no] = save;
3693
3694 MATCH_RETURN (1);
3695 } else {
3696 MATCH_RETURN (0);
3697 }
3698 } else {
3699 reg_error ("memory corruption, `match\'");
3700
3701 MATCH_RETURN (0);
3702 }
3703
3704 break;
3705 }
3706
3707 scan = next;
3708 }
3709
3710 /* We get here only if there's trouble -- normally "case END" is
3711 the terminating point. */
3712
3713 reg_error ("corrupted pointers, `match\'");
3714
3715 MATCH_RETURN (0);
3716 }
3717
3718 /*----------------------------------------------------------------------*
3719 * greedy
3720 *
3721 * Repeatedly match something simple up to "max" times. If max <= 0
3722 * then match as much as possible (max = infinity). Uses unsigned long
3723 * variables to maximize the amount of text matchable for unbounded
3724 * qualifiers like '*' and '+'. This will allow at least 4,294,967,295
3725 * matches (4 Gig!) for an ANSI C compliant compiler. If you are
3726 * applying a regex to something bigger than that, you shouldn't be
3727 * using NEdit!
3728 *
3729 * Returns the actual number of matches.
3730 *----------------------------------------------------------------------*/
3731
greedy(unsigned char * p,long max)3732 static unsigned long greedy (unsigned char *p, long max) {
3733
3734 register unsigned char *input_str;
3735 register unsigned char *operand;
3736 register unsigned long count = REG_ZERO;
3737 register unsigned long max_cmp;
3738
3739 input_str = Reg_Input;
3740 operand = OPERAND (p); /* Literal char or start of class characters. */
3741 max_cmp = (max > 0) ? (unsigned long) max : ULONG_MAX;
3742
3743 switch (GET_OP_CODE (p)) {
3744 case ANY:
3745 /* Race to the end of the line or string. Dot DOESN'T match
3746 newline. */
3747
3748 while (count < max_cmp &&
3749 *input_str != '\n' &&
3750 !AT_END_OF_STRING(input_str)) {
3751 count++; input_str++;
3752 }
3753
3754 break;
3755
3756 case EVERY:
3757 /* Race to the end of the line or string. Dot DOES match newline. */
3758
3759 while (count < max_cmp &&
3760 !AT_END_OF_STRING(input_str)) {
3761 count++; input_str++;
3762 }
3763
3764 break;
3765
3766 case EXACTLY: /* Count occurrences of single character operand. */
3767 while (count < max_cmp &&
3768 *operand == *input_str &&
3769 !AT_END_OF_STRING(input_str)) {
3770 count++; input_str++;
3771 }
3772
3773 break;
3774
3775 case SIMILAR: /* Case insensitive version of EXACTLY */
3776 while (count < max_cmp &&
3777 *operand == tolower (*input_str) &&
3778 !AT_END_OF_STRING(input_str)) {
3779 count++; input_str++;
3780 }
3781
3782 break;
3783
3784 case ANY_OF: /* [...] character class. */
3785 while (count < max_cmp &&
3786 strchr ((char *) operand, (int) *input_str) != NULL &&
3787 !AT_END_OF_STRING(input_str)) {
3788
3789 count++; input_str++;
3790 }
3791
3792 break;
3793
3794 case ANY_BUT: /* [^...] Negated character class- does NOT normally
3795 match newline (\n added usually to operand at compile
3796 time.) */
3797
3798 while (count < max_cmp &&
3799 strchr ((char *) operand, (int) *input_str) == NULL &&
3800 !AT_END_OF_STRING(input_str)) {
3801
3802 count++; input_str++;
3803 }
3804
3805 break;
3806
3807 case IS_DELIM: /* \y (not a word delimiter char)
3808 NOTE: '\n' and '\0' are always word delimiters. */
3809
3810 while (count < max_cmp &&
3811 Current_Delimiters [ *input_str ] &&
3812 !AT_END_OF_STRING(input_str)) {
3813 count++; input_str++;
3814 }
3815
3816 break;
3817
3818 case NOT_DELIM: /* \Y (not a word delimiter char)
3819 NOTE: '\n' and '\0' are always word delimiters. */
3820
3821 while (count < max_cmp &&
3822 !Current_Delimiters [ *input_str ] &&
3823 !AT_END_OF_STRING(input_str)) {
3824 count++; input_str++;
3825 }
3826
3827 break;
3828
3829 case WORD_CHAR: /* \w (word character, alpha-numeric or underscore) */
3830 while (count < max_cmp &&
3831 (isalnum ((int) *input_str) ||
3832 *input_str == (unsigned char) '_') &&
3833 !AT_END_OF_STRING(input_str)) {
3834
3835 count++; input_str++;
3836 }
3837
3838 break;
3839
3840 case NOT_WORD_CHAR:/* \W (NOT a word character) */
3841 while (count < max_cmp &&
3842 !isalnum ((int) *input_str) &&
3843 *input_str != (unsigned char) '_' &&
3844 *input_str != (unsigned char) '\n' &&
3845 !AT_END_OF_STRING(input_str)) {
3846
3847 count++; input_str++;
3848 }
3849
3850 break;
3851
3852 case DIGIT: /* same as [0123456789] */
3853 while (count < max_cmp &&
3854 isdigit ((int) *input_str) &&
3855 !AT_END_OF_STRING(input_str)) {
3856 count++; input_str++;
3857 }
3858
3859 break;
3860
3861 case NOT_DIGIT: /* same as [^0123456789] */
3862 while (count < max_cmp &&
3863 !isdigit ((int) *input_str) &&
3864 *input_str != '\n' &&
3865 !AT_END_OF_STRING(input_str)) {
3866
3867 count++; input_str++;
3868 }
3869
3870 break;
3871
3872 case SPACE: /* same as [ \t\r\f\v]-- doesn't match newline. */
3873 while (count < max_cmp &&
3874 isspace ((int) *input_str) &&
3875 *input_str != '\n' &&
3876 !AT_END_OF_STRING(input_str)) {
3877
3878 count++; input_str++;
3879 }
3880
3881 break;
3882
3883 case SPACE_NL: /* same as [\n \t\r\f\v]-- matches newline. */
3884 while (count < max_cmp &&
3885 isspace ((int) *input_str) &&
3886 !AT_END_OF_STRING(input_str)) {
3887
3888 count++; input_str++;
3889 }
3890
3891 break;
3892
3893 case NOT_SPACE: /* same as [^\n \t\r\f\v]-- doesn't match newline. */
3894 while (count < max_cmp &&
3895 !isspace ((int) *input_str) &&
3896 !AT_END_OF_STRING(input_str)) {
3897
3898 count++; input_str++;
3899 }
3900
3901 break;
3902
3903 case NOT_SPACE_NL: /* same as [^ \t\r\f\v]-- matches newline. */
3904 while (count < max_cmp &&
3905 (!isspace ((int) *input_str) || *input_str == '\n') &&
3906 !AT_END_OF_STRING(input_str)) {
3907
3908 count++; input_str++;
3909 }
3910
3911 break;
3912
3913 case LETTER: /* same as [a-zA-Z] */
3914 while (count < max_cmp &&
3915 isalpha ((int) *input_str) &&
3916 !AT_END_OF_STRING(input_str)) {
3917
3918 count++; input_str++;
3919 }
3920
3921 break;
3922
3923 case NOT_LETTER: /* same as [^a-zA-Z] */
3924 while (count < max_cmp &&
3925 !isalpha ((int) *input_str) &&
3926 *input_str != '\n' &&
3927 !AT_END_OF_STRING(input_str)) {
3928
3929 count++; input_str++;
3930 }
3931
3932 break;
3933
3934 default:
3935 /* Called inappropriately. Only atoms that are SIMPLE should
3936 generate a call to greedy. The above cases should cover
3937 all the atoms that are SIMPLE. */
3938
3939 reg_error ("internal error #10 `greedy\'");
3940 count = 0U; /* Best we can do. */
3941 }
3942
3943 /* Point to character just after last matched character. */
3944
3945 Reg_Input = input_str;
3946
3947 return (count);
3948 }
3949
3950 /*----------------------------------------------------------------------*
3951 * next_ptr - compute the address of a node's "NEXT" pointer.
3952 * Note: a simplified inline version is available via the NEXT_PTR() macro,
3953 * but that one is only to be used at time-critical places (see the
3954 * description of the macro).
3955 *----------------------------------------------------------------------*/
3956
next_ptr(unsigned char * ptr)3957 static unsigned char * next_ptr (unsigned char *ptr) {
3958
3959 register int offset;
3960
3961 if (ptr == &Compute_Size) return (NULL);
3962
3963 offset = GET_OFFSET (ptr);
3964
3965 if (offset == 0) return (NULL);
3966
3967 if (GET_OP_CODE (ptr) == BACK) {
3968 return (ptr - offset);
3969 } else {
3970 return (ptr + offset);
3971 }
3972 }
3973
3974 /*
3975 ** SubstituteRE - Perform substitutions after a `regexp' match.
3976 **
3977 ** This function cleanly shortens results of more than max length to max.
3978 ** To give the caller a chance to react to this the function returns False
3979 ** on any error. The substitution will still be executed.
3980 */
SubstituteRE(const regexp * prog,const char * source,char * dest,int max)3981 Boolean SubstituteRE(const regexp* prog, const char* source, char* dest,
3982 int max)
3983 {
3984
3985 register unsigned char *src;
3986 unsigned char *src_alias;
3987 register unsigned char *dst;
3988 register unsigned char c;
3989 register unsigned char test;
3990 register int paren_no;
3991 register int len;
3992 register unsigned char chgcase;
3993 Boolean anyWarnings = False;
3994
3995 if (prog == NULL || source == NULL || dest == NULL) {
3996 reg_error ("NULL parm to `SubstituteRE\'");
3997
3998 return False;
3999 }
4000
4001 if (U_CHAR_AT (prog->program) != MAGIC) {
4002 reg_error ("damaged regexp passed to `SubstituteRE\'");
4003
4004 return False;
4005 }
4006
4007 src = (unsigned char *) source;
4008 dst = (unsigned char *) dest;
4009
4010 while ((c = *src++) != '\0') {
4011 chgcase = '\0';
4012 paren_no = -1;
4013
4014 if (c == '\\') {
4015 /* Process any case altering tokens, i.e \u, \U, \l, \L. */
4016
4017 if (*src == 'u' || *src == 'U' || *src == 'l' || *src == 'L') {
4018 chgcase = *src;
4019 src++;
4020 c = *src++;
4021
4022 if (c == '\0') break;
4023 }
4024 }
4025
4026 if (c == '&') {
4027 paren_no = 0;
4028 } else if (c == '\\') {
4029 /* Can not pass register variable `&src' to function `numeric_escape'
4030 so make a non-register copy that we can take the address of. */
4031
4032 src_alias = src;
4033
4034 if ('1' <= *src && *src <= '9') {
4035 paren_no = (int) *src++ - (int) '0';
4036
4037 } else if ((test = literal_escape (*src)) != '\0') {
4038 c = test; src++;
4039
4040 } else if ((test = numeric_escape (*src, &src_alias)) != '\0') {
4041 c = test;
4042 src = src_alias; src++;
4043
4044 /* NOTE: if an octal escape for zero is attempted (e.g. \000), it
4045 will be treated as a literal string. */
4046 } else if (*src == '\0') {
4047 /* If '\' is the last character of the replacement string, it is
4048 interpreted as a literal backslash. */
4049
4050 c = '\\';
4051 } else {
4052 c = *src++; /* Allow any escape sequence (This is */
4053 } /* INCONSISTENT with the `CompileRE' */
4054 } /* mind set of issuing an error! */
4055
4056 if (paren_no < 0) { /* Ordinary character. */
4057 if (((char *) dst - (char *) dest) >= (max - 1)) {
4058 reg_error("replacing expression in `SubstituteRE\' too long; "
4059 "truncating");
4060 anyWarnings = True;
4061 break;
4062 } else {
4063 *dst++ = c;
4064 }
4065 } else if (prog->startp [paren_no] != NULL &&
4066 prog->endp [paren_no] != NULL) {
4067
4068 len = prog->endp [paren_no] - prog->startp [paren_no];
4069
4070 if (((char *) dst + len - (char *) dest) >= max-1) {
4071 reg_error("replacing expression in `SubstituteRE\' too long; "
4072 "truncating");
4073 anyWarnings = True;
4074 len = max - ((char *) dst - (char *) dest) - 1;
4075 }
4076
4077 (void) strncpy ((char *) dst, (char *) prog->startp [paren_no], len);
4078
4079 if (chgcase != '\0') adjustcase (dst, len, chgcase);
4080
4081 dst += len;
4082
4083 if (len != 0 && *(dst - 1) == '\0') { /* strncpy hit NUL. */
4084 reg_error ("damaged match string in `SubstituteRE\'");
4085 anyWarnings = True;
4086 }
4087 }
4088 }
4089
4090 *dst = '\0';
4091
4092 return !anyWarnings;
4093 }
4094
adjustcase(unsigned char * str,int len,unsigned char chgcase)4095 static void adjustcase (unsigned char *str, int len, unsigned char chgcase) {
4096
4097 register unsigned char *string = str;
4098 int i;
4099
4100 /* The tokens \u and \l only modify the first character while the tokens
4101 \U and \L modify the entire string. */
4102
4103 if (islower (chgcase) && len > 0) len = 1;
4104
4105 switch (chgcase) {
4106 case 'u':
4107 case 'U':
4108 for (i = 0; i < len; i++) {
4109 *(string + i) = toupper ((int) *(string + i));
4110 }
4111
4112 break;
4113
4114 case 'l':
4115 case 'L':
4116 for (i = 0; i < len; i++) {
4117 *(string + i) = tolower ((int) *(string + i));
4118 }
4119
4120 break;
4121 }
4122 }
4123
4124 /*----------------------------------------------------------------------*
4125 * reg_error
4126 *----------------------------------------------------------------------*/
4127
reg_error(char * str)4128 static void reg_error (char *str) {
4129
4130 fprintf (
4131 stderr,
4132 "nedit: Internal error processing regular expression (%s)\n",
4133 str);
4134 }
4135
4136 /*----------------------------------------------------------------------*
4137 * makeDelimiterTable
4138 *
4139 * Translate a null-terminated string of delimiters into a 256 byte
4140 * lookup table for determining whether a character is a delimiter or
4141 * not.
4142 *
4143 * Table must be allocated by the caller.
4144 *
4145 * Return value is a pointer to the table.
4146 *----------------------------------------------------------------------*/
4147
makeDelimiterTable(unsigned char * delimiters,unsigned char * table)4148 static unsigned char * makeDelimiterTable (
4149 unsigned char *delimiters,
4150 unsigned char *table) {
4151
4152 unsigned char *c;
4153
4154 memset (table, 0, 256);
4155
4156 for (c = (unsigned char *) delimiters; *c != '\0'; c++) {
4157 table [*c] = 1;
4158 }
4159
4160 table [(int) '\0'] = 1; /* These */
4161 table [(int) '\t'] = 1; /* characters */
4162 table [(int) '\n'] = 1; /* are always */
4163 table [(int) ' ' ] = 1; /* delimiters. */
4164
4165 return table;
4166 }
4167
4168 /*----------------------------------------------------------------------*
4169 * SetREDefaultWordDelimiters
4170 *
4171 * Builds a default delimiter table that persists across `ExecRE' calls.
4172 *----------------------------------------------------------------------*/
4173
SetREDefaultWordDelimiters(char * delimiters)4174 void SetREDefaultWordDelimiters (char *delimiters) {
4175 makeDelimiterTable ((unsigned char *) delimiters, Default_Delimiters);
4176 }
4177