1 /*------------------------------------------------------------------------*
2 * static const char CVSID[] = "$Id: regularExp.c,v 1.30 2006/08/13 21:47:45 yooden Exp $";
3 * `CompileRE', `ExecRE', and `substituteRE' -- regular expression parsing
4 *
5 * This is a HIGHLY ALTERED VERSION of Henry Spencer's `regcomp' and
6 * `regexec' code adapted for NEdit.
7 *
8 * .-------------------------------------------------------------------.
9 * | ORIGINAL COPYRIGHT NOTICE: |
10 * | |
11 * | Copyright (c) 1986 by University of Toronto. |
12 * | Written by Henry Spencer. Not derived from licensed software. |
13 * | |
14 * | Permission is granted to anyone to use this software for any |
15 * | purpose on any computer system, and to redistribute it freely, |
16 * | subject to the following restrictions: |
17 * | |
18 * | 1. The author is not responsible for the consequences of use of |
19 * | this software, no matter how awful, even if they arise |
20 * | from defects in it. |
21 * | |
22 * | 2. The origin of this software must not be misrepresented, either |
23 * | by explicit claim or by omission. |
24 * | |
25 * | 3. Altered versions should be plainly marked as such, and must not |
26 * | be misrepresented as being the original software. |
27 * `-------------------------------------------------------------------'
28 *
29 * This is free software; you can redistribute it and/or modify it under the
30 * terms of the GNU General Public License as published by the Free Software
31 * Foundation; either version 2 of the License, or (at your option) any later
32 * version. In addition, you may distribute version of this program linked to
33 * Motif or Open Motif. See README for details.
34 *
35 * This software is distributed in the hope that it will be useful, but WITHOUT
36 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
37 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
38 * for more details.
39 *
40 * You should have received a copy of the GNU General Public License along with
41 * software; if not, write to the Free Software Foundation, Inc., 59 Temple
42 * Place, Suite 330, Boston, MA 02111-1307 USA
43 *
44 *
45 * BEWARE that some of this code is subtly aware of the way operator
46 * precedence is structured in regular expressions. Serious changes in
47 * regular-expression syntax might require a total rethink.
48 * -- Henry Spencer
49 * (Yes, it did!) -- Christopher Conrad, Dec. 1999
50 *
51 * January, 1994, Mark Edel
52 * Consolidated files, changed names of external functions to avoid
53 * potential conflicts with native regcomp and regexec functions, changed
54 * error reporting to NEdit form, added multi-line and reverse searching,
55 * and added \n \t \u \U \l \L.
56 *
57 * June, 1996, Mark Edel
58 * Bug in NEXT macro, didn't work for expressions which compiled to over
59 * 256 bytes.
60 *
61 * December, 1999, Christopher Conrad
62 * Reformatted code for readability, improved error output, added octal and
63 * hexadecimal escapes, added back-references (\1-\9), added positive look
64 * ahead: (?=...), added negative lookahead: (?!...), added non-capturing
65 * parentheses: (?:...), added case insensitive constructs (?i...) and
66 * (?I...), added newline matching constructs (?n...) and (?N...), added
67 * regex comments: (?#...), added shortcut escapes: \d\D\l\L\s\S\w\W\y\Y.
68 * Added "not a word boundary" anchor \B.
69 *
70 * July, 2002, Eddy De Greef
71 * Added look behind, both positive (?<=...) and negative (?<!...) for
72 * bounded-length patterns.
73 *
74 * November, 2004, Eddy De Greef
75 * Added constrained matching (allowing specification of the logical end
76 * of the string iso. matching till \0), and fixed several (probably
77 * very old) string overrun errors that could easily result in crashes,
78 * especially in client code.
79 *
80 * June, 2008, David Weenink
81 * Adaptation for use in the Praat program.
82 * Changed reg_error to call Melder_error.
83 * Extra parameter for SubstituteRE to allow error differentiation
84 * (not enough memory can be repaired upstream).
85 *
86 * April, 2018, Paul Boersma
87 * Added Unicode-based definitions of letters, digits, spaces and word boundaries.
88 * Thereby removed shortcut escapes from classes.
89 */
90
91 #include <limits.h>
92 #include "melder.h"
93
94 /* The first byte of the regexp internal `program' is a magic number to help
95 guard against corrupted data; the compiled regex code really begins in the
96 second byte. */
97
98 #define MAGIC 0234
99
100 /* The "internal use only" fields in `regexp.h' are present to pass info from
101 * `CompileRE' to `ExecRE' which permits the execute phase to run lots faster on
102 * simple cases. They are:
103 *
104 * match_start Character that must begin a match; '\0' if none obvious.
105 * anchor Is the match anchored (at beginning-of-line only)?
106 *
107 * `match_start' and `anchor' permit very fast decisions on suitable starting
108 * points for a match, considerably reducing the work done by ExecRE. */
109
110 /* STRUCTURE FOR A REGULAR EXPRESSION (regex) `PROGRAM'.
111 *
112 * This is essentially a linear encoding of a nondeterministic finite-state
113 * machine or NFA (aka syntax charts or `railroad normal form' in parsing
114 * technology). Each node is an opcode plus a NEXT pointer, possibly
115 * followed by operands. NEXT pointers of all nodes except BRANCH implement
116 * concatenation; a NEXT pointer with a BRANCH on both ends of it is
117 * connecting two alternatives. (Here we have one of the subtle syntax
118 * dependencies: an individual BRANCH (as opposed to a collection of them) is
119 * never concatenated with anything because of operator precedence.) The
120 * operand of some types of nodes is a literal string; for others, it is a node
121 * leading into a sub-FSM. In particular, the operand of a BRANCH node is the
122 * first node of the branch. (NB this is _NOT_ a tree structure: the tail of
123 * the branch connects to the thing following the set of BRANCHes.)
124 *
125 * The opcodes are: */
126
127 /* DEFINITION VALUE MEANING */
128
129 #define END 1 /* End of program. */
130
131 /* Zero width positional assertions. */
132
133 #define BOL 2 /* Match position at beginning of line. */
134 #define EOL 3 /* Match position at end of line. */
135 #define BOWORD 4 /* Match "" representing word delimiter or BOL */
136 #define EOWORD 5 /* Match "" representing word delimiter or EOL */
137 #define NOT_BOUNDARY 6 /* Not word boundary (\B, opposite of < and >) */
138
139 /* Op codes with null terminated string operands. */
140
141 #define EXACTLY 7 /* Match this string. */
142 #define SIMILAR 8 /* Match this case insensitive string */
143 #define ANY_OF 9 /* Match any character in the set. */
144 #define ANY_BUT 10 /* Match any character not in the set. */
145
146 /* Op codes to match any character. */
147
148 #define ANY 11 /* Match any one character (implements '.') */
149 #define EVERY 12 /* Same as ANY but matches newline. */
150
151 /* Shortcut escapes, \d, \D, \l, \L, \s, \S, \w, \W, \y, \Y. */
152
153 #define DIGIT 13 /* Match any digit, i.e. [0123456789] */
154 #define NOT_DIGIT 14 /* Match any non-digit, i.e. [^0123456789] */
155 #define LETTER 15 /* Match any letter character [a-zA-Z] */
156 #define NOT_LETTER 16 /* Match any non-letter character [^a-zA-Z] */
157 #define SPACE 17 /* Match any whitespace character EXCEPT \n */
158 #define SPACE_NL 18 /* Match any whitespace character INCLUDING \n */
159 #define NOT_SPACE 19 /* Match any non-whitespace character */
160 #define NOT_SPACE_NL 20 /* Same as NOT_SPACE but matches newline. */
161 #define WORD_CHAR 21 /* Match any word character [a-zA-Z0-9_] */
162 #define NOT_WORD_CHAR 22 /* Match any non-word character [^a-zA-Z0-9_] */
163 #define IS_DELIM 23 /* Match any character that's a word delimiter */
164 #define NOT_DELIM 24 /* Match any character NOT a word delimiter */
165
166 /* Quantifier nodes. (Only applied to SIMPLE nodes. Quantifiers applied to non
167 SIMPLE nodes or larger atoms are implemented using complex constructs.)*/
168
169 #define STAR 25 /* Match this (simple) thing 0 or more times. */
170 #define LAZY_STAR 26 /* Minimal matching STAR */
171 #define QUESTION 27 /* Match this (simple) thing 0 or 1 times. */
172 #define LAZY_QUESTION 28 /* Minimal matching QUESTION */
173 #define PLUS 29 /* Match this (simple) thing 1 or more times. */
174 #define LAZY_PLUS 30 /* Minimal matching PLUS */
175 #define BRACE 31 /* Match this (simple) thing m to n times. */
176 #define LAZY_BRACE 32 /* Minimal matching BRACE */
177
178 /* Nodes used to build complex constructs. */
179
180 #define NOTHING 33 /* Match empty string (always matches) */
181 #define BRANCH 34 /* Match this alternative, or the next... */
182 #define BACK 35 /* Always matches, NEXT ptr points backward. */
183 #define INIT_COUNT 36 /* Initialize {m,n} counter to zero */
184 #define INC_COUNT 37 /* Increment {m,n} counter by one */
185 #define TEST_COUNT 38 /* Test {m,n} counter against operand */
186
187 /* Back Reference nodes. */
188
189 #define BACK_REF 39 /* Match latest matched parenthesized text */
190 #define BACK_REF_CI 40 /* Case insensitive version of BACK_REF */
191 #define X_REGEX_BR 41 /* Cross-Regex Back-Ref for syntax highlighting */
192 #define X_REGEX_BR_CI 42 /* Case insensitive version of X_REGEX_BR_CI */
193
194 /* Various nodes used to implement parenthetical constructs. */
195
196 #define POS_AHEAD_OPEN 43 /* Begin positive look ahead */
197 #define NEG_AHEAD_OPEN 44 /* Begin negative look ahead */
198 #define LOOK_AHEAD_CLOSE 45 /* End positive or negative look ahead */
199
200 #define POS_BEHIND_OPEN 46 /* Begin positive look behind */
201 #define NEG_BEHIND_OPEN 47 /* Begin negative look behind */
202 #define LOOK_BEHIND_CLOSE 48 /* Close look behind */
203
204 #define OPEN 49 /* Open for capturing parentheses. */
205
206 /* OPEN+1 is number 1, etc. */
207 #define CLOSE (OPEN + NSUBEXP) /* Close for capturing parentheses. */
208
209 #define LAST_PAREN (CLOSE + NSUBEXP)
210
211 #if (LAST_PAREN > UCHAR_MAX)
212 #error "Too many parentheses for storage in an unsigned char (LAST_PAREN too big.)"
213 #endif
214
215 /* The next_ptr () function can consume up to 30% of the time during matching
216 because it is called an immense number of times (an average of 25
217 next_ptr() calls per match() call was witnessed for Perl syntax
218 highlighting). Therefore it is well worth removing some of the function
219 call overhead by selectively inlining the next_ptr() calls. Moreover,
220 the inlined code can be simplified for matching because one of the tests,
221 only necessary during compilation, can be left out.
222 The net result of using this inlined version at two critical places is
223 a 25% speedup (again, witnesses on Perl syntax highlighting). */
224
225 #define NEXT_PTR(in_ptr, out_ptr)\
226 next_ptr_offset = GET_OFFSET (in_ptr);\
227 if (next_ptr_offset == 0)\
228 out_ptr = NULL;\
229 else {\
230 if (GET_OP_CODE (in_ptr) == BACK)\
231 out_ptr = in_ptr - next_ptr_offset;\
232 else \
233 out_ptr = in_ptr + next_ptr_offset;\
234 }
235
236 /* OPCODE NOTES:
237 ------------
238
239 All nodes consist of an 8 bit op code followed by 2 bytes that make up a 16
240 bit NEXT pointer. Some nodes have a null terminated character string operand
241 following the NEXT pointer. Other nodes may have an 8 bit index operand.
242 The TEST_COUNT node has an index operand followed by a 16 bit test value.
243 The BRACE and LAZY_BRACE nodes have two 16 bit values for min and max but no
244 index value.
245
246 SIMILAR
247 Operand(s): null terminated string
248
249 Implements a case insensitive match of a string. Mostly intended for use
250 in syntax highlighting patterns for keywords of languages like FORTRAN
251 and Ada that are case insensitive. The regex text in this node is
252 converted to lower case during regex compile.
253
254 DIGIT, NOT_DIGIT, LETTER, NOT_LETTER, SPACE, NOT_SPACE, WORD_CHAR,
255 NOT_WORD_CHAR
256 Operand(s): None
257
258 Implements shortcut escapes \d, \D, \l, \L, \s, \S, \w, \W.
259 The Unicode-aware functions Melder_isDecimalNumber(), Melder_isLetter(),
260 Melder_isHorizontalSpace(), and Melder_isWordCharacter are
261 used to implement these in the hopes of increasing portability.
262
263 NOT_BOUNDARY
264 Operand(s): None
265
266 Implements \B as a zero width assertion that the current character is
267 NOT on a word boundary. Word boundaries are defined to be the position
268 between two characters where one of those characters is
269 a Unicode word character and the other character is not.
270
271 IS_DELIM
272 Operand(s): None
273
274 Implements \y as any character that is a Unicode word delimiter.
275
276 NOT_DELIM
277 Operand(s): None
278
279 Implements \Y as any character that is NOT a Unicode word delimiter.
280
281 STAR, PLUS, QUESTION, and complex '*', '+', and '?'
282 Operand(s): None (Note: NEXT pointer is usually zero. The code that
283 processes this node skips over it.)
284
285 Complex (parenthesized) versions implemented as circular BRANCH
286 structures using BACK. SIMPLE versions (one character per match) are
287 implemented separately for speed and to minimize recursion.
288
289 BRACE, LAZY_BRACE
290 Operand(s): minimum value (2 bytes), maximum value (2 bytes)
291
292 Implements the {m,n} construct for atoms that are SIMPLE.
293
294 BRANCH
295 Operand(s): None
296
297 The set of branches constituting a single choice are hooked together
298 with their NEXT pointers, since precedence prevents anything being
299 concatenated to any individual branch. The NEXT pointer of the last
300 BRANCH in a choice points to the thing following the whole choice. This
301 is also where the final NEXT pointer of each individual branch points;
302 each branch starts with the operand node of a BRANCH node.
303
304 BACK
305 Operand(s): None
306
307 Normal NEXT pointers all implicitly point forward. Back implicitly
308 points backward. BACK exists to make loop structures possible.
309
310 INIT_COUNT
311 Operand(s): index (1 byte)
312
313 Initializes the count array element referenced by the index operand.
314 This node is used to build general (i.e. parenthesized) {m,n} constructs.
315
316 INC_COUNT
317 Operand(s): index (1 byte)
318
319 Increments the count array element referenced by the index operand.
320 This node is used to build general (i.e. parenthesized) {m,n} constructs.
321
322 TEST_COUNT
323 Operand(s): index (1 byte), test value (2 bytes)
324
325 Tests the current value of the count array element specified by the
326 index operand against the test value. If the current value is less than
327 the test value, control passes to the node after that TEST_COUNT node.
328 Otherwise control passes to the node referenced by the NEXT pointer for
329 the TEST_COUNT node. This node is used to build general (i.e.
330 parenthesized) {m,n} constructs.
331
332 BACK_REF, BACK_REF_CI
333 Operand(s): index (1 byte, value 1-9)
334
335 Implements back references. This node will attempt to match whatever text
336 was most recently captured by the index'th set of parentheses.
337 BACK_REF_CI is case insensitive version.
338
339 X_REGEX_BR, X_REGEX_BR_CI
340 (NOT IMPLEMENTED YET)
341
342 Operand(s): index (1 byte, value 1-9)
343
344 Implements back references into a previously matched but separate regular
345 expression. This is used by syntax highlighting patterns. This node will
346 attempt to match whatever text was most captured by the index'th set of
347 parentheses of the separate regex passed to ExecRE. X_REGEX_BR_CI is case
348 insensitive version.
349
350 POS_AHEAD_OPEN, NEG_AHEAD_OPEN, LOOK_AHEAD_CLOSE
351
352 Operand(s): None
353
354 Implements positive and negative look ahead. Look ahead is an assertion
355 that something is either there or not there. Once this is determined the
356 regex engine backtracks to where it was just before the look ahead was
357 encountered, i.e. look ahead is a zero width assertion.
358
359 POS_BEHIND_OPEN, NEG_BEHIND_OPEN, LOOK_BEHIND_CLOSE
360
361 Operand(s): 2x2 bytes for OPEN (match boundaries), None for CLOSE
362
363 Implements positive and negative look behind. Look behind is an assertion
364 that something is either there or not there in front of the current
365 position. Look behind is a zero width assertion, with the additional
366 constraint that it must have a bounded length (for complexity and
367 efficiency reasons; note that most other implementation even impose
368 fixed length).
369
370 OPEN, CLOSE
371
372 Operand(s): None
373
374 OPEN + n = Start of parenthesis 'n', CLOSE + n = Close of parenthesis
375 'n', and are numbered at compile time.
376 */
377
378 /* A node is one char of opcode followed by two chars of NEXT pointer plus
379 * any operands. NEXT pointers are stored as two 8-bit pieces, high order
380 * first. The value is a positive offset from the opcode of the node
381 * containing it. An operand, if any, simply follows the node. (Note that
382 * much of the code generation knows about this implicit relationship.)
383 *
384 * Using two bytes for NEXT_PTR_SIZE is vast overkill for most things,
385 * but allows patterns to get big without disasters. */
386
387 #define OP_CODE_SIZE 1
388 #define NEXT_PTR_SIZE 2
389 #define INDEX_SIZE 1
390 #define LENGTH_SIZE 4
391 #define NODE_SIZE (NEXT_PTR_SIZE + OP_CODE_SIZE)
392
393 #define GET_OP_CODE(p) (*(char32 *)(p))
394 #define OPERAND(p) ((p) + NODE_SIZE)
395 #define GET_OFFSET(p) ((( *((p) + 1) & 0377) << 8) + (( *((p) + 2)) & 0377))
396 #define PUT_OFFSET_L(v) (char32)(((v) >> 8) & 0377)
397 #define PUT_OFFSET_R(v) (char32) ((v) & 0377)
398 #define GET_LOWER(p) ((( *((p) + NODE_SIZE) & 0377) << 8) + \
399 (( *((p) + NODE_SIZE+1)) & 0377))
400 #define GET_UPPER(p) ((( *((p) + NODE_SIZE+2) & 0377) << 8) + \
401 (( *((p) + NODE_SIZE+3)) & 0377))
402
403 /* Utility definitions. */
404
405 #define REG_FAIL(m) {*Error_Ptr = (m); return (NULL);}
406 #define IS_QUANTIFIER(c) ((c) == '*' || (c) == '+' || \
407 (c) == '?' || (c) == Brace_Char)
408 #define SET_BIT(i,n) ((i) |= (1 << ((n) - 1)))
409 #define TEST_BIT(i,n) ((i) & (1 << ((n) - 1)))
410 #define U_CHAR_AT(p) ((unsigned int) *(char32 *)(p))
411
412 /* Flags to be passed up and down via function parameters during compile. */
413
414 #define WORST 0 /* Worst case. No assumptions can be made.*/
415 #define HAS_WIDTH 1 /* Known never to match null string. */
416 #define SIMPLE 2 /* Simple enough to be STAR/PLUS operand. */
417
418 #define NO_PAREN 0 /* Only set by initial call to "chunk". */
419 #define PAREN 1 /* Used for normal capturing parentheses. */
420 #define NO_CAPTURE 2 /* Non-capturing parentheses (grouping only). */
421 #define INSENSITIVE 3 /* Case insensitive parenthetical construct */
422 #define SENSITIVE 4 /* Case sensitive parenthetical construct */
423 #define NEWLINE 5 /* Construct to match newlines in most cases */
424 #define NO_NEWLINE 6 /* Construct to match newlines normally */
425
426 #define REG_INFINITY 0UL
427 #define REG_ZERO 0UL
428 #define REG_ONE 1UL
429
430 /* Flags for function shortcut_escape() */
431
432 #define CHECK_ESCAPE 0 /* Check an escape sequence for validity only. */
433 #define CHECK_CLASS_ESCAPE 1 /* Check the validity of an escape within a character class */
434 #define EMIT_NODE 2 /* Emit the appropriate node. */
435
436 /* Number of bytes to offset from the beginning of the regex program to the start
437 of the actual compiled regex code, i.e. skipping over the MAGIC number and
438 the two counters at the front. */
439
440 #define REGEX_START_OFFSET 3
441
442 #define MAX_COMPILED_SIZE 32767UL /* Largest size a compiled regex can be.
443 Probably could be 65535UL. */
444
445 /* Global work variables for `CompileRE'. */
446
447 static char32 *Reg_Parse; /* Input scan ptr (scans user's regex) */
448 static int Total_Paren; /* Parentheses, (), counter. */
449 static int Num_Braces; /* Number of general {m,n} constructs.
450 {m,n} quantifiers of SIMPLE atoms are
451 not included in this count. */
452 static int Closed_Parens; /* Bit flags indicating () closure. */
453 static int Paren_Has_Width; /* Bit flags indicating ()'s that are
454 known to not match the empty string */
455 static char32 Compute_Size; /* Address of this used as flag. */
456 static char32 *Code_Emit_Ptr; /* When Code_Emit_Ptr is set to
457 &Compute_Size no code is emitted.
458 Instead, the size of code that WOULD
459 have been generated is accumulated in
460 Reg_Size. Otherwise, Code_Emit_Ptr
461 points to where compiled regex code is
462 to be written. */
463 static unsigned long Reg_Size; /* Size of compiled regex code. */
464 static const char32 **Error_Ptr; /* Place to store error messages so
465 they can be returned by `CompileRE' */
466 static char32 Error_Text [128];/* Sting to build error messages in. */
467
468 static int Is_Case_Insensitive;
469 static int Match_Newline;
470
471 static int Enable_Counting_Quantifier = 1;
472 static char32 Brace_Char;
473 static char32 Default_Meta_Char [] = { '{', '.', '*', '+', '?', '[', '(', '|', ')', '^', '<', '>', '$', '\0' };
474 static char32 *Meta_Char;
475
476 typedef struct {
477 long lower;
478 long upper;
479 } len_range;
480
481 /* Forward declarations for functions used by `CompileRE'. */
482
483 static char32 *alternative (int *flag_param, len_range *range_param);
484 static char32 *back_ref (char32 *c, int *flag_param, int emit);
485 static char32 *chunk (int paren, int *flag_param, len_range *range_param);
486 static void emit_byte (char32 c);
487 static void emit_class_byte (char32 c);
488 static char32 *emit_node (int op_code);
489 static char32 *emit_special (char32 op_code, unsigned long test_val, int index);
490 static char32 literal_escape (char32 c);
491 static char32 numeric_escape (char32 c, char32 **parse);
492 static char32 *atom (int *flag_param, len_range *range_param);
493 static void reg_error (const char32_t *str);
494 static char32 *insert (char32 op, char32 *opnd, long min, long max, int index);
495 static char32 *next_ptr (char32 *ptr);
496 static void offset_tail (char32 *ptr, int offset, char32 *val);
497 static void branch_tail (char32 *ptr, int offset, char32 *val);
498 static char32 *piece (int *flag_param, len_range *range_param);
499 static void tail (char32 *search_from, char32 *point_t);
500 static char32 *shortcut_escape (char32 c, int *flag_param, int emit);
501
502 /*----------------------------------------------------------------------*
503 * CompileRE
504 *
505 * Compiles a regular expression into the internal format used by
506 * `ExecRE'.
507 *
508 * The default behaviour wrt. case sensitivity and newline matching can
509 * be controlled through the defaultFlags argument (Markus Schwarzenberg).
510 * Future extensions are possible by using other flag bits.
511 * Note that currently only the case sensitivity flag is effectively used.
512 *
513 * Beware that the optimization and preparation code in here knows about
514 * some of the structure of the compiled regexp.
515 *----------------------------------------------------------------------*/
516
CompileRE_throwable(conststring32 expression,int defaultFlags)517 regexp *CompileRE_throwable (conststring32 expression, int defaultFlags) {
518 conststring32 compileMessage;
519 regexp *compiledRE = CompileRE (expression, & compileMessage, defaultFlags);
520 if (compiledRE == NULL)
521 Melder_throw (U"Regular expression: ", compileMessage, U" (", expression, U").");
522 return compiledRE;
523 }
524
CompileRE(conststring32 exp,conststring32 * errorText,int defaultFlags)525 regexp *CompileRE (conststring32 exp, conststring32 *errorText, int defaultFlags) {
526
527 regexp *comp_regex = NULL;
528 char32 *scan;
529 int flags_local, pass;
530 len_range range_local;
531
532 if (Enable_Counting_Quantifier) {
533 Brace_Char = '{';
534 Meta_Char = &Default_Meta_Char [0];
535 } else {
536 Brace_Char = '*'; /* Bypass the '{' in */
537 Meta_Char = &Default_Meta_Char [1]; /* Default_Meta_Char */
538 }
539
540 /* Set up errorText to receive failure reports. */
541
542 Error_Ptr = errorText;
543 *Error_Ptr = U"";
544
545 if (exp == NULL) {
546 REG_FAIL (U"NULL argument, `CompileRE\'");
547 }
548
549 Code_Emit_Ptr = &Compute_Size;
550 Reg_Size = 0UL;
551
552 /* We can't allocate space until we know how big the compiled form will be,
553 but we can't compile it (and thus know how big it is) until we've got a
554 place to put the code. So we cheat: we compile it twice, once with code
555 generation turned off and size counting turned on, and once "for real".
556 This also means that we don't allocate space until we are sure that the
557 thing really will compile successfully, and we never have to move the
558 code and thus invalidate pointers into it. (Note that it has to be in
559 one piece because free() should be able to free it all.) */
560
561 for (pass = 1; pass <= 2; pass++) {
562 /*-------------------------------------------*
563 * FIRST PASS: Determine size and legality. *
564 * SECOND PASS: Emit code. *
565 *-------------------------------------------*/
566
567 /* Schwarzenberg:
568 * If defaultFlags = 0 use standard defaults:
569 * Is_Case_Insensitive: Case sensitive is the default
570 * Match_Newline: Newlines are NOT matched by default
571 * in character classes
572 */
573 Is_Case_Insensitive = ( (defaultFlags & REDFLT_CASE_INSENSITIVE) ? 1 : 0);
574 Match_Newline = 0; /* ((defaultFlags & REDFLT_MATCH_NEWLINE) ? 1 : 0);
575 Currently not used. Uncomment if needed. */
576
577 Reg_Parse = (char32 *) exp;
578 Total_Paren = 1;
579 Num_Braces = 0;
580 Closed_Parens = 0;
581 Paren_Has_Width = 0;
582
583 emit_byte (MAGIC);
584 emit_byte ('%'); /* Placeholder for num of capturing parentheses. */
585 emit_byte ('%'); /* Placeholder for num of general {m,n} constructs. */
586
587 if (chunk (NO_PAREN, &flags_local, &range_local) == NULL) {
588 return (NULL); /* Something went wrong */
589 }
590 if (pass == 1) {
591 if (Reg_Size >= MAX_COMPILED_SIZE) {
592 /* Too big for NEXT pointers NEXT_PTR_SIZE bytes long to span.
593 This is a real issue since the first BRANCH node usually points
594 to the end of the compiled regex code. */
595
596 Melder_sprint (Error_Text,128, U"regexp > ", MAX_COMPILED_SIZE, U" bytes");
597 REG_FAIL (Error_Text);
598 }
599
600 /* Allocate memory. */
601
602 comp_regex = (regexp *) malloc (sizeof (regexp) + Reg_Size * sizeof (char32));
603
604 if (comp_regex == NULL) {
605 REG_FAIL (U"out of memory in `CompileRE\'");
606 }
607
608 Code_Emit_Ptr = (char32 *) comp_regex->program;
609 }
610 }
611
612 comp_regex->program [1] = (char32) Total_Paren - 1;
613 comp_regex->program [2] = (char32) Num_Braces;
614
615 /*----------------------------------------*
616 * Dig out information for optimizations. *
617 *----------------------------------------*/
618
619 comp_regex->match_start = '\0'; /* Worst-case defaults. */
620 comp_regex->anchor = 0;
621
622 /* First BRANCH. */
623
624 scan = (char32 *) (comp_regex->program + REGEX_START_OFFSET);
625
626 if (GET_OP_CODE (next_ptr (scan)) == END) { /* Only one top-level choice. */
627 scan = OPERAND (scan);
628
629 /* Starting-point info. */
630
631 if (GET_OP_CODE (scan) == EXACTLY) {
632 comp_regex->match_start = *OPERAND (scan);
633
634 } else if (PLUS <= GET_OP_CODE (scan) &&
635 GET_OP_CODE (scan) <= LAZY_PLUS) {
636
637 /* Allow x+ or x+? at the start of the regex to be
638 optimized. */
639
640 if (GET_OP_CODE (scan + NODE_SIZE) == EXACTLY) {
641 comp_regex->match_start = *OPERAND (scan + NODE_SIZE);
642 }
643 } else if (GET_OP_CODE (scan) == BOL) {
644 comp_regex->anchor++;
645 }
646 }
647
648 return (comp_regex);
649 }
650
651 /*----------------------------------------------------------------------*
652 * chunk *
653 * *
654 * Process main body of regex or process a parenthesized "thing". *
655 * *
656 * Caller must absorb opening parenthesis. *
657 * *
658 * Combining parenthesis handling with the base level of regular *
659 * expression is a trifle forced, but the need to tie the tails of the *
660 * branches to what follows makes it hard to avoid. *
661 *----------------------------------------------------------------------*/
662
chunk(int paren,int * flag_param,len_range * range_param)663 static char32 *chunk (int paren, int *flag_param, len_range *range_param) {
664
665 char32 *ret_val = NULL;
666 char32 *this_branch;
667 char32 *ender = NULL;
668 int this_paren = 0;
669 int flags_local, first = 1, zero_width, i;
670 int old_sensitive = Is_Case_Insensitive;
671 int old_newline = Match_Newline;
672 len_range range_local;
673 int look_only = 0;
674 char32 *emit_look_behind_bounds = NULL;
675
676
677 *flag_param = HAS_WIDTH; /* Tentatively. */
678 range_param->lower = 0; /* Idem */
679 range_param->upper = 0;
680
681 /* Make an OPEN node, if parenthesized. */
682
683 if (paren == PAREN) {
684 if (Total_Paren >= NSUBEXP) {
685 Melder_sprint (Error_Text,128, U"number of ()'s > ", NSUBEXP);
686 REG_FAIL (Error_Text)
687 }
688
689 this_paren = Total_Paren; Total_Paren++;
690 ret_val = emit_node (OPEN + this_paren);
691 } else if (paren == POS_AHEAD_OPEN || paren == NEG_AHEAD_OPEN) {
692 *flag_param = WORST; /* Look ahead is zero width. */
693 look_only = 1;
694 ret_val = emit_node (paren);
695 } else if (paren == POS_BEHIND_OPEN || paren == NEG_BEHIND_OPEN) {
696 *flag_param = WORST; /* Look behind is zero width. */
697 look_only = 1;
698 /* We'll overwrite the zero length later on, so we save the ptr */
699 ret_val = emit_special (paren, 0, 0);
700 emit_look_behind_bounds = ret_val + NODE_SIZE;
701 } else if (paren == INSENSITIVE) {
702 Is_Case_Insensitive = 1;
703 } else if (paren == SENSITIVE) {
704 Is_Case_Insensitive = 0;
705 } else if (paren == NEWLINE) {
706 Match_Newline = 1;
707 } else if (paren == NO_NEWLINE) {
708 Match_Newline = 0;
709 }
710
711 /* Pick up the branches, linking them together. */
712
713 do {
714 this_branch = alternative (&flags_local, &range_local);
715
716 if (this_branch == NULL) {
717 return (NULL);
718 }
719
720 if (first) {
721 first = 0;
722 *range_param = range_local;
723 if (ret_val == NULL) {
724 ret_val = this_branch;
725 }
726 } else if (range_param->lower >= 0) {
727 if (range_local.lower >= 0) {
728 if (range_local.lower < range_param->lower) {
729 range_param->lower = range_local.lower;
730 }
731 if (range_local.upper > range_param->upper) {
732 range_param->upper = range_local.upper;
733 }
734 } else {
735 range_param->lower = -1; /* Branches have different lengths */
736 range_param->upper = -1;
737 }
738 }
739
740 tail (ret_val, this_branch); /* Connect BRANCH -> BRANCH. */
741
742 /* If any alternative could be zero width, consider the whole
743 parenthisized thing to be zero width. */
744
745 if (! (flags_local & HAS_WIDTH)) {
746 *flag_param &= ~HAS_WIDTH;
747 }
748
749 /* Are there more alternatives to process? */
750
751 if (*Reg_Parse != '|') {
752 break;
753 }
754
755 Reg_Parse++;
756 } while (1);
757
758 /* Make a closing node, and hook it on the end. */
759
760 if (paren == PAREN) {
761 ender = emit_node (CLOSE + this_paren);
762
763 } else if (paren == NO_PAREN) {
764 ender = emit_node (END);
765
766 } else if (paren == POS_AHEAD_OPEN || paren == NEG_AHEAD_OPEN) {
767 ender = emit_node (LOOK_AHEAD_CLOSE);
768
769 } else if (paren == POS_BEHIND_OPEN || paren == NEG_BEHIND_OPEN) {
770 ender = emit_node (LOOK_BEHIND_CLOSE);
771
772 } else {
773 ender = emit_node (NOTHING);
774 }
775
776 tail (ret_val, ender);
777
778 /* Hook the tails of the branch alternatives to the closing node. */
779
780 for (this_branch = ret_val; this_branch != NULL;) {
781 branch_tail (this_branch, NODE_SIZE, ender);
782 this_branch = next_ptr (this_branch);
783 }
784
785 /* Check for proper termination. */
786
787 if (paren != NO_PAREN && *Reg_Parse++ != ')') {
788 REG_FAIL (U"missing right parenthesis \')\'");
789 } else if (paren == NO_PAREN && *Reg_Parse != '\0') {
790 if (*Reg_Parse == ')') {
791 REG_FAIL (U"missing left parenthesis \'(\'");
792 } else {
793 REG_FAIL (U"junk on end"); /* "Can't happen" - NOTREACHED */
794 }
795 }
796
797 /* Check whether look behind has a fixed size */
798
799 if (emit_look_behind_bounds) {
800 if (range_param->lower < 0) {
801 REG_FAIL (U"look-behind does not have a bounded size");
802 }
803 if (range_param->upper > 65535L) {
804 REG_FAIL (U"max. look-behind size is too large (>65535)")
805 }
806 if (Code_Emit_Ptr != &Compute_Size) {
807 *emit_look_behind_bounds++ = PUT_OFFSET_L (range_param->lower);
808 *emit_look_behind_bounds++ = PUT_OFFSET_R (range_param->lower);
809 *emit_look_behind_bounds++ = PUT_OFFSET_L (range_param->upper);
810 *emit_look_behind_bounds = PUT_OFFSET_R (range_param->upper);
811 }
812 }
813
814 /* For look ahead/behind, the length should be set to zero again */
815 if (look_only) {
816 range_param->lower = 0;
817 range_param->upper = 0;
818 }
819
820 zero_width = 0;
821
822 /* Set a bit in Closed_Parens to let future calls to function `back_ref'
823 know that we have closed this set of parentheses. */
824
825 if (paren == PAREN && this_paren <= (int) sizeof (Closed_Parens) * CHAR_BIT) {
826 SET_BIT (Closed_Parens, this_paren);
827
828 /* Determine if a parenthesized expression is modified by a quantifier
829 that can have zero width. */
830
831 if (* (Reg_Parse) == '?' || * (Reg_Parse) == '*') {
832 zero_width++;
833 } else if (* (Reg_Parse) == '{' && Brace_Char == '{') {
834 if (* (Reg_Parse + 1) == ',' || * (Reg_Parse + 1) == '}') {
835 zero_width++;
836 } else if (* (Reg_Parse + 1) == '0') {
837 i = 2;
838
839 while (* (Reg_Parse + i) == '0') {
840 i++;
841 }
842
843 if (* (Reg_Parse + i) == ',') {
844 zero_width++;
845 }
846 }
847 }
848 }
849
850 /* If this set of parentheses is known to never match the empty string, set
851 a bit in Paren_Has_Width to let future calls to function back_ref know
852 that this set of parentheses has non-zero width. This will allow star
853 (*) or question (?) quantifiers to be aplied to a back-reference that
854 refers to this set of parentheses. */
855
856 if ( (*flag_param & HAS_WIDTH) &&
857 paren == PAREN &&
858 !zero_width &&
859 this_paren <= (int) (sizeof (Paren_Has_Width) * CHAR_BIT)) {
860
861 SET_BIT (Paren_Has_Width, this_paren);
862 }
863
864 Is_Case_Insensitive = old_sensitive;
865 Match_Newline = old_newline;
866
867 return (ret_val);
868 }
869
870 /*----------------------------------------------------------------------*
871 * alternative
872 *
873 * Processes one alternative of an '|' operator. Connects the NEXT
874 * pointers of each regex atom together sequentialy.
875 *----------------------------------------------------------------------*/
876
alternative(int * flag_param,len_range * range_param)877 static char32 *alternative (int *flag_param, len_range *range_param) {
878
879 char32 *ret_val;
880 char32 *chain;
881 char32 *latest;
882 int flags_local;
883 len_range range_local;
884
885 *flag_param = WORST; /* Tentatively. */
886 range_param->lower = 0; /* Idem */
887 range_param->upper = 0;
888
889 ret_val = emit_node (BRANCH);
890 chain = NULL;
891
892 /* Loop until we hit the start of the next alternative, the end of this set
893 of alternatives (end of parentheses), or the end of the regex. */
894
895 while (*Reg_Parse != '|' && *Reg_Parse != ')' && *Reg_Parse != '\0') {
896 latest = piece (&flags_local, &range_local);
897
898 if (latest == NULL) {
899 return (NULL); /* Something went wrong. */
900 }
901
902 *flag_param |= flags_local & HAS_WIDTH;
903 if (range_local.lower < 0) {
904 /* Not a fixed length */
905 range_param->lower = -1;
906 range_param->upper = -1;
907 } else if (range_param->lower >= 0) {
908 range_param->lower += range_local.lower;
909 range_param->upper += range_local.upper;
910 }
911
912 if (chain != NULL) { /* Connect the regex atoms together sequentialy. */
913 tail (chain, latest);
914 }
915
916 chain = latest;
917 }
918
919 if (chain == NULL) { /* Loop ran zero times. */
920 (void) emit_node (NOTHING);
921 }
922
923 return (ret_val);
924 }
925
926 /*----------------------------------------------------------------------*
927 * piece - something followed by possible '*', '+', '?', or "{m,n}"
928 *
929 * Note that the branching code sequences used for the general cases of
930 * *, +. ?, and {m,n} are somewhat optimized: they use the same
931 * NOTHING node as both the endmarker for their branch list and the
932 * body of the last branch. It might seem that this node could be
933 * dispensed with entirely, but the endmarker role is not redundant.
934 *----------------------------------------------------------------------*/
935
piece(int * flag_param,len_range * range_param)936 static char32 *piece (int *flag_param, len_range *range_param) {
937
938 char32 *ret_val;
939 char32 *next;
940 char32 op_code;
941 unsigned long min_max [2] = {REG_ZERO, REG_INFINITY};
942 int flags_local, i, brace_present = 0;
943 int lazy = 0, comma_present = 0;
944 int digit_present [2] = {0, 0};
945 len_range range_local;
946
947 ret_val = atom (&flags_local, &range_local);
948
949 if (ret_val == NULL) {
950 return (NULL); /* Something went wrong. */
951 }
952
953 op_code = *Reg_Parse;
954
955 if (!IS_QUANTIFIER (op_code)) {
956 *flag_param = flags_local;
957 *range_param = range_local;
958 return (ret_val);
959 } else if (op_code == '{') { /* {n,m} quantifier present */
960 brace_present++;
961 Reg_Parse++;
962
963 /* This code will allow specifying a counting range in any of the
964 following forms:
965
966 {m,n} between m and n.
967 {,n} same as {0,n} or between 0 and infinity.
968 {m,} same as {m,0} or between m and infinity.
969 {m} same as {m,m} or exactly m.
970 {,} same as {0,0} or between 0 and infinity or just '*'.
971 {} same as {0,0} or between 0 and infinity or just '*'.
972
973 Note that specifying a max of zero, {m,0} is not allowed in the regex
974 itself, but it is implemented internally that way to support '*', '+',
975 and {min,} constructs and signals an unlimited number. */
976
977 for (i = 0; i < 2; i++) {
978 /* Look for digits of number and convert as we go. The numeric maximum
979 value for max and min of 65,535 is due to using 2 bytes to store
980 each value in the compiled regex code. */
981
982 while (Melder_isAsciiDecimalNumber (*Reg_Parse)) {
983 /* (6553 * 10 + 6) > 65535 (16 bit max) */
984
985 if ( (min_max [i] == 6553UL && (*Reg_Parse - '0') <= 5) ||
986 (min_max [i] <= 6552UL)) {
987
988 min_max [i] = (min_max [i] * 10UL) +
989 (unsigned long) (*Reg_Parse - '0');
990 Reg_Parse++;
991
992 digit_present [i]++;
993 } else {
994 if (i == 0) {
995 Melder_sprint (Error_Text,128, U"min operand of {", min_max [0], *Reg_Parse, U",???} > 65535");
996 } else {
997 Melder_sprint (Error_Text,128, U"max operand of {", min_max [0], U",", min_max [1], *Reg_Parse, U"} > 65535");
998 }
999
1000 REG_FAIL (Error_Text);
1001 }
1002 }
1003
1004 if (!comma_present && *Reg_Parse == ',') {
1005 comma_present++;
1006 Reg_Parse++;
1007 }
1008 }
1009
1010 /* A max of zero cannot be specified directly in the regex since it would
1011 signal a max of infinity. This code specifically disallows `{0,0}',
1012 `{,0}', and `{0}' which really means nothing to humans but would be
1013 interpreted as `{0,infinity}' or `*' if we didn't make this check. */
1014
1015 if (digit_present [0] && (min_max [0] == REG_ZERO) && !comma_present) {
1016
1017 REG_FAIL (U"{0} is an invalid range");
1018 } else if (digit_present [0] && (min_max [0] == REG_ZERO) &&
1019 digit_present [1] && (min_max [1] == REG_ZERO)) {
1020
1021 REG_FAIL (U"{0,0} is an invalid range");
1022 } else if (digit_present [1] && (min_max [1] == REG_ZERO)) {
1023 if (digit_present [0]) {
1024 Melder_sprint (Error_Text,128, U"{", min_max [0], U",0} is an invalid range");
1025 REG_FAIL (Error_Text);
1026 } else {
1027 REG_FAIL (U"{,0} is an invalid range");
1028 }
1029 }
1030
1031 if (!comma_present) {
1032 min_max [1] = min_max [0];
1033 } /* {x} means {x,x} */
1034
1035 if (*Reg_Parse != '}') {
1036 REG_FAIL (U"{m,n} specification missing right \'}\'");
1037
1038 } else if (min_max [1] != REG_INFINITY && min_max [0] > min_max [1]) {
1039 /* Disallow a backward range. */
1040
1041 Melder_sprint (Error_Text,128, U"{", min_max [0], U",", min_max [1], U"} is an invalid range");
1042 REG_FAIL (Error_Text);
1043 }
1044 }
1045
1046 Reg_Parse++;
1047
1048 /* Check for a minimal matching (non-greedy or "lazy") specification. */
1049
1050 if (*Reg_Parse == '?') {
1051 lazy = 1;
1052 Reg_Parse++;
1053 }
1054
1055 /* Avoid overhead of counting if possible */
1056
1057 if (op_code == '{') {
1058 if (min_max [0] == REG_ZERO && min_max [1] == REG_INFINITY) {
1059 op_code = '*';
1060 } else if (min_max [0] == REG_ONE && min_max [1] == REG_INFINITY) {
1061 op_code = '+';
1062 } else if (min_max [0] == REG_ZERO && min_max [1] == REG_ONE) {
1063 op_code = '?';
1064 } else if (min_max [0] == REG_ONE && min_max [1] == REG_ONE) {
1065 /* "x{1,1}" is the same as "x". No need to pollute the compiled
1066 regex with such nonsense. */
1067
1068 *flag_param = flags_local;
1069 *range_param = range_local;
1070 return (ret_val);
1071 } else if (Num_Braces > (int) UCHAR_MAX) {
1072 Melder_sprint (Error_Text,128, U"number of {m,n} constructs > ", UCHAR_MAX);
1073 REG_FAIL (Error_Text);
1074 }
1075 }
1076
1077 if (op_code == '+') {
1078 min_max [0] = REG_ONE;
1079 }
1080 if (op_code == '?') {
1081 min_max [1] = REG_ONE;
1082 }
1083
1084 /* It is dangerous to apply certain quantifiers to a possibly zero width
1085 item. */
1086
1087 if (! (flags_local & HAS_WIDTH)) {
1088 if (brace_present) {
1089 Melder_sprint (Error_Text,128, U"{", min_max [0], U",", min_max [1], U"} operand could be empty");
1090 } else {
1091 Melder_sprint (Error_Text,128, op_code, U" operand could be empty");
1092 }
1093
1094 REG_FAIL (Error_Text);
1095 }
1096
1097 *flag_param = (min_max [0] > REG_ZERO) ? (WORST | HAS_WIDTH) : WORST;
1098 if (range_local.lower >= 0) {
1099 if (min_max[1] != REG_INFINITY) {
1100 range_param->lower = range_local.lower * min_max[0];
1101 range_param->upper = range_local.upper * min_max[1];
1102 } else {
1103 range_param->lower = -1; /* Not a fixed-size length */
1104 range_param->upper = -1;
1105 }
1106 } else {
1107 range_param->lower = -1; /* Not a fixed-size length */
1108 range_param->upper = -1;
1109 }
1110
1111 /*---------------------------------------------------------------------*
1112 * Symbol Legend For Node Structure Diagrams
1113 *---------------------------------------------------------------------*
1114 * (...) = general grouped thing
1115 * B = (B)ranch, K = bac(K), N = (N)othing
1116 * I = (I)nitialize count, C = Increment (C)ount
1117 * T~m = (T)est against mini(m)um- go to NEXT pointer if >= operand
1118 * T~x = (T)est against ma(x)imum- go to NEXT pointer if >= operand
1119 * '~' = NEXT pointer, \___| = forward pointer, |___/ = Backward pointer
1120 *---------------------------------------------------------------------*/
1121
1122 if (op_code == '*' && (flags_local & SIMPLE)) {
1123 insert ( (lazy ? LAZY_STAR : STAR), ret_val, 0UL, 0UL, 0);
1124
1125 } else if (op_code == '+' && (flags_local & SIMPLE)) {
1126 insert (lazy ? LAZY_PLUS : PLUS, ret_val, 0UL, 0UL, 0);
1127
1128 } else if (op_code == '?' && (flags_local & SIMPLE)) {
1129 insert (lazy ? LAZY_QUESTION : QUESTION, ret_val, 0UL, 0UL, 0);
1130
1131 } else if (op_code == '{' && (flags_local & SIMPLE)) {
1132 insert (lazy ? LAZY_BRACE : BRACE, ret_val, min_max [0], min_max [1], 0);
1133
1134 } else if ( (op_code == '*' || op_code == '+') && lazy) {
1135 /* Node structure for (x)*? Node structure for (x)+? construct.
1136 * construct. (Same as (x)*? except for initial
1137 * forward jump into parenthesis.)
1138 *
1139 * ___6____
1140 * _______5_______ /________|______
1141 * | _4__ 1_\ /| ____ | _\
1142 * |/ | / |\ / |/ | | / |\
1143 * B~ N~ B~ (...)~ K~ N~ N~ B~ N~ B~ (...)~ K~ N~
1144 * \ \___2_______| \ \___________|
1145 * \_____3_______| \_____________|
1146 *
1147 */
1148
1149 tail (ret_val, emit_node (BACK)); /* 1 */
1150 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 2,4 */
1151 (void) insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 3 */
1152
1153 next = emit_node (NOTHING); /* 2,3 */
1154
1155 offset_tail (ret_val, NODE_SIZE, next); /* 2 */
1156 tail (ret_val, next); /* 3 */
1157 insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 4,5 */
1158 tail (ret_val, ret_val + (2 * NODE_SIZE)); /* 4 */
1159 offset_tail (ret_val, 3 * NODE_SIZE, ret_val); /* 5 */
1160
1161 if (op_code == '+') {
1162 insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 6 */
1163 tail (ret_val, ret_val + (4 * NODE_SIZE)); /* 6 */
1164 }
1165 } else if (op_code == '*') {
1166 /* Node structure for (x)* construct.
1167 * ____1_____
1168 * | \
1169 * B~ (...)~ K~ B~ N~
1170 * \ \_|2 |\_|
1171 * \__3_______| 4
1172 */
1173
1174 insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 1,3 */
1175 offset_tail (ret_val, NODE_SIZE, emit_node (BACK)); /* 2 */
1176 offset_tail (ret_val, NODE_SIZE, ret_val); /* 1 */
1177 tail (ret_val, emit_node (BRANCH)); /* 3 */
1178 tail (ret_val, emit_node (NOTHING)); /* 4 */
1179 } else if (op_code == '+') {
1180 /* Node structure for (x)+ construct.
1181 *
1182 * ____2_____
1183 * | \
1184 * (...)~ B~ K~ B~ N~
1185 * \_|\____|\_|
1186 * 1 3 4
1187 */
1188
1189 next = emit_node (BRANCH); /* 1 */
1190
1191 tail (ret_val, next); /* 1 */
1192 tail (emit_node (BACK), ret_val); /* 2 */
1193 tail (next, emit_node (BRANCH)); /* 3 */
1194 tail (ret_val, emit_node (NOTHING)); /* 4 */
1195 } else if (op_code == '?' && lazy) {
1196 /* Node structure for (x)?? construct.
1197 * _4__ 1_
1198 * / | / |
1199 * B~ N~ B~ (...)~ N~
1200 * \ \___2____|
1201 * \_____3____|
1202 */
1203
1204 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 2,4 */
1205 (void) insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 3 */
1206
1207 next = emit_node (NOTHING); /* 1,2,3 */
1208
1209 offset_tail (ret_val, 2 * NODE_SIZE, next); /* 1 */
1210 offset_tail (ret_val, NODE_SIZE, next); /* 2 */
1211 tail (ret_val, next); /* 3 */
1212 insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 4 */
1213 tail (ret_val, (ret_val + (2 * NODE_SIZE))); /* 4 */
1214
1215 } else if (op_code == '?') {
1216 /* Node structure for (x)? construct.
1217 * ___1____ _2
1218 * / |/ |
1219 * B~ (...)~ B~ N~
1220 * \__3_|
1221 */
1222
1223 insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 1 */
1224 tail (ret_val, emit_node (BRANCH)); /* 1 */
1225
1226 next = emit_node (NOTHING); /* 2,3 */
1227
1228 tail (ret_val, next); /* 2 */
1229 offset_tail (ret_val, NODE_SIZE, next); /* 3 */
1230 } else if (op_code == '{' && min_max [0] == min_max [1]) {
1231 /* Node structure for (x){m}, (x){m}?, (x){m,m}, or (x){m,m}? constructs.
1232 * Note that minimal and maximal matching mean the same thing when we
1233 * specify the minimum and maximum to be the same value.
1234 * _______3_____
1235 * | 1_ _2 \
1236 * | / |/ | \
1237 * I~ (...)~ C~ T~m K~ N~
1238 * \_| \_____|
1239 * 5 4
1240 */
1241
1242 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1243 tail (ret_val, emit_special (TEST_COUNT, min_max [0], Num_Braces));/* 2 */
1244 tail (emit_node (BACK), ret_val); /* 3 */
1245 tail (ret_val, emit_node (NOTHING)); /* 4 */
1246
1247 next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 5 */
1248
1249 tail (ret_val, next); /* 5 */
1250
1251 Num_Braces++;
1252 } else if (op_code == '{' && lazy) {
1253 if (min_max [0] == REG_ZERO && min_max [1] != REG_INFINITY) {
1254 /* Node structure for (x){0,n}? or {,n}? construct.
1255 * _________3____________
1256 * 8_| _4__ 1_ _2 \
1257 * / |/ | / |/ | \
1258 * I~ B~ N~ B~ (...)~ C~ T~x K~ N~
1259 * \ \ \__7__|
1260 * \ \_________6_______|
1261 * \______5____________|
1262 */
1263
1264 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1265
1266 next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2,7 */
1267
1268 tail (ret_val, next); /* 2 */
1269 (void) insert (BRANCH, ret_val, 0UL, 0UL, Num_Braces); /* 4,6 */
1270 (void) insert (NOTHING, ret_val, 0UL, 0UL, Num_Braces); /* 5 */
1271 (void) insert (BRANCH, ret_val, 0UL, 0UL, Num_Braces); /* 3,4,8 */
1272 tail (emit_node (BACK), ret_val); /* 3 */
1273 tail (ret_val, ret_val + (2 * NODE_SIZE)); /* 4 */
1274
1275 next = emit_node (NOTHING); /* 5,6,7 */
1276
1277 offset_tail (ret_val, NODE_SIZE, next); /* 5 */
1278 offset_tail (ret_val, 2 * NODE_SIZE, next); /* 6 */
1279 offset_tail (ret_val, 3 * NODE_SIZE, next); /* 7 */
1280
1281 next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 8 */
1282
1283 tail (ret_val, next); /* 8 */
1284
1285 } else if (min_max [0] > REG_ZERO && min_max [1] == REG_INFINITY) {
1286 /* Node structure for (x){m,}? construct.
1287 * ______8_________________
1288 * | _______3_____ \
1289 * | _7__ | 1_ _2 \ \
1290 * |/ | | / |/ | \ \
1291 * I~ B~ N~ B~ (...)~ C~ T~m K~ K~ N~
1292 * \_____\__\_| \_4___| |
1293 * 9 \ \_________5__________|
1294 * \_______6______________|
1295 */
1296
1297 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1298
1299 next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2,4 */
1300
1301 tail (ret_val, next); /* 2 */
1302 tail (emit_node (BACK), ret_val); /* 3 */
1303 tail (ret_val, emit_node (BACK)); /* 4 */
1304 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 5,7 */
1305 (void) insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 6 */
1306
1307 next = emit_node (NOTHING); /* 5,6 */
1308
1309 offset_tail (ret_val, NODE_SIZE, next); /* 5 */
1310 tail (ret_val, next); /* 6 */
1311 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 7,8 */
1312 tail (ret_val, ret_val + (2 * NODE_SIZE)); /* 7 */
1313 offset_tail (ret_val, 3 * NODE_SIZE, ret_val); /* 8 */
1314 (void) insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 9 */
1315 tail (ret_val, ret_val + INDEX_SIZE + (4 * NODE_SIZE)); /* 9 */
1316
1317 } else {
1318 /* Node structure for (x){m,n}? construct.
1319 * ______9_____________________
1320 * | _____________3___ \
1321 * | __8_ | 1_ _2 \ \
1322 * |/ | | / |/ | \ \
1323 * I~ B~ N~ B~ (...)~ C~ T~x T~m K~ K~ N~
1324 * \_____\__\_| \ \__4__| |
1325 * 10 \ \ \_7_________|
1326 * \ \_________6_____________|
1327 * \_______5_________________|
1328 */
1329
1330 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1331
1332 next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,7 */
1333
1334 tail (ret_val, next); /* 2 */
1335
1336 next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 4 */
1337
1338 tail (emit_node (BACK), ret_val); /* 3 */
1339 tail (next, emit_node (BACK)); /* 4 */
1340 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 6,8 */
1341 (void) insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 5 */
1342 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 8,9 */
1343
1344 next = emit_node (NOTHING); /* 5,6,7 */
1345
1346 offset_tail (ret_val, NODE_SIZE, next); /* 5 */
1347 offset_tail (ret_val, 2 * NODE_SIZE, next); /* 6 */
1348 offset_tail (ret_val, 3 * NODE_SIZE, next); /* 7 */
1349 tail (ret_val, ret_val + (2 * NODE_SIZE)); /* 8 */
1350 offset_tail (next, -NODE_SIZE, ret_val); /* 9 */
1351 insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 10 */
1352 tail (ret_val, ret_val + INDEX_SIZE + (4 * NODE_SIZE)); /* 10 */
1353 }
1354
1355 Num_Braces++;
1356 } else if (op_code == '{') {
1357 if (min_max [0] == REG_ZERO && min_max [1] != REG_INFINITY) {
1358 /* Node structure for (x){0,n} or (x){,n} construct.
1359 *
1360 * ___3____________
1361 * | 1_ _2 \ 5_
1362 * | / |/ | \ / |
1363 * I~ B~ (...)~ C~ T~x K~ B~ N~
1364 * \_|\ \_6___|__|
1365 * 7 \________4________|
1366 */
1367
1368 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1369
1370 next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,6 */
1371
1372 tail (ret_val, next); /* 2 */
1373 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 3,4,7 */
1374 tail (emit_node (BACK), ret_val); /* 3 */
1375
1376 next = emit_node (BRANCH); /* 4,5 */
1377
1378 tail (ret_val, next); /* 4 */
1379 tail (next, emit_node (NOTHING)); /* 5,6 */
1380 offset_tail (ret_val, NODE_SIZE, next); /* 6 */
1381
1382 next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 7 */
1383
1384 tail (ret_val, next); /* 7 */
1385
1386 } else if (min_max [0] > REG_ZERO && min_max [1] == REG_INFINITY) {
1387 /* Node structure for (x){m,} construct.
1388 * __________4________
1389 * | __3__________ \
1390 * _|___| 1_ _2 \ \ _7
1391 * / | 8 | / |/ | \ \ / |
1392 * I~ B~ (...)~ C~ T~m K~ K~ B~ N~
1393 * \ \_5___| |
1394 * \__________6__________|
1395 */
1396
1397 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1398
1399 next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2 */
1400
1401 tail (ret_val, next); /* 2 */
1402 tail (emit_node (BACK), ret_val); /* 3 */
1403 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 4,6 */
1404
1405 next = emit_node (BACK); /* 4 */
1406
1407 tail (next, ret_val); /* 4 */
1408 offset_tail (ret_val, NODE_SIZE, next); /* 5 */
1409 tail (ret_val, emit_node (BRANCH)); /* 6 */
1410 tail (ret_val, emit_node (NOTHING)); /* 7 */
1411
1412 insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 8 */
1413
1414 tail (ret_val, ret_val + INDEX_SIZE + (2 * NODE_SIZE)); /* 8 */
1415
1416 } else {
1417 /* Node structure for (x){m,n} construct.
1418 * _____6________________
1419 * | _____________3___ \
1420 * 9_|__| 1_ _2 \ \ _8
1421 * / | | / |/ | \ \ / |
1422 * I~ B~ (...)~ C~ T~x T~m K~ K~ B~ N~
1423 * \ \ \__4__| | |
1424 * \ \_7_________|__|
1425 * \_________5_____________|
1426 */
1427
1428 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1429
1430 next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,4 */
1431
1432 tail (ret_val, next); /* 2 */
1433
1434 next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 4 */
1435
1436 tail (emit_node (BACK), ret_val); /* 3 */
1437 tail (next, emit_node (BACK)); /* 4 */
1438 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 5,6 */
1439
1440 next = emit_node (BRANCH); /* 5,8 */
1441
1442 tail (ret_val, next); /* 5 */
1443 offset_tail (next, -NODE_SIZE, ret_val); /* 6 */
1444
1445 next = emit_node (NOTHING); /* 7,8 */
1446
1447 offset_tail (ret_val, NODE_SIZE, next); /* 7 */
1448
1449 offset_tail (next, -NODE_SIZE, next); /* 8 */
1450 (void) insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 9 */
1451 tail (ret_val, ret_val + INDEX_SIZE + (2 * NODE_SIZE)); /* 9 */
1452 }
1453
1454 Num_Braces++;
1455 } else {
1456 /* We get here if the IS_QUANTIFIER macro is not coordinated properly
1457 with this function. */
1458
1459 REG_FAIL (U"internal error #2, `piece\'");
1460 }
1461
1462 if (IS_QUANTIFIER (*Reg_Parse)) {
1463 if (op_code == '{') {
1464 Melder_sprint (Error_Text,128, U"nested quantifiers, {m,n}", *Reg_Parse);
1465 } else {
1466 Melder_sprint (Error_Text,128, U"nested quantifiers, ", op_code, *Reg_Parse);
1467 }
1468
1469 REG_FAIL (Error_Text);
1470 }
1471
1472 return (ret_val);
1473 }
1474
1475 /*----------------------------------------------------------------------*
1476 * atom
1477 *
1478 * Process one regex item at the lowest level
1479 *
1480 * OPTIMIZATION: Lumps a continuous sequence of ordinary characters
1481 * together so that it can turn them into a single EXACTLY node, which
1482 * is smaller to store and faster to run.
1483 *----------------------------------------------------------------------*/
1484
atom(int * flag_param,len_range * range_param)1485 static char32 *atom (int *flag_param, len_range *range_param) {
1486
1487 char32 *ret_val;
1488 char32 test;
1489 int flags_local;
1490 len_range range_local;
1491
1492 *flag_param = WORST; /* Tentatively. */
1493 range_param->lower = 0; /* Idem */
1494 range_param->upper = 0;
1495
1496 /* Process any regex comments, e.g. `(?# match next token->)'. The
1497 terminating right parenthesis cannot be escaped. The comment stops at
1498 the first right parenthesis encountered (or the end of the regex
1499 string)... period. Handles multiple sequential comments,
1500 e.g. `(?# one)(?# two)...' */
1501
1502 while (*Reg_Parse == '(' &&
1503 * (Reg_Parse + 1) == '?' &&
1504 * (Reg_Parse + 2) == '#') {
1505
1506 Reg_Parse += 3;
1507
1508 while (*Reg_Parse != ')' && *Reg_Parse != '\0') {
1509 Reg_Parse++;
1510 }
1511
1512 if (*Reg_Parse == ')') {
1513 Reg_Parse++;
1514 }
1515
1516 if (*Reg_Parse == ')' || *Reg_Parse == '|' || *Reg_Parse == '\0') {
1517 /* Hit end of regex string or end of parenthesized regex; have to
1518 return "something" (i.e. a NOTHING node) to avoid generating an
1519 error. */
1520
1521 ret_val = emit_node (NOTHING);
1522
1523 return (ret_val);
1524 }
1525 }
1526
1527 switch (*Reg_Parse ++) {
1528 case U'^':
1529 ret_val = emit_node (BOL);
1530 break;
1531 case U'$':
1532 ret_val = emit_node (EOL);
1533 break;
1534 case U'<':
1535 ret_val = emit_node (BOWORD);
1536 break;
1537 case U'>':
1538 ret_val = emit_node (EOWORD);
1539 break;
1540 case U'.':
1541 if (Match_Newline) {
1542 ret_val = emit_node (EVERY);
1543 } else {
1544 ret_val = emit_node (ANY);
1545 }
1546
1547 *flag_param |= (HAS_WIDTH | SIMPLE);
1548 range_param->lower = 1;
1549 range_param->upper = 1;
1550 break;
1551 case U'(':
1552 if (*Reg_Parse == U'?') { /* Special parenthetical expression */
1553 Reg_Parse++;
1554 range_local.lower = 0; /* Make sure it is always used */
1555 range_local.upper = 0;
1556
1557 if (*Reg_Parse == U':') {
1558 Reg_Parse ++;
1559 ret_val = chunk (NO_CAPTURE, &flags_local, &range_local);
1560 } else if (*Reg_Parse == U'=') {
1561 Reg_Parse ++;
1562 ret_val = chunk (POS_AHEAD_OPEN, &flags_local, &range_local);
1563 } else if (*Reg_Parse == U'!') {
1564 Reg_Parse ++;
1565 ret_val = chunk (NEG_AHEAD_OPEN, &flags_local, &range_local);
1566 } else if (*Reg_Parse == U'i') {
1567 Reg_Parse ++;
1568 ret_val = chunk (INSENSITIVE, &flags_local, &range_local);
1569 } else if (*Reg_Parse == U'I') {
1570 Reg_Parse ++;
1571 ret_val = chunk (SENSITIVE, &flags_local, &range_local);
1572 } else if (*Reg_Parse == U'n') {
1573 Reg_Parse ++;
1574 ret_val = chunk (NEWLINE, &flags_local, &range_local);
1575 } else if (*Reg_Parse == U'N') {
1576 Reg_Parse ++;
1577 ret_val = chunk (NO_NEWLINE, &flags_local, &range_local);
1578 } else if (*Reg_Parse == U'<') {
1579 Reg_Parse ++;
1580 if (*Reg_Parse == U'=') {
1581 Reg_Parse ++;
1582 ret_val = chunk (POS_BEHIND_OPEN, &flags_local, &range_local);
1583 } else if (*Reg_Parse == U'!') {
1584 Reg_Parse ++;
1585 ret_val = chunk (NEG_BEHIND_OPEN, &flags_local, &range_local);
1586 } else {
1587 Melder_sprint (Error_Text,128, U"invalid look-behind syntax, \"(?<", *Reg_Parse, U"...)\"");
1588 REG_FAIL (Error_Text)
1589 }
1590 } else {
1591 Melder_sprint (Error_Text,128, U"invalid grouping syntax, \"(?", *Reg_Parse, U"...)\"");
1592 REG_FAIL (Error_Text)
1593 }
1594 } else { /* Normal capturing parentheses */
1595 ret_val = chunk (PAREN, & flags_local, & range_local);
1596 }
1597 if (! ret_val)
1598 return nullptr; // something went wrong
1599
1600 /* Add HAS_WIDTH flag if it was set by call to chunk. */
1601
1602 *flag_param |= flags_local & HAS_WIDTH;
1603 *range_param = range_local;
1604
1605 break;
1606
1607 case U'\0':
1608 case U'|':
1609 case U')':
1610 REG_FAIL (U"internal error #3, `atom\'") // supposed to be caught earlier
1611 case U'?':
1612 case U'+':
1613 case U'*':
1614 Melder_sprint (Error_Text,128, * (Reg_Parse - 1), U" follows nothing");
1615 REG_FAIL (Error_Text)
1616
1617 case U'{':
1618 if (Enable_Counting_Quantifier) {
1619 REG_FAIL (U"{m,n} follows nothing")
1620 } else {
1621 ret_val = emit_node (EXACTLY); /* Treat braces as literals. */
1622 emit_byte (U'{');
1623 emit_byte (U'\0');
1624 range_param->lower = 1;
1625 range_param->upper = 1;
1626 }
1627
1628 break;
1629
1630 case U'[': {
1631 char32 last_emit = U'\0';
1632
1633 /* Handle characters that can only occur at the start of a class. */
1634
1635 if (*Reg_Parse == U'^') { /* Complement of range. */
1636 ret_val = emit_node (ANY_BUT);
1637 Reg_Parse ++;
1638
1639 /* All negated classes include newline unless escaped with
1640 a "(?n)" switch. */
1641
1642 if (!Match_Newline) {
1643 emit_byte (U'\n');
1644 }
1645 } else {
1646 ret_val = emit_node (ANY_OF);
1647 }
1648
1649 if (*Reg_Parse == U']' || *Reg_Parse == U'-') {
1650 /* If '-' or ']' is the first character in a class,
1651 it is a literal character in the class. */
1652
1653 last_emit = *Reg_Parse;
1654 emit_byte (*Reg_Parse);
1655 Reg_Parse ++;
1656 }
1657
1658 /* Handle the rest of the class characters. */
1659
1660 while (*Reg_Parse != U'\0' && *Reg_Parse != U']') {
1661 if (*Reg_Parse == U'-') { /* Process a range, e.g [a-z]. */
1662 Reg_Parse ++;
1663
1664 if (*Reg_Parse == U']' || *Reg_Parse == U'\0') {
1665 /* If '-' is the last character in a class it is a literal
1666 character. If `Reg_Parse' points to the end of the
1667 regex string, an error will be generated later. */
1668
1669 emit_byte (U'-');
1670 last_emit = U'-';
1671 } else {
1672 /* We must get the range starting character value from the
1673 emitted code since it may have been an escaped
1674 character. `second_value' is set one larger than the
1675 just emitted character value. This is done since
1676 `second_value' is used as the start value for the loop
1677 that emits the values in the range. Since we have
1678 already emitted the first character of the class, we do
1679 not want to emit it again. */
1680
1681 char32 second_value = last_emit + 1, last_value;
1682
1683 if (*Reg_Parse == U'\\') {
1684 /* Handle escaped characters within a class range.
1685 Specifically disallow shortcut escapes as the end of
1686 a class range. To allow this would be ambiguous
1687 since shortcut escapes represent a set of characters,
1688 and it would not be clear which character of the
1689 class should be treated as the "last" character. */
1690
1691 Reg_Parse++;
1692
1693 if ((test = numeric_escape (*Reg_Parse, &Reg_Parse)) != U'\0') {
1694 last_value = test;
1695 } else if ((test = literal_escape (*Reg_Parse)) != U'\0') {
1696 last_value = test;
1697 } else if (shortcut_escape (*Reg_Parse, NULL, CHECK_CLASS_ESCAPE)) {
1698 Melder_sprint (Error_Text, 128, U"\\", *Reg_Parse, U" is not allowed as range operand");
1699 REG_FAIL (Error_Text)
1700 } else {
1701 Melder_sprint (Error_Text, 128, U"\\", *Reg_Parse, U" is an invalid char class escape sequence");
1702 REG_FAIL (Error_Text)
1703 }
1704 } else {
1705 last_value = U_CHAR_AT (Reg_Parse);
1706 }
1707
1708 if (Is_Case_Insensitive) {
1709 second_value = Melder_toLowerCase (second_value); // TODO: this should do casefolding
1710 last_value = Melder_toLowerCase (last_value);
1711 }
1712
1713 /* For case insensitive, something like [A-_] will
1714 generate an error here since ranges are converted to
1715 lower case. */
1716
1717 if (second_value - 1 > last_value)
1718 REG_FAIL (U"invalid [] range")
1719
1720 /* If only one character in range (e.g [a-a]) then this
1721 loop is not run since the first character of any range
1722 was emitted by the previous iteration of while loop. */
1723
1724 for (; second_value <= last_value; second_value++)
1725 emit_class_byte (second_value);
1726 last_emit = (char32) last_value;
1727 Reg_Parse ++;
1728 } /* End class character range code. */
1729 } else if (*Reg_Parse == '\\') {
1730 Reg_Parse++;
1731
1732 if ( (test = numeric_escape (*Reg_Parse, &Reg_Parse)) != '\0') {
1733 emit_class_byte (test);
1734 last_emit = test;
1735 } else if ( (test = literal_escape (*Reg_Parse)) != '\0') {
1736 emit_byte (test);
1737 last_emit = test;
1738 } else if (shortcut_escape (*Reg_Parse,
1739 NULL,
1740 CHECK_CLASS_ESCAPE)) {
1741
1742 if (* (Reg_Parse + 1) == '-') {
1743 /* Specifically disallow shortcut escapes as the start
1744 of a character class range (see comment above.) */
1745
1746 Melder_sprint (Error_Text,128, U"\\", *Reg_Parse, U" not allowed as range operand");
1747
1748 REG_FAIL (Error_Text);
1749 } else {
1750 /* Emit the bytes that are part of the shortcut
1751 escape sequence's range (e.g. \d = 0123456789) */
1752
1753 Melder_sprint (Error_Text,128, U"Cannot have escape sequences such as \\", *Reg_Parse, U" inside a class");
1754 }
1755 } else {
1756 Melder_sprint (Error_Text,128, U"\\", *Reg_Parse, U" is an invalid char class escape sequence");
1757
1758 REG_FAIL (Error_Text);
1759 }
1760
1761 Reg_Parse++;
1762
1763 /* End of class escaped sequence code */
1764 } else {
1765 emit_class_byte (*Reg_Parse); /* Ordinary class character. */
1766
1767 last_emit = *Reg_Parse;
1768 Reg_Parse++;
1769 }
1770 } /* End of while (*Reg_Parse != U'\0' && *Reg_Parse != U']') */
1771
1772 if (*Reg_Parse != U']') {
1773 REG_FAIL (U"missing right \']\'");
1774 }
1775
1776 emit_byte (U'\0');
1777
1778 /* NOTE: it is impossible to specify an empty class. This is
1779 because [] would be interpreted as "begin character class"
1780 followed by a literal ']' character and no "end character class"
1781 delimiter (']'). Because of this, it is always safe to assume
1782 that a class HAS_WIDTH. */
1783
1784 Reg_Parse ++;
1785 *flag_param |= HAS_WIDTH | SIMPLE;
1786 range_param->lower = 1;
1787 range_param->upper = 1;
1788 }
1789
1790 break; /* End of character class code. */
1791
1792 case U'\\':
1793 /* Force Error_Text to have a length of zero. This way we can tell if
1794 either of the calls to shortcut_escape() or back_ref() fill
1795 Error_Text with an error message. */
1796
1797 Error_Text [0] = U'\0';
1798
1799 if ( (ret_val = shortcut_escape (*Reg_Parse, flag_param, EMIT_NODE))) {
1800 Reg_Parse ++;
1801 range_param->lower = 1;
1802 range_param->upper = 1;
1803 break;
1804 } else if ( (ret_val = back_ref (Reg_Parse, flag_param, EMIT_NODE))) {
1805 /* Can't make any assumptions about a back-reference as to SIMPLE
1806 or HAS_WIDTH. For example (^|<) is neither simple nor has
1807 width. So we don't flip bits in flag_param here. */
1808
1809 Reg_Parse++;
1810 /* Back-references always have an unknown length */
1811 range_param->lower = -1;
1812 range_param->upper = -1;
1813 break;
1814 }
1815 if (Error_Text [0] != U'\0')
1816 REG_FAIL (Error_Text)
1817
1818 /* At this point it is apparent that the escaped character is not a
1819 shortcut escape or back-reference. Back up one character to allow
1820 the default code to include it as an ordinary character. */
1821
1822 /* Fall through to Default case to handle literal escapes and numeric
1823 escapes. */
1824
1825 default:
1826 Reg_Parse --; /* If we fell through from the above code, we are now
1827 pointing at the back slash (\) character. */
1828 {
1829 char32 *parse_save;
1830 int len = 0;
1831
1832 if (Is_Case_Insensitive) {
1833 ret_val = emit_node (SIMILAR);
1834 } else {
1835 ret_val = emit_node (EXACTLY);
1836 }
1837
1838 /* Loop until we find a meta character, shortcut escape, back
1839 reference, or end of regex string. */
1840
1841 for (; *Reg_Parse != '\0' && ! str32chr (Meta_Char, *Reg_Parse); len++) {
1842
1843 /* Save where we are in case we have to back
1844 this character out. */
1845
1846 parse_save = Reg_Parse;
1847
1848 if (*Reg_Parse == U'\\') {
1849 Reg_Parse ++; // point to escaped character
1850
1851 Error_Text [0] = U'\0'; // see comment above
1852
1853 if ( (test = numeric_escape (*Reg_Parse, &Reg_Parse))) {
1854 if (Is_Case_Insensitive) {
1855 emit_byte (Melder_toLowerCase (test));
1856 } else {
1857 emit_byte (test);
1858 }
1859 } else if ( (test = literal_escape (*Reg_Parse))) {
1860 emit_byte (test);
1861 } else if (back_ref (Reg_Parse, NULL, CHECK_ESCAPE)) {
1862 /* Leave back reference for next `atom' call */
1863
1864 Reg_Parse --; break;
1865 } else if (shortcut_escape (*Reg_Parse, NULL, CHECK_ESCAPE)) {
1866 /* Leave shortcut escape for next `atom' call */
1867
1868 Reg_Parse --; break;
1869 } else {
1870 if (Error_Text [0] != U'\0') {
1871 /* None of the above calls generated an error message
1872 so generate our own here. */
1873
1874 Melder_sprint (Error_Text,128, U"\\", *Reg_Parse, U" is an invalid escape sequence");
1875 }
1876 REG_FAIL (Error_Text)
1877 }
1878 Reg_Parse ++;
1879 } else {
1880 /* Ordinary character */
1881
1882 if (Is_Case_Insensitive) {
1883 emit_byte (Melder_toLowerCase (*Reg_Parse));
1884 } else {
1885 emit_byte (*Reg_Parse);
1886 }
1887 Reg_Parse ++;
1888 }
1889
1890 /* If next regex token is a quantifier (?, +. *, or {m,n}) and
1891 our EXACTLY node so far is more than one character, leave the
1892 last character to be made into an EXACTLY node one character
1893 wide for the multiplier to act on. For example 'abcd* would
1894 have an EXACTLY node with an 'abc' operand followed by a STAR
1895 node followed by another EXACTLY node with a 'd' operand. */
1896
1897 if (IS_QUANTIFIER (*Reg_Parse) && len > 0) {
1898 Reg_Parse = parse_save; /* Point to previous regex token. */
1899 if (Code_Emit_Ptr == &Compute_Size) {
1900 Reg_Size--;
1901 } else {
1902 Code_Emit_Ptr--; /* Write over previously emitted byte. */
1903 }
1904 break;
1905 }
1906 }
1907 if (len <= 0)
1908 REG_FAIL (U"internal error #4, `atom\'")
1909
1910 *flag_param |= HAS_WIDTH;
1911
1912 if (len == 1)
1913 *flag_param |= SIMPLE;
1914
1915 range_param->lower = len;
1916 range_param->upper = len;
1917
1918 emit_byte (U'\0');
1919 }
1920 } /* END switch (*Reg_Parse++) */
1921
1922 return (ret_val);
1923 }
1924
1925 /*----------------------------------------------------------------------*
1926 * emit_node
1927 *
1928 * Emit (if appropriate) the op code for a regex node atom.
1929 *
1930 * The NEXT pointer is initialized to NULL.
1931 *
1932 * Returns a pointer to the START of the emitted node.
1933 *----------------------------------------------------------------------*/
1934
emit_node(int op_code)1935 static char32 *emit_node (int op_code) {
1936 char32 *ret_val = Code_Emit_Ptr; /* Return address of start of node */
1937 if (ret_val == & Compute_Size) {
1938 Reg_Size += NODE_SIZE;
1939 } else {
1940 char32 *ptr = ret_val;
1941 *ptr ++ = op_code;
1942 *ptr ++ = U'\0'; // null "NEXT" pointer
1943 *ptr ++ = U'\0';
1944 Code_Emit_Ptr = ptr;
1945 }
1946 return ret_val;
1947 }
1948
1949 /*----------------------------------------------------------------------*
1950 * emit_byte
1951 *
1952 * Emit (if appropriate) a byte of code (usually part of an operand.)
1953 *----------------------------------------------------------------------*/
1954
emit_byte(char32 c)1955 static void emit_byte (char32 c) {
1956 if (Code_Emit_Ptr == & Compute_Size) {
1957 Reg_Size ++;
1958 } else {
1959 *Code_Emit_Ptr ++ = c;
1960 }
1961 }
1962
1963 /*----------------------------------------------------------------------*
1964 * emit_class_byte
1965 *
1966 * Emit (if appropriate) a byte of code (usually part of a character
1967 * class operand.)
1968 *----------------------------------------------------------------------*/
1969
emit_class_byte(char32 c)1970 static void emit_class_byte (char32 c) {
1971 if (Code_Emit_Ptr == & Compute_Size) {
1972 Reg_Size ++;
1973 if (Is_Case_Insensitive && Melder_isLetter (c)) {
1974 Reg_Size ++;
1975 }
1976 } else if (Is_Case_Insensitive && Melder_isLetter (c)) {
1977 /* For case-insensitive character classes, emit both upper and lower case
1978 versions of alphabetical characters. */
1979
1980 *Code_Emit_Ptr ++ = Melder_toLowerCase (c);
1981 *Code_Emit_Ptr ++ = Melder_toUpperCase (c);
1982 } else {
1983 *Code_Emit_Ptr ++ = c;
1984 }
1985 }
1986
1987 /*----------------------------------------------------------------------*
1988 * emit_special
1989 *
1990 * Emit nodes that need special processing.
1991 *----------------------------------------------------------------------*/
1992
emit_special(char32 op_code,unsigned long test_val,int index)1993 static char32 *emit_special (
1994 char32 op_code,
1995 unsigned long test_val,
1996 int index)
1997 {
1998 char32 *ret_val = & Compute_Size;
1999 char32 *ptr;
2000
2001 if (Code_Emit_Ptr == & Compute_Size) {
2002 switch (op_code) {
2003 case POS_BEHIND_OPEN:
2004 case NEG_BEHIND_OPEN:
2005 Reg_Size += LENGTH_SIZE; /* Length of the look-behind match */
2006 Reg_Size += NODE_SIZE; /* Make room for the node */
2007 break;
2008 case TEST_COUNT:
2009 Reg_Size += NEXT_PTR_SIZE; /* Make room for a test value. */
2010 // ppgb FALLTHROUGH (both test and inc require index)
2011 case INC_COUNT:
2012 Reg_Size += INDEX_SIZE; /* Make room for an index value. */
2013 // ppgb FALLTHROUGH (everything requires node)
2014 default:
2015 Reg_Size += NODE_SIZE; /* Make room for the node. */
2016 }
2017 } else {
2018 ret_val = emit_node (op_code); /* Return the address for start of node. */
2019 ptr = Code_Emit_Ptr;
2020 if (op_code == INC_COUNT || op_code == TEST_COUNT) {
2021 *ptr ++ = (char32) index;
2022 if (op_code == TEST_COUNT) {
2023 *ptr ++ = PUT_OFFSET_L (test_val);
2024 *ptr ++ = PUT_OFFSET_R (test_val);
2025 }
2026 } else if (op_code == POS_BEHIND_OPEN || op_code == NEG_BEHIND_OPEN) {
2027 *ptr ++ = PUT_OFFSET_L (test_val);
2028 *ptr ++ = PUT_OFFSET_R (test_val);
2029 *ptr ++ = PUT_OFFSET_L (test_val);
2030 *ptr ++ = PUT_OFFSET_R (test_val);
2031 }
2032 Code_Emit_Ptr = ptr;
2033 }
2034 return ret_val;
2035 }
2036
2037 /*----------------------------------------------------------------------*
2038 * insert
2039 *
2040 * Insert a node in front of already emitted node(s). Means relocating
2041 * the operand. Code_Emit_Ptr points one byte past the just emitted
2042 * node and operand. The parameter `insert_pos' points to the location
2043 * where the new node is to be inserted.
2044 *----------------------------------------------------------------------*/
2045
insert(char32 op,char32 * insert_pos,long min,long max,int index)2046 static char32 *insert (
2047 char32 op,
2048 char32 *insert_pos,
2049 long min,
2050 long max,
2051 int index) {
2052
2053 char32 *src;
2054 char32 *dst;
2055 char32 *place;
2056 int insert_size = NODE_SIZE;
2057
2058 if (op == BRACE || op == LAZY_BRACE) {
2059 /* Make room for the min and max values. */
2060
2061 insert_size += (2 * NEXT_PTR_SIZE);
2062 } else if (op == INIT_COUNT) {
2063 /* Make room for an index value . */
2064
2065 insert_size += INDEX_SIZE;
2066 }
2067
2068 if (Code_Emit_Ptr == &Compute_Size) {
2069 Reg_Size += insert_size;
2070 return &Compute_Size;
2071 }
2072
2073 src = Code_Emit_Ptr;
2074 Code_Emit_Ptr += insert_size;
2075 dst = Code_Emit_Ptr;
2076
2077 /* Relocate the existing emitted code to make room for the new node. */
2078
2079 while (src > insert_pos) {
2080 *--dst = *--src;
2081 }
2082
2083 place = insert_pos; /* Where operand used to be. */
2084 *place++ = op; /* Inserted operand. */
2085 *place++ = '\0'; /* NEXT pointer for inserted operand. */
2086 *place++ = '\0';
2087
2088 if (op == BRACE || op == LAZY_BRACE) {
2089 *place++ = PUT_OFFSET_L (min);
2090 *place++ = PUT_OFFSET_R (min);
2091
2092 *place++ = PUT_OFFSET_L (max);
2093 *place++ = PUT_OFFSET_R (max);
2094 } else if (op == INIT_COUNT) {
2095 *place++ = (char32) index;
2096 }
2097
2098 return place; /* Return a pointer to the start of the code moved. */
2099 }
2100
2101 /*----------------------------------------------------------------------*
2102 * tail - Set the next-pointer at the end of a node chain.
2103 *----------------------------------------------------------------------*/
2104
tail(char32 * search_from,char32 * point_to)2105 static void tail (char32 *search_from, char32 *point_to) {
2106
2107 char32 *scan;
2108 char32 *next;
2109 int offset;
2110
2111 if (search_from == &Compute_Size) {
2112 return;
2113 }
2114
2115 /* Find the last node in the chain (node with a null NEXT pointer) */
2116
2117 scan = search_from;
2118
2119 for (;;) {
2120 next = next_ptr (scan);
2121
2122 if (!next) {
2123 break;
2124 }
2125
2126 scan = next;
2127 }
2128
2129 if (GET_OP_CODE (scan) == BACK) {
2130 offset = scan - point_to;
2131 } else {
2132 offset = point_to - scan;
2133 }
2134
2135 /* Set NEXT pointer */
2136
2137 * (scan + 1) = PUT_OFFSET_L (offset);
2138 * (scan + 2) = PUT_OFFSET_R (offset);
2139 }
2140
2141 /*--------------------------------------------------------------------*
2142 * offset_tail
2143 *
2144 * Perform a tail operation on (ptr + offset).
2145 *--------------------------------------------------------------------*/
2146
offset_tail(char32 * ptr,int offset,char32 * val)2147 static void offset_tail (char32 *ptr, int offset, char32 *val) {
2148 if (ptr == & Compute_Size || ! ptr)
2149 return;
2150 tail (ptr + offset, val);
2151 }
2152
2153 /*--------------------------------------------------------------------*
2154 * branch_tail
2155 *
2156 * Perform a tail operation on (ptr + offset) but only if `ptr' is a
2157 * BRANCH node.
2158 *--------------------------------------------------------------------*/
2159
branch_tail(char32 * ptr,int offset,char32 * val)2160 static void branch_tail (char32 *ptr, int offset, char32 *val) {
2161 if (ptr == & Compute_Size || ! ptr || GET_OP_CODE (ptr) != BRANCH)
2162 return;
2163 tail (ptr + offset, val);
2164 }
2165
2166 /*--------------------------------------------------------------------*
2167 * shortcut_escape
2168 *
2169 * Implements convenient escape sequences that represent entire
2170 * character classes or special location assertions (similar to escapes
2171 * supported by Perl)
2172 * _
2173 * \d Digits [0-9] |
2174 * \D NOT a digit [^0-9] | (Examples
2175 * \l Letters [a-zA-Z] | at left
2176 * \L NOT a Letter [^a-zA-Z] | are
2177 * \s Whitespace [ \t\n\r\f\v] | for
2178 * \S NOT Whitespace [^ \t\n\r\f\v] | ASCII)
2179 * \w "Word" character [a-zA-Z0-9_] |
2180 * \W NOT a "Word" character [^a-zA-Z0-9_] _|
2181 *
2182 * \B Matches any character that is NOT a word-delimiter
2183 *
2184 * Codes for the "emit" parameter:
2185 *
2186 * EMIT_NODE
2187 * Emit a shortcut node. Shortcut nodes have an implied set of
2188 * class characters. This helps keep the compiled regex string
2189 * small.
2190 *
2191 * CHECK_ESCAPE
2192 * Only verify that this is a valid shortcut escape.
2193 *
2194 * CHECK_CLASS_ESCAPE
2195 * Same as CHECK_ESCAPE but only allows characters valid within
2196 * a klas.
2197 *
2198 *--------------------------------------------------------------------*/
2199
shortcut_escape(char32 c,int * flag_param,int emit)2200 static char32 *shortcut_escape (
2201 char32 c,
2202 int *flag_param,
2203 int emit) {
2204
2205 char32 *klas = NULL;
2206 static const conststring32 codes = U"ByYwWdDlLsS";
2207 char32 *ret_val = (char32 *) 1; // assume success
2208 conststring32 valid_codes;
2209
2210 if (emit == CHECK_CLASS_ESCAPE) {
2211 valid_codes = codes + 5; /* \B, \y, \Y, \w and \W are not allowed in classes */
2212 } else {
2213 valid_codes = codes;
2214 }
2215
2216 if (! str32chr (valid_codes, c)) {
2217 return nullptr; // not a valid shortcut escape sequence
2218 } else if (emit == CHECK_ESCAPE || emit == CHECK_CLASS_ESCAPE) {
2219 return ret_val; // just checking if this is a valid shortcut escape
2220 }
2221
2222 switch (c) {
2223 case U'd':
2224 case U'D':
2225 if (emit == EMIT_NODE)
2226 ret_val = (iswlower ((int) c) ? emit_node (DIGIT) : emit_node (NOT_DIGIT));
2227 break;
2228 case U'l':
2229 case U'L':
2230 if (emit == EMIT_NODE)
2231 ret_val = (iswlower ((int) c) ? emit_node (LETTER) : emit_node (NOT_LETTER));
2232 break;
2233 case U's':
2234 case U'S':
2235 if (emit == EMIT_NODE) {
2236 if (Match_Newline) {
2237 ret_val = (iswlower ((int) c) ? emit_node (SPACE_NL) : emit_node (NOT_SPACE_NL));
2238 } else {
2239 ret_val = (iswlower ((int) c) ? emit_node (SPACE) : emit_node (NOT_SPACE));
2240 }
2241 }
2242 break;
2243 case U'w':
2244 case U'W':
2245 if (emit == EMIT_NODE) {
2246 ret_val = (iswlower ((int) c) ? emit_node (WORD_CHAR) : emit_node (NOT_WORD_CHAR));
2247 } else {
2248 REG_FAIL (U"internal error #105 `shortcut_escape\'");
2249 }
2250 break;
2251 /*
2252 Since the delimiter table is not available at regex compile time,
2253 \B, \y and \Y can only generate a node. At run time, the delimiter table
2254 will be available for these nodes to use.
2255 */
2256 case 'y':
2257 if (emit == EMIT_NODE) {
2258 ret_val = emit_node (IS_DELIM);
2259 } else {
2260 REG_FAIL (U"internal error #5 `shortcut_escape\'");
2261 }
2262 break;
2263 case 'Y':
2264 if (emit == EMIT_NODE) {
2265 ret_val = emit_node (NOT_DELIM);
2266 } else {
2267 REG_FAIL (U"internal error #6 `shortcut_escape\'");
2268 }
2269 break;
2270 case 'B':
2271 if (emit == EMIT_NODE) {
2272 ret_val = emit_node (NOT_BOUNDARY);
2273 } else {
2274 REG_FAIL (U"internal error #7 `shortcut_escape\'");
2275 }
2276 break;
2277 default:
2278 /*
2279 We get here if there isn't a case for every character in the string "codes".
2280 */
2281 REG_FAIL (U"internal error #8 `shortcut_escape\'");
2282 }
2283
2284 if (emit == EMIT_NODE && c != 'B') {
2285 *flag_param |= (HAS_WIDTH | SIMPLE);
2286 }
2287
2288 if (klas) {
2289 /*
2290 Emit bytes within a character class operand.
2291 */
2292 while (*klas != U'\0')
2293 emit_byte (*klas ++);
2294 }
2295
2296 return ret_val;
2297 }
2298
2299 /*--------------------------------------------------------------------*
2300 * numeric_escape
2301 *
2302 * Implements hex and octal numeric escape sequence syntax.
2303 *
2304 * Hexadecimal Escape: \x## Max of two digits Must have leading 'x'.
2305 * Octal Escape: \0### Max of three digits and not greater
2306 * than 377 octal. Must have leading zero.
2307 *
2308 * Returns the actual character value or NULL if not a valid hex or
2309 * octal escape. REG_FAIL is called if \x0, \x00, \0, \00, \000, or
2310 * \0000 is specified.
2311 *--------------------------------------------------------------------*/
2312
numeric_escape(char32 c,char32 ** parse)2313 static char32 numeric_escape (
2314 char32 c,
2315 char32 **parse) {
2316
2317 static char32 digits [] = { 'f', 'e', 'd', 'c', 'b', 'a', 'F', 'E', 'D', 'C', 'B', 'A', '9', '8', '7', '6', '5', '4', '3', '2', '1', '0', '\0' };
2318
2319 static unsigned int digit_val [] = {
2320 15, 14, 13, 12, 11, 10, /* Lower case Hex digits */
2321 15, 14, 13, 12, 11, 10, /* Upper case Hex digits */
2322 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2323 }; /* Decimal Digits */
2324
2325 char32 *scan;
2326 char32 *pos_ptr;
2327 char32 *digit_str;
2328 unsigned int value = 0;
2329 unsigned int radix = 8;
2330 int width = 3; /* Can not be bigger than \0377 */
2331 int pos_delta = 14;
2332 int i, pos;
2333
2334 switch (c) {
2335 case U'0':
2336 digit_str = digits + pos_delta; /* Only use Octal digits, i.e. 0-7. */
2337 break;
2338
2339 case U'x':
2340 case U'X':
2341 width = 2; /* Can not be bigger than \0377 */
2342 radix = 16;
2343 pos_delta = 0;
2344 digit_str = digits; /* Use all of the digit characters. */
2345 break;
2346
2347 default:
2348 return U'\0'; /* Not a numeric escape */
2349 }
2350
2351 scan = *parse; scan ++; /* Only change *parse on success. */
2352
2353 pos_ptr = str32chr (digit_str, *scan);
2354
2355 for (i = 0; pos_ptr && i < width; i++) {
2356 pos = (pos_ptr - digit_str) + pos_delta;
2357 value = (value * radix) + digit_val [pos];
2358
2359 /* If this digit makes the value over 255, treat this digit as a literal
2360 character instead of part of the numeric escape. For example, \0777
2361 will be processed as \077 (an 'M') and a literal '7' character, NOT
2362 511 decimal which is > 255. */
2363
2364 if (value > 255) {
2365 /* Back out calculations for last digit processed. */
2366
2367 value -= digit_val [pos];
2368 value /= radix;
2369
2370 break; /* Note that scan will not be incremented and still points to
2371 the digit that caused overflow. It will be decremented by
2372 the "else" below to point to the last character that is
2373 considered to be part of the octal escape. */
2374 }
2375
2376 scan ++;
2377 pos_ptr = str32chr (digit_str, *scan);
2378 }
2379
2380 /* Handle the case of "\0" i.e. trying to specify a NULL character. */
2381
2382 if (value == 0) {
2383 if (c == U'0') {
2384 Melder_sprint (Error_Text,128, U"\\00 is an invalid octal escape");
2385 } else {
2386 Melder_sprint (Error_Text,128, U"\\", c, U"0 is an invalid hexadecimal escape");
2387 }
2388 } else {
2389 /* Point to the last character of the number on success. */
2390
2391 scan --;
2392 *parse = scan;
2393 }
2394
2395 return value;
2396 }
2397
2398 /*--------------------------------------------------------------------*
2399 * literal_escape
2400 *
2401 * Recognize escaped literal characters (prefixed with backslash),
2402 * and translate them into the corresponding character.
2403 *
2404 * Returns the proper character value or NULL if not a valid literal
2405 * escape.
2406 *--------------------------------------------------------------------*/
2407
literal_escape(char32 c)2408 static char32 literal_escape (char32 c) {
2409
2410 static char32 valid_escape [] = {
2411 'a', 'b',
2412 'e',
2413 'f', 'n', 'r', 't', 'v', '(', ')', '-', '[', ']',
2414 '<', '>', '{', '}', '.', '\\', '|', '^', '$', '*',
2415 '+', '?', '&', '\0'
2416 };
2417
2418 static char32 value [] = {
2419 '\a', '\b',
2420 0x1B, /* Escape character in Unicode character set. */
2421 '\f', '\n', '\r', '\t', '\v', '(', ')', '-', '[', ']',
2422 '<', '>', '{', '}', '.', '\\', '|', '^', '$', '*',
2423 '+', '?', '&', '\0'
2424 };
2425
2426 for (int i = 0; valid_escape [i] != '\0'; i++) {
2427 if (c == valid_escape [i]) {
2428 return value [i];
2429 }
2430 }
2431
2432 return '\0';
2433 }
2434
2435 /*--------------------------------------------------------------------*
2436 * back_ref
2437 *
2438 * Process a request to match a previous parenthesized thing.
2439 * Parenthetical entities are numbered beginning at 1 by counting
2440 * opening parentheses from left to to right. \0 would represent
2441 * whole match, but would confuse numeric_escape as an octal escape,
2442 * so it is forbidden.
2443 *
2444 * Constructs of the form \~1, \~2, etc. are cross-regex back
2445 * references and are used in syntax highlighting patterns to match
2446 * text previously matched by another regex. *** IMPLEMENT LATER ***
2447 *--------------------------------------------------------------------*/
2448
back_ref(char32 * c,int * flag_param,int emit)2449 static char32 *back_ref (
2450 char32 *c,
2451 int *flag_param,
2452 int emit) {
2453
2454 int paren_no, c_offset = 0, is_cross_regex = 0;
2455
2456 char32 *ret_val;
2457
2458 /* Implement cross regex backreferences later. */
2459
2460 /* if (*c == (regularExp_CHAR) ('~')) {
2461 c_offset++;
2462 is_cross_regex++;
2463 } */
2464
2465 paren_no = (int) (* (c + c_offset) - (char32) ('0'));
2466
2467 if (! Melder_isAsciiDecimalNumber (* (c + c_offset)) || /* Only \1, \2, ... \9 are supported. */
2468 paren_no == 0) { /* Should be caught by numeric_escape. */
2469
2470 return NULL;
2471 }
2472
2473 /* Make sure parentheses for requested back-reference are complete. */
2474
2475 if (! is_cross_regex && ! TEST_BIT (Closed_Parens, paren_no)) {
2476 Melder_sprint (Error_Text,128, U"\\", paren_no, U" is an illegal back reference");
2477 return NULL;
2478 }
2479
2480 if (emit == EMIT_NODE) {
2481 if (is_cross_regex) {
2482 Reg_Parse ++; /* Skip past the '~' in a cross regex back reference.
2483 We only do this if we are emitting code. */
2484
2485 if (Is_Case_Insensitive) {
2486 ret_val = emit_node (X_REGEX_BR_CI);
2487 } else {
2488 ret_val = emit_node (X_REGEX_BR);
2489 }
2490 } else {
2491 if (Is_Case_Insensitive) {
2492 ret_val = emit_node (BACK_REF_CI);
2493 } else {
2494 ret_val = emit_node (BACK_REF);
2495 }
2496 }
2497
2498 emit_byte ((char32) paren_no);
2499
2500 if (is_cross_regex || TEST_BIT (Paren_Has_Width, paren_no)) {
2501 *flag_param |= HAS_WIDTH;
2502 }
2503 } else if (emit == CHECK_ESCAPE) {
2504 ret_val = (char32 *) 1;
2505 } else {
2506 ret_val = nullptr;
2507 }
2508 return ret_val;
2509 }
2510
2511 /*======================================================================*
2512 * Regex execution related code
2513 *======================================================================*/
2514
2515 /* Global work variables for `ExecRE'. */
2516
2517 static char32 *Reg_Input; /* String-input pointer. */
2518 static const char32 *Start_Of_String; /* Beginning of input, for ^ */
2519 /* and < checks. */
2520 static const char32 *End_Of_String; /* Logical end of input (if supplied, till \0 otherwise) */
2521 static const char32 *Look_Behind_To; /* Position till were look behind can safely check back */
2522 static char32 **Start_Ptr_Ptr; /* Pointer to `startp' array. */
2523 static char32 **End_Ptr_Ptr; /* Ditto for `endp'. */
2524 static char32 *Extent_Ptr_FW; /* Forward extent pointer */
2525 static char32 *Extent_Ptr_BW; /* Backward extent pointer */
2526 static char32 *Back_Ref_Start [10]; /* Back_Ref_Start [0] and */
2527 static char32 *Back_Ref_End [10]; /* Back_Ref_End [0] are not */
2528 /* used. This simplifies */
2529 /* indexing. */
2530 /*
2531 * Measured recursion limits:
2532 * Linux: +/- 40 000 (up to 110 000)
2533 * Solaris: +/- 85 000
2534 * HP-UX 11: +/- 325 000
2535 *
2536 * So 10 000 ought to be safe.
2537 */
2538 #define REGEX_RECURSION_LIMIT 10000
2539 static int Recursion_Count; /* Recursion counter */
2540 static int Recursion_Limit_Exceeded; /* Recursion limit exceeded flag */
2541
2542 #define AT_END_OF_STRING(X) (*(X) == U'\0' || (End_Of_String != NULL && (X) >= End_Of_String))
2543
2544 /* static regexp *Cross_Regex_Backref; */
2545
2546 static bool Prev_Is_BOL;
2547 static bool Succ_Is_EOL;
2548 static bool Prev_Is_Delim;
2549 static bool Succ_Is_Delim;
2550
2551 /* Define a pointer to an array to hold general (...){m,n} counts. */
2552
2553 typedef struct brace_counts {
2554 unsigned long count [1]; /* More unwarranted chumminess with compiler. */
2555 } brace_counts;
2556
2557 static struct brace_counts *Brace;
2558
2559 /* Forward declarations of functions used by `ExecRE' */
2560
2561 static int attempt (regexp *, char32 *);
2562 static int match (char32 *, int *);
2563 static unsigned long greedy (char32 *, long);
2564 static void adjustcase (char32 *, int, char32);
2565
2566 /*
2567 * ExecRE - match a `regexp' structure against a string
2568 *
2569 * If `end' is non-NULL, matches may not BEGIN past end, but may extend past
2570 * it. If reverse is true, `end' should be specified, and searching begins at
2571 * `end'. "isbol" should be set to true if the beginning of the string is the
2572 * actual beginning of a line (since `ExecRE' can't look backwards from the
2573 * beginning to find whether there was a newline before). Likewise, "isbow"
2574 * asks whether the string is preceded by a word delimiter. End of string is
2575 * always treated as a word and line boundary (there may be cases where it
2576 * shouldn't be, in which case, this should be changed).
2577 * Look_behind_to indicates the position till where it is safe to
2578 * perform look-behind matches. If set, it should be smaller than or equal
2579 * to the start position of the search (pointed at by string). If it is NULL,
2580 * it defaults to the start position.
2581 * Finally, match_to indicates the logical end of the string, till where
2582 * matches are allowed to extend. Note that look-ahead patterns may look
2583 * past that boundary. If match_to is set to NULL, the terminating \0 is
2584 * assumed to correspond to the logical boundary. Match_to, if set, should be
2585 * larger than or equal to end, if set.
2586 */
2587
ExecRE(regexp * prog,regexp * cross_regex_backref,conststring32 string,const char32 * end,int reverse,char32 prev_char,char32 succ_char,conststring32 look_behind_to,conststring32 match_to)2588 int ExecRE (
2589 regexp *prog,
2590 regexp *cross_regex_backref,
2591 conststring32 string,
2592 const char32 *end,
2593 int reverse,
2594 char32 prev_char,
2595 char32 succ_char,
2596 conststring32 look_behind_to,
2597 conststring32 match_to) {
2598
2599 char32 *str;
2600 char32 **s_ptr;
2601 char32 **e_ptr;
2602 int ret_val = 0;
2603 int i;
2604 (void) cross_regex_backref;
2605
2606 s_ptr = (char32 **) prog->startp;
2607 e_ptr = (char32 **) prog->endp;
2608
2609 /* Check for valid parameters. */
2610
2611 if (! prog || ! string) {
2612 reg_error (U"NULL parameter to `ExecRE\'");
2613 goto SINGLE_RETURN;
2614 }
2615
2616 /* Check validity of program. */
2617
2618 if (U_CHAR_AT (prog->program) != MAGIC) {
2619 reg_error (U"corrupted program");
2620 goto SINGLE_RETURN;
2621 }
2622
2623 /* Remember the logical end of the string. */
2624
2625 End_Of_String = match_to;
2626
2627 if (reverse && ! end) {
2628 for (end = string; ! AT_END_OF_STRING (end); end ++) { }
2629 succ_char = U'\n';
2630 } else if (! end) {
2631 succ_char = U'\n';
2632 }
2633
2634 /* Remember the beginning of the string for matching BOL */
2635
2636 Start_Of_String = string;
2637 Look_Behind_To = look_behind_to ? look_behind_to : string;
2638
2639 Prev_Is_BOL = ( prev_char == U'\n' || prev_char == U'\0' );
2640 Succ_Is_EOL = ( succ_char == U'\n' || succ_char == U'\0' );
2641 Prev_Is_Delim = ! Melder_isWordCharacter (prev_char);
2642 Succ_Is_Delim = ! Melder_isWordCharacter (succ_char);
2643
2644 Total_Paren = (int) (prog->program [1]);
2645 Num_Braces = (int) (prog->program [2]);
2646
2647 /* Reset the recursion detection flag */
2648 Recursion_Limit_Exceeded = 0;
2649
2650 /* Cross_Regex_Backref = cross_regex_backref; */
2651
2652 /* Allocate memory for {m,n} construct counting variables if need be. */
2653
2654 if (Num_Braces > 0) {
2655 Brace =
2656 (brace_counts *) malloc (sizeof (brace_counts) * (size_t) Num_Braces);
2657
2658 if (Brace == NULL) {
2659 reg_error (U"out of memory in `ExecRE\'");
2660 goto SINGLE_RETURN;
2661 }
2662 } else {
2663 Brace = NULL;
2664 }
2665
2666 /* Initialize the first nine (9) capturing parentheses start and end
2667 pointers to point to the start of the search string. This is to prevent
2668 crashes when later trying to reference captured parens that do not exist
2669 in the compiled regex. We only need to do the first nine since users
2670 can only specify \1, \2, ... \9. */
2671
2672 for (i = 9; i > 0; i--) {
2673 *s_ptr++ = (char32 *) string;
2674 *e_ptr++ = (char32 *) string;
2675 }
2676
2677 if (!reverse) { /* Forward Search */
2678 if (prog->anchor) {
2679 /* Search is anchored at BOL */
2680
2681 if (attempt (prog, (char32 *) string)) {
2682 ret_val = 1;
2683 goto SINGLE_RETURN;
2684 }
2685
2686 for (str = (char32 *) string;
2687 !AT_END_OF_STRING (str) && str != (char32 *) end && !Recursion_Limit_Exceeded;
2688 str++) {
2689
2690 if (*str == '\n') {
2691 if (attempt (prog, str + 1)) {
2692 ret_val = 1;
2693 break;
2694 }
2695 }
2696 }
2697
2698 goto SINGLE_RETURN;
2699
2700 } else if (prog->match_start != '\0') {
2701 /* We know what char match must start with. */
2702
2703 for (str = (char32 *) string;
2704 !AT_END_OF_STRING (str) && str != (char32 *) end && !Recursion_Limit_Exceeded;
2705 str++) {
2706
2707 if (*str == (char32) prog->match_start) {
2708 if (attempt (prog, str)) {
2709 ret_val = 1;
2710 break;
2711 }
2712 }
2713 }
2714
2715 goto SINGLE_RETURN;
2716 } else {
2717 /* General case */
2718
2719 for (str = (char32 *) string;
2720 !AT_END_OF_STRING (str) && str != (char32 *) end && !Recursion_Limit_Exceeded;
2721 str++) {
2722
2723 if (attempt (prog, str)) {
2724 ret_val = 1;
2725 break;
2726 }
2727 }
2728
2729 /* Beware of a single $ matching \0 */
2730 if (!Recursion_Limit_Exceeded && !ret_val && AT_END_OF_STRING (str) && str != (char32 *) end) {
2731 if (attempt (prog, str)) {
2732 ret_val = 1;
2733 }
2734 }
2735
2736 goto SINGLE_RETURN;
2737 }
2738 } else { /* Search reverse, same as forward, but loops run backward */
2739
2740 /* Make sure that we don't start matching beyond the logical end */
2741 if (End_Of_String != NULL && (char32 *) end > End_Of_String) {
2742 end = (const char32 *) End_Of_String;
2743 }
2744
2745 if (prog->anchor) {
2746 /* Search is anchored at BOL */
2747
2748 for (str = (char32 *) (end - 1); str >= (char32 *) string && !Recursion_Limit_Exceeded; str--) {
2749
2750 if (*str == '\n') {
2751 if (attempt (prog, str + 1)) {
2752 ret_val = 1;
2753 goto SINGLE_RETURN;
2754 }
2755 }
2756 }
2757
2758 if (!Recursion_Limit_Exceeded && attempt (prog, (char32 *) string)) {
2759 ret_val = 1;
2760 goto SINGLE_RETURN;
2761 }
2762
2763 goto SINGLE_RETURN;
2764 } else if (prog->match_start != '\0') {
2765 /* We know what char match must start with. */
2766
2767 for (str = (char32 *) end; str >= (char32 *) string && !Recursion_Limit_Exceeded; str--) {
2768
2769 if (*str == (char32) prog->match_start) {
2770 if (attempt (prog, str)) {
2771 ret_val = 1;
2772 break;
2773 }
2774 }
2775 }
2776
2777 goto SINGLE_RETURN;
2778 } else {
2779 /* General case */
2780
2781 for (str = (char32 *) end; str >= (char32 *) string && !Recursion_Limit_Exceeded; str--) {
2782
2783 if (attempt (prog, str)) {
2784 ret_val = 1;
2785 break;
2786 }
2787 }
2788 }
2789 }
2790
2791 SINGLE_RETURN: if (Brace) {
2792 free (Brace);
2793 }
2794
2795 if (Recursion_Limit_Exceeded) {
2796 return (0);
2797 }
2798
2799 return (ret_val);
2800 }
2801
attempt(regexp * prog,char32 * string)2802 static int attempt (regexp *prog, char32 *string) {
2803
2804 int i;
2805 char32 **s_ptr;
2806 char32 **e_ptr;
2807 int branch_index = 0; /* Must be set to zero ! */
2808
2809 Reg_Input = string;
2810 Start_Ptr_Ptr = (char32 **) prog->startp;
2811 End_Ptr_Ptr = (char32 **) prog->endp;
2812 s_ptr = (char32 **) prog->startp;
2813 e_ptr = (char32 **) prog->endp;
2814
2815 /* Reset the recursion counter. */
2816 Recursion_Count = 0;
2817
2818 /* Overhead due to capturing parentheses. */
2819
2820 Extent_Ptr_BW = string;
2821 Extent_Ptr_FW = nullptr;
2822
2823 for (i = Total_Paren + 1; i > 0; i--) {
2824 *s_ptr ++ = nullptr;
2825 *e_ptr ++ = nullptr;
2826 }
2827
2828 if (match ( (char32 *) (prog->program + REGEX_START_OFFSET),
2829 &branch_index)) {
2830 prog->startp [0] = (char32 *) string;
2831 prog->endp [0] = (char32 *) Reg_Input; /* <-- One char AFTER */
2832 prog->extentpBW = (char32 *) Extent_Ptr_BW; /* matched string! */
2833 prog->extentpFW = (char32 *) Extent_Ptr_FW;
2834 prog->top_branch = branch_index;
2835
2836 return 1;
2837 } else {
2838 return 0;
2839 }
2840 }
2841
2842 /*----------------------------------------------------------------------*
2843 * match - main matching routine
2844 *
2845 * Conceptually the strategy is simple: check to see whether the
2846 * current node matches, call self recursively to see whether the rest
2847 * matches, and then act accordingly. In practice we make some effort
2848 * to avoid recursion, in particular by going through "ordinary" nodes
2849 * (that don't need to know whether the rest of the match failed) by a
2850 * loop instead of by recursion. Returns 0 failure, 1 success.
2851 *----------------------------------------------------------------------*/
2852 #define MATCH_RETURN(X)\
2853 { --Recursion_Count; return (X); }
2854 #define CHECK_RECURSION_LIMIT\
2855 if (Recursion_Limit_Exceeded) MATCH_RETURN (0);
2856
match(char32 * prog,int * branch_index_param)2857 static int match (char32 *prog, int *branch_index_param) {
2858
2859 char32 *scan; /* Current node. */
2860 char32 *next; /* Next node. */
2861 int next_ptr_offset; /* Used by the NEXT_PTR () macro */
2862
2863 if (++ Recursion_Count > REGEX_RECURSION_LIMIT) {
2864 if (! Recursion_Limit_Exceeded) { /* Prevent duplicate errors */
2865 reg_error (U"recursion limit exceeded, please respecify expression");
2866 }
2867 Recursion_Limit_Exceeded = 1;
2868 MATCH_RETURN (0)
2869 }
2870
2871
2872 scan = prog;
2873
2874 while (scan) {
2875 NEXT_PTR (scan, next);
2876
2877 switch (GET_OP_CODE (scan)) {
2878 case BRANCH: {
2879 char32 *save;
2880 int branch_index_local = 0;
2881
2882 if (GET_OP_CODE (next) != BRANCH) { /* No choice. */
2883 next = OPERAND (scan); /* Avoid recursion. */
2884 } else {
2885 do {
2886 save = Reg_Input;
2887
2888 if (match (OPERAND (scan), NULL)) {
2889 if (branch_index_param)
2890 *branch_index_param = branch_index_local;
2891 MATCH_RETURN (1)
2892 }
2893
2894 CHECK_RECURSION_LIMIT
2895
2896 ++branch_index_local;
2897
2898 Reg_Input = save; /* Backtrack. */
2899 NEXT_PTR (scan, scan);
2900 } while (scan != NULL && GET_OP_CODE (scan) == BRANCH);
2901
2902 MATCH_RETURN (0) // NOT REACHED
2903 }
2904 }
2905
2906 break;
2907
2908 case EXACTLY: {
2909 char32 *opnd = OPERAND (scan);
2910
2911 /* Inline the first character, for speed. */
2912
2913 if (*opnd != *Reg_Input)
2914 MATCH_RETURN (0)
2915 int len = str32len (opnd);
2916 if (End_Of_String != NULL && Reg_Input + len > End_Of_String)
2917 MATCH_RETURN (0)
2918 if (len > 1 && str32ncmp (opnd, Reg_Input, len) != 0)
2919 MATCH_RETURN (0)
2920 Reg_Input += len;
2921 }
2922
2923 break;
2924
2925 case SIMILAR: {
2926 char32 *opnd = OPERAND (scan);
2927 char32 test;
2928 /* Note: the SIMILAR operand was converted to lower case during
2929 regex compile. */
2930
2931 while ((test = *opnd++) != U'\0') {
2932 if (AT_END_OF_STRING (Reg_Input) || Melder_toLowerCase (*Reg_Input++) != test)
2933 MATCH_RETURN (0)
2934 }
2935 }
2936
2937 break;
2938
2939 case BOL: /* `^' (beginning of line anchor) */
2940 if (Reg_Input == Start_Of_String) {
2941 if (Prev_Is_BOL)
2942 break;
2943 } else if (* (Reg_Input - 1) == U'\n') {
2944 break;
2945 }
2946 MATCH_RETURN (0)
2947
2948 case EOL: /* `$' anchor matches end of line and end of string */
2949 if (*Reg_Input == U'\n' || (AT_END_OF_STRING (Reg_Input) && Succ_Is_EOL))
2950 break;
2951 MATCH_RETURN (0)
2952
2953 case BOWORD: /* `<' (beginning of word anchor) */
2954 /* Check to see if the current character is not a delimiter
2955 and the preceding character is. */
2956 {
2957 bool prev_is_delim;
2958 if (Reg_Input == Start_Of_String) {
2959 prev_is_delim = Prev_Is_Delim;
2960 } else {
2961 prev_is_delim = ! Melder_isWordCharacter (*(Reg_Input - 1));
2962 }
2963 if (prev_is_delim) {
2964 bool current_is_delim;
2965 if (AT_END_OF_STRING (Reg_Input)) {
2966 current_is_delim = Succ_Is_Delim;
2967 } else {
2968 current_is_delim = ! Melder_isWordCharacter (*Reg_Input);
2969 }
2970 if (! current_is_delim)
2971 break;
2972 }
2973 }
2974 MATCH_RETURN (0)
2975
2976 case EOWORD: /* `>' (end of word anchor) */
2977 /* Check to see if the current character is a delimiter
2978 and the preceding character is not. */
2979 {
2980 bool prev_is_delim;
2981 if (Reg_Input == Start_Of_String) {
2982 prev_is_delim = Prev_Is_Delim;
2983 } else {
2984 prev_is_delim = ! Melder_isWordCharacter (*(Reg_Input - 1));
2985 }
2986 if (! prev_is_delim) {
2987 bool current_is_delim;
2988 if (AT_END_OF_STRING (Reg_Input)) {
2989 current_is_delim = Succ_Is_Delim;
2990 } else {
2991 current_is_delim = ! Melder_isWordCharacter (*Reg_Input);
2992 }
2993 if (current_is_delim)
2994 break;
2995 }
2996 }
2997 MATCH_RETURN (0)
2998
2999 case NOT_BOUNDARY: { /* \B (NOT a word boundary) */
3000 bool prev_is_delim;
3001 bool current_is_delim;
3002 if (Reg_Input == Start_Of_String) {
3003 prev_is_delim = Prev_Is_Delim;
3004 } else {
3005 prev_is_delim = ! Melder_isWordCharacter (*(Reg_Input - 1));
3006 }
3007 if (AT_END_OF_STRING (Reg_Input)) {
3008 current_is_delim = Succ_Is_Delim;
3009 } else {
3010 current_is_delim = ! Melder_isWordCharacter (*Reg_Input);
3011 }
3012 if (! (prev_is_delim ^ current_is_delim))
3013 break;
3014 }
3015 MATCH_RETURN (0)
3016 case IS_DELIM: /* \y (A word delimiter character.) */
3017 if (! Melder_isWordCharacter (*Reg_Input) && ! AT_END_OF_STRING (Reg_Input)) {
3018 Reg_Input++; break;
3019 }
3020 MATCH_RETURN (0)
3021 case NOT_DELIM: /* \Y (NOT a word delimiter character.) */
3022 if (Melder_isWordCharacter (*Reg_Input) && ! AT_END_OF_STRING (Reg_Input)) {
3023 Reg_Input ++;
3024 break;
3025 }
3026 MATCH_RETURN (0)
3027 case WORD_CHAR: /* \w (word character; alpha-numeric or underscore) */
3028 if (Melder_isWordCharacter (*Reg_Input) && ! AT_END_OF_STRING (Reg_Input)) {
3029 Reg_Input ++;
3030 break;
3031 }
3032 MATCH_RETURN (0)
3033 case NOT_WORD_CHAR:/* \W (NOT a word character) */
3034 if (Melder_isWordCharacter (*Reg_Input) || *Reg_Input == U'\n' || AT_END_OF_STRING (Reg_Input))
3035 MATCH_RETURN (0)
3036 Reg_Input ++;
3037 break;
3038 case ANY: /* `.' (matches any character EXCEPT newline) */
3039 if (AT_END_OF_STRING (Reg_Input) || *Reg_Input == U'\n')
3040 MATCH_RETURN (0)
3041 Reg_Input ++;
3042 break;
3043 case EVERY: /* `.' (matches any character INCLUDING newline) */
3044 if (AT_END_OF_STRING (Reg_Input))
3045 MATCH_RETURN (0)
3046 Reg_Input ++;
3047 break;
3048 case DIGIT: /* \d; for ASCII, use [0-9] */
3049 if (! Melder_isDecimalNumber (*Reg_Input) || AT_END_OF_STRING (Reg_Input))
3050 MATCH_RETURN (0)
3051 Reg_Input ++;
3052 break;
3053 case NOT_DIGIT: /* \D; for ASCII, use [^0-9] */
3054 if (Melder_isDecimalNumber (*Reg_Input) || *Reg_Input == U'\n' || AT_END_OF_STRING (Reg_Input))
3055 MATCH_RETURN (0)
3056 Reg_Input ++;
3057 break;
3058 case LETTER: /* \l; for ASCII, use [a-zA-Z] */
3059 if (! Melder_isLetter (*Reg_Input) || AT_END_OF_STRING (Reg_Input))
3060 MATCH_RETURN (0)
3061 Reg_Input ++;
3062 break;
3063 case NOT_LETTER: /* \L; for ASCII, use [^a-zA-Z] */
3064 if (Melder_isLetter (*Reg_Input) || *Reg_Input == '\n' || AT_END_OF_STRING (Reg_Input))
3065 MATCH_RETURN (0)
3066 Reg_Input ++;
3067 break;
3068 case SPACE: /* \s; for ASCII, use [ \t] */
3069 if (! Melder_isHorizontalSpace (*Reg_Input) || AT_END_OF_STRING (Reg_Input))
3070 MATCH_RETURN (0)
3071 Reg_Input ++;
3072 break;
3073 case SPACE_NL: /* \s; for ASCII, use [\n \t\r\f\v] */
3074 if (! Melder_isHorizontalOrVerticalSpace (*Reg_Input) || AT_END_OF_STRING (Reg_Input))
3075 MATCH_RETURN (0)
3076 Reg_Input ++;
3077 break;
3078 case NOT_SPACE: /* \S; for ASCII, use [^\n \t\r\f\v] */
3079 if (Melder_isHorizontalOrVerticalSpace (*Reg_Input) || AT_END_OF_STRING (Reg_Input))
3080 MATCH_RETURN (0)
3081 Reg_Input ++;
3082 break;
3083 case NOT_SPACE_NL: /* \S; for ASCII, use [^ \t\r\f\v] */
3084 if (Melder_isHorizontalSpace (*Reg_Input) || AT_END_OF_STRING (Reg_Input))
3085 MATCH_RETURN (0)
3086 Reg_Input ++;
3087 break;
3088 case ANY_OF: /* [...] character class. */
3089 if (AT_END_OF_STRING (Reg_Input))
3090 MATCH_RETURN (0)
3091 /* Needed because strchr ()
3092 considers \0 as a member
3093 of the character set. */
3094
3095 if (! str32chr (OPERAND (scan), *Reg_Input))
3096 MATCH_RETURN (0)
3097 Reg_Input ++;
3098 break;
3099 case ANY_BUT: /* [^...] Negated character class-- does NOT normally
3100 match newline (\n added usually to operand at compile
3101 time.) */
3102
3103 if (AT_END_OF_STRING (Reg_Input)) {
3104 MATCH_RETURN (0); /* See comment for ANY_OF. */
3105 }
3106 if (str32chr (OPERAND (scan), *Reg_Input) != nullptr)
3107 MATCH_RETURN (0)
3108 Reg_Input ++;
3109 break;
3110 case NOTHING:
3111 case BACK:
3112 break;
3113 case STAR:
3114 case PLUS:
3115 case QUESTION:
3116 case BRACE:
3117
3118 case LAZY_STAR:
3119 case LAZY_PLUS:
3120 case LAZY_QUESTION:
3121 case LAZY_BRACE: {
3122 unsigned long num_matched = REG_ZERO;
3123 unsigned long min = ULONG_MAX, max = REG_ZERO;
3124 char32 *save;
3125 char32 next_char;
3126 char32 *next_op;
3127 int lazy = 0;
3128
3129 /* Lookahead (when possible) to avoid useless match attempts
3130 when we know what character comes next. */
3131
3132 if (GET_OP_CODE (next) == EXACTLY) {
3133 next_char = *OPERAND (next);
3134 } else {
3135 next_char = U'\0';/* i.e. Don't know what next character is. */
3136 }
3137
3138 next_op = OPERAND (scan);
3139
3140 switch (GET_OP_CODE (scan)) {
3141 case LAZY_STAR:
3142 lazy = 1;
3143 case STAR:
3144 min = REG_ZERO;
3145 max = ULONG_MAX;
3146 break;
3147
3148 case LAZY_PLUS:
3149 lazy = 1;
3150 case PLUS:
3151 min = REG_ONE;
3152 max = ULONG_MAX;
3153 break;
3154
3155 case LAZY_QUESTION:
3156 lazy = 1;
3157 case QUESTION:
3158 min = REG_ZERO;
3159 max = REG_ONE;
3160 break;
3161
3162 case LAZY_BRACE:
3163 lazy = 1;
3164 case BRACE:
3165 min = (unsigned long)
3166 GET_OFFSET (scan + NEXT_PTR_SIZE);
3167
3168 max = (unsigned long)
3169 GET_OFFSET (scan + (2 * NEXT_PTR_SIZE));
3170
3171 if (max <= REG_INFINITY) {
3172 max = ULONG_MAX;
3173 }
3174
3175 next_op = OPERAND (scan + (2 * NEXT_PTR_SIZE));
3176 }
3177
3178 save = Reg_Input;
3179
3180 if (lazy) {
3181 if (min > REG_ZERO) {
3182 num_matched = greedy (next_op, min);
3183 }
3184 } else {
3185 num_matched = greedy (next_op, max);
3186 }
3187
3188 while (min <= num_matched && num_matched <= max) {
3189 if (next_char == U'\0' || next_char == *Reg_Input) {
3190 if (match (next, NULL)) {
3191 MATCH_RETURN (1);
3192 }
3193
3194 CHECK_RECURSION_LIMIT
3195 }
3196
3197 /* Couldn't or didn't match. */
3198
3199 if (lazy) {
3200 if (!greedy (next_op, 1)) {
3201 MATCH_RETURN (0);
3202 }
3203
3204 num_matched ++; /* Inch forward. */
3205 } else if (num_matched > REG_ZERO) {
3206 num_matched --; /* Back up. */
3207 } else if (min == REG_ZERO && num_matched == REG_ZERO) {
3208 break;
3209 }
3210
3211 Reg_Input = save + num_matched;
3212 }
3213
3214 MATCH_RETURN (0);
3215 }
3216
3217 break;
3218
3219 case END:
3220 if (Extent_Ptr_FW == NULL || (Reg_Input - Extent_Ptr_FW) > 0) {
3221 Extent_Ptr_FW = Reg_Input;
3222 }
3223
3224 MATCH_RETURN (1) // Success!
3225
3226 break;
3227
3228 case INIT_COUNT:
3229 Brace->count [*OPERAND (scan)] = REG_ZERO;
3230
3231 break;
3232
3233 case INC_COUNT:
3234 Brace->count [*OPERAND (scan)] ++;
3235
3236 break;
3237
3238 case TEST_COUNT:
3239 if (Brace->count [*OPERAND (scan)] <
3240 (unsigned long) GET_OFFSET (scan + NEXT_PTR_SIZE + INDEX_SIZE)) {
3241
3242 next = scan + NODE_SIZE + INDEX_SIZE + NEXT_PTR_SIZE;
3243 }
3244
3245 break;
3246
3247 case BACK_REF:
3248 case BACK_REF_CI:
3249 /* case X_REGEX_BR: */
3250 /* case X_REGEX_BR_CI: *** IMPLEMENT LATER */
3251 {
3252 char32 *captured, *finish;
3253 int paren_no;
3254
3255 paren_no = (int) * OPERAND (scan);
3256
3257 /* if (GET_OP_CODE (scan) == X_REGEX_BR ||
3258 GET_OP_CODE (scan) == X_REGEX_BR_CI) {
3259
3260 if (Cross_Regex_Backref == NULL) MATCH_RETURN (0);
3261
3262 captured =
3263 (regularExp_CHAR *) Cross_Regex_Backref->startp [paren_no];
3264
3265 finish =
3266 (regularExp_CHAR *) Cross_Regex_Backref->endp [paren_no];
3267 } else { */
3268 captured = Back_Ref_Start [paren_no];
3269 finish = Back_Ref_End [paren_no];
3270 /* } */
3271
3272 if (captured && finish) {
3273 if (captured > finish) {
3274 MATCH_RETURN (0);
3275 }
3276
3277 if (GET_OP_CODE (scan) == BACK_REF_CI /* ||
3278 GET_OP_CODE (scan) == X_REGEX_BR_CI*/) {
3279
3280 while (captured < finish) {
3281 if (AT_END_OF_STRING (Reg_Input) || Melder_toLowerCase (*captured++) != Melder_toLowerCase (*Reg_Input++)) // TODO: should casefold
3282 MATCH_RETURN (0)
3283 }
3284 } else {
3285 while (captured < finish) {
3286 if (AT_END_OF_STRING (Reg_Input) || *captured++ != *Reg_Input++)
3287 MATCH_RETURN (0)
3288 }
3289 }
3290 break;
3291 } else {
3292 MATCH_RETURN (0)
3293 }
3294 }
3295
3296 case POS_AHEAD_OPEN:
3297 case NEG_AHEAD_OPEN:
3298 {
3299 char32 *save = Reg_Input;
3300
3301 /* Temporarily ignore the logical end of the string, to allow
3302 lookahead past the end. */
3303 const char32 *saved_end = End_Of_String;
3304 End_Of_String = NULL;
3305
3306 int answer = match (next, NULL); /* Does the look-ahead regex match? */
3307
3308 CHECK_RECURSION_LIMIT
3309
3310 if ( (GET_OP_CODE (scan) == POS_AHEAD_OPEN) ? answer : ! answer) {
3311 /* Remember the last (most to the right) character position
3312 that we consume in the input for a successful match. This
3313 is info that may be needed should an attempt be made to
3314 match the exact same text at the exact same place. Since
3315 look-aheads backtrack, a regex with a trailing look-ahead
3316 may need more text than it matches to accomplish a
3317 re-match. */
3318
3319 if (Extent_Ptr_FW == NULL || (Reg_Input - Extent_Ptr_FW) > 0) {
3320 Extent_Ptr_FW = Reg_Input;
3321 }
3322
3323 Reg_Input = save; /* Backtrack to look-ahead start. */
3324 End_Of_String = saved_end; /* Restore logical end. */
3325
3326 /* Jump to the node just after the (?=...) or (?!...)
3327 Construct. */
3328
3329 next = next_ptr (OPERAND (scan)); /* Skip 1st branch */
3330 /* Skip the chain of branches inside the look-ahead */
3331 while (GET_OP_CODE (next) == BRANCH) {
3332 next = next_ptr (next);
3333 }
3334 next = next_ptr (next); /* Skip the LOOK_AHEAD_CLOSE */
3335 } else {
3336 Reg_Input = save; /* Backtrack to look-ahead start. */
3337 End_Of_String = saved_end; /* Restore logical end. */
3338
3339 MATCH_RETURN (0)
3340 }
3341 }
3342
3343 break;
3344
3345 case POS_BEHIND_OPEN:
3346 case NEG_BEHIND_OPEN: {
3347 char32 *save;
3348 int answer;
3349 int offset, upper;
3350 int lower;
3351 int found = 0;
3352 const char32 *saved_end;
3353
3354 save = Reg_Input;
3355 saved_end = End_Of_String;
3356
3357 /* Prevent overshoot (greedy matching could end past the
3358 current position) by tightening the matching boundary.
3359 Lookahead inside lookbehind can still cross that boundary. */
3360 End_Of_String = Reg_Input;
3361
3362 lower = GET_LOWER (scan);
3363 upper = GET_UPPER (scan);
3364
3365 /* Start with the shortest match first. This is the most
3366 efficient direction in general.
3367 Note! Negative look behind is _very_ tricky when the length
3368 is not constant: we have to make sure the expression doesn't
3369 match for _any_ of the starting positions. */
3370 for (offset = lower; offset <= upper; ++offset) {
3371 Reg_Input = save - offset;
3372
3373 if (Reg_Input < Look_Behind_To) {
3374 /* No need to look any further */
3375 break;
3376 }
3377
3378 answer = match (next, NULL); /* Does the look-behind regex match? */
3379
3380 CHECK_RECURSION_LIMIT
3381
3382 /* The match must have ended at the current position;
3383 otherwise it is invalid */
3384 if (answer && Reg_Input == save) {
3385 /* It matched, exactly far enough */
3386 found = 1;
3387
3388 /* Remember the last (most to the left) character position
3389 that we consume in the input for a successful match.
3390 This is info that may be needed should an attempt be
3391 made to match the exact same text at the exact same
3392 place. Since look-behind backtracks, a regex with a
3393 leading look-behind may need more text than it matches
3394 to accomplish a re-match. */
3395
3396 if (Extent_Ptr_BW == NULL ||
3397 (Extent_Ptr_BW - (save - offset)) > 0) {
3398 Extent_Ptr_BW = save - offset;
3399 }
3400
3401 break;
3402 }
3403 }
3404
3405 /* Always restore the position and the logical string end. */
3406 Reg_Input = save;
3407 End_Of_String = saved_end;
3408
3409 if ( (GET_OP_CODE (scan) == POS_BEHIND_OPEN) ? found : !found) {
3410 /* The look-behind matches, so we must jump to the next
3411 node. The look-behind node is followed by a chain of
3412 branches (contents of the look-behind expression), and
3413 terminated by a look-behind-close node. */
3414 next = next_ptr (OPERAND (scan) + LENGTH_SIZE); /* 1st branch */
3415 /* Skip the chained branches inside the look-ahead */
3416 while (GET_OP_CODE (next) == BRANCH) {
3417 next = next_ptr (next);
3418 }
3419 next = next_ptr (next); /* Skip LOOK_BEHIND_CLOSE */
3420 } else {
3421 /* Not a match */
3422 MATCH_RETURN (0)
3423 }
3424 }
3425 break;
3426
3427 case LOOK_AHEAD_CLOSE:
3428 case LOOK_BEHIND_CLOSE:
3429 MATCH_RETURN (1); /* We have reached the end of the look-ahead or
3430 look-behind which implies that we matched it,
3431 so return `true`. */
3432 default:
3433 if ( (GET_OP_CODE (scan) > OPEN) &&
3434 (GET_OP_CODE (scan) < OPEN + NSUBEXP)) {
3435
3436 int no;
3437 char32 *save;
3438
3439 no = GET_OP_CODE (scan) - OPEN;
3440 save = Reg_Input;
3441
3442 if (no < 10) {
3443 Back_Ref_Start [no] = save;
3444 Back_Ref_End [no] = NULL;
3445 }
3446
3447 if (match (next, NULL)) {
3448 /* Do not set `Start_Ptr_Ptr' if some later invocation (think
3449 recursion) of the same parentheses already has. */
3450
3451 if (Start_Ptr_Ptr [no] == NULL) {
3452 Start_Ptr_Ptr [no] = save;
3453 }
3454
3455 MATCH_RETURN (1)
3456 } else {
3457 MATCH_RETURN (0)
3458 }
3459 } else if ( (GET_OP_CODE (scan) > CLOSE) &&
3460 (GET_OP_CODE (scan) < CLOSE + NSUBEXP)) {
3461
3462 int no;
3463 char32 *save;
3464
3465 no = GET_OP_CODE (scan) - CLOSE;
3466 save = Reg_Input;
3467
3468 if (no < 10) {
3469 Back_Ref_End [no] = save;
3470 }
3471
3472 if (match (next, NULL)) {
3473 /* Do not set `End_Ptr_Ptr' if some later invocation of the
3474 same parentheses already has. */
3475
3476 if (End_Ptr_Ptr [no] == NULL) {
3477 End_Ptr_Ptr [no] = save;
3478 }
3479 MATCH_RETURN (1)
3480 } else {
3481 MATCH_RETURN (0)
3482 }
3483 } else {
3484 reg_error (U"memory corruption, `match\'");
3485 MATCH_RETURN (0)
3486 }
3487
3488 break;
3489 }
3490
3491 scan = next;
3492 }
3493
3494 /* We get here only if there's trouble -- normally "case END" is
3495 the terminating point. */
3496
3497 reg_error (U"corrupted pointers, `match\'");
3498 MATCH_RETURN (0)
3499 }
3500
3501 /*----------------------------------------------------------------------*
3502 * greedy
3503 *
3504 * Repeatedly match something simple up to "max" times. If max <= 0
3505 * then match as much as possible (max = infinity). Uses unsigned long
3506 * variables to maximize the amount of text matchable for unbounded
3507 * qualifiers like '*' and '+'. This will allow at least 4,294,967,295
3508 * matches (4 Gig!) for an ANSI C compliant compiler. If you are
3509 * applying a regex to something bigger than that, you shouldn't be
3510 * using NEdit!
3511 *
3512 * Returns the actual number of matches.
3513 *----------------------------------------------------------------------*/
3514
greedy(char32 * p,long max)3515 static unsigned long greedy (char32 *p, long max) {
3516
3517 char32 *input_str;
3518 char32 *operand;
3519 unsigned long count = REG_ZERO;
3520 unsigned long max_cmp;
3521
3522 input_str = Reg_Input;
3523 operand = OPERAND (p); /* Literal char or start of class characters. */
3524 max_cmp = (max > 0) ? (unsigned long) max : ULONG_MAX;
3525
3526 switch (GET_OP_CODE (p))
3527 {
3528 case ANY: /* Race to the end of the line or string. Dot DOESN'T match newline. */
3529 while (count < max_cmp && *input_str != '\n' && ! AT_END_OF_STRING (input_str)) {
3530 count ++;
3531 input_str ++;
3532 }
3533 break;
3534
3535 case EVERY: /* Race to the end of the line or string. Dot DOES match newline. */
3536 while (count < max_cmp && ! AT_END_OF_STRING (input_str)) {
3537 count ++;
3538 input_str ++;
3539 }
3540 break;
3541
3542 case EXACTLY: /* Count occurrences of single character operand. */
3543 while (count < max_cmp && *operand == *input_str && ! AT_END_OF_STRING (input_str)) {
3544 count ++;
3545 input_str ++;
3546 }
3547 break;
3548
3549 case SIMILAR: /* Case-insensitive version of EXACTLY */
3550 while (count < max_cmp && *operand == Melder_toLowerCase (*input_str) && ! AT_END_OF_STRING (input_str)) {
3551 count ++;
3552 input_str ++;
3553 }
3554 break;
3555
3556 case ANY_OF: /* [...] character class. */
3557 while (count < max_cmp && str32chr (operand, *input_str) != NULL && ! AT_END_OF_STRING (input_str)) {
3558 count ++;
3559 input_str ++;
3560 }
3561 break;
3562
3563 case ANY_BUT: /* [^...] Negated character class- does NOT normally
3564 match newline (\n added usually to operand at compile
3565 time.) */
3566
3567 while (count < max_cmp && str32chr (operand, *input_str) == NULL && ! AT_END_OF_STRING (input_str)) {
3568 count ++;
3569 input_str ++;
3570 }
3571 break;
3572
3573 case IS_DELIM: /* \y (not a word delimiter char)
3574 NOTE: '\n' and '\0' are always word delimiters. */
3575
3576 while (count < max_cmp && ! Melder_isWordCharacter (*input_str) && ! AT_END_OF_STRING (input_str)) {
3577 count ++;
3578 input_str ++;
3579 }
3580 break;
3581
3582 case NOT_DELIM: /* \Y (not a word delimiter char)
3583 NOTE: '\n' and '\0' are always word delimiters. */
3584
3585 while (count < max_cmp && Melder_isWordCharacter (*input_str) && ! AT_END_OF_STRING (input_str)) {
3586 count ++;
3587 input_str ++;
3588 }
3589 break;
3590
3591 case WORD_CHAR: /* \w (a word character: letter, number, mark or connector punctuation) */
3592 while (count < max_cmp && Melder_isWordCharacter (*input_str) && ! AT_END_OF_STRING (input_str)) {
3593 count ++;
3594 input_str ++;
3595 }
3596 break;
3597
3598 case NOT_WORD_CHAR: /* \W (NOT a word character) */
3599 while (count < max_cmp && ! Melder_isWordCharacter (*input_str) && *input_str != U'\n' && ! AT_END_OF_STRING (input_str)) {
3600 count ++;
3601 input_str ++;
3602 }
3603 break;
3604
3605 case DIGIT: /* for ASCII, use [0-9] */
3606 while (count < max_cmp && Melder_isDecimalNumber (*input_str) && ! AT_END_OF_STRING (input_str)) {
3607 count ++;
3608 input_str ++;
3609 }
3610 break;
3611
3612 case NOT_DIGIT: /* for ASCII, use [^0-9] */
3613 while (count < max_cmp && ! Melder_isDecimalNumber (*input_str) && *input_str != U'\n' && ! AT_END_OF_STRING (input_str)) {
3614 count ++;
3615 input_str ++;
3616 }
3617 break;
3618
3619 case SPACE: /* for ASCII, use [ \t\r\f\v]-- doesn't match newline. */
3620 while (count < max_cmp && Melder_isHorizontalSpace (*input_str) && ! AT_END_OF_STRING (input_str)) {
3621 count ++;
3622 input_str ++;
3623 }
3624 break;
3625
3626 case SPACE_NL: /* for ASCII, use [\n \t\r\f\v]-- matches newline. */
3627 while (count < max_cmp && Melder_isHorizontalOrVerticalSpace (*input_str) && ! AT_END_OF_STRING (input_str)) {
3628 count ++;
3629 input_str ++;
3630 }
3631 break;
3632
3633 case NOT_SPACE: /* for ASCII, use [^\n \t\r\f\v]-- doesn't match newline. */
3634 while (count < max_cmp && ! Melder_isHorizontalOrVerticalSpace (*input_str) && ! AT_END_OF_STRING (input_str)) {
3635 count ++;
3636 input_str ++;
3637 }
3638 break;
3639
3640 case NOT_SPACE_NL: /* for ASCII, use [^ \t\r\f\v]-- matches newline. */
3641 while (count < max_cmp && ! Melder_isHorizontalSpace (*input_str) && ! AT_END_OF_STRING (input_str)) {
3642 count ++;
3643 input_str ++;
3644 }
3645 break;
3646
3647 case LETTER: /* for ASCII, use [a-zA-Z] */
3648 while (count < max_cmp && Melder_isLetter (*input_str) && ! AT_END_OF_STRING (input_str)) {
3649 count ++;
3650 input_str ++;
3651 }
3652 break;
3653
3654 case NOT_LETTER: /* for ASCII, use [^a-zA-Z] */
3655 while (count < max_cmp && ! Melder_isLetter (*input_str) && *input_str != '\n' && ! AT_END_OF_STRING (input_str)) {
3656 count ++;
3657 input_str ++;
3658 }
3659 break;
3660
3661 default:
3662 /* Called inappropriately. Only atoms that are SIMPLE should
3663 generate a call to greedy. The above cases should cover
3664 all the atoms that are SIMPLE. */
3665
3666 reg_error (U"internal error #10 `greedy\'");
3667 count = 0U; /* Best we can do. */
3668 }
3669
3670 /* Point to character just after last matched character. */
3671
3672 Reg_Input = input_str;
3673
3674 return (count);
3675 }
3676
3677 /*----------------------------------------------------------------------*
3678 * next_ptr - compute the address of a node's "NEXT" pointer.
3679 * Note: a simplified inline version is available via the NEXT_PTR() macro,
3680 * but that one is only to be used at time-critical places (see the
3681 * description of the macro).
3682 *----------------------------------------------------------------------*/
3683
next_ptr(char32 * ptr)3684 static char32 *next_ptr (char32 *ptr) {
3685 if (ptr == & Compute_Size)
3686 return nullptr;
3687 int offset = GET_OFFSET (ptr);
3688 if (offset == 0)
3689 return nullptr;
3690 if (GET_OP_CODE (ptr) == BACK) {
3691 return ptr - offset;
3692 } else {
3693 return ptr + offset;
3694 }
3695 }
3696
3697 /*
3698 ** SubstituteRE - Perform substitutions after a `regexp' match.
3699 **
3700 ** This function cleanly shortens results of more than max length to max.
3701 ** To give the caller a chance to react to this the function returns False
3702 ** on any error. The substitution will still be executed.
3703 */
SubstituteRE(const regexp * prog,conststring32 source,mutablestring32 dest,int max,int * errorType)3704 int SubstituteRE (const regexp *prog, conststring32 source, mutablestring32 dest, int max, int *errorType) {
3705
3706 const char32 *src;
3707 const char32 *src_alias;
3708 char32 *dst;
3709 char32 c;
3710 char32 test;
3711 int paren_no;
3712 int len;
3713 char32 chgcase;
3714 bool anyWarnings = false;
3715
3716 *errorType = 0;
3717 if (! prog || ! source || ! dest) {
3718 reg_error (U"NULL parm to `SubstituteRE\'");
3719 *errorType = 2;
3720 return false;
3721 }
3722
3723 if (U_CHAR_AT (prog->program) != MAGIC) {
3724 *errorType = 3;
3725 reg_error (U"damaged regexp passed to `SubstituteRE\'");
3726 return false;
3727 }
3728
3729 src = source;
3730 dst = dest;
3731
3732 while ( (c = *src++) != U'\0') {
3733 chgcase = U'\0';
3734 paren_no = -1;
3735
3736 if (c == U'\\') {
3737 /* Process any case-altering tokens, i.e \u, \U, \l, \L. */
3738
3739 if (*src == U'u' || *src == U'U' || *src == U'l' || *src == U'L') {
3740 chgcase = *src;
3741 src ++;
3742 c = *src ++;
3743 if (c == U'\0')
3744 break;
3745 }
3746 }
3747
3748 if (c == U'&') {
3749 paren_no = 0;
3750 } else if (c == U'\\') {
3751 /* Can not pass register variable `&src' to function `numeric_escape'
3752 so make a non-register copy that we can take the address of. */
3753
3754 src_alias = src;
3755
3756 if (U'1' <= *src && *src <= U'9') {
3757 paren_no = (int) * src++ - (int) '0';
3758
3759 } else if ( (test = literal_escape (*src)) != U'\0') {
3760 c = test; src++;
3761
3762 } else if ( (test = numeric_escape (*src, (char32 **) &src_alias)) != U'\0') {
3763 c = test;
3764 src = src_alias;
3765 src ++;
3766
3767 /* NOTE: if an octal escape for zero is attempted (e.g. \000), it
3768 will be treated as a literal string. */
3769 } else if (*src == U'\0') {
3770 /* If '\' is the last character of the replacement string, it is
3771 interpreted as a literal backslash. */
3772
3773 c = U'\\';
3774 } else {
3775 c = *src ++; /* Allow any escape sequence (This is */
3776 } /* INCONSISTENT with the `CompileRE' */
3777 } /* mind set of issuing an error! */
3778
3779 if (paren_no < 0) { /* Ordinary character. */
3780 if ( (dst - dest) >= (max - 1)) {
3781 *errorType = 1;
3782 reg_error (U"replacing expression in `SubstituteRE\' too long; truncating");
3783 anyWarnings = true;
3784 break;
3785 } else {
3786 *dst++ = c;
3787 }
3788 } else if (prog->startp [paren_no] != NULL &&
3789 prog->endp [paren_no] != NULL) {
3790
3791 len = prog->endp [paren_no] - prog->startp [paren_no];
3792
3793 if ( (dst + len - dest) >= max - 1) {
3794 *errorType = 1;
3795 reg_error (U"replacing expression in `SubstituteRE\' too long; truncating");
3796 anyWarnings = true;
3797 len = max - (dst - dest) - 1;
3798 }
3799
3800 (void) str32ncpy (dst, prog->startp [paren_no], len);
3801
3802 if (chgcase != U'\0') {
3803 adjustcase (dst, len, chgcase);
3804 }
3805
3806 dst += len;
3807
3808 if (len != 0 && * (dst - 1) == U'\0') { /* strncpy hit NUL. */
3809 *errorType = 3;
3810 reg_error (U"damaged match string in `SubstituteRE\'");
3811 anyWarnings = true;
3812 }
3813 }
3814 }
3815
3816 *dst = U'\0';
3817
3818 return ! anyWarnings;
3819 }
3820
adjustcase(char32 * str,int len,char32 chgcase)3821 static void adjustcase (char32 *str, int len, char32 chgcase) {
3822
3823 char32 *string = str;
3824
3825 /* The tokens \u and \l only modify the first character while the tokens
3826 \U and \L modify the entire string. */
3827
3828 if (iswlower ((int) chgcase) && len > 0)
3829 len = 1;
3830 switch (chgcase) {
3831 case U'u':
3832 case U'U':
3833 for (integer i = 0; i < len; i ++)
3834 * (string + i) = Melder_toUpperCase (* (string + i));
3835 break;
3836 case U'l':
3837 case U'L':
3838 for (integer i = 0; i < len; i ++)
3839 * (string + i) = Melder_toLowerCase (* (string + i));
3840 break;
3841 }
3842 }
3843
3844 /*----------------------------------------------------------------------*
3845 * reg_error
3846 *----------------------------------------------------------------------*/
3847
reg_error(conststring32 str)3848 static void reg_error (conststring32 str) {
3849 Melder_appendError (U"Internal error processing regular expression: ", str);
3850 }
3851
EnableCountingQuantifier(int is_enabled)3852 void EnableCountingQuantifier (int is_enabled) {
3853 Enable_Counting_Quantifier = is_enabled;
3854 }
3855
3856 /* End of file regularExp.cpp */
3857