1 /*------------------------------------------------------------------------*
2  * static const char CVSID[] = "$Id: regularExp.c,v 1.30 2006/08/13 21:47:45 yooden Exp $";
3  * `CompileRE', `ExecRE', and `substituteRE' -- regular expression parsing
4  *
5  * This is a HIGHLY ALTERED VERSION of Henry Spencer's `regcomp' and
6  * `regexec' code adapted for NEdit.
7  *
8  * .-------------------------------------------------------------------.
9  * | ORIGINAL COPYRIGHT NOTICE:                                        |
10  * |                                                                   |
11  * | Copyright (c) 1986 by University of Toronto.                      |
12  * | Written by Henry Spencer.  Not derived from licensed software.    |
13  * |                                                                   |
14  * | Permission is granted to anyone to use this software for any      |
15  * | purpose on any computer system, and to redistribute it freely,    |
16  * | subject to the following restrictions:                            |
17  * |                                                                   |
18  * | 1. The author is not responsible for the consequences of use of   |
19  * |      this software, no matter how awful, even if they arise       |
20  * |      from defects in it.                                          |
21  * |                                                                   |
22  * | 2. The origin of this software must not be misrepresented, either |
23  * |      by explicit claim or by omission.                            |
24  * |                                                                   |
25  * | 3. Altered versions should be plainly marked as such, and must not  |
26  * |      be misrepresented as being the original software.            |
27  * `-------------------------------------------------------------------'
28  *
29  * This is free software; you can redistribute it and/or modify it under the
30  * terms of the GNU General Public License as published by the Free Software
31  * Foundation; either version 2 of the License, or (at your option) any later
32  * version. In addition, you may distribute version of this program linked to
33  * Motif or Open Motif. See README for details.
34  *
35  * This software is distributed in the hope that it will be useful, but WITHOUT
36  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
37  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
38  * for more details.
39  *
40  * You should have received a copy of the GNU General Public License along with
41  * software; if not, write to the Free Software Foundation, Inc., 59 Temple
42  * Place, Suite 330, Boston, MA  02111-1307 USA
43  *
44  *
45  * BEWARE that some of this code is subtly aware of the way operator
46  * precedence is structured in regular expressions.  Serious changes in
47  * regular-expression syntax might require a total rethink.
48  *   -- Henry Spencer
49  * (Yes, it did!) -- Christopher Conrad, Dec. 1999
50  *
51  * January, 1994, Mark Edel
52  *    Consolidated files, changed names of external functions to avoid
53  *    potential conflicts with native regcomp and regexec functions, changed
54  *    error reporting to NEdit form, added multi-line and reverse searching,
55  *    and added \n \t \u \U \l \L.
56  *
57  * June, 1996, Mark Edel
58  *    Bug in NEXT macro, didn't work for expressions which compiled to over
59  *    256 bytes.
60  *
61  * December, 1999, Christopher Conrad
62  *    Reformatted code for readability, improved error output, added octal and
63  *    hexadecimal escapes, added back-references (\1-\9), added positive look
64  *    ahead: (?=...), added negative lookahead: (?!...),  added non-capturing
65  *    parentheses: (?:...), added case insensitive constructs (?i...) and
66  *    (?I...), added newline matching constructs (?n...) and (?N...), added
67  *    regex comments: (?#...), added shortcut escapes: \d\D\l\L\s\S\w\W\y\Y.
68  *    Added "not a word boundary" anchor \B.
69  *
70  * July, 2002, Eddy De Greef
71  *    Added look behind, both positive (?<=...) and negative (?<!...) for
72  *    bounded-length patterns.
73  *
74  * November, 2004, Eddy De Greef
75  *    Added constrained matching (allowing specification of the logical end
76  *    of the string iso. matching till \0), and fixed several (probably
77  *    very old) string overrun errors that could easily result in crashes,
78  *    especially in client code.
79  *
80  * June, 2008, David Weenink
81  *    Adaptation for use in the Praat program.
82  *    Changed reg_error to call Melder_error.
83  *    Extra parameter for SubstituteRE to allow error differentiation
84  *    (not enough memory can be repaired upstream).
85  *
86  * April, 2018, Paul Boersma
87  *    Added Unicode-based definitions of letters, digits, spaces and word boundaries.
88  *    Thereby removed shortcut escapes from classes.
89  */
90 
91 #include <limits.h>
92 #include "melder.h"
93 
94 /* The first byte of the regexp internal `program' is a magic number to help
95    guard against corrupted data; the compiled regex code really begins in the
96    second byte. */
97 
98 #define MAGIC   0234
99 
100 /* The "internal use only" fields in `regexp.h' are present to pass info from
101  * `CompileRE' to `ExecRE' which permits the execute phase to run lots faster on
102  * simple cases.  They are:
103  *
104  *   match_start     Character that must begin a match; '\0' if none obvious.
105  *   anchor          Is the match anchored (at beginning-of-line only)?
106  *
107  * `match_start' and `anchor' permit very fast decisions on suitable starting
108  * points for a match, considerably reducing the work done by ExecRE. */
109 
110 /* STRUCTURE FOR A REGULAR EXPRESSION (regex) `PROGRAM'.
111  *
112  * This is essentially a linear encoding of a nondeterministic finite-state
113  * machine or NFA (aka syntax charts or `railroad normal form' in parsing
114  * technology).  Each node is an opcode plus a NEXT pointer, possibly
115  * followed by operands.  NEXT pointers of all nodes except BRANCH implement
116  * concatenation; a NEXT pointer with a BRANCH on both ends of it is
117  * connecting two alternatives.  (Here we have one of the subtle syntax
118  * dependencies:  an individual BRANCH (as opposed to a collection of them) is
119  * never concatenated with anything because of operator precedence.)  The
120  * operand of some types of nodes is a literal string; for others, it is a node
121  * leading into a sub-FSM.  In particular, the operand of a BRANCH node is the
122  * first node of the branch. (NB this is _NOT_ a tree structure:  the tail of
123  * the branch connects to the thing following the set of BRANCHes.)
124  *
125  * The opcodes are: */
126 
127 /* DEFINITION            VALUE  MEANING */
128 
129 #define END                1  /* End of program. */
130 
131 /* Zero width positional assertions. */
132 
133 #define BOL                2  /* Match position at beginning of line. */
134 #define EOL                3  /* Match position at end of line. */
135 #define BOWORD             4  /* Match "" representing word delimiter or BOL */
136 #define EOWORD             5  /* Match "" representing word delimiter or EOL */
137 #define NOT_BOUNDARY       6  /* Not word boundary (\B, opposite of < and >) */
138 
139 /* Op codes with null terminated string operands. */
140 
141 #define EXACTLY            7  /* Match this string. */
142 #define SIMILAR            8  /* Match this case insensitive string */
143 #define ANY_OF             9  /* Match any character in the set. */
144 #define ANY_BUT           10  /* Match any character not in the set. */
145 
146 /* Op codes to match any character. */
147 
148 #define ANY               11  /* Match any one character (implements '.') */
149 #define EVERY             12  /* Same as ANY but matches newline. */
150 
151 /* Shortcut escapes, \d, \D, \l, \L, \s, \S, \w, \W, \y, \Y. */
152 
153 #define DIGIT             13  /* Match any digit, i.e. [0123456789] */
154 #define NOT_DIGIT         14  /* Match any non-digit, i.e. [^0123456789] */
155 #define LETTER            15  /* Match any letter character [a-zA-Z] */
156 #define NOT_LETTER        16  /* Match any non-letter character [^a-zA-Z] */
157 #define SPACE             17  /* Match any whitespace character EXCEPT \n */
158 #define SPACE_NL          18  /* Match any whitespace character INCLUDING \n */
159 #define NOT_SPACE         19  /* Match any non-whitespace character */
160 #define NOT_SPACE_NL      20  /* Same as NOT_SPACE but matches newline. */
161 #define WORD_CHAR         21  /* Match any word character [a-zA-Z0-9_] */
162 #define NOT_WORD_CHAR     22  /* Match any non-word character [^a-zA-Z0-9_] */
163 #define IS_DELIM          23  /* Match any character that's a word delimiter */
164 #define NOT_DELIM         24  /* Match any character NOT a word delimiter */
165 
166 /* Quantifier nodes. (Only applied to SIMPLE nodes.  Quantifiers applied to non
167    SIMPLE nodes or larger atoms are implemented using complex constructs.)*/
168 
169 #define STAR              25  /* Match this (simple) thing 0 or more times. */
170 #define LAZY_STAR         26  /* Minimal matching STAR */
171 #define QUESTION          27  /* Match this (simple) thing 0 or 1 times. */
172 #define LAZY_QUESTION     28  /* Minimal matching QUESTION */
173 #define PLUS              29  /* Match this (simple) thing 1 or more times. */
174 #define LAZY_PLUS         30  /* Minimal matching PLUS */
175 #define BRACE             31  /* Match this (simple) thing m to n times. */
176 #define LAZY_BRACE        32  /* Minimal matching BRACE */
177 
178 /* Nodes used to build complex constructs. */
179 
180 #define NOTHING           33  /* Match empty string (always matches) */
181 #define BRANCH            34  /* Match this alternative, or the next... */
182 #define BACK              35  /* Always matches, NEXT ptr points backward. */
183 #define INIT_COUNT        36  /* Initialize {m,n} counter to zero */
184 #define INC_COUNT         37  /* Increment {m,n} counter by one */
185 #define TEST_COUNT        38  /* Test {m,n} counter against operand */
186 
187 /* Back Reference nodes. */
188 
189 #define BACK_REF          39  /* Match latest matched parenthesized text */
190 #define BACK_REF_CI       40  /* Case insensitive version of BACK_REF */
191 #define X_REGEX_BR        41  /* Cross-Regex Back-Ref for syntax highlighting */
192 #define X_REGEX_BR_CI     42  /* Case insensitive version of X_REGEX_BR_CI */
193 
194 /* Various nodes used to implement parenthetical constructs. */
195 
196 #define POS_AHEAD_OPEN    43  /* Begin positive look ahead */
197 #define NEG_AHEAD_OPEN    44  /* Begin negative look ahead */
198 #define LOOK_AHEAD_CLOSE  45  /* End positive or negative look ahead */
199 
200 #define POS_BEHIND_OPEN   46  /* Begin positive look behind */
201 #define NEG_BEHIND_OPEN   47  /* Begin negative look behind */
202 #define LOOK_BEHIND_CLOSE 48  /* Close look behind */
203 
204 #define OPEN              49  /* Open for capturing parentheses. */
205 
206 /*  OPEN+1 is number 1, etc. */
207 #define CLOSE       (OPEN + NSUBEXP)  /* Close for capturing parentheses. */
208 
209 #define LAST_PAREN  (CLOSE + NSUBEXP)
210 
211 #if (LAST_PAREN > UCHAR_MAX)
212 #error "Too many parentheses for storage in an unsigned char (LAST_PAREN too big.)"
213 #endif
214 
215 /* The next_ptr () function can consume up to 30% of the time during matching
216    because it is called an immense number of times (an average of 25
217    next_ptr() calls per match() call was witnessed for Perl syntax
218    highlighting). Therefore it is well worth removing some of the function
219    call overhead by selectively inlining the next_ptr() calls. Moreover,
220    the inlined code can be simplified for matching because one of the tests,
221    only necessary during compilation, can be left out.
222    The net result of using this inlined version at two critical places is
223    a 25% speedup (again, witnesses on Perl syntax highlighting). */
224 
225 #define NEXT_PTR(in_ptr, out_ptr)\
226    next_ptr_offset = GET_OFFSET (in_ptr);\
227    if (next_ptr_offset == 0)\
228        out_ptr = NULL;\
229    else {\
230        if (GET_OP_CODE (in_ptr) == BACK)\
231            out_ptr = in_ptr - next_ptr_offset;\
232        else \
233            out_ptr = in_ptr + next_ptr_offset;\
234    }
235 
236 /* OPCODE NOTES:
237    ------------
238 
239    All nodes consist of an 8 bit op code followed by 2 bytes that make up a 16
240    bit NEXT pointer.  Some nodes have a null terminated character string operand
241    following the NEXT pointer.  Other nodes may have an 8 bit index operand.
242    The TEST_COUNT node has an index operand followed by a 16 bit test value.
243    The BRACE and LAZY_BRACE nodes have two 16 bit values for min and max but no
244    index value.
245 
246    SIMILAR
247       Operand(s): null terminated string
248 
249       Implements a case insensitive match of a string.  Mostly intended for use
250       in syntax highlighting patterns for keywords of languages like FORTRAN
251       and Ada that are case insensitive.  The regex text in this node is
252       converted to lower case during regex compile.
253 
254    DIGIT, NOT_DIGIT, LETTER, NOT_LETTER, SPACE, NOT_SPACE, WORD_CHAR,
255    NOT_WORD_CHAR
256       Operand(s): None
257 
258       Implements shortcut escapes \d, \D, \l, \L, \s, \S, \w, \W.
259       The Unicode-aware functions Melder_isDecimalNumber(), Melder_isLetter(),
260       Melder_isHorizontalSpace(), and Melder_isWordCharacter are
261       used to implement these in the hopes of increasing portability.
262 
263    NOT_BOUNDARY
264       Operand(s): None
265 
266       Implements \B as a zero width assertion that the current character is
267       NOT on a word boundary. Word boundaries are defined to be the position
268       between two characters where one of those characters is
269       a Unicode word character and the other character is not.
270 
271    IS_DELIM
272       Operand(s): None
273 
274       Implements \y as any character that is a Unicode word delimiter.
275 
276    NOT_DELIM
277       Operand(s): None
278 
279       Implements \Y as any character that is NOT a Unicode word delimiter.
280 
281    STAR, PLUS, QUESTION, and complex '*', '+', and '?'
282       Operand(s): None (Note: NEXT pointer is usually zero.  The code that
283                         processes this node skips over it.)
284 
285       Complex (parenthesized) versions implemented as circular BRANCH
286       structures using BACK.  SIMPLE versions (one character per match) are
287       implemented separately for speed and to minimize recursion.
288 
289    BRACE, LAZY_BRACE
290       Operand(s): minimum value (2 bytes), maximum value (2 bytes)
291 
292       Implements the {m,n} construct for atoms that are SIMPLE.
293 
294    BRANCH
295       Operand(s): None
296 
297       The set of branches constituting a single choice are hooked together
298       with their NEXT pointers, since precedence prevents anything being
299       concatenated to any individual branch.  The NEXT pointer of the last
300       BRANCH in a choice points to the thing following the whole choice.  This
301       is also where the final NEXT pointer of each individual branch points;
302       each branch starts with the operand node of a BRANCH node.
303 
304    BACK
305       Operand(s): None
306 
307       Normal NEXT pointers all implicitly point forward.  Back implicitly
308       points backward.  BACK exists to make loop structures possible.
309 
310    INIT_COUNT
311       Operand(s): index (1 byte)
312 
313       Initializes the count array element referenced by the index operand.
314       This node is used to build general (i.e. parenthesized) {m,n} constructs.
315 
316    INC_COUNT
317       Operand(s): index (1 byte)
318 
319       Increments the count array element referenced by the index operand.
320       This node is used to build general (i.e. parenthesized) {m,n} constructs.
321 
322    TEST_COUNT
323       Operand(s): index (1 byte), test value (2 bytes)
324 
325       Tests the current value of the count array element specified by the
326       index operand against the test value.  If the current value is less than
327       the test value, control passes to the node after that TEST_COUNT node.
328       Otherwise control passes to the node referenced by the NEXT pointer for
329       the TEST_COUNT node.  This node is used to build general (i.e.
330       parenthesized) {m,n} constructs.
331 
332    BACK_REF, BACK_REF_CI
333       Operand(s): index (1 byte, value 1-9)
334 
335       Implements back references.  This node will attempt to match whatever text
336       was most recently captured by the index'th set of parentheses.
337       BACK_REF_CI is case insensitive version.
338 
339    X_REGEX_BR, X_REGEX_BR_CI
340       (NOT IMPLEMENTED YET)
341 
342       Operand(s): index (1 byte, value 1-9)
343 
344       Implements back references into a previously matched but separate regular
345       expression.  This is used by syntax highlighting patterns. This node will
346       attempt to match whatever text was most captured by the index'th set of
347       parentheses of the separate regex passed to ExecRE. X_REGEX_BR_CI is case
348       insensitive version.
349 
350    POS_AHEAD_OPEN, NEG_AHEAD_OPEN, LOOK_AHEAD_CLOSE
351 
352       Operand(s): None
353 
354       Implements positive and negative look ahead.  Look ahead is an assertion
355       that something is either there or not there.   Once this is determined the
356       regex engine backtracks to where it was just before the look ahead was
357       encountered, i.e. look ahead is a zero width assertion.
358 
359    POS_BEHIND_OPEN, NEG_BEHIND_OPEN, LOOK_BEHIND_CLOSE
360 
361       Operand(s): 2x2 bytes for OPEN (match boundaries), None for CLOSE
362 
363       Implements positive and negative look behind.  Look behind is an assertion
364       that something is either there or not there in front of the current
365       position.  Look behind is a zero width assertion, with the additional
366       constraint that it must have a bounded length (for complexity and
367       efficiency reasons; note that most other implementation even impose
368       fixed length).
369 
370    OPEN, CLOSE
371 
372       Operand(s): None
373 
374       OPEN  + n = Start of parenthesis 'n', CLOSE + n = Close of parenthesis
375       'n', and are numbered at compile time.
376  */
377 
378 /* A node is one char of opcode followed by two chars of NEXT pointer plus
379  * any operands.  NEXT pointers are stored as two 8-bit pieces, high order
380  * first.  The value is a positive offset from the opcode of the node
381  * containing it.  An operand, if any, simply follows the node.  (Note that
382  * much of the code generation knows about this implicit relationship.)
383  *
384  * Using two bytes for NEXT_PTR_SIZE is vast overkill for most things,
385  * but allows patterns to get big without disasters. */
386 
387 #define OP_CODE_SIZE    1
388 #define NEXT_PTR_SIZE   2
389 #define INDEX_SIZE      1
390 #define LENGTH_SIZE	4
391 #define NODE_SIZE       (NEXT_PTR_SIZE + OP_CODE_SIZE)
392 
393 #define GET_OP_CODE(p)  (*(char32 *)(p))
394 #define OPERAND(p)      ((p) + NODE_SIZE)
395 #define GET_OFFSET(p)   ((( *((p) + 1) & 0377) << 8) + (( *((p) + 2)) & 0377))
396 #define PUT_OFFSET_L(v) (char32)(((v) >> 8) & 0377)
397 #define PUT_OFFSET_R(v) (char32) ((v)       & 0377)
398 #define GET_LOWER(p)    ((( *((p) + NODE_SIZE) & 0377) << 8) + \
399                          (( *((p) + NODE_SIZE+1)) & 0377))
400 #define GET_UPPER(p)    ((( *((p) + NODE_SIZE+2) & 0377) << 8) + \
401                          (( *((p) + NODE_SIZE+3)) & 0377))
402 
403 /* Utility definitions. */
404 
405 #define REG_FAIL(m)      {*Error_Ptr = (m); return (NULL);}
406 #define IS_QUANTIFIER(c) ((c) == '*' || (c) == '+' || \
407                           (c) == '?' || (c) == Brace_Char)
408 #define SET_BIT(i,n)     ((i) |= (1 << ((n) - 1)))
409 #define TEST_BIT(i,n)    ((i) &  (1 << ((n) - 1)))
410 #define U_CHAR_AT(p)     ((unsigned int) *(char32 *)(p))
411 
412 /* Flags to be passed up and down via function parameters during compile. */
413 
414 #define WORST             0  /* Worst case. No assumptions can be made.*/
415 #define HAS_WIDTH         1  /* Known never to match null string. */
416 #define SIMPLE            2  /* Simple enough to be STAR/PLUS operand. */
417 
418 #define NO_PAREN          0  /* Only set by initial call to "chunk". */
419 #define PAREN             1  /* Used for normal capturing parentheses. */
420 #define NO_CAPTURE        2  /* Non-capturing parentheses (grouping only). */
421 #define INSENSITIVE       3  /* Case insensitive parenthetical construct */
422 #define SENSITIVE         4  /* Case sensitive parenthetical construct */
423 #define NEWLINE           5  /* Construct to match newlines in most cases */
424 #define NO_NEWLINE        6  /* Construct to match newlines normally */
425 
426 #define REG_INFINITY    0UL
427 #define REG_ZERO        0UL
428 #define REG_ONE         1UL
429 
430 /* Flags for function shortcut_escape() */
431 
432 #define CHECK_ESCAPE       0  /* Check an escape sequence for validity only. */
433 #define CHECK_CLASS_ESCAPE 1  /* Check the validity of an escape within a character class */
434 #define EMIT_NODE          2  /* Emit the appropriate node. */
435 
436        /* Number of bytes to offset from the beginning of the regex program to the start
437           of the actual compiled regex code, i.e. skipping over the MAGIC number and
438           the two counters at the front.  */
439 
440 #define REGEX_START_OFFSET 3
441 
442 #define MAX_COMPILED_SIZE  32767UL  /* Largest size a compiled regex can be.
443 	       Probably could be 65535UL. */
444 
445 /* Global work variables for `CompileRE'. */
446 
447 static char32 *Reg_Parse;       /* Input scan ptr (scans user's regex) */
448 static int            Total_Paren;     /* Parentheses, (),  counter. */
449 static int            Num_Braces;      /* Number of general {m,n} constructs.
450                                           {m,n} quantifiers of SIMPLE atoms are
451                                           not included in this count. */
452 static int            Closed_Parens;   /* Bit flags indicating () closure. */
453 static int            Paren_Has_Width; /* Bit flags indicating ()'s that are
454                                           known to not match the empty string */
455 static char32  Compute_Size;    /* Address of this used as flag. */
456 static char32 *Code_Emit_Ptr;   /* When Code_Emit_Ptr is set to
457                                           &Compute_Size no code is emitted.
458                                           Instead, the size of code that WOULD
459                                           have been generated is accumulated in
460                                           Reg_Size.  Otherwise, Code_Emit_Ptr
461                                           points to where compiled regex code is
462                                           to be written. */
463 static unsigned long  Reg_Size;        /* Size of compiled regex code. */
464 static const char32         **Error_Ptr;       /* Place to store error messages so
465                                           they can be returned by `CompileRE' */
466 static char32           Error_Text [128];/* Sting to build error messages in. */
467 
468 static int            Is_Case_Insensitive;
469 static int            Match_Newline;
470 
471 static int            Enable_Counting_Quantifier = 1;
472 static char32  Brace_Char;
473 static char32  Default_Meta_Char [] = { '{', '.', '*', '+', '?', '[', '(', '|', ')', '^', '<', '>', '$', '\0' };
474 static char32 *Meta_Char;
475 
476 typedef struct {
477 	long lower;
478 	long upper;
479 } len_range;
480 
481 /* Forward declarations for functions used by `CompileRE'. */
482 
483 static char32 *alternative (int *flag_param, len_range *range_param);
484 static char32 *back_ref (char32 *c, int *flag_param, int emit);
485 static char32 *chunk (int paren, int *flag_param, len_range *range_param);
486 static void emit_byte (char32 c);
487 static void emit_class_byte (char32 c);
488 static char32 *emit_node (int op_code);
489 static char32 *emit_special (char32 op_code, unsigned long test_val, int index);
490 static char32 literal_escape (char32 c);
491 static char32 numeric_escape (char32 c, char32 **parse);
492 static char32 *atom (int *flag_param, len_range *range_param);
493 static void reg_error (const char32_t *str);
494 static char32 *insert (char32 op, char32 *opnd, long min, long max, int index);
495 static char32 *next_ptr (char32 *ptr);
496 static void offset_tail (char32 *ptr, int offset, char32 *val);
497 static void branch_tail (char32 *ptr, int offset, char32 *val);
498 static char32 *piece (int *flag_param, len_range *range_param);
499 static void tail (char32 *search_from, char32 *point_t);
500 static char32 *shortcut_escape (char32 c, int *flag_param, int emit);
501 
502 /*----------------------------------------------------------------------*
503  * CompileRE
504  *
505  * Compiles a regular expression into the internal format used by
506  * `ExecRE'.
507  *
508  * The default behaviour wrt. case sensitivity and newline matching can
509  * be controlled through the defaultFlags argument (Markus Schwarzenberg).
510  * Future extensions are possible by using other flag bits.
511  * Note that currently only the case sensitivity flag is effectively used.
512  *
513  * Beware that the optimization and preparation code in here knows about
514  * some of the structure of the compiled regexp.
515  *----------------------------------------------------------------------*/
516 
CompileRE_throwable(conststring32 expression,int defaultFlags)517 regexp *CompileRE_throwable (conststring32 expression, int defaultFlags) {
518 	conststring32 compileMessage;
519 	regexp *compiledRE = CompileRE (expression, & compileMessage, defaultFlags);
520 	if (compiledRE == NULL)
521 		Melder_throw (U"Regular expression: ", compileMessage, U" (", expression, U").");
522 	return compiledRE;
523 }
524 
CompileRE(conststring32 exp,conststring32 * errorText,int defaultFlags)525 regexp *CompileRE (conststring32 exp, conststring32 *errorText, int defaultFlags) {
526 
527 	regexp *comp_regex = NULL;
528 	char32 *scan;
529 	int flags_local, pass;
530 	len_range range_local;
531 
532 	if (Enable_Counting_Quantifier) {
533 		Brace_Char  = '{';
534 		Meta_Char   = &Default_Meta_Char [0];
535 	} else {
536 		Brace_Char  = '*';                    /* Bypass the '{' in */
537 		Meta_Char   = &Default_Meta_Char [1]; /* Default_Meta_Char */
538 	}
539 
540 	/* Set up errorText to receive failure reports. */
541 
542 	Error_Ptr = errorText;
543 	*Error_Ptr = U"";
544 
545 	if (exp == NULL) {
546 		REG_FAIL (U"NULL argument, `CompileRE\'");
547 	}
548 
549 	Code_Emit_Ptr = &Compute_Size;
550 	Reg_Size      = 0UL;
551 
552 	/* We can't allocate space until we know how big the compiled form will be,
553 	   but we can't compile it (and thus know how big it is) until we've got a
554 	   place to put the code.  So we cheat: we compile it twice, once with code
555 	   generation turned off and size counting turned on, and once "for real".
556 	   This also means that we don't allocate space until we are sure that the
557 	   thing really will compile successfully, and we never have to move the
558 	   code and thus invalidate pointers into it.  (Note that it has to be in
559 	   one piece because free() should be able to free it all.) */
560 
561 	for (pass = 1; pass <= 2; pass++) {
562 		/*-------------------------------------------*
563 		 * FIRST  PASS: Determine size and legality. *
564 		 * SECOND PASS: Emit code.                   *
565 		 *-------------------------------------------*/
566 
567 		/*  Schwarzenberg:
568 		 *  If defaultFlags = 0 use standard defaults:
569 		 *    Is_Case_Insensitive: Case sensitive is the default
570 		 *    Match_Newline:       Newlines are NOT matched by default
571 		 *                         in character classes
572 		 */
573 		Is_Case_Insensitive = ( (defaultFlags & REDFLT_CASE_INSENSITIVE) ? 1 : 0);
574 		Match_Newline = 0;  /* ((defaultFlags & REDFLT_MATCH_NEWLINE)   ? 1 : 0);
575                              Currently not used. Uncomment if needed. */
576 
577 		Reg_Parse       = (char32 *) exp;
578 		Total_Paren     = 1;
579 		Num_Braces      = 0;
580 		Closed_Parens   = 0;
581 		Paren_Has_Width = 0;
582 
583 		emit_byte (MAGIC);
584 		emit_byte ('%');  /* Placeholder for num of capturing parentheses.    */
585 		emit_byte ('%');  /* Placeholder for num of general {m,n} constructs. */
586 
587 		if (chunk (NO_PAREN, &flags_local, &range_local) == NULL) {
588 			return (NULL);    /* Something went wrong */
589 		}
590 		if (pass == 1) {
591 			if (Reg_Size >= MAX_COMPILED_SIZE) {
592 				/* Too big for NEXT pointers NEXT_PTR_SIZE bytes long to span.
593 				   This is a real issue since the first BRANCH node usually points
594 				   to the end of the compiled regex code. */
595 
596 				Melder_sprint (Error_Text,128, U"regexp > ", MAX_COMPILED_SIZE, U" bytes");
597 				REG_FAIL (Error_Text);
598 			}
599 
600 			/* Allocate memory. */
601 
602 			comp_regex = (regexp *) malloc (sizeof (regexp) + Reg_Size * sizeof (char32));
603 
604 			if (comp_regex == NULL) {
605 				REG_FAIL (U"out of memory in `CompileRE\'");
606 			}
607 
608 			Code_Emit_Ptr = (char32 *) comp_regex->program;
609 		}
610 	}
611 
612 	comp_regex->program [1] = (char32) Total_Paren - 1;
613 	comp_regex->program [2] = (char32) Num_Braces;
614 
615 	/*----------------------------------------*
616 	 * Dig out information for optimizations. *
617 	 *----------------------------------------*/
618 
619 	comp_regex->match_start = '\0';   /* Worst-case defaults. */
620 	comp_regex->anchor      =   0;
621 
622 	/* First BRANCH. */
623 
624 	scan = (char32 *) (comp_regex->program + REGEX_START_OFFSET);
625 
626 	if (GET_OP_CODE (next_ptr (scan)) == END) { /* Only one top-level choice. */
627 		scan = OPERAND (scan);
628 
629 		/* Starting-point info. */
630 
631 		if (GET_OP_CODE (scan) == EXACTLY) {
632 			comp_regex->match_start = *OPERAND (scan);
633 
634 		} else if (PLUS <= GET_OP_CODE (scan) &&
635 		           GET_OP_CODE (scan) <= LAZY_PLUS) {
636 
637 			/* Allow x+ or x+? at the start of the regex to be
638 			   optimized. */
639 
640 			if (GET_OP_CODE (scan + NODE_SIZE) == EXACTLY) {
641 				comp_regex->match_start = *OPERAND (scan + NODE_SIZE);
642 			}
643 		} else if (GET_OP_CODE (scan) == BOL) {
644 			comp_regex->anchor++;
645 		}
646 	}
647 
648 	return (comp_regex);
649 }
650 
651 /*----------------------------------------------------------------------*
652  * chunk                                                                *
653  *                                                                      *
654  * Process main body of regex or process a parenthesized "thing".       *
655  *                                                                      *
656  * Caller must absorb opening parenthesis.                              *
657  *                                                                      *
658  * Combining parenthesis handling with the base level of regular        *
659  * expression is a trifle forced, but the need to tie the tails of the  *
660  * branches to what follows makes it hard to avoid.                     *
661  *----------------------------------------------------------------------*/
662 
chunk(int paren,int * flag_param,len_range * range_param)663 static char32 *chunk (int paren, int *flag_param, len_range *range_param) {
664 
665 	char32 *ret_val = NULL;
666 	char32 *this_branch;
667 	char32 *ender = NULL;
668 	int   this_paren = 0;
669 	int   flags_local, first = 1, zero_width, i;
670 	int   old_sensitive = Is_Case_Insensitive;
671 	int   old_newline   = Match_Newline;
672 	len_range range_local;
673 	int   look_only = 0;
674 	char32 *emit_look_behind_bounds = NULL;
675 
676 
677 	*flag_param = HAS_WIDTH;  /* Tentatively. */
678 	range_param->lower = 0;   /* Idem */
679 	range_param->upper = 0;
680 
681 	/* Make an OPEN node, if parenthesized. */
682 
683 	if (paren == PAREN) {
684 		if (Total_Paren >= NSUBEXP) {
685 			Melder_sprint (Error_Text,128, U"number of ()'s > ", NSUBEXP);
686 			REG_FAIL (Error_Text)
687 		}
688 
689 		this_paren = Total_Paren; Total_Paren++;
690 		ret_val    = emit_node (OPEN + this_paren);
691 	} else if (paren == POS_AHEAD_OPEN || paren == NEG_AHEAD_OPEN) {
692 		*flag_param = WORST;  /* Look ahead is zero width. */
693 		look_only   = 1;
694 		ret_val     = emit_node (paren);
695 	} else if (paren == POS_BEHIND_OPEN || paren == NEG_BEHIND_OPEN) {
696 		*flag_param = WORST;  /* Look behind is zero width. */
697 		look_only   = 1;
698 		/* We'll overwrite the zero length later on, so we save the ptr */
699 		ret_val 	  = emit_special (paren, 0, 0);
700 		emit_look_behind_bounds = ret_val + NODE_SIZE;
701 	} else if (paren == INSENSITIVE) {
702 		Is_Case_Insensitive = 1;
703 	} else if (paren == SENSITIVE) {
704 		Is_Case_Insensitive = 0;
705 	} else if (paren == NEWLINE) {
706 		Match_Newline = 1;
707 	} else if (paren == NO_NEWLINE) {
708 		Match_Newline = 0;
709 	}
710 
711 	/* Pick up the branches, linking them together. */
712 
713 	do {
714 		this_branch = alternative (&flags_local, &range_local);
715 
716 		if (this_branch == NULL) {
717 			return (NULL);
718 		}
719 
720 		if (first) {
721 			first = 0;
722 			*range_param = range_local;
723 			if (ret_val == NULL) {
724 				ret_val = this_branch;
725 			}
726 		} else if (range_param->lower >= 0) {
727 			if (range_local.lower >= 0) {
728 				if (range_local.lower < range_param->lower) {
729 					range_param->lower = range_local.lower;
730 				}
731 				if (range_local.upper > range_param->upper) {
732 					range_param->upper = range_local.upper;
733 				}
734 			} else {
735 				range_param->lower = -1; /* Branches have different lengths */
736 				range_param->upper = -1;
737 			}
738 		}
739 
740 		tail (ret_val, this_branch);  /* Connect BRANCH -> BRANCH. */
741 
742 		/* If any alternative could be zero width, consider the whole
743 		   parenthisized thing to be zero width. */
744 
745 		if (! (flags_local & HAS_WIDTH)) {
746 			*flag_param &= ~HAS_WIDTH;
747 		}
748 
749 		/* Are there more alternatives to process? */
750 
751 		if (*Reg_Parse != '|') {
752 			break;
753 		}
754 
755 		Reg_Parse++;
756 	} while (1);
757 
758 	/* Make a closing node, and hook it on the end. */
759 
760 	if (paren == PAREN) {
761 		ender = emit_node (CLOSE + this_paren);
762 
763 	} else if (paren == NO_PAREN) {
764 		ender = emit_node (END);
765 
766 	} else if (paren == POS_AHEAD_OPEN || paren == NEG_AHEAD_OPEN) {
767 		ender = emit_node (LOOK_AHEAD_CLOSE);
768 
769 	} else if (paren == POS_BEHIND_OPEN || paren == NEG_BEHIND_OPEN) {
770 		ender = emit_node (LOOK_BEHIND_CLOSE);
771 
772 	} else {
773 		ender = emit_node (NOTHING);
774 	}
775 
776 	tail (ret_val, ender);
777 
778 	/* Hook the tails of the branch alternatives to the closing node. */
779 
780 	for (this_branch = ret_val; this_branch != NULL;) {
781 		branch_tail (this_branch, NODE_SIZE, ender);
782 		this_branch = next_ptr (this_branch);
783 	}
784 
785 	/* Check for proper termination. */
786 
787 	if (paren != NO_PAREN && *Reg_Parse++ != ')') {
788 		REG_FAIL (U"missing right parenthesis \')\'");
789 	} else if (paren == NO_PAREN && *Reg_Parse != '\0') {
790 		if (*Reg_Parse == ')') {
791 			REG_FAIL (U"missing left parenthesis \'(\'");
792 		} else {
793 			REG_FAIL (U"junk on end");  /* "Can't happen" - NOTREACHED */
794 		}
795 	}
796 
797 	/* Check whether look behind has a fixed size */
798 
799 	if (emit_look_behind_bounds) {
800 		if (range_param->lower < 0) {
801 			REG_FAIL (U"look-behind does not have a bounded size");
802 		}
803 		if (range_param->upper > 65535L) {
804 			REG_FAIL (U"max. look-behind size is too large (>65535)")
805 		}
806 		if (Code_Emit_Ptr != &Compute_Size) {
807 			*emit_look_behind_bounds++ = PUT_OFFSET_L (range_param->lower);
808 			*emit_look_behind_bounds++ = PUT_OFFSET_R (range_param->lower);
809 			*emit_look_behind_bounds++ = PUT_OFFSET_L (range_param->upper);
810 			*emit_look_behind_bounds   = PUT_OFFSET_R (range_param->upper);
811 		}
812 	}
813 
814 	/* For look ahead/behind, the length should be set to zero again */
815 	if (look_only) {
816 		range_param->lower = 0;
817 		range_param->upper = 0;
818 	}
819 
820 	zero_width = 0;
821 
822 	/* Set a bit in Closed_Parens to let future calls to function `back_ref'
823 	   know that we have closed this set of parentheses. */
824 
825 	if (paren == PAREN && this_paren <= (int) sizeof (Closed_Parens) * CHAR_BIT) {
826 		SET_BIT (Closed_Parens, this_paren);
827 
828 		/* Determine if a parenthesized expression is modified by a quantifier
829 		   that can have zero width. */
830 
831 		if (* (Reg_Parse) == '?' || * (Reg_Parse) == '*') {
832 			zero_width++;
833 		} else if (* (Reg_Parse) == '{' && Brace_Char == '{') {
834 			if (* (Reg_Parse + 1) == ',' || * (Reg_Parse + 1) == '}') {
835 				zero_width++;
836 			} else if (* (Reg_Parse + 1) == '0') {
837 				i = 2;
838 
839 				while (* (Reg_Parse + i) == '0') {
840 					i++;
841 				}
842 
843 				if (* (Reg_Parse + i) == ',') {
844 					zero_width++;
845 				}
846 			}
847 		}
848 	}
849 
850 	/* If this set of parentheses is known to never match the empty string, set
851 	   a bit in Paren_Has_Width to let future calls to function back_ref know
852 	   that this set of parentheses has non-zero width.  This will allow star
853 	   (*) or question (?) quantifiers to be aplied to a back-reference that
854 	   refers to this set of parentheses. */
855 
856 	if ( (*flag_param & HAS_WIDTH)  &&
857 	        paren == PAREN            &&
858 	        !zero_width               &&
859 	        this_paren <= (int) (sizeof (Paren_Has_Width) * CHAR_BIT)) {
860 
861 		SET_BIT (Paren_Has_Width, this_paren);
862 	}
863 
864 	Is_Case_Insensitive = old_sensitive;
865 	Match_Newline       = old_newline;
866 
867 	return (ret_val);
868 }
869 
870 /*----------------------------------------------------------------------*
871  * alternative
872  *
873  * Processes one alternative of an '|' operator.  Connects the NEXT
874  * pointers of each regex atom together sequentialy.
875  *----------------------------------------------------------------------*/
876 
alternative(int * flag_param,len_range * range_param)877 static char32 *alternative (int *flag_param, len_range *range_param) {
878 
879 	char32 *ret_val;
880 	char32 *chain;
881 	char32 *latest;
882 	int flags_local;
883 	len_range range_local;
884 
885 	*flag_param = WORST;  /* Tentatively. */
886 	range_param->lower = 0; /* Idem */
887 	range_param->upper = 0;
888 
889 	ret_val = emit_node (BRANCH);
890 	chain   = NULL;
891 
892 	/* Loop until we hit the start of the next alternative, the end of this set
893 	   of alternatives (end of parentheses), or the end of the regex. */
894 
895 	while (*Reg_Parse != '|' && *Reg_Parse != ')' && *Reg_Parse != '\0') {
896 		latest = piece (&flags_local, &range_local);
897 
898 		if (latest == NULL) {
899 			return (NULL);    /* Something went wrong. */
900 		}
901 
902 		*flag_param |= flags_local & HAS_WIDTH;
903 		if (range_local.lower < 0) {
904 			/* Not a fixed length */
905 			range_param->lower = -1;
906 			range_param->upper = -1;
907 		} else if (range_param->lower >= 0) {
908 			range_param->lower += range_local.lower;
909 			range_param->upper += range_local.upper;
910 		}
911 
912 		if (chain != NULL) { /* Connect the regex atoms together sequentialy. */
913 			tail (chain, latest);
914 		}
915 
916 		chain = latest;
917 	}
918 
919 	if (chain == NULL) {  /* Loop ran zero times. */
920 		(void) emit_node (NOTHING);
921 	}
922 
923 	return (ret_val);
924 }
925 
926 /*----------------------------------------------------------------------*
927  * piece - something followed by possible '*', '+', '?', or "{m,n}"
928  *
929  * Note that the branching code sequences used for the general cases of
930  * *, +. ?, and {m,n} are somewhat optimized:  they use the same
931  * NOTHING node as both the endmarker for their branch list and the
932  * body of the last branch. It might seem that this node could be
933  * dispensed with entirely, but the endmarker role is not redundant.
934  *----------------------------------------------------------------------*/
935 
piece(int * flag_param,len_range * range_param)936 static char32 *piece (int *flag_param, len_range *range_param) {
937 
938 	char32 *ret_val;
939 	char32 *next;
940 	char32 op_code;
941 	unsigned long  min_max [2] = {REG_ZERO, REG_INFINITY};
942 	int            flags_local, i, brace_present = 0;
943 	int            lazy = 0, comma_present = 0;
944 	int            digit_present [2] = {0, 0};
945 	len_range      range_local;
946 
947 	ret_val = atom (&flags_local, &range_local);
948 
949 	if (ret_val == NULL) {
950 		return (NULL);    /* Something went wrong. */
951 	}
952 
953 	op_code = *Reg_Parse;
954 
955 	if (!IS_QUANTIFIER (op_code)) {
956 		*flag_param = flags_local;
957 		*range_param = range_local;
958 		return (ret_val);
959 	} else if (op_code == '{') { /* {n,m} quantifier present */
960 		brace_present++;
961 		Reg_Parse++;
962 
963 		/* This code will allow specifying a counting range in any of the
964 		   following forms:
965 
966 		   {m,n}  between m and n.
967 		   {,n}   same as {0,n} or between 0 and infinity.
968 		   {m,}   same as {m,0} or between m and infinity.
969 		   {m}    same as {m,m} or exactly m.
970 		   {,}    same as {0,0} or between 0 and infinity or just '*'.
971 		   {}     same as {0,0} or between 0 and infinity or just '*'.
972 
973 		   Note that specifying a max of zero, {m,0} is not allowed in the regex
974 		   itself, but it is implemented internally that way to support '*', '+',
975 		   and {min,} constructs and signals an unlimited number. */
976 
977 		for (i = 0; i < 2; i++) {
978 			/* Look for digits of number and convert as we go.  The numeric maximum
979 			   value for max and min of 65,535 is due to using 2 bytes to store
980 			   each value in the compiled regex code. */
981 
982 			while (Melder_isAsciiDecimalNumber (*Reg_Parse)) {
983 				/* (6553 * 10 + 6) > 65535 (16 bit max) */
984 
985 				if ( (min_max [i] == 6553UL && (*Reg_Parse - '0') <= 5) ||
986 				        (min_max [i] <= 6552UL)) {
987 
988 					min_max [i] = (min_max [i] * 10UL) +
989 					              (unsigned long) (*Reg_Parse - '0');
990 					Reg_Parse++;
991 
992 					digit_present [i]++;
993 				} else {
994 					if (i == 0) {
995 						Melder_sprint (Error_Text,128, U"min operand of {", min_max [0], *Reg_Parse, U",???} > 65535");
996 					} else {
997 						Melder_sprint (Error_Text,128, U"max operand of {", min_max [0], U",", min_max [1], *Reg_Parse, U"} > 65535");
998 					}
999 
1000 					REG_FAIL (Error_Text);
1001 				}
1002 			}
1003 
1004 			if (!comma_present && *Reg_Parse == ',') {
1005 				comma_present++;
1006 				Reg_Parse++;
1007 			}
1008 		}
1009 
1010 		/* A max of zero cannot be specified directly in the regex since it would
1011 		   signal a max of infinity.  This code specifically disallows `{0,0}',
1012 		   `{,0}', and `{0}' which really means nothing to humans but would be
1013 		   interpreted as `{0,infinity}' or `*' if we didn't make this check. */
1014 
1015 		if (digit_present [0] && (min_max [0] == REG_ZERO) && !comma_present) {
1016 
1017 			REG_FAIL (U"{0} is an invalid range");
1018 		} else if (digit_present [0] && (min_max [0] == REG_ZERO) &&
1019 		           digit_present [1] && (min_max [1] == REG_ZERO)) {
1020 
1021 			REG_FAIL (U"{0,0} is an invalid range");
1022 		} else if (digit_present [1] && (min_max [1] == REG_ZERO)) {
1023 			if (digit_present [0]) {
1024 				Melder_sprint (Error_Text,128, U"{", min_max [0], U",0} is an invalid range");
1025 				REG_FAIL (Error_Text);
1026 			} else {
1027 				REG_FAIL (U"{,0} is an invalid range");
1028 			}
1029 		}
1030 
1031 		if (!comma_present) {
1032 			min_max [1] = min_max [0];
1033 		} /* {x} means {x,x} */
1034 
1035 		if (*Reg_Parse != '}') {
1036 			REG_FAIL (U"{m,n} specification missing right \'}\'");
1037 
1038 		} else if (min_max [1] != REG_INFINITY && min_max [0] > min_max [1]) {
1039 			/* Disallow a backward range. */
1040 
1041 			Melder_sprint (Error_Text,128, U"{", min_max [0], U",", min_max [1], U"} is an invalid range");
1042 			REG_FAIL (Error_Text);
1043 		}
1044 	}
1045 
1046 	Reg_Parse++;
1047 
1048 	/* Check for a minimal matching (non-greedy or "lazy") specification. */
1049 
1050 	if (*Reg_Parse == '?') {
1051 		lazy = 1;
1052 		Reg_Parse++;
1053 	}
1054 
1055 	/* Avoid overhead of counting if possible */
1056 
1057 	if (op_code == '{') {
1058 		if (min_max [0] == REG_ZERO && min_max [1] == REG_INFINITY) {
1059 			op_code = '*';
1060 		} else if (min_max [0] == REG_ONE  && min_max [1] == REG_INFINITY) {
1061 			op_code = '+';
1062 		} else if (min_max [0] == REG_ZERO && min_max [1] == REG_ONE) {
1063 			op_code = '?';
1064 		} else if (min_max [0] == REG_ONE  && min_max [1] == REG_ONE) {
1065 			/* "x{1,1}" is the same as "x".  No need to pollute the compiled
1066 			    regex with such nonsense. */
1067 
1068 			*flag_param = flags_local;
1069 			*range_param = range_local;
1070 			return (ret_val);
1071 		} else if (Num_Braces > (int) UCHAR_MAX) {
1072 			Melder_sprint (Error_Text,128, U"number of {m,n} constructs > ", UCHAR_MAX);
1073 			REG_FAIL (Error_Text);
1074 		}
1075 	}
1076 
1077 	if (op_code == '+') {
1078 		min_max [0] = REG_ONE;
1079 	}
1080 	if (op_code == '?') {
1081 		min_max [1] = REG_ONE;
1082 	}
1083 
1084 	/* It is dangerous to apply certain quantifiers to a possibly zero width
1085 	   item. */
1086 
1087 	if (! (flags_local & HAS_WIDTH)) {
1088 		if (brace_present) {
1089 			Melder_sprint (Error_Text,128, U"{", min_max [0], U",", min_max [1], U"} operand could be empty");
1090 		} else {
1091 			Melder_sprint (Error_Text,128, op_code, U" operand could be empty");
1092 		}
1093 
1094 		REG_FAIL (Error_Text);
1095 	}
1096 
1097 	*flag_param = (min_max [0] > REG_ZERO) ? (WORST | HAS_WIDTH) : WORST;
1098 	if (range_local.lower >= 0) {
1099 		if (min_max[1] != REG_INFINITY) {
1100 			range_param->lower = range_local.lower * min_max[0];
1101 			range_param->upper = range_local.upper * min_max[1];
1102 		} else {
1103 			range_param->lower = -1; /* Not a fixed-size length */
1104 			range_param->upper = -1;
1105 		}
1106 	} else {
1107 		range_param->lower = -1; /* Not a fixed-size length */
1108 		range_param->upper = -1;
1109 	}
1110 
1111 	/*---------------------------------------------------------------------*
1112 	 *          Symbol  Legend  For  Node  Structure  Diagrams
1113 	 *---------------------------------------------------------------------*
1114 	 * (...) = general grouped thing
1115 	 * B     = (B)ranch,  K = bac(K),  N = (N)othing
1116 	 * I     = (I)nitialize count,     C = Increment (C)ount
1117 	 * T~m   = (T)est against mini(m)um- go to NEXT pointer if >= operand
1118 	 * T~x   = (T)est against ma(x)imum- go to NEXT pointer if >= operand
1119 	 * '~'   = NEXT pointer, \___| = forward pointer, |___/ = Backward pointer
1120 	 *---------------------------------------------------------------------*/
1121 
1122 	if (op_code == '*' && (flags_local & SIMPLE)) {
1123 		insert ( (lazy ? LAZY_STAR : STAR), ret_val, 0UL, 0UL, 0);
1124 
1125 	} else if (op_code == '+' && (flags_local & SIMPLE)) {
1126 		insert (lazy ? LAZY_PLUS : PLUS, ret_val, 0UL, 0UL, 0);
1127 
1128 	} else if (op_code == '?' && (flags_local & SIMPLE)) {
1129 		insert (lazy ? LAZY_QUESTION : QUESTION, ret_val, 0UL, 0UL, 0);
1130 
1131 	} else if (op_code == '{' && (flags_local & SIMPLE)) {
1132 		insert (lazy ? LAZY_BRACE : BRACE, ret_val, min_max [0], min_max [1], 0);
1133 
1134 	} else if ( (op_code == '*' || op_code == '+') && lazy) {
1135 		/*  Node structure for (x)*?    Node structure for (x)+? construct.
1136 		 *  construct.                  (Same as (x)*? except for initial
1137 		 *                              forward jump into parenthesis.)
1138 		 *
1139 		 *                                  ___6____
1140 		 *   _______5_______               /________|______
1141 		 *  | _4__        1_\             /| ____   |     _\
1142 		 *  |/    |       / |\           / |/    |  |    / |\
1143 		 *  B~ N~ B~ (...)~ K~ N~       N~ B~ N~ B~ (...)~ K~ N~
1144 		 *      \  \___2_______|               \  \___________|
1145 		 *       \_____3_______|                \_____________|
1146 		 *
1147 		 */
1148 
1149 		tail (ret_val, emit_node (BACK));                 /* 1 */
1150 		(void) insert (BRANCH,  ret_val, 0UL, 0UL, 0);    /* 2,4 */
1151 		(void) insert (NOTHING, ret_val, 0UL, 0UL, 0);    /* 3 */
1152 
1153 		next = emit_node (NOTHING);                       /* 2,3 */
1154 
1155 		offset_tail (ret_val, NODE_SIZE, next);           /* 2 */
1156 		tail (ret_val, next);                             /* 3 */
1157 		insert (BRANCH, ret_val, 0UL, 0UL, 0);            /* 4,5 */
1158 		tail (ret_val, ret_val + (2 * NODE_SIZE));        /* 4 */
1159 		offset_tail (ret_val, 3 * NODE_SIZE, ret_val);    /* 5 */
1160 
1161 		if (op_code == '+') {
1162 			insert (NOTHING, ret_val, 0UL, 0UL, 0);        /* 6 */
1163 			tail (ret_val, ret_val + (4 * NODE_SIZE));     /* 6 */
1164 		}
1165 	} else if (op_code == '*') {
1166 		/* Node structure for (x)* construct.
1167 		 *      ____1_____
1168 		 *     |          \
1169 		 *     B~ (...)~ K~ B~ N~
1170 		 *      \      \_|2 |\_|
1171 		 *       \__3_______|  4
1172 		 */
1173 
1174 		insert (BRANCH, ret_val, 0UL, 0UL, 0);              /* 1,3 */
1175 		offset_tail (ret_val, NODE_SIZE, emit_node (BACK)); /* 2 */
1176 		offset_tail (ret_val, NODE_SIZE, ret_val);          /* 1 */
1177 		tail (ret_val, emit_node (BRANCH));                 /* 3 */
1178 		tail (ret_val, emit_node (NOTHING));                /* 4 */
1179 	} else if (op_code == '+') {
1180 		/* Node structure for (x)+ construct.
1181 		 *
1182 		 *      ____2_____
1183 		 *     |          \
1184 		 *     (...)~ B~ K~ B~ N~
1185 		 *          \_|\____|\_|
1186 		 *          1     3    4
1187 		 */
1188 
1189 		next = emit_node (BRANCH);            /* 1 */
1190 
1191 		tail (ret_val, next);                 /* 1 */
1192 		tail (emit_node (BACK), ret_val);     /* 2 */
1193 		tail (next, emit_node (BRANCH));      /* 3 */
1194 		tail (ret_val, emit_node (NOTHING));  /* 4 */
1195 	} else if (op_code == '?' && lazy) {
1196 		/* Node structure for (x)?? construct.
1197 		 *       _4__        1_
1198 		 *      /    |       / |
1199 		 *     B~ N~ B~ (...)~ N~
1200 		 *         \  \___2____|
1201 		 *          \_____3____|
1202 		 */
1203 
1204 		(void) insert (BRANCH,  ret_val, 0UL, 0UL, 0);       /* 2,4 */
1205 		(void) insert (NOTHING, ret_val, 0UL, 0UL, 0);       /* 3 */
1206 
1207 		next = emit_node (NOTHING);                          /* 1,2,3 */
1208 
1209 		offset_tail (ret_val, 2 * NODE_SIZE, next);          /* 1 */
1210 		offset_tail (ret_val,     NODE_SIZE, next);          /* 2 */
1211 		tail (ret_val, next);                                /* 3 */
1212 		insert (BRANCH,  ret_val, 0UL, 0UL, 0);              /* 4 */
1213 		tail (ret_val, (ret_val + (2 * NODE_SIZE)));         /* 4 */
1214 
1215 	} else if (op_code == '?') {
1216 		/* Node structure for (x)? construct.
1217 		 *       ___1____  _2
1218 		 *      /        |/ |
1219 		 *     B~ (...)~ B~ N~
1220 		 *             \__3_|
1221 		 */
1222 
1223 		insert (BRANCH, ret_val, 0UL, 0UL, 0);   /* 1 */
1224 		tail (ret_val, emit_node (BRANCH));      /* 1 */
1225 
1226 		next = emit_node (NOTHING);              /* 2,3 */
1227 
1228 		tail (ret_val, next);                    /* 2 */
1229 		offset_tail (ret_val, NODE_SIZE, next);  /* 3 */
1230 	} else if (op_code == '{' && min_max [0] == min_max [1]) {
1231 		/* Node structure for (x){m}, (x){m}?, (x){m,m}, or (x){m,m}? constructs.
1232 		 * Note that minimal and maximal matching mean the same thing when we
1233 		 * specify the minimum and maximum to be the same value.
1234 		 *       _______3_____
1235 		 *      |    1_  _2   \
1236 		 *      |    / |/ |    \
1237 		 *   I~ (...)~ C~ T~m K~ N~
1238 		 *    \_|          \_____|
1239 		 *     5              4
1240 		 */
1241 
1242 		tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces));         /* 1 */
1243 		tail (ret_val, emit_special (TEST_COUNT, min_max [0], Num_Braces));/* 2 */
1244 		tail (emit_node (BACK), ret_val);                                  /* 3 */
1245 		tail (ret_val, emit_node (NOTHING));                               /* 4 */
1246 
1247 		next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces);         /* 5 */
1248 
1249 		tail (ret_val, next);                                              /* 5 */
1250 
1251 		Num_Braces++;
1252 	} else if (op_code == '{' && lazy) {
1253 		if (min_max [0] == REG_ZERO && min_max [1] != REG_INFINITY) {
1254 			/* Node structure for (x){0,n}? or {,n}? construct.
1255 			 *       _________3____________
1256 			 *    8_| _4__        1_  _2   \
1257 			 *    / |/    |       / |/ |    \
1258 			 *   I~ B~ N~ B~ (...)~ C~ T~x K~ N~
1259 			 *          \  \            \__7__|
1260 			 *           \  \_________6_______|
1261 			 *            \______5____________|
1262 			 */
1263 
1264 			tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1265 
1266 			next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2,7 */
1267 
1268 			tail (ret_val, next);                                      /* 2 */
1269 			(void) insert (BRANCH,  ret_val, 0UL, 0UL, Num_Braces);    /* 4,6 */
1270 			(void) insert (NOTHING, ret_val, 0UL, 0UL, Num_Braces);    /* 5 */
1271 			(void) insert (BRANCH,  ret_val, 0UL, 0UL, Num_Braces);    /* 3,4,8 */
1272 			tail (emit_node (BACK), ret_val);                          /* 3 */
1273 			tail (ret_val, ret_val + (2 * NODE_SIZE));                 /* 4 */
1274 
1275 			next = emit_node (NOTHING);                                /* 5,6,7 */
1276 
1277 			offset_tail (ret_val, NODE_SIZE, next);                    /* 5 */
1278 			offset_tail (ret_val, 2 * NODE_SIZE, next);                /* 6 */
1279 			offset_tail (ret_val, 3 * NODE_SIZE, next);                /* 7 */
1280 
1281 			next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 8 */
1282 
1283 			tail (ret_val, next);                                      /* 8 */
1284 
1285 		} else if (min_max [0] > REG_ZERO && min_max [1] == REG_INFINITY) {
1286 			/* Node structure for (x){m,}? construct.
1287 			 *       ______8_________________
1288 			 *      |         _______3_____  \
1289 			 *      | _7__   |    1_  _2   \  \
1290 			 *      |/    |  |    / |/ |    \  \
1291 			 *   I~ B~ N~ B~ (...)~ C~ T~m K~ K~ N~
1292 			 *    \_____\__\_|          \_4___|  |
1293 			 *       9   \  \_________5__________|
1294 			 *            \_______6______________|
1295 			 */
1296 
1297 			tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1298 
1299 			next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2,4 */
1300 
1301 			tail (ret_val, next);                                      /* 2 */
1302 			tail (emit_node (BACK), ret_val);                          /* 3 */
1303 			tail (ret_val, emit_node (BACK));                          /* 4 */
1304 			(void) insert (BRANCH, ret_val, 0UL, 0UL, 0);              /* 5,7 */
1305 			(void) insert (NOTHING, ret_val, 0UL, 0UL, 0);             /* 6 */
1306 
1307 			next = emit_node (NOTHING);                                /* 5,6 */
1308 
1309 			offset_tail (ret_val, NODE_SIZE, next);                    /* 5 */
1310 			tail (ret_val, next);                                      /* 6 */
1311 			(void) insert (BRANCH,  ret_val, 0UL, 0UL, 0);             /* 7,8 */
1312 			tail (ret_val, ret_val + (2 * NODE_SIZE));                 /* 7 */
1313 			offset_tail (ret_val, 3 * NODE_SIZE, ret_val);             /* 8 */
1314 			(void) insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 9 */
1315 			tail (ret_val, ret_val + INDEX_SIZE + (4 * NODE_SIZE));    /* 9 */
1316 
1317 		} else {
1318 			/* Node structure for (x){m,n}? construct.
1319 			 *       ______9_____________________
1320 			 *      |         _____________3___  \
1321 			 *      | __8_   |    1_  _2       \  \
1322 			 *      |/    |  |    / |/ |        \  \
1323 			 *   I~ B~ N~ B~ (...)~ C~ T~x T~m K~ K~ N~
1324 			 *    \_____\__\_|          \   \__4__|  |
1325 			 *      10   \  \            \_7_________|
1326 			 *            \  \_________6_____________|
1327 			 *             \_______5_________________|
1328 			 */
1329 
1330 			tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1331 
1332 			next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,7 */
1333 
1334 			tail (ret_val, next);                                      /* 2 */
1335 
1336 			next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 4 */
1337 
1338 			tail (emit_node (BACK), ret_val);                          /* 3 */
1339 			tail (next, emit_node (BACK));                             /* 4 */
1340 			(void) insert (BRANCH, ret_val, 0UL, 0UL, 0);              /* 6,8 */
1341 			(void) insert (NOTHING, ret_val, 0UL, 0UL, 0);             /* 5 */
1342 			(void) insert (BRANCH,  ret_val, 0UL, 0UL, 0);             /* 8,9 */
1343 
1344 			next = emit_node (NOTHING);                                /* 5,6,7 */
1345 
1346 			offset_tail (ret_val, NODE_SIZE, next);                    /* 5 */
1347 			offset_tail (ret_val, 2 * NODE_SIZE, next);                /* 6 */
1348 			offset_tail (ret_val, 3 * NODE_SIZE, next);                /* 7 */
1349 			tail (ret_val, ret_val + (2 * NODE_SIZE));                 /* 8 */
1350 			offset_tail (next, -NODE_SIZE, ret_val);                   /* 9 */
1351 			insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces);        /* 10 */
1352 			tail (ret_val, ret_val + INDEX_SIZE + (4 * NODE_SIZE));    /* 10 */
1353 		}
1354 
1355 		Num_Braces++;
1356 	} else if (op_code == '{') {
1357 		if (min_max [0] == REG_ZERO && min_max [1] != REG_INFINITY) {
1358 			/* Node structure for (x){0,n} or (x){,n} construct.
1359 			 *
1360 			 *       ___3____________
1361 			 *      |       1_  _2   \   5_
1362 			 *      |       / |/ |    \  / |
1363 			 *   I~ B~ (...)~ C~ T~x K~ B~ N~
1364 			 *    \_|\            \_6___|__|
1365 			 *    7   \________4________|
1366 			 */
1367 
1368 			tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1369 
1370 			next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,6 */
1371 
1372 			tail (ret_val, next);                                      /* 2 */
1373 			(void) insert (BRANCH, ret_val, 0UL, 0UL, 0);              /* 3,4,7 */
1374 			tail (emit_node (BACK), ret_val);                          /* 3 */
1375 
1376 			next = emit_node (BRANCH);                                 /* 4,5 */
1377 
1378 			tail (ret_val, next);                                      /* 4 */
1379 			tail (next, emit_node (NOTHING));                          /* 5,6 */
1380 			offset_tail (ret_val, NODE_SIZE, next);                    /* 6 */
1381 
1382 			next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 7 */
1383 
1384 			tail (ret_val, next);                                      /* 7 */
1385 
1386 		} else if (min_max [0] > REG_ZERO && min_max [1] == REG_INFINITY) {
1387 			/* Node structure for (x){m,} construct.
1388 			 *       __________4________
1389 			 *      |    __3__________  \
1390 			 *     _|___|    1_  _2   \  \    _7
1391 			 *    / | 8 |    / |/ |    \  \  / |
1392 			 *   I~ B~  (...)~ C~ T~m K~ K~ B~ N~
1393 			 *       \             \_5___|  |
1394 			 *        \__________6__________|
1395 			 */
1396 
1397 			tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1398 
1399 			next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2 */
1400 
1401 			tail (ret_val, next);                                      /* 2 */
1402 			tail (emit_node (BACK), ret_val);                          /* 3 */
1403 			(void) insert (BRANCH, ret_val, 0UL, 0UL, 0);              /* 4,6 */
1404 
1405 			next = emit_node (BACK);                                   /* 4 */
1406 
1407 			tail (next, ret_val);                                      /* 4 */
1408 			offset_tail (ret_val, NODE_SIZE, next);                    /* 5 */
1409 			tail (ret_val, emit_node (BRANCH));                        /* 6 */
1410 			tail (ret_val, emit_node (NOTHING));                       /* 7 */
1411 
1412 			insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces);        /* 8 */
1413 
1414 			tail (ret_val, ret_val + INDEX_SIZE + (2 * NODE_SIZE));    /* 8 */
1415 
1416 		} else {
1417 			/* Node structure for (x){m,n} construct.
1418 			 *       _____6________________
1419 			 *      |   _____________3___  \
1420 			 *    9_|__|    1_  _2       \  \    _8
1421 			 *    / |  |    / |/ |        \  \  / |
1422 			 *   I~ B~ (...)~ C~ T~x T~m K~ K~ B~ N~
1423 			 *       \            \   \__4__|  |  |
1424 			 *        \            \_7_________|__|
1425 			 *         \_________5_____________|
1426 			 */
1427 
1428 			tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1429 
1430 			next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,4 */
1431 
1432 			tail (ret_val, next);                                      /* 2 */
1433 
1434 			next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 4 */
1435 
1436 			tail (emit_node (BACK), ret_val);                          /* 3 */
1437 			tail (next, emit_node (BACK));                             /* 4 */
1438 			(void) insert (BRANCH, ret_val, 0UL, 0UL, 0);              /* 5,6 */
1439 
1440 			next = emit_node (BRANCH);                                 /* 5,8 */
1441 
1442 			tail (ret_val, next);                                      /* 5 */
1443 			offset_tail (next, -NODE_SIZE, ret_val);                   /* 6 */
1444 
1445 			next = emit_node (NOTHING);                                /* 7,8 */
1446 
1447 			offset_tail (ret_val, NODE_SIZE, next);                    /* 7 */
1448 
1449 			offset_tail (next, -NODE_SIZE, next);                      /* 8 */
1450 			(void) insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 9 */
1451 			tail (ret_val, ret_val + INDEX_SIZE + (2 * NODE_SIZE));    /* 9 */
1452 		}
1453 
1454 		Num_Braces++;
1455 	} else {
1456 		/* We get here if the IS_QUANTIFIER macro is not coordinated properly
1457 		   with this function. */
1458 
1459 		REG_FAIL (U"internal error #2, `piece\'");
1460 	}
1461 
1462 	if (IS_QUANTIFIER (*Reg_Parse)) {
1463 		if (op_code == '{') {
1464 			Melder_sprint (Error_Text,128, U"nested quantifiers, {m,n}", *Reg_Parse);
1465 		} else {
1466 			Melder_sprint (Error_Text,128, U"nested quantifiers, ", op_code, *Reg_Parse);
1467 		}
1468 
1469 		REG_FAIL (Error_Text);
1470 	}
1471 
1472 	return (ret_val);
1473 }
1474 
1475 /*----------------------------------------------------------------------*
1476  * atom
1477  *
1478  * Process one regex item at the lowest level
1479  *
1480  * OPTIMIZATION:  Lumps a continuous sequence of ordinary characters
1481  * together so that it can turn them into a single EXACTLY node, which
1482  * is smaller to store and faster to run.
1483  *----------------------------------------------------------------------*/
1484 
atom(int * flag_param,len_range * range_param)1485 static char32 *atom (int *flag_param, len_range *range_param) {
1486 
1487 	char32 *ret_val;
1488 	char32  test;
1489 	int            flags_local;
1490 	len_range      range_local;
1491 
1492 	*flag_param = WORST;  /* Tentatively. */
1493 	range_param->lower = 0; /* Idem */
1494 	range_param->upper = 0;
1495 
1496 	/* Process any regex comments, e.g. `(?# match next token->)'.  The
1497 	   terminating right parenthesis cannot be escaped.  The comment stops at
1498 	   the first right parenthesis encountered (or the end of the regex
1499 	   string)... period.  Handles multiple sequential comments,
1500 	   e.g. `(?# one)(?# two)...'  */
1501 
1502 	while (*Reg_Parse      == '(' &&
1503 	        * (Reg_Parse + 1) == '?' &&
1504 	        * (Reg_Parse + 2) == '#') {
1505 
1506 		Reg_Parse += 3;
1507 
1508 		while (*Reg_Parse != ')' && *Reg_Parse != '\0') {
1509 			Reg_Parse++;
1510 		}
1511 
1512 		if (*Reg_Parse == ')') {
1513 			Reg_Parse++;
1514 		}
1515 
1516 		if (*Reg_Parse == ')' || *Reg_Parse == '|' || *Reg_Parse == '\0') {
1517 			/* Hit end of regex string or end of parenthesized regex; have to
1518 			 return "something" (i.e. a NOTHING node) to avoid generating an
1519 			 error. */
1520 
1521 			ret_val = emit_node (NOTHING);
1522 
1523 			return (ret_val);
1524 		}
1525 	}
1526 
1527 	switch (*Reg_Parse ++) {
1528 		case U'^':
1529 			ret_val = emit_node (BOL);
1530 			break;
1531 		case U'$':
1532 			ret_val = emit_node (EOL);
1533 			break;
1534 		case U'<':
1535 			ret_val = emit_node (BOWORD);
1536 			break;
1537 		case U'>':
1538 			ret_val = emit_node (EOWORD);
1539 			break;
1540 		case U'.':
1541 			if (Match_Newline) {
1542 				ret_val = emit_node (EVERY);
1543 			} else {
1544 				ret_val = emit_node (ANY);
1545 			}
1546 
1547 			*flag_param |= (HAS_WIDTH | SIMPLE);
1548 			range_param->lower = 1;
1549 			range_param->upper = 1;
1550 			break;
1551 		case U'(':
1552 			if (*Reg_Parse == U'?') {  /* Special parenthetical expression */
1553 				Reg_Parse++;
1554 				range_local.lower = 0; /* Make sure it is always used */
1555 				range_local.upper = 0;
1556 
1557 				if (*Reg_Parse == U':') {
1558 					Reg_Parse ++;
1559 					ret_val = chunk (NO_CAPTURE, &flags_local, &range_local);
1560 				} else if (*Reg_Parse == U'=') {
1561 					Reg_Parse ++;
1562 					ret_val = chunk (POS_AHEAD_OPEN, &flags_local, &range_local);
1563 				} else if (*Reg_Parse == U'!') {
1564 					Reg_Parse ++;
1565 					ret_val = chunk (NEG_AHEAD_OPEN, &flags_local, &range_local);
1566 				} else if (*Reg_Parse == U'i') {
1567 					Reg_Parse ++;
1568 					ret_val = chunk (INSENSITIVE, &flags_local, &range_local);
1569 				} else if (*Reg_Parse == U'I') {
1570 					Reg_Parse ++;
1571 					ret_val = chunk (SENSITIVE, &flags_local, &range_local);
1572 				} else if (*Reg_Parse == U'n') {
1573 					Reg_Parse ++;
1574 					ret_val = chunk (NEWLINE, &flags_local, &range_local);
1575 				} else if (*Reg_Parse == U'N') {
1576 					Reg_Parse ++;
1577 					ret_val = chunk (NO_NEWLINE, &flags_local, &range_local);
1578 				} else if (*Reg_Parse == U'<') {
1579 					Reg_Parse ++;
1580 					if (*Reg_Parse == U'=') {
1581 						Reg_Parse ++;
1582 						ret_val = chunk (POS_BEHIND_OPEN, &flags_local, &range_local);
1583 					} else if (*Reg_Parse == U'!') {
1584 						Reg_Parse ++;
1585 						ret_val = chunk (NEG_BEHIND_OPEN, &flags_local, &range_local);
1586 					} else {
1587 						Melder_sprint (Error_Text,128, U"invalid look-behind syntax, \"(?<", *Reg_Parse, U"...)\"");
1588 						REG_FAIL (Error_Text)
1589 					}
1590 				} else {
1591 					Melder_sprint (Error_Text,128, U"invalid grouping syntax, \"(?", *Reg_Parse, U"...)\"");
1592 					REG_FAIL (Error_Text)
1593 				}
1594 			} else { /* Normal capturing parentheses */
1595 				ret_val = chunk (PAREN, & flags_local, & range_local);
1596 			}
1597 			if (! ret_val)
1598 				return nullptr;   // something went wrong
1599 
1600 			/* Add HAS_WIDTH flag if it was set by call to chunk. */
1601 
1602 			*flag_param |= flags_local & HAS_WIDTH;
1603 			*range_param = range_local;
1604 
1605 			break;
1606 
1607 		case U'\0':
1608 		case U'|':
1609 		case U')':
1610 			REG_FAIL (U"internal error #3, `atom\'")   // supposed to be caught earlier
1611 		case U'?':
1612 		case U'+':
1613 		case U'*':
1614 			Melder_sprint (Error_Text,128, * (Reg_Parse - 1), U" follows nothing");
1615 			REG_FAIL (Error_Text)
1616 
1617 		case U'{':
1618 			if (Enable_Counting_Quantifier) {
1619 				REG_FAIL (U"{m,n} follows nothing")
1620 			} else {
1621 				ret_val = emit_node (EXACTLY); /* Treat braces as literals. */
1622 				emit_byte (U'{');
1623 				emit_byte (U'\0');
1624 				range_param->lower = 1;
1625 				range_param->upper = 1;
1626 			}
1627 
1628 			break;
1629 
1630 		case U'[': {
1631 			char32 last_emit = U'\0';
1632 
1633 			/* Handle characters that can only occur at the start of a class. */
1634 
1635 			if (*Reg_Parse == U'^') { /* Complement of range. */
1636 				ret_val = emit_node (ANY_BUT);
1637 				Reg_Parse ++;
1638 
1639 				/* All negated classes include newline unless escaped with
1640 				   a "(?n)" switch. */
1641 
1642 				if (!Match_Newline) {
1643 					emit_byte (U'\n');
1644 				}
1645 			} else {
1646 				ret_val = emit_node (ANY_OF);
1647 			}
1648 
1649 			if (*Reg_Parse == U']' || *Reg_Parse == U'-') {
1650 				/* If '-' or ']' is the first character in a class,
1651 				   it is a literal character in the class. */
1652 
1653 				last_emit = *Reg_Parse;
1654 				emit_byte (*Reg_Parse);
1655 				Reg_Parse ++;
1656 			}
1657 
1658 			/* Handle the rest of the class characters. */
1659 
1660 			while (*Reg_Parse != U'\0' && *Reg_Parse != U']') {
1661 				if (*Reg_Parse == U'-') { /* Process a range, e.g [a-z]. */
1662 					Reg_Parse ++;
1663 
1664 					if (*Reg_Parse == U']' || *Reg_Parse == U'\0') {
1665 						/* If '-' is the last character in a class it is a literal
1666 						   character.  If `Reg_Parse' points to the end of the
1667 						   regex string, an error will be generated later. */
1668 
1669 						emit_byte (U'-');
1670 						last_emit = U'-';
1671 					} else {
1672 						/* We must get the range starting character value from the
1673 						   emitted code since it may have been an escaped
1674 						   character.  `second_value' is set one larger than the
1675 						   just emitted character value.  This is done since
1676 						   `second_value' is used as the start value for the loop
1677 						   that emits the values in the range.  Since we have
1678 						   already emitted the first character of the class, we do
1679 						   not want to emit it again. */
1680 
1681 						char32 second_value = last_emit + 1, last_value;
1682 
1683 						if (*Reg_Parse == U'\\') {
1684 							/* Handle escaped characters within a class range.
1685 							   Specifically disallow shortcut escapes as the end of
1686 							   a class range.  To allow this would be ambiguous
1687 							   since shortcut escapes represent a set of characters,
1688 							   and it would not be clear which character of the
1689 							   class should be treated as the "last" character. */
1690 
1691 							Reg_Parse++;
1692 
1693 							if ((test = numeric_escape (*Reg_Parse, &Reg_Parse)) != U'\0') {
1694 								last_value = test;
1695 							} else if ((test = literal_escape (*Reg_Parse)) != U'\0') {
1696 								last_value = test;
1697 							} else if (shortcut_escape (*Reg_Parse, NULL, CHECK_CLASS_ESCAPE)) {
1698 								Melder_sprint (Error_Text, 128, U"\\", *Reg_Parse, U" is not allowed as range operand");
1699 								REG_FAIL (Error_Text)
1700 							} else {
1701 								Melder_sprint (Error_Text, 128, U"\\", *Reg_Parse, U" is an invalid char class escape sequence");
1702 								REG_FAIL (Error_Text)
1703 							}
1704 						} else {
1705 							last_value = U_CHAR_AT (Reg_Parse);
1706 						}
1707 
1708 						if (Is_Case_Insensitive) {
1709 							second_value = Melder_toLowerCase (second_value);   // TODO: this should do casefolding
1710 							last_value = Melder_toLowerCase (last_value);
1711 						}
1712 
1713 						/* For case insensitive, something like [A-_] will
1714 						   generate an error here since ranges are converted to
1715 						   lower case. */
1716 
1717 						if (second_value - 1 > last_value)
1718 							REG_FAIL (U"invalid [] range")
1719 
1720 						/* If only one character in range (e.g [a-a]) then this
1721 						   loop is not run since the first character of any range
1722 						   was emitted by the previous iteration of while loop. */
1723 
1724 						for (; second_value <= last_value; second_value++)
1725 							emit_class_byte (second_value);
1726 						last_emit = (char32) last_value;
1727 						Reg_Parse ++;
1728 					} /* End class character range code. */
1729 				} else if (*Reg_Parse == '\\') {
1730 					Reg_Parse++;
1731 
1732 					if ( (test = numeric_escape (*Reg_Parse, &Reg_Parse)) != '\0') {
1733 						emit_class_byte (test);
1734 						last_emit = test;
1735 					} else if ( (test = literal_escape (*Reg_Parse)) != '\0') {
1736 						emit_byte (test);
1737 						last_emit = test;
1738 					} else if (shortcut_escape (*Reg_Parse,
1739 					                            NULL,
1740 					                            CHECK_CLASS_ESCAPE)) {
1741 
1742 						if (* (Reg_Parse + 1) == '-') {
1743 							/* Specifically disallow shortcut escapes as the start
1744 							   of a character class range (see comment above.) */
1745 
1746 							Melder_sprint (Error_Text,128, U"\\", *Reg_Parse, U" not allowed as range operand");
1747 
1748 							REG_FAIL (Error_Text);
1749 						} else {
1750 							/* Emit the bytes that are part of the shortcut
1751 							   escape sequence's range (e.g. \d = 0123456789) */
1752 
1753 							Melder_sprint (Error_Text,128, U"Cannot have escape sequences such as \\", *Reg_Parse, U" inside a class");
1754 						}
1755 					} else {
1756 						Melder_sprint (Error_Text,128, U"\\", *Reg_Parse, U" is an invalid char class escape sequence");
1757 
1758 						REG_FAIL (Error_Text);
1759 					}
1760 
1761 					Reg_Parse++;
1762 
1763 					/* End of class escaped sequence code */
1764 				} else {
1765 					emit_class_byte (*Reg_Parse); /* Ordinary class character. */
1766 
1767 					last_emit = *Reg_Parse;
1768 					Reg_Parse++;
1769 				}
1770 			} /* End of while (*Reg_Parse != U'\0' && *Reg_Parse != U']') */
1771 
1772 			if (*Reg_Parse != U']') {
1773 				REG_FAIL (U"missing right \']\'");
1774 			}
1775 
1776 			emit_byte (U'\0');
1777 
1778 			/* NOTE: it is impossible to specify an empty class.  This is
1779 			   because [] would be interpreted as "begin character class"
1780 			   followed by a literal ']' character and no "end character class"
1781 			   delimiter (']').  Because of this, it is always safe to assume
1782 			   that a class HAS_WIDTH. */
1783 
1784 			Reg_Parse ++;
1785 			*flag_param |= HAS_WIDTH | SIMPLE;
1786 			range_param->lower = 1;
1787 			range_param->upper = 1;
1788 		}
1789 
1790 		break; /* End of character class code. */
1791 
1792 		case U'\\':
1793 			/* Force Error_Text to have a length of zero.  This way we can tell if
1794 			   either of the calls to shortcut_escape() or back_ref() fill
1795 			   Error_Text with an error message. */
1796 
1797 			Error_Text [0] = U'\0';
1798 
1799 			if ( (ret_val = shortcut_escape (*Reg_Parse, flag_param, EMIT_NODE))) {
1800 				Reg_Parse ++;
1801 				range_param->lower = 1;
1802 				range_param->upper = 1;
1803 				break;
1804 			} else if ( (ret_val = back_ref (Reg_Parse, flag_param, EMIT_NODE))) {
1805 				/* Can't make any assumptions about a back-reference as to SIMPLE
1806 				   or HAS_WIDTH.  For example (^|<) is neither simple nor has
1807 				   width.  So we don't flip bits in flag_param here. */
1808 
1809 				Reg_Parse++;
1810 				/* Back-references always have an unknown length */
1811 				range_param->lower = -1;
1812 				range_param->upper = -1;
1813 				break;
1814 			}
1815 			if (Error_Text [0] != U'\0')
1816 				REG_FAIL (Error_Text)
1817 
1818 			/* At this point it is apparent that the escaped character is not a
1819 			   shortcut escape or back-reference.  Back up one character to allow
1820 			   the default code to include it as an ordinary character. */
1821 
1822 			/* Fall through to Default case to handle literal escapes and numeric
1823 			   escapes. */
1824 
1825 		default:
1826 			Reg_Parse --; /* If we fell through from the above code, we are now
1827                          pointing at the back slash (\) character. */
1828 			{
1829 				char32 *parse_save;
1830 				int   len = 0;
1831 
1832 				if (Is_Case_Insensitive) {
1833 					ret_val = emit_node (SIMILAR);
1834 				} else {
1835 					ret_val = emit_node (EXACTLY);
1836 				}
1837 
1838 				/* Loop until we find a meta character, shortcut escape, back
1839 				   reference, or end of regex string. */
1840 
1841 				for (; *Reg_Parse != '\0' && ! str32chr (Meta_Char, *Reg_Parse); len++) {
1842 
1843 					/* Save where we are in case we have to back
1844 					   this character out. */
1845 
1846 					parse_save = Reg_Parse;
1847 
1848 					if (*Reg_Parse == U'\\') {
1849 						Reg_Parse ++;   // point to escaped character
1850 
1851 						Error_Text [0] = U'\0';   // see comment above
1852 
1853 						if ( (test = numeric_escape (*Reg_Parse, &Reg_Parse))) {
1854 							if (Is_Case_Insensitive) {
1855 								emit_byte (Melder_toLowerCase (test));
1856 							} else {
1857 								emit_byte (test);
1858 							}
1859 						} else if ( (test = literal_escape (*Reg_Parse))) {
1860 							emit_byte (test);
1861 						} else if (back_ref (Reg_Parse, NULL, CHECK_ESCAPE)) {
1862 							/* Leave back reference for next `atom' call */
1863 
1864 							Reg_Parse --; break;
1865 						} else if (shortcut_escape (*Reg_Parse, NULL, CHECK_ESCAPE)) {
1866 							/* Leave shortcut escape for next `atom' call */
1867 
1868 							Reg_Parse --; break;
1869 						} else {
1870 							if (Error_Text [0] != U'\0') {
1871 								/* None of the above calls generated an error message
1872 								   so generate our own here. */
1873 
1874 								Melder_sprint (Error_Text,128, U"\\", *Reg_Parse, U" is an invalid escape sequence");
1875 							}
1876 							REG_FAIL (Error_Text)
1877 						}
1878 						Reg_Parse ++;
1879 					} else {
1880 						/* Ordinary character */
1881 
1882 						if (Is_Case_Insensitive) {
1883 							emit_byte (Melder_toLowerCase (*Reg_Parse));
1884 						} else {
1885 							emit_byte (*Reg_Parse);
1886 						}
1887 						Reg_Parse ++;
1888 					}
1889 
1890 					/* If next regex token is a quantifier (?, +. *, or {m,n}) and
1891 					   our EXACTLY node so far is more than one character, leave the
1892 					   last character to be made into an EXACTLY node one character
1893 					   wide for the multiplier to act on.  For example 'abcd* would
1894 					   have an EXACTLY node with an 'abc' operand followed by a STAR
1895 					   node followed by another EXACTLY node with a 'd' operand. */
1896 
1897 					if (IS_QUANTIFIER (*Reg_Parse) && len > 0) {
1898 						Reg_Parse = parse_save; /* Point to previous regex token. */
1899 						if (Code_Emit_Ptr == &Compute_Size) {
1900 							Reg_Size--;
1901 						} else {
1902 							Code_Emit_Ptr--; /* Write over previously emitted byte. */
1903 						}
1904 						break;
1905 					}
1906 				}
1907 				if (len <= 0)
1908 					REG_FAIL (U"internal error #4, `atom\'")
1909 
1910 				*flag_param |= HAS_WIDTH;
1911 
1912 				if (len == 1)
1913 					*flag_param |= SIMPLE;
1914 
1915 				range_param->lower = len;
1916 				range_param->upper = len;
1917 
1918 				emit_byte (U'\0');
1919 			}
1920 	} /* END switch (*Reg_Parse++) */
1921 
1922 	return (ret_val);
1923 }
1924 
1925 /*----------------------------------------------------------------------*
1926  * emit_node
1927  *
1928  * Emit (if appropriate) the op code for a regex node atom.
1929  *
1930  * The NEXT pointer is initialized to NULL.
1931  *
1932  * Returns a pointer to the START of the emitted node.
1933  *----------------------------------------------------------------------*/
1934 
emit_node(int op_code)1935 static char32 *emit_node (int op_code) {
1936 	char32 *ret_val = Code_Emit_Ptr; /* Return address of start of node */
1937 	if (ret_val == & Compute_Size) {
1938 		Reg_Size += NODE_SIZE;
1939 	} else {
1940 		char32 *ptr = ret_val;
1941 		*ptr ++ = op_code;
1942 		*ptr ++ = U'\0';   // null "NEXT" pointer
1943 		*ptr ++ = U'\0';
1944 		Code_Emit_Ptr = ptr;
1945 	}
1946 	return ret_val;
1947 }
1948 
1949 /*----------------------------------------------------------------------*
1950  * emit_byte
1951  *
1952  * Emit (if appropriate) a byte of code (usually part of an operand.)
1953  *----------------------------------------------------------------------*/
1954 
emit_byte(char32 c)1955 static void emit_byte (char32 c) {
1956 	if (Code_Emit_Ptr == & Compute_Size) {
1957 		Reg_Size ++;
1958 	} else {
1959 		*Code_Emit_Ptr ++ = c;
1960 	}
1961 }
1962 
1963 /*----------------------------------------------------------------------*
1964  * emit_class_byte
1965  *
1966  * Emit (if appropriate) a byte of code (usually part of a character
1967  * class operand.)
1968  *----------------------------------------------------------------------*/
1969 
emit_class_byte(char32 c)1970 static void emit_class_byte (char32 c) {
1971 	if (Code_Emit_Ptr == & Compute_Size) {
1972 		Reg_Size ++;
1973 		if (Is_Case_Insensitive && Melder_isLetter (c)) {
1974 			Reg_Size ++;
1975 		}
1976 	} else if (Is_Case_Insensitive && Melder_isLetter (c)) {
1977 		/* For case-insensitive character classes, emit both upper and lower case
1978 		   versions of alphabetical characters. */
1979 
1980 		*Code_Emit_Ptr ++ = Melder_toLowerCase (c);
1981 		*Code_Emit_Ptr ++ = Melder_toUpperCase (c);
1982 	} else {
1983 		*Code_Emit_Ptr ++ = c;
1984 	}
1985 }
1986 
1987 /*----------------------------------------------------------------------*
1988  * emit_special
1989  *
1990  * Emit nodes that need special processing.
1991  *----------------------------------------------------------------------*/
1992 
emit_special(char32 op_code,unsigned long test_val,int index)1993 static char32 *emit_special (
1994     char32 op_code,
1995     unsigned long test_val,
1996     int index)
1997 {
1998 	char32 *ret_val = & Compute_Size;
1999 	char32 *ptr;
2000 
2001 	if (Code_Emit_Ptr == & Compute_Size) {
2002 		switch (op_code) {
2003 			case POS_BEHIND_OPEN:
2004 			case NEG_BEHIND_OPEN:
2005 				Reg_Size += LENGTH_SIZE;   /* Length of the look-behind match */
2006 				Reg_Size += NODE_SIZE;     /* Make room for the node */
2007 				break;
2008 			case TEST_COUNT:
2009 				Reg_Size += NEXT_PTR_SIZE; /* Make room for a test value. */
2010 				// ppgb FALLTHROUGH (both test and inc require index)
2011 			case INC_COUNT:
2012 				Reg_Size += INDEX_SIZE;    /* Make room for an index value. */
2013 				// ppgb FALLTHROUGH (everything requires node)
2014 			default:
2015 				Reg_Size += NODE_SIZE;     /* Make room for the node. */
2016 		}
2017 	} else {
2018 		ret_val = emit_node (op_code); /* Return the address for start of node. */
2019 		ptr     = Code_Emit_Ptr;
2020 		if (op_code == INC_COUNT || op_code == TEST_COUNT) {
2021 			*ptr ++ = (char32) index;
2022 			if (op_code == TEST_COUNT) {
2023 				*ptr ++ = PUT_OFFSET_L (test_val);
2024 				*ptr ++ = PUT_OFFSET_R (test_val);
2025 			}
2026 		} else if (op_code == POS_BEHIND_OPEN || op_code == NEG_BEHIND_OPEN) {
2027 			*ptr ++ = PUT_OFFSET_L (test_val);
2028 			*ptr ++ = PUT_OFFSET_R (test_val);
2029 			*ptr ++ = PUT_OFFSET_L (test_val);
2030 			*ptr ++ = PUT_OFFSET_R (test_val);
2031 		}
2032 		Code_Emit_Ptr = ptr;
2033 	}
2034 	return ret_val;
2035 }
2036 
2037 /*----------------------------------------------------------------------*
2038  * insert
2039  *
2040  * Insert a node in front of already emitted node(s).  Means relocating
2041  * the operand.  Code_Emit_Ptr points one byte past the just emitted
2042  * node and operand.  The parameter `insert_pos' points to the location
2043  * where the new node is to be inserted.
2044  *----------------------------------------------------------------------*/
2045 
insert(char32 op,char32 * insert_pos,long min,long max,int index)2046 static char32 *insert (
2047     char32 op,
2048     char32 *insert_pos,
2049     long min,
2050     long max,
2051     int index) {
2052 
2053 	char32 *src;
2054 	char32 *dst;
2055 	char32 *place;
2056 	int insert_size = NODE_SIZE;
2057 
2058 	if (op == BRACE || op == LAZY_BRACE) {
2059 		/* Make room for the min and max values. */
2060 
2061 		insert_size += (2 * NEXT_PTR_SIZE);
2062 	} else if (op == INIT_COUNT) {
2063 		/* Make room for an index value . */
2064 
2065 		insert_size += INDEX_SIZE;
2066 	}
2067 
2068 	if (Code_Emit_Ptr == &Compute_Size) {
2069 		Reg_Size += insert_size;
2070 		return &Compute_Size;
2071 	}
2072 
2073 	src            = Code_Emit_Ptr;
2074 	Code_Emit_Ptr += insert_size;
2075 	dst            = Code_Emit_Ptr;
2076 
2077 	/* Relocate the existing emitted code to make room for the new node. */
2078 
2079 	while (src > insert_pos) {
2080 		*--dst = *--src;
2081 	}
2082 
2083 	place   = insert_pos;   /* Where operand used to be. */
2084 	*place++ = op;           /* Inserted operand. */
2085 	*place++ = '\0';         /* NEXT pointer for inserted operand. */
2086 	*place++ = '\0';
2087 
2088 	if (op == BRACE || op == LAZY_BRACE) {
2089 		*place++ = PUT_OFFSET_L (min);
2090 		*place++ = PUT_OFFSET_R (min);
2091 
2092 		*place++ = PUT_OFFSET_L (max);
2093 		*place++ = PUT_OFFSET_R (max);
2094 	} else if (op == INIT_COUNT) {
2095 		*place++ = (char32) index;
2096 	}
2097 
2098 	return place; /* Return a pointer to the start of the code moved. */
2099 }
2100 
2101 /*----------------------------------------------------------------------*
2102  * tail - Set the next-pointer at the end of a node chain.
2103  *----------------------------------------------------------------------*/
2104 
tail(char32 * search_from,char32 * point_to)2105 static void tail (char32 *search_from, char32 *point_to) {
2106 
2107 	char32 *scan;
2108 	char32 *next;
2109 	int offset;
2110 
2111 	if (search_from == &Compute_Size) {
2112 		return;
2113 	}
2114 
2115 	/* Find the last node in the chain (node with a null NEXT pointer) */
2116 
2117 	scan = search_from;
2118 
2119 	for (;;) {
2120 		next = next_ptr (scan);
2121 
2122 		if (!next) {
2123 			break;
2124 		}
2125 
2126 		scan = next;
2127 	}
2128 
2129 	if (GET_OP_CODE (scan) == BACK) {
2130 		offset = scan - point_to;
2131 	} else {
2132 		offset = point_to - scan;
2133 	}
2134 
2135 	/* Set NEXT pointer */
2136 
2137 	* (scan + 1) = PUT_OFFSET_L (offset);
2138 	* (scan + 2) = PUT_OFFSET_R (offset);
2139 }
2140 
2141 /*--------------------------------------------------------------------*
2142  * offset_tail
2143  *
2144  * Perform a tail operation on (ptr + offset).
2145  *--------------------------------------------------------------------*/
2146 
offset_tail(char32 * ptr,int offset,char32 * val)2147 static void offset_tail (char32 *ptr, int offset, char32 *val) {
2148 	if (ptr == & Compute_Size || ! ptr)
2149 		return;
2150 	tail (ptr + offset, val);
2151 }
2152 
2153 /*--------------------------------------------------------------------*
2154  * branch_tail
2155  *
2156  * Perform a tail operation on (ptr + offset) but only if `ptr' is a
2157  * BRANCH node.
2158  *--------------------------------------------------------------------*/
2159 
branch_tail(char32 * ptr,int offset,char32 * val)2160 static void branch_tail (char32 *ptr, int offset, char32 *val) {
2161 	if (ptr == & Compute_Size || ! ptr || GET_OP_CODE (ptr) != BRANCH)
2162 		return;
2163 	tail (ptr + offset, val);
2164 }
2165 
2166 /*--------------------------------------------------------------------*
2167  * shortcut_escape
2168  *
2169  * Implements convenient escape sequences that represent entire
2170  * character classes or special location assertions (similar to escapes
2171  * supported by Perl)
2172  *                                                  _
2173  *    \d     Digits                  [0-9]           |
2174  *    \D     NOT a digit             [^0-9]          | (Examples
2175  *    \l     Letters                 [a-zA-Z]        |  at left
2176  *    \L     NOT a Letter            [^a-zA-Z]       |    are
2177  *    \s     Whitespace              [ \t\n\r\f\v]   |    for
2178  *    \S     NOT Whitespace          [^ \t\n\r\f\v]  |   ASCII)
2179  *    \w     "Word" character        [a-zA-Z0-9_]    |
2180  *    \W     NOT a "Word" character  [^a-zA-Z0-9_]  _|
2181  *
2182  *    \B     Matches any character that is NOT a word-delimiter
2183  *
2184  *    Codes for the "emit" parameter:
2185  *
2186  *    EMIT_NODE
2187  *       Emit a shortcut node.  Shortcut nodes have an implied set of
2188  *       class characters.  This helps keep the compiled regex string
2189  *       small.
2190  *
2191  *    CHECK_ESCAPE
2192  *       Only verify that this is a valid shortcut escape.
2193  *
2194  *    CHECK_CLASS_ESCAPE
2195  *       Same as CHECK_ESCAPE but only allows characters valid within
2196  *       a klas.
2197  *
2198  *--------------------------------------------------------------------*/
2199 
shortcut_escape(char32 c,int * flag_param,int emit)2200 static char32 *shortcut_escape (
2201     char32  c,
2202     int           *flag_param,
2203     int            emit) {
2204 
2205 	char32 *klas   = NULL;
2206 	static const conststring32 codes = U"ByYwWdDlLsS";
2207 	char32 *ret_val = (char32 *) 1;   // assume success
2208 	conststring32 valid_codes;
2209 
2210 	if (emit == CHECK_CLASS_ESCAPE) {
2211 		valid_codes = codes + 5; /* \B, \y, \Y, \w and \W are not allowed in classes */
2212 	} else {
2213 		valid_codes = codes;
2214 	}
2215 
2216 	if (! str32chr (valid_codes, c)) {
2217 		return nullptr;   // not a valid shortcut escape sequence
2218 	} else if (emit == CHECK_ESCAPE || emit == CHECK_CLASS_ESCAPE) {
2219 		return ret_val;   // just checking if this is a valid shortcut escape
2220 	}
2221 
2222 	switch (c) {
2223 		case U'd':
2224 		case U'D':
2225 			if (emit == EMIT_NODE)
2226 				ret_val = (iswlower ((int) c) ? emit_node (DIGIT) : emit_node (NOT_DIGIT));
2227 			break;
2228 		case U'l':
2229 		case U'L':
2230 			if (emit == EMIT_NODE)
2231 				ret_val = (iswlower ((int) c) ? emit_node (LETTER) : emit_node (NOT_LETTER));
2232 			break;
2233 		case U's':
2234 		case U'S':
2235 			if (emit == EMIT_NODE) {
2236 				if (Match_Newline) {
2237 					ret_val = (iswlower ((int) c) ? emit_node (SPACE_NL) : emit_node (NOT_SPACE_NL));
2238 				} else {
2239 					ret_val = (iswlower ((int) c) ? emit_node (SPACE) : emit_node (NOT_SPACE));
2240 				}
2241 			}
2242 			break;
2243 		case U'w':
2244 		case U'W':
2245 			if (emit == EMIT_NODE) {
2246 				ret_val = (iswlower ((int) c) ? emit_node (WORD_CHAR) : emit_node (NOT_WORD_CHAR));
2247 			} else {
2248 				REG_FAIL (U"internal error #105 `shortcut_escape\'");
2249 			}
2250 			break;
2251 			/*
2252 				Since the delimiter table is not available at regex compile time,
2253 				\B, \y and \Y can only generate a node. At run time, the delimiter table
2254 				will be available for these nodes to use.
2255 			*/
2256 		case 'y':
2257 			if (emit == EMIT_NODE) {
2258 				ret_val = emit_node (IS_DELIM);
2259 			} else {
2260 				REG_FAIL (U"internal error #5 `shortcut_escape\'");
2261 			}
2262 			break;
2263 		case 'Y':
2264 			if (emit == EMIT_NODE) {
2265 				ret_val = emit_node (NOT_DELIM);
2266 			} else {
2267 				REG_FAIL (U"internal error #6 `shortcut_escape\'");
2268 			}
2269 			break;
2270 		case 'B':
2271 			if (emit == EMIT_NODE) {
2272 				ret_val = emit_node (NOT_BOUNDARY);
2273 			} else {
2274 				REG_FAIL (U"internal error #7 `shortcut_escape\'");
2275 			}
2276 			break;
2277 		default:
2278 			/*
2279 				We get here if there isn't a case for every character in the string "codes".
2280 			*/
2281 			REG_FAIL (U"internal error #8 `shortcut_escape\'");
2282 	}
2283 
2284 	if (emit == EMIT_NODE  &&  c != 'B') {
2285 		*flag_param |= (HAS_WIDTH | SIMPLE);
2286 	}
2287 
2288 	if (klas) {
2289 		/*
2290 			Emit bytes within a character class operand.
2291 		*/
2292 		while (*klas != U'\0')
2293 			emit_byte (*klas ++);
2294 	}
2295 
2296 	return ret_val;
2297 }
2298 
2299 /*--------------------------------------------------------------------*
2300  * numeric_escape
2301  *
2302  * Implements hex and octal numeric escape sequence syntax.
2303  *
2304  * Hexadecimal Escape: \x##    Max of two digits  Must have leading 'x'.
2305  * Octal Escape:       \0###   Max of three digits and not greater
2306  *                             than 377 octal.  Must have leading zero.
2307  *
2308  * Returns the actual character value or NULL if not a valid hex or
2309  * octal escape.  REG_FAIL is called if \x0, \x00, \0, \00, \000, or
2310  * \0000 is specified.
2311  *--------------------------------------------------------------------*/
2312 
numeric_escape(char32 c,char32 ** parse)2313 static char32 numeric_escape (
2314     char32 c,
2315     char32 **parse) {
2316 
2317 	static char32 digits [] = { 'f', 'e', 'd', 'c', 'b', 'a', 'F', 'E', 'D', 'C', 'B', 'A', '9', '8', '7', '6', '5', '4', '3', '2', '1', '0', '\0' };
2318 
2319 	static unsigned int digit_val [] = {
2320 		15, 14, 13, 12, 11, 10,                  /* Lower case Hex digits */
2321 		15, 14, 13, 12, 11, 10,                  /* Upper case Hex digits */
2322 		9,  8,  7,  6,  5,  4,  3,  2,  1,  0
2323 	}; /* Decimal Digits */
2324 
2325 	char32 *scan;
2326 	char32 *pos_ptr;
2327 	char32 *digit_str;
2328 	unsigned int   value     =  0;
2329 	unsigned int   radix     =  8;
2330 	int   width     =  3; /* Can not be bigger than \0377 */
2331 	int   pos_delta = 14;
2332 	int   i, pos;
2333 
2334 	switch (c) {
2335 		case U'0':
2336 			digit_str = digits + pos_delta; /* Only use Octal digits, i.e. 0-7. */
2337 			break;
2338 
2339 		case U'x':
2340 		case U'X':
2341 			width     =  2;     /* Can not be bigger than \0377 */
2342 			radix     = 16;
2343 			pos_delta =  0;
2344 			digit_str = digits; /* Use all of the digit characters. */
2345 			break;
2346 
2347 		default:
2348 			return U'\0'; /* Not a numeric escape */
2349 	}
2350 
2351 	scan = *parse; scan ++; /* Only change *parse on success. */
2352 
2353 	pos_ptr = str32chr (digit_str, *scan);
2354 
2355 	for (i = 0; pos_ptr && i < width; i++) {
2356 		pos   = (pos_ptr - digit_str) + pos_delta;
2357 		value = (value * radix) + digit_val [pos];
2358 
2359 		/* If this digit makes the value over 255, treat this digit as a literal
2360 		   character instead of part of the numeric escape.  For example, \0777
2361 		   will be processed as \077 (an 'M') and a literal '7' character, NOT
2362 		   511 decimal which is > 255. */
2363 
2364 		if (value > 255) {
2365 			/* Back out calculations for last digit processed. */
2366 
2367 			value -= digit_val [pos];
2368 			value /= radix;
2369 
2370 			break; /* Note that scan will not be incremented and still points to
2371                    the digit that caused overflow.  It will be decremented by
2372                    the "else" below to point to the last character that is
2373                    considered to be part of the octal escape. */
2374 		}
2375 
2376 		scan ++;
2377 		pos_ptr = str32chr (digit_str, *scan);
2378 	}
2379 
2380 	/* Handle the case of "\0" i.e. trying to specify a NULL character. */
2381 
2382 	if (value == 0) {
2383 		if (c == U'0') {
2384 			Melder_sprint (Error_Text,128, U"\\00 is an invalid octal escape");
2385 		} else {
2386 			Melder_sprint (Error_Text,128, U"\\", c, U"0 is an invalid hexadecimal escape");
2387 		}
2388 	} else {
2389 		/* Point to the last character of the number on success. */
2390 
2391 		scan --;
2392 		*parse = scan;
2393 	}
2394 
2395 	return value;
2396 }
2397 
2398 /*--------------------------------------------------------------------*
2399  * literal_escape
2400  *
2401  * Recognize escaped literal characters (prefixed with backslash),
2402  * and translate them into the corresponding character.
2403  *
2404  * Returns the proper character value or NULL if not a valid literal
2405  * escape.
2406  *--------------------------------------------------------------------*/
2407 
literal_escape(char32 c)2408 static char32 literal_escape (char32 c) {
2409 
2410 	static char32 valid_escape [] =  {
2411 		'a',   'b',
2412 		'e',
2413 		'f',   'n',   'r',   't',   'v',   '(',    ')',   '-',   '[',   ']',
2414 		'<',   '>',   '{',   '}',   '.',   '\\',   '|',   '^',   '$',   '*',
2415 		'+',   '?',   '&',   '\0'
2416 	};
2417 
2418 	static char32 value [] = {
2419 		'\a',  '\b',
2420 		0x1B,  /* Escape character in Unicode character set. */
2421 		'\f',  '\n',  '\r',  '\t',  '\v',  '(',    ')',   '-',   '[',   ']',
2422 		'<',   '>',   '{',   '}',   '.',   '\\',   '|',   '^',   '$',   '*',
2423 		'+',   '?',   '&',   '\0'
2424 	};
2425 
2426 	for (int i = 0; valid_escape [i] != '\0'; i++) {
2427 		if (c == valid_escape [i]) {
2428 			return value [i];
2429 		}
2430 	}
2431 
2432 	return '\0';
2433 }
2434 
2435 /*--------------------------------------------------------------------*
2436  * back_ref
2437  *
2438  * Process a request to match a previous parenthesized thing.
2439  * Parenthetical entities are numbered beginning at 1 by counting
2440  * opening parentheses from left to to right.  \0 would represent
2441  * whole match, but would confuse numeric_escape as an octal escape,
2442  * so it is forbidden.
2443  *
2444  * Constructs of the form \~1, \~2, etc. are cross-regex back
2445  * references and are used in syntax highlighting patterns to match
2446  * text previously matched by another regex. *** IMPLEMENT LATER ***
2447  *--------------------------------------------------------------------*/
2448 
back_ref(char32 * c,int * flag_param,int emit)2449 static char32 *back_ref (
2450     char32 *c,
2451     int           *flag_param,
2452     int            emit) {
2453 
2454 	int  paren_no, c_offset = 0, is_cross_regex = 0;
2455 
2456 	char32 *ret_val;
2457 
2458 	/* Implement cross regex backreferences later. */
2459 
2460 	/* if (*c == (regularExp_CHAR) ('~')) {
2461 	   c_offset++;
2462 	   is_cross_regex++;
2463 	} */
2464 
2465 	paren_no = (int) (* (c + c_offset) - (char32) ('0'));
2466 
2467 	if (! Melder_isAsciiDecimalNumber (* (c + c_offset)) || /* Only \1, \2, ... \9 are supported.  */
2468 	        paren_no == 0) {              /* Should be caught by numeric_escape. */
2469 
2470 		return NULL;
2471 	}
2472 
2473 	/* Make sure parentheses for requested back-reference are complete. */
2474 
2475 	if (! is_cross_regex && ! TEST_BIT (Closed_Parens, paren_no)) {
2476 		Melder_sprint (Error_Text,128, U"\\", paren_no, U" is an illegal back reference");
2477 		return NULL;
2478 	}
2479 
2480 	if (emit == EMIT_NODE) {
2481 		if (is_cross_regex) {
2482 			Reg_Parse ++; /* Skip past the '~' in a cross regex back reference.
2483                          We only do this if we are emitting code. */
2484 
2485 			if (Is_Case_Insensitive) {
2486 				ret_val = emit_node (X_REGEX_BR_CI);
2487 			} else {
2488 				ret_val = emit_node (X_REGEX_BR);
2489 			}
2490 		} else {
2491 			if (Is_Case_Insensitive) {
2492 				ret_val = emit_node (BACK_REF_CI);
2493 			} else {
2494 				ret_val = emit_node (BACK_REF);
2495 			}
2496 		}
2497 
2498 		emit_byte ((char32) paren_no);
2499 
2500 		if (is_cross_regex || TEST_BIT (Paren_Has_Width, paren_no)) {
2501 			*flag_param |= HAS_WIDTH;
2502 		}
2503 	} else if (emit == CHECK_ESCAPE) {
2504 		ret_val = (char32 *) 1;
2505 	} else {
2506 		ret_val = nullptr;
2507 	}
2508 	return ret_val;
2509 }
2510 
2511 /*======================================================================*
2512  *  Regex execution related code
2513  *======================================================================*/
2514 
2515 /* Global work variables for `ExecRE'. */
2516 
2517 static char32  *Reg_Input;           /* String-input pointer.         */
2518 static const char32  *Start_Of_String;     /* Beginning of input, for ^     */
2519 /* and < checks.                 */
2520 static const char32  *End_Of_String;       /* Logical end of input (if supplied, till \0 otherwise)  */
2521 static const char32  *Look_Behind_To;      /* Position till were look behind can safely check back   */
2522 static char32 **Start_Ptr_Ptr;       /* Pointer to `startp' array.    */
2523 static char32 **End_Ptr_Ptr;         /* Ditto for `endp'.             */
2524 static char32  *Extent_Ptr_FW;       /* Forward extent pointer        */
2525 static char32  *Extent_Ptr_BW;       /* Backward extent pointer       */
2526 static char32  *Back_Ref_Start [10]; /* Back_Ref_Start [0] and        */
2527 static char32  *Back_Ref_End   [10]; /* Back_Ref_End [0] are not      */
2528 /* used. This simplifies         */
2529 /* indexing.                     */
2530 /*
2531  * Measured recursion limits:
2532  *    Linux:      +/-  40 000 (up to 110 000)
2533  *    Solaris:    +/-  85 000
2534  *    HP-UX 11:   +/- 325 000
2535  *
2536  * So 10 000 ought to be safe.
2537  */
2538 #define REGEX_RECURSION_LIMIT 10000
2539 static int Recursion_Count;          /* Recursion counter */
2540 static int Recursion_Limit_Exceeded; /* Recursion limit exceeded flag */
2541 
2542 #define AT_END_OF_STRING(X) (*(X) == U'\0' || (End_Of_String != NULL && (X) >= End_Of_String))
2543 
2544 /* static regexp *Cross_Regex_Backref; */
2545 
2546 static bool Prev_Is_BOL;
2547 static bool Succ_Is_EOL;
2548 static bool Prev_Is_Delim;
2549 static bool Succ_Is_Delim;
2550 
2551 /* Define a pointer to an array to hold general (...){m,n} counts. */
2552 
2553 typedef struct brace_counts {
2554 	unsigned long count [1]; /* More unwarranted chumminess with compiler. */
2555 } brace_counts;
2556 
2557 static struct brace_counts *Brace;
2558 
2559 /* Forward declarations of functions used by `ExecRE' */
2560 
2561 static int             attempt (regexp *, char32 *);
2562 static int             match (char32 *, int *);
2563 static unsigned long   greedy (char32 *, long);
2564 static void            adjustcase (char32 *, int, char32);
2565 
2566 /*
2567  * ExecRE - match a `regexp' structure against a string
2568  *
2569  * If `end' is non-NULL, matches may not BEGIN past end, but may extend past
2570  * it.  If reverse is true, `end' should be specified, and searching begins at
2571  * `end'.  "isbol" should be set to true if the beginning of the string is the
2572  * actual beginning of a line (since `ExecRE' can't look backwards from the
2573  * beginning to find whether there was a newline before).  Likewise, "isbow"
2574  * asks whether the string is preceded by a word delimiter.  End of string is
2575  * always treated as a word and line boundary (there may be cases where it
2576  * shouldn't be, in which case, this should be changed).
2577  * Look_behind_to indicates the position till where it is safe to
2578  * perform look-behind matches. If set, it should be smaller than or equal
2579  * to the start position of the search (pointed at by string). If it is NULL,
2580  * it defaults to the start position.
2581  * Finally, match_to indicates the logical end of the string, till where
2582  * matches are allowed to extend. Note that look-ahead patterns may look
2583  * past that boundary. If match_to is set to NULL, the terminating \0 is
2584  * assumed to correspond to the logical boundary. Match_to, if set, should be
2585  * larger than or equal to end, if set.
2586  */
2587 
ExecRE(regexp * prog,regexp * cross_regex_backref,conststring32 string,const char32 * end,int reverse,char32 prev_char,char32 succ_char,conststring32 look_behind_to,conststring32 match_to)2588 int ExecRE (
2589     regexp *prog,
2590     regexp *cross_regex_backref,
2591     conststring32 string,
2592     const char32 *end,
2593     int     reverse,
2594     char32    prev_char,
2595     char32    succ_char,
2596     conststring32 look_behind_to,
2597     conststring32 match_to) {
2598 
2599 	char32 *str;
2600 	char32 **s_ptr;
2601 	char32 **e_ptr;
2602 	int    ret_val = 0;
2603 	int    i;
2604 	(void) cross_regex_backref;
2605 
2606 	s_ptr = (char32 **) prog->startp;
2607 	e_ptr = (char32 **) prog->endp;
2608 
2609 	/* Check for valid parameters. */
2610 
2611 	if (! prog || ! string) {
2612 		reg_error (U"NULL parameter to `ExecRE\'");
2613 		goto SINGLE_RETURN;
2614 	}
2615 
2616 	/* Check validity of program. */
2617 
2618 	if (U_CHAR_AT (prog->program) != MAGIC) {
2619 		reg_error (U"corrupted program");
2620 		goto SINGLE_RETURN;
2621 	}
2622 
2623 	/* Remember the logical end of the string. */
2624 
2625 	End_Of_String = match_to;
2626 
2627 	if (reverse && ! end) {
2628 		for (end = string; ! AT_END_OF_STRING (end); end ++) { }
2629 		succ_char = U'\n';
2630 	} else if (! end) {
2631 		succ_char = U'\n';
2632 	}
2633 
2634 	/* Remember the beginning of the string for matching BOL */
2635 
2636 	Start_Of_String    = string;
2637 	Look_Behind_To     = look_behind_to ? look_behind_to : string;
2638 
2639 	Prev_Is_BOL        = ( prev_char == U'\n' || prev_char == U'\0' );
2640 	Succ_Is_EOL        = ( succ_char == U'\n' || succ_char == U'\0' );
2641 	Prev_Is_Delim      = ! Melder_isWordCharacter (prev_char);
2642 	Succ_Is_Delim      = ! Melder_isWordCharacter (succ_char);
2643 
2644 	Total_Paren        = (int) (prog->program [1]);
2645 	Num_Braces         = (int) (prog->program [2]);
2646 
2647 	/* Reset the recursion detection flag */
2648 	Recursion_Limit_Exceeded = 0;
2649 
2650 	/*   Cross_Regex_Backref = cross_regex_backref; */
2651 
2652 	/* Allocate memory for {m,n} construct counting variables if need be. */
2653 
2654 	if (Num_Braces > 0) {
2655 		Brace =
2656 		    (brace_counts *) malloc (sizeof (brace_counts) * (size_t) Num_Braces);
2657 
2658 		if (Brace == NULL) {
2659 			reg_error (U"out of memory in `ExecRE\'");
2660 			goto SINGLE_RETURN;
2661 		}
2662 	} else {
2663 		Brace = NULL;
2664 	}
2665 
2666 	/* Initialize the first nine (9) capturing parentheses start and end
2667 	   pointers to point to the start of the search string.  This is to prevent
2668 	   crashes when later trying to reference captured parens that do not exist
2669 	   in the compiled regex.  We only need to do the first nine since users
2670 	   can only specify \1, \2, ... \9. */
2671 
2672 	for (i = 9; i > 0; i--) {
2673 		*s_ptr++ = (char32 *) string;
2674 		*e_ptr++ = (char32 *) string;
2675 	}
2676 
2677 	if (!reverse) { /* Forward Search */
2678 		if (prog->anchor) {
2679 			/* Search is anchored at BOL */
2680 
2681 			if (attempt (prog, (char32 *) string)) {
2682 				ret_val = 1;
2683 				goto SINGLE_RETURN;
2684 			}
2685 
2686 			for (str = (char32 *) string;
2687 			        !AT_END_OF_STRING (str) && str != (char32 *) end && !Recursion_Limit_Exceeded;
2688 			        str++) {
2689 
2690 				if (*str == '\n') {
2691 					if (attempt (prog, str + 1)) {
2692 						ret_val = 1;
2693 						break;
2694 					}
2695 				}
2696 			}
2697 
2698 			goto SINGLE_RETURN;
2699 
2700 		} else if (prog->match_start != '\0') {
2701 			/* We know what char match must start with. */
2702 
2703 			for (str = (char32 *) string;
2704 			        !AT_END_OF_STRING (str) && str != (char32 *) end && !Recursion_Limit_Exceeded;
2705 			        str++) {
2706 
2707 				if (*str == (char32) prog->match_start) {
2708 					if (attempt (prog, str)) {
2709 						ret_val = 1;
2710 						break;
2711 					}
2712 				}
2713 			}
2714 
2715 			goto SINGLE_RETURN;
2716 		} else {
2717 			/* General case */
2718 
2719 			for (str = (char32 *) string;
2720 			        !AT_END_OF_STRING (str) && str != (char32 *) end && !Recursion_Limit_Exceeded;
2721 			        str++) {
2722 
2723 				if (attempt (prog, str)) {
2724 					ret_val = 1;
2725 					break;
2726 				}
2727 			}
2728 
2729 			/* Beware of a single $ matching \0 */
2730 			if (!Recursion_Limit_Exceeded && !ret_val && AT_END_OF_STRING (str) && str != (char32 *) end) {
2731 				if (attempt (prog, str)) {
2732 					ret_val = 1;
2733 				}
2734 			}
2735 
2736 			goto SINGLE_RETURN;
2737 		}
2738 	} else { /* Search reverse, same as forward, but loops run backward */
2739 
2740 		/* Make sure that we don't start matching beyond the logical end */
2741 		if (End_Of_String != NULL && (char32 *) end > End_Of_String) {
2742 			end = (const char32 *) End_Of_String;
2743 		}
2744 
2745 		if (prog->anchor) {
2746 			/* Search is anchored at BOL */
2747 
2748 			for (str = (char32 *) (end - 1); str >= (char32 *) string && !Recursion_Limit_Exceeded; str--) {
2749 
2750 				if (*str == '\n') {
2751 					if (attempt (prog, str + 1)) {
2752 						ret_val = 1;
2753 						goto SINGLE_RETURN;
2754 					}
2755 				}
2756 			}
2757 
2758 			if (!Recursion_Limit_Exceeded && attempt (prog, (char32 *) string)) {
2759 				ret_val = 1;
2760 				goto SINGLE_RETURN;
2761 			}
2762 
2763 			goto SINGLE_RETURN;
2764 		} else if (prog->match_start != '\0') {
2765 			/* We know what char match must start with. */
2766 
2767 			for (str = (char32 *) end; str >= (char32 *) string && !Recursion_Limit_Exceeded; str--) {
2768 
2769 				if (*str == (char32) prog->match_start) {
2770 					if (attempt (prog, str)) {
2771 						ret_val = 1;
2772 						break;
2773 					}
2774 				}
2775 			}
2776 
2777 			goto SINGLE_RETURN;
2778 		} else {
2779 			/* General case */
2780 
2781 			for (str = (char32 *) end; str >= (char32 *) string && !Recursion_Limit_Exceeded; str--) {
2782 
2783 				if (attempt (prog, str)) {
2784 					ret_val = 1;
2785 					break;
2786 				}
2787 			}
2788 		}
2789 	}
2790 
2791 SINGLE_RETURN: if (Brace) {
2792 		free (Brace);
2793 	}
2794 
2795 	if (Recursion_Limit_Exceeded) {
2796 		return (0);
2797 	}
2798 
2799 	return (ret_val);
2800 }
2801 
attempt(regexp * prog,char32 * string)2802 static int attempt (regexp *prog, char32 *string) {
2803 
2804 	int    i;
2805 	char32 **s_ptr;
2806 	char32 **e_ptr;
2807 	int    branch_index = 0; /* Must be set to zero ! */
2808 
2809 	Reg_Input      = string;
2810 	Start_Ptr_Ptr  = (char32 **) prog->startp;
2811 	End_Ptr_Ptr    = (char32 **) prog->endp;
2812 	s_ptr          = (char32 **) prog->startp;
2813 	e_ptr          = (char32 **) prog->endp;
2814 
2815 	/* Reset the recursion counter. */
2816 	Recursion_Count = 0;
2817 
2818 	/* Overhead due to capturing parentheses. */
2819 
2820 	Extent_Ptr_BW = string;
2821 	Extent_Ptr_FW = nullptr;
2822 
2823 	for (i = Total_Paren + 1; i > 0; i--) {
2824 		*s_ptr ++ = nullptr;
2825 		*e_ptr ++ = nullptr;
2826 	}
2827 
2828 	if (match ( (char32 *) (prog->program + REGEX_START_OFFSET),
2829 	            &branch_index)) {
2830 		prog->startp [0] = (char32 *) string;
2831 		prog->endp   [0] = (char32 *) Reg_Input;     /* <-- One char AFTER  */
2832 		prog->extentpBW  = (char32 *) Extent_Ptr_BW; /*     matched string! */
2833 		prog->extentpFW  = (char32 *) Extent_Ptr_FW;
2834 		prog->top_branch = branch_index;
2835 
2836 		return 1;
2837 	} else {
2838 		return 0;
2839 	}
2840 }
2841 
2842 /*----------------------------------------------------------------------*
2843  * match - main matching routine
2844  *
2845  * Conceptually the strategy is simple: check to see whether the
2846  * current node matches, call self recursively to see whether the rest
2847  * matches, and then act accordingly.  In practice we make some effort
2848  * to avoid recursion, in particular by going through "ordinary" nodes
2849  * (that don't need to know whether the rest of the match failed) by a
2850  * loop instead of by recursion.  Returns 0 failure, 1 success.
2851  *----------------------------------------------------------------------*/
2852 #define MATCH_RETURN(X)\
2853  { --Recursion_Count; return (X); }
2854 #define CHECK_RECURSION_LIMIT\
2855  if (Recursion_Limit_Exceeded) MATCH_RETURN (0);
2856 
match(char32 * prog,int * branch_index_param)2857 static int match (char32 *prog, int *branch_index_param) {
2858 
2859 	char32 *scan;  /* Current node. */
2860 	char32 *next;  /* Next node. */
2861 	int next_ptr_offset;  /* Used by the NEXT_PTR () macro */
2862 
2863 	if (++ Recursion_Count > REGEX_RECURSION_LIMIT) {
2864 		if (! Recursion_Limit_Exceeded) { /* Prevent duplicate errors */
2865 			reg_error (U"recursion limit exceeded, please respecify expression");
2866 		}
2867 		Recursion_Limit_Exceeded = 1;
2868 		MATCH_RETURN (0)
2869 	}
2870 
2871 
2872 	scan = prog;
2873 
2874 	while (scan) {
2875 		NEXT_PTR (scan, next);
2876 
2877 		switch (GET_OP_CODE (scan)) {
2878 			case BRANCH: {
2879 				char32 *save;
2880 				int branch_index_local = 0;
2881 
2882 				if (GET_OP_CODE (next) != BRANCH) {  /* No choice. */
2883 					next = OPERAND (scan);   /* Avoid recursion. */
2884 				} else {
2885 					do {
2886 						save = Reg_Input;
2887 
2888 						if (match (OPERAND (scan), NULL)) {
2889 							if (branch_index_param)
2890 								*branch_index_param = branch_index_local;
2891 							MATCH_RETURN (1)
2892 						}
2893 
2894 						CHECK_RECURSION_LIMIT
2895 
2896 						++branch_index_local;
2897 
2898 						Reg_Input = save; /* Backtrack. */
2899 						NEXT_PTR (scan, scan);
2900 					} while (scan != NULL && GET_OP_CODE (scan) == BRANCH);
2901 
2902 					MATCH_RETURN (0)   // NOT REACHED
2903 				}
2904 			}
2905 
2906 			break;
2907 
2908 			case EXACTLY: {
2909 				char32 *opnd = OPERAND (scan);
2910 
2911 				/* Inline the first character, for speed. */
2912 
2913 				if (*opnd != *Reg_Input)
2914 					MATCH_RETURN (0)
2915 				int len = str32len (opnd);
2916 				if (End_Of_String != NULL && Reg_Input + len > End_Of_String)
2917 					MATCH_RETURN (0)
2918 				if (len > 1  && str32ncmp (opnd, Reg_Input, len) != 0)
2919 					MATCH_RETURN (0)
2920 				Reg_Input += len;
2921 			}
2922 
2923 			break;
2924 
2925 			case SIMILAR: {
2926 				char32 *opnd = OPERAND (scan);
2927 				char32  test;
2928 				/* Note: the SIMILAR operand was converted to lower case during
2929 				   regex compile. */
2930 
2931 				while ((test = *opnd++) != U'\0') {
2932 					if (AT_END_OF_STRING (Reg_Input) || Melder_toLowerCase (*Reg_Input++) != test)
2933 						MATCH_RETURN (0)
2934 				}
2935 			}
2936 
2937 			break;
2938 
2939 			case BOL: /* `^' (beginning of line anchor) */
2940 				if (Reg_Input == Start_Of_String) {
2941 					if (Prev_Is_BOL)
2942 						break;
2943 				} else if (* (Reg_Input - 1) == U'\n') {
2944 					break;
2945 				}
2946 				MATCH_RETURN (0)
2947 
2948 			case EOL: /* `$' anchor matches end of line and end of string */
2949 				if (*Reg_Input == U'\n' || (AT_END_OF_STRING (Reg_Input) && Succ_Is_EOL))
2950 					break;
2951 				MATCH_RETURN (0)
2952 
2953 			case BOWORD: /* `<' (beginning of word anchor) */
2954 				/* Check to see if the current character is not a delimiter
2955 				   and the preceding character is. */
2956 			{
2957 				bool prev_is_delim;
2958 				if (Reg_Input == Start_Of_String) {
2959 					prev_is_delim = Prev_Is_Delim;
2960 				} else {
2961 					prev_is_delim = ! Melder_isWordCharacter (*(Reg_Input - 1));
2962 				}
2963 				if (prev_is_delim) {
2964 					bool current_is_delim;
2965 					if (AT_END_OF_STRING (Reg_Input)) {
2966 						current_is_delim = Succ_Is_Delim;
2967 					} else {
2968 						current_is_delim = ! Melder_isWordCharacter (*Reg_Input);
2969 					}
2970 					if (! current_is_delim)
2971 						break;
2972 				}
2973 			}
2974 			MATCH_RETURN (0)
2975 
2976 			case EOWORD: /* `>' (end of word anchor) */
2977 				/* Check to see if the current character is a delimiter
2978 				and the preceding character is not. */
2979 			{
2980 				bool prev_is_delim;
2981 				if (Reg_Input == Start_Of_String) {
2982 					prev_is_delim = Prev_Is_Delim;
2983 				} else {
2984 					prev_is_delim = ! Melder_isWordCharacter (*(Reg_Input - 1));
2985 				}
2986 				if (! prev_is_delim) {
2987 					bool current_is_delim;
2988 					if (AT_END_OF_STRING (Reg_Input)) {
2989 						current_is_delim = Succ_Is_Delim;
2990 					} else {
2991 						current_is_delim = ! Melder_isWordCharacter (*Reg_Input);
2992 					}
2993 					if (current_is_delim)
2994 						break;
2995 				}
2996 			}
2997 			MATCH_RETURN (0)
2998 
2999 			case NOT_BOUNDARY: { /* \B (NOT a word boundary) */
3000 				bool prev_is_delim;
3001 				bool current_is_delim;
3002 				if (Reg_Input == Start_Of_String) {
3003 					prev_is_delim = Prev_Is_Delim;
3004 				} else {
3005 					prev_is_delim = ! Melder_isWordCharacter (*(Reg_Input - 1));
3006 				}
3007 				if (AT_END_OF_STRING (Reg_Input)) {
3008 					current_is_delim = Succ_Is_Delim;
3009 				} else {
3010 					current_is_delim = ! Melder_isWordCharacter (*Reg_Input);
3011 				}
3012 				if (! (prev_is_delim ^ current_is_delim))
3013 					break;
3014 			}
3015 			MATCH_RETURN (0)
3016 			case IS_DELIM: /* \y (A word delimiter character.) */
3017 				if (! Melder_isWordCharacter (*Reg_Input) && ! AT_END_OF_STRING (Reg_Input)) {
3018 					Reg_Input++; break;
3019 				}
3020 				MATCH_RETURN (0)
3021 			case NOT_DELIM: /* \Y (NOT a word delimiter character.) */
3022 				if (Melder_isWordCharacter (*Reg_Input) && ! AT_END_OF_STRING (Reg_Input)) {
3023 					Reg_Input ++;
3024 					break;
3025 				}
3026 				MATCH_RETURN (0)
3027 			case WORD_CHAR: /* \w (word character; alpha-numeric or underscore) */
3028 				if (Melder_isWordCharacter (*Reg_Input) && ! AT_END_OF_STRING (Reg_Input)) {
3029 					Reg_Input ++;
3030 					break;
3031 				}
3032 				MATCH_RETURN (0)
3033 			case NOT_WORD_CHAR:/* \W (NOT a word character) */
3034 				if (Melder_isWordCharacter (*Reg_Input) || *Reg_Input == U'\n' || AT_END_OF_STRING (Reg_Input))
3035 					MATCH_RETURN (0)
3036 				Reg_Input ++;
3037 				break;
3038 			case ANY: /* `.' (matches any character EXCEPT newline) */
3039 				if (AT_END_OF_STRING (Reg_Input) || *Reg_Input == U'\n')
3040 					MATCH_RETURN (0)
3041 				Reg_Input ++;
3042 				break;
3043 			case EVERY: /* `.' (matches any character INCLUDING newline) */
3044 				if (AT_END_OF_STRING (Reg_Input))
3045 					MATCH_RETURN (0)
3046 				Reg_Input ++;
3047 				break;
3048 			case DIGIT: /* \d; for ASCII, use [0-9] */
3049 				if (! Melder_isDecimalNumber (*Reg_Input) || AT_END_OF_STRING (Reg_Input))
3050 					MATCH_RETURN (0)
3051 				Reg_Input ++;
3052 				break;
3053 			case NOT_DIGIT: /* \D; for ASCII, use [^0-9] */
3054 				if (Melder_isDecimalNumber (*Reg_Input) || *Reg_Input == U'\n' || AT_END_OF_STRING (Reg_Input))
3055 					MATCH_RETURN (0)
3056 				Reg_Input ++;
3057 				break;
3058 			case LETTER: /* \l; for ASCII, use [a-zA-Z] */
3059 				if (! Melder_isLetter (*Reg_Input) || AT_END_OF_STRING (Reg_Input))
3060 					MATCH_RETURN (0)
3061 				Reg_Input ++;
3062 				break;
3063 			case NOT_LETTER: /* \L; for ASCII, use [^a-zA-Z] */
3064 				if (Melder_isLetter (*Reg_Input) || *Reg_Input == '\n' || AT_END_OF_STRING (Reg_Input))
3065 					MATCH_RETURN (0)
3066 				Reg_Input ++;
3067 				break;
3068 			case SPACE: /* \s; for ASCII, use [ \t] */
3069 				if (! Melder_isHorizontalSpace (*Reg_Input) || AT_END_OF_STRING (Reg_Input))
3070 					MATCH_RETURN (0)
3071 				Reg_Input ++;
3072 				break;
3073 			case SPACE_NL: /* \s; for ASCII, use [\n \t\r\f\v] */
3074 				if (! Melder_isHorizontalOrVerticalSpace (*Reg_Input) || AT_END_OF_STRING (Reg_Input))
3075 					MATCH_RETURN (0)
3076 				Reg_Input ++;
3077 				break;
3078 			case NOT_SPACE: /* \S; for ASCII, use [^\n \t\r\f\v] */
3079 				if (Melder_isHorizontalOrVerticalSpace (*Reg_Input) || AT_END_OF_STRING (Reg_Input))
3080 					MATCH_RETURN (0)
3081 				Reg_Input ++;
3082 				break;
3083 			case NOT_SPACE_NL: /* \S; for ASCII, use [^ \t\r\f\v] */
3084 				if (Melder_isHorizontalSpace (*Reg_Input) || AT_END_OF_STRING (Reg_Input))
3085 					MATCH_RETURN (0)
3086 				Reg_Input ++;
3087 				break;
3088 			case ANY_OF:  /* [...] character class. */
3089 				if (AT_END_OF_STRING (Reg_Input))
3090 					MATCH_RETURN (0)
3091 				/* Needed because strchr ()
3092                                     considers \0 as a member
3093                                     of the character set. */
3094 
3095 				if (! str32chr (OPERAND (scan), *Reg_Input))
3096 					MATCH_RETURN (0)
3097 				Reg_Input ++;
3098 				break;
3099 			case ANY_BUT: /* [^...] Negated character class-- does NOT normally
3100                        match newline (\n added usually to operand at compile
3101                        time.) */
3102 
3103 				if (AT_END_OF_STRING (Reg_Input)) {
3104 					MATCH_RETURN (0);    /* See comment for ANY_OF. */
3105 				}
3106 				if (str32chr (OPERAND (scan), *Reg_Input) != nullptr)
3107 					MATCH_RETURN (0)
3108 				Reg_Input ++;
3109 				break;
3110 			case NOTHING:
3111 			case BACK:
3112 				break;
3113 			case STAR:
3114 			case PLUS:
3115 			case QUESTION:
3116 			case BRACE:
3117 
3118 			case LAZY_STAR:
3119 			case LAZY_PLUS:
3120 			case LAZY_QUESTION:
3121 			case LAZY_BRACE: {
3122 				unsigned long  num_matched = REG_ZERO;
3123 				unsigned long  min = ULONG_MAX, max = REG_ZERO;
3124 				char32 *save;
3125 				char32 next_char;
3126 				char32 *next_op;
3127 				int            lazy = 0;
3128 
3129 				/* Lookahead (when possible) to avoid useless match attempts
3130 				   when we know what character comes next. */
3131 
3132 				if (GET_OP_CODE (next) == EXACTLY) {
3133 					next_char = *OPERAND (next);
3134 				} else {
3135 					next_char = U'\0';/* i.e. Don't know what next character is. */
3136 				}
3137 
3138 				next_op = OPERAND (scan);
3139 
3140 				switch (GET_OP_CODE (scan)) {
3141 					case LAZY_STAR:
3142 						lazy = 1;
3143 					case STAR:
3144 						min = REG_ZERO;
3145 						max = ULONG_MAX;
3146 						break;
3147 
3148 					case LAZY_PLUS:
3149 						lazy = 1;
3150 					case PLUS:
3151 						min = REG_ONE;
3152 						max = ULONG_MAX;
3153 						break;
3154 
3155 					case LAZY_QUESTION:
3156 						lazy = 1;
3157 					case QUESTION:
3158 						min = REG_ZERO;
3159 						max = REG_ONE;
3160 						break;
3161 
3162 					case LAZY_BRACE:
3163 						lazy = 1;
3164 					case BRACE:
3165 						min = (unsigned long)
3166 						      GET_OFFSET (scan + NEXT_PTR_SIZE);
3167 
3168 						max = (unsigned long)
3169 						      GET_OFFSET (scan + (2 * NEXT_PTR_SIZE));
3170 
3171 						if (max <= REG_INFINITY) {
3172 							max = ULONG_MAX;
3173 						}
3174 
3175 						next_op = OPERAND (scan + (2 * NEXT_PTR_SIZE));
3176 				}
3177 
3178 				save = Reg_Input;
3179 
3180 				if (lazy) {
3181 					if (min > REG_ZERO) {
3182 						num_matched = greedy (next_op, min);
3183 					}
3184 				} else {
3185 					num_matched = greedy (next_op, max);
3186 				}
3187 
3188 				while (min <= num_matched && num_matched <= max) {
3189 					if (next_char == U'\0' || next_char == *Reg_Input) {
3190 						if (match (next, NULL)) {
3191 							MATCH_RETURN (1);
3192 						}
3193 
3194 						CHECK_RECURSION_LIMIT
3195 					}
3196 
3197 					/* Couldn't or didn't match. */
3198 
3199 					if (lazy) {
3200 						if (!greedy (next_op, 1)) {
3201 							MATCH_RETURN (0);
3202 						}
3203 
3204 						num_matched ++; /* Inch forward. */
3205 					} else if (num_matched > REG_ZERO) {
3206 						num_matched --; /* Back up. */
3207 					} else if (min == REG_ZERO && num_matched == REG_ZERO) {
3208 						break;
3209 					}
3210 
3211 					Reg_Input = save + num_matched;
3212 				}
3213 
3214 				MATCH_RETURN (0);
3215 			}
3216 
3217 			break;
3218 
3219 			case END:
3220 				if (Extent_Ptr_FW == NULL || (Reg_Input - Extent_Ptr_FW) > 0) {
3221 					Extent_Ptr_FW = Reg_Input;
3222 				}
3223 
3224 				MATCH_RETURN (1)   // Success!
3225 
3226 				break;
3227 
3228 			case INIT_COUNT:
3229 				Brace->count [*OPERAND (scan)] = REG_ZERO;
3230 
3231 				break;
3232 
3233 			case INC_COUNT:
3234 				Brace->count [*OPERAND (scan)] ++;
3235 
3236 				break;
3237 
3238 			case TEST_COUNT:
3239 				if (Brace->count [*OPERAND (scan)] <
3240 				        (unsigned long) GET_OFFSET (scan + NEXT_PTR_SIZE + INDEX_SIZE)) {
3241 
3242 					next = scan + NODE_SIZE + INDEX_SIZE + NEXT_PTR_SIZE;
3243 				}
3244 
3245 				break;
3246 
3247 			case BACK_REF:
3248 			case BACK_REF_CI:
3249 				/* case X_REGEX_BR:    */
3250 				/* case X_REGEX_BR_CI: *** IMPLEMENT LATER */
3251 			{
3252 				char32 *captured, *finish;
3253 				int paren_no;
3254 
3255 				paren_no = (int) * OPERAND (scan);
3256 
3257 				/* if (GET_OP_CODE (scan) == X_REGEX_BR ||
3258 				    GET_OP_CODE (scan) == X_REGEX_BR_CI) {
3259 
3260 				   if (Cross_Regex_Backref == NULL) MATCH_RETURN (0);
3261 
3262 				   captured =
3263 				      (regularExp_CHAR *) Cross_Regex_Backref->startp [paren_no];
3264 
3265 				   finish =
3266 				      (regularExp_CHAR *) Cross_Regex_Backref->endp   [paren_no];
3267 				} else { */
3268 				captured = Back_Ref_Start [paren_no];
3269 				finish   = Back_Ref_End   [paren_no];
3270 				/* } */
3271 
3272 				if (captured && finish) {
3273 					if (captured > finish) {
3274 						MATCH_RETURN (0);
3275 					}
3276 
3277 					if (GET_OP_CODE (scan) == BACK_REF_CI  /* ||
3278                      GET_OP_CODE (scan) == X_REGEX_BR_CI*/) {
3279 
3280 						while (captured < finish) {
3281 							if (AT_END_OF_STRING (Reg_Input) || Melder_toLowerCase (*captured++) != Melder_toLowerCase (*Reg_Input++))   // TODO: should casefold
3282 								MATCH_RETURN (0)
3283 						}
3284 					} else {
3285 						while (captured < finish) {
3286 							if (AT_END_OF_STRING (Reg_Input) || *captured++ != *Reg_Input++)
3287 								MATCH_RETURN (0)
3288 						}
3289 					}
3290 					break;
3291 				} else {
3292 					MATCH_RETURN (0)
3293 				}
3294 			}
3295 
3296 			case POS_AHEAD_OPEN:
3297 			case NEG_AHEAD_OPEN:
3298 			{
3299 				char32 *save = Reg_Input;
3300 
3301 				/* Temporarily ignore the logical end of the string, to allow
3302 				   lookahead past the end. */
3303 				const char32 *saved_end = End_Of_String;
3304 				End_Of_String = NULL;
3305 
3306 				int answer = match (next, NULL); /* Does the look-ahead regex match? */
3307 
3308 				CHECK_RECURSION_LIMIT
3309 
3310 				if ( (GET_OP_CODE (scan) == POS_AHEAD_OPEN) ? answer : ! answer) {
3311 					/* Remember the last (most to the right) character position
3312 					   that we consume in the input for a successful match.  This
3313 					   is info that may be needed should an attempt be made to
3314 					   match the exact same text at the exact same place.  Since
3315 					   look-aheads backtrack, a regex with a trailing look-ahead
3316 					   may need more text than it matches to accomplish a
3317 					   re-match. */
3318 
3319 					if (Extent_Ptr_FW == NULL || (Reg_Input - Extent_Ptr_FW) > 0) {
3320 						Extent_Ptr_FW = Reg_Input;
3321 					}
3322 
3323 					Reg_Input = save; /* Backtrack to look-ahead start. */
3324 					End_Of_String = saved_end; /* Restore logical end. */
3325 
3326 					/* Jump to the node just after the (?=...) or (?!...)
3327 					   Construct. */
3328 
3329 					next = next_ptr (OPERAND (scan)); /* Skip 1st branch */
3330 					/* Skip the chain of branches inside the look-ahead */
3331 					while (GET_OP_CODE (next) == BRANCH) {
3332 						next = next_ptr (next);
3333 					}
3334 					next = next_ptr (next); /* Skip the LOOK_AHEAD_CLOSE */
3335 				} else {
3336 					Reg_Input = save; /* Backtrack to look-ahead start. */
3337 					End_Of_String = saved_end; /* Restore logical end. */
3338 
3339 					MATCH_RETURN (0)
3340 				}
3341 			}
3342 
3343 			break;
3344 
3345 			case POS_BEHIND_OPEN:
3346 			case NEG_BEHIND_OPEN: {
3347 				char32 *save;
3348 				int   answer;
3349 				int   offset, upper;
3350 				int   lower;
3351 				int   found = 0;
3352 				const char32 *saved_end;
3353 
3354 				save      = Reg_Input;
3355 				saved_end = End_Of_String;
3356 
3357 				/* Prevent overshoot (greedy matching could end past the
3358 				   current position) by tightening the matching boundary.
3359 				   Lookahead inside lookbehind can still cross that boundary. */
3360 				End_Of_String = Reg_Input;
3361 
3362 				lower = GET_LOWER (scan);
3363 				upper = GET_UPPER (scan);
3364 
3365 				/* Start with the shortest match first. This is the most
3366 				   efficient direction in general.
3367 				   Note! Negative look behind is _very_ tricky when the length
3368 				   is not constant: we have to make sure the expression doesn't
3369 				   match for _any_ of the starting positions. */
3370 				for (offset = lower; offset <= upper; ++offset) {
3371 					Reg_Input = save - offset;
3372 
3373 					if (Reg_Input < Look_Behind_To) {
3374 						/* No need to look any further */
3375 						break;
3376 					}
3377 
3378 					answer    = match (next, NULL); /* Does the look-behind regex match? */
3379 
3380 					CHECK_RECURSION_LIMIT
3381 
3382 					/* The match must have ended at the current position;
3383 					   otherwise it is invalid */
3384 					if (answer && Reg_Input == save) {
3385 						/* It matched, exactly far enough */
3386 						found = 1;
3387 
3388 						/* Remember the last (most to the left) character position
3389 						   that we consume in the input for a successful match.
3390 						   This is info that may be needed should an attempt be
3391 						   made to match the exact same text at the exact same
3392 						   place. Since look-behind backtracks, a regex with a
3393 						   leading look-behind may need more text than it matches
3394 						   to accomplish a re-match. */
3395 
3396 						if (Extent_Ptr_BW == NULL ||
3397 						        (Extent_Ptr_BW - (save - offset)) > 0) {
3398 							Extent_Ptr_BW = save - offset;
3399 						}
3400 
3401 						break;
3402 					}
3403 				}
3404 
3405 				/* Always restore the position and the logical string end. */
3406 				Reg_Input = save;
3407 				End_Of_String = saved_end;
3408 
3409 				if ( (GET_OP_CODE (scan) == POS_BEHIND_OPEN) ? found : !found) {
3410 					/* The look-behind matches, so we must jump to the next
3411 					   node. The look-behind node is followed by a chain of
3412 					   branches (contents of the look-behind expression), and
3413 					   terminated by a look-behind-close node. */
3414 					next = next_ptr (OPERAND (scan) + LENGTH_SIZE); /* 1st branch */
3415 					/* Skip the chained branches inside the look-ahead */
3416 					while (GET_OP_CODE (next) == BRANCH) {
3417 						next = next_ptr (next);
3418 					}
3419 					next = next_ptr (next); /* Skip LOOK_BEHIND_CLOSE */
3420 				} else {
3421 					/* Not a match */
3422 					MATCH_RETURN (0)
3423 				}
3424 			}
3425 			break;
3426 
3427 			case LOOK_AHEAD_CLOSE:
3428 			case LOOK_BEHIND_CLOSE:
3429 				MATCH_RETURN (1);  /* We have reached the end of the look-ahead or
3430 	                    look-behind which implies that we matched it,
3431 			    so return `true`. */
3432 			default:
3433 				if ( (GET_OP_CODE (scan) > OPEN) &&
3434 				        (GET_OP_CODE (scan) < OPEN + NSUBEXP)) {
3435 
3436 					int no;
3437 					char32 *save;
3438 
3439 					no   = GET_OP_CODE (scan) - OPEN;
3440 					save = Reg_Input;
3441 
3442 					if (no < 10) {
3443 						Back_Ref_Start [no] = save;
3444 						Back_Ref_End   [no] = NULL;
3445 					}
3446 
3447 					if (match (next, NULL)) {
3448 						/* Do not set `Start_Ptr_Ptr' if some later invocation (think
3449 						   recursion) of the same parentheses already has. */
3450 
3451 						if (Start_Ptr_Ptr [no] == NULL) {
3452 							Start_Ptr_Ptr [no] = save;
3453 						}
3454 
3455 						MATCH_RETURN (1)
3456 					} else {
3457 						MATCH_RETURN (0)
3458 					}
3459 				} else if ( (GET_OP_CODE (scan) > CLOSE) &&
3460 				            (GET_OP_CODE (scan) < CLOSE + NSUBEXP)) {
3461 
3462 					int no;
3463 					char32 *save;
3464 
3465 					no   = GET_OP_CODE (scan) - CLOSE;
3466 					save = Reg_Input;
3467 
3468 					if (no < 10) {
3469 						Back_Ref_End [no] = save;
3470 					}
3471 
3472 					if (match (next, NULL)) {
3473 						/* Do not set `End_Ptr_Ptr' if some later invocation of the
3474 						   same parentheses already has. */
3475 
3476 						if (End_Ptr_Ptr [no] == NULL) {
3477 							End_Ptr_Ptr [no] = save;
3478 						}
3479 						MATCH_RETURN (1)
3480 					} else {
3481 						MATCH_RETURN (0)
3482 					}
3483 				} else {
3484 					reg_error (U"memory corruption, `match\'");
3485 					MATCH_RETURN (0)
3486 				}
3487 
3488 				break;
3489 		}
3490 
3491 		scan = next;
3492 	}
3493 
3494 	/* We get here only if there's trouble -- normally "case END" is
3495 	   the terminating point. */
3496 
3497 	reg_error (U"corrupted pointers, `match\'");
3498 	MATCH_RETURN (0)
3499 }
3500 
3501 /*----------------------------------------------------------------------*
3502  * greedy
3503  *
3504  * Repeatedly match something simple up to "max" times. If max <= 0
3505  * then match as much as possible (max = infinity).  Uses unsigned long
3506  * variables to maximize the amount of text matchable for unbounded
3507  * qualifiers like '*' and '+'.  This will allow at least 4,294,967,295
3508  * matches (4 Gig!) for an ANSI C compliant compiler.  If you are
3509  * applying a regex to something bigger than that, you shouldn't be
3510  * using NEdit!
3511  *
3512  * Returns the actual number of matches.
3513  *----------------------------------------------------------------------*/
3514 
greedy(char32 * p,long max)3515 static unsigned long greedy (char32 *p, long max) {
3516 
3517 	char32 *input_str;
3518 	char32 *operand;
3519 	unsigned long  count = REG_ZERO;
3520 	unsigned long  max_cmp;
3521 
3522 	input_str = Reg_Input;
3523 	operand   = OPERAND (p); /* Literal char or start of class characters. */
3524 	max_cmp   = (max > 0) ? (unsigned long) max : ULONG_MAX;
3525 
3526 	switch (GET_OP_CODE (p))
3527 	{
3528 		case ANY: /* Race to the end of the line or string. Dot DOESN'T match newline. */
3529 			while (count < max_cmp && *input_str != '\n' && ! AT_END_OF_STRING (input_str)) {
3530 				count ++;
3531 				input_str ++;
3532 			}
3533 			break;
3534 
3535 		case EVERY: /* Race to the end of the line or string. Dot DOES match newline. */
3536 			while (count < max_cmp && ! AT_END_OF_STRING (input_str)) {
3537 				count ++;
3538 				input_str ++;
3539 			}
3540 			break;
3541 
3542 		case EXACTLY: /* Count occurrences of single character operand. */
3543 			while (count < max_cmp && *operand == *input_str && ! AT_END_OF_STRING (input_str)) {
3544 				count ++;
3545 				input_str ++;
3546 			}
3547 			break;
3548 
3549 		case SIMILAR: /* Case-insensitive version of EXACTLY */
3550 			while (count < max_cmp && *operand == Melder_toLowerCase (*input_str) && ! AT_END_OF_STRING (input_str)) {
3551 				count ++;
3552 				input_str ++;
3553 			}
3554 			break;
3555 
3556 		case ANY_OF:  /* [...] character class. */
3557 			while (count < max_cmp && str32chr (operand, *input_str) != NULL && ! AT_END_OF_STRING (input_str)) {
3558 				count ++;
3559 				input_str ++;
3560 			}
3561 			break;
3562 
3563 		case ANY_BUT: /* [^...] Negated character class- does NOT normally
3564                        match newline (\n added usually to operand at compile
3565                        time.) */
3566 
3567 			while (count < max_cmp && str32chr (operand, *input_str) == NULL && ! AT_END_OF_STRING (input_str)) {
3568 				count ++;
3569 				input_str ++;
3570 			}
3571 			break;
3572 
3573 		case IS_DELIM: /* \y (not a word delimiter char)
3574                          NOTE: '\n' and '\0' are always word delimiters. */
3575 
3576 			while (count < max_cmp && ! Melder_isWordCharacter (*input_str) && ! AT_END_OF_STRING (input_str)) {
3577 				count ++;
3578 				input_str ++;
3579 			}
3580 			break;
3581 
3582 		case NOT_DELIM: /* \Y (not a word delimiter char)
3583                          NOTE: '\n' and '\0' are always word delimiters. */
3584 
3585 			while (count < max_cmp && Melder_isWordCharacter (*input_str) && ! AT_END_OF_STRING (input_str)) {
3586 				count ++;
3587 				input_str ++;
3588 			}
3589 			break;
3590 
3591 		case WORD_CHAR: /* \w (a word character: letter, number, mark or connector punctuation) */
3592 			while (count < max_cmp && Melder_isWordCharacter (*input_str) && ! AT_END_OF_STRING (input_str)) {
3593 				count ++;
3594 				input_str ++;
3595 			}
3596 			break;
3597 
3598 		case NOT_WORD_CHAR: /* \W (NOT a word character) */
3599 			while (count < max_cmp && ! Melder_isWordCharacter (*input_str) && *input_str != U'\n' && ! AT_END_OF_STRING (input_str)) {
3600 				count ++;
3601 				input_str ++;
3602 			}
3603 			break;
3604 
3605 		case DIGIT: /* for ASCII, use [0-9] */
3606 			while (count < max_cmp && Melder_isDecimalNumber (*input_str) && ! AT_END_OF_STRING (input_str)) {
3607 				count ++;
3608 				input_str ++;
3609 			}
3610 			break;
3611 
3612 		case NOT_DIGIT: /* for ASCII, use [^0-9] */
3613 			while (count < max_cmp && ! Melder_isDecimalNumber (*input_str) && *input_str != U'\n' && ! AT_END_OF_STRING (input_str)) {
3614 				count ++;
3615 				input_str ++;
3616 			}
3617 			break;
3618 
3619 		case SPACE: /* for ASCII, use [ \t\r\f\v]-- doesn't match newline. */
3620 			while (count < max_cmp && Melder_isHorizontalSpace (*input_str) && ! AT_END_OF_STRING (input_str)) {
3621 				count ++;
3622 				input_str ++;
3623 			}
3624 			break;
3625 
3626 		case SPACE_NL: /* for ASCII, use [\n \t\r\f\v]-- matches newline. */
3627 			while (count < max_cmp && Melder_isHorizontalOrVerticalSpace (*input_str) && ! AT_END_OF_STRING (input_str)) {
3628 				count ++;
3629 				input_str ++;
3630 			}
3631 			break;
3632 
3633 		case NOT_SPACE: /* for ASCII, use [^\n \t\r\f\v]-- doesn't match newline. */
3634 			while (count < max_cmp && ! Melder_isHorizontalOrVerticalSpace (*input_str) && ! AT_END_OF_STRING (input_str)) {
3635 				count ++;
3636 				input_str ++;
3637 			}
3638 			break;
3639 
3640 		case NOT_SPACE_NL: /* for ASCII, use [^ \t\r\f\v]-- matches newline. */
3641 			while (count < max_cmp && ! Melder_isHorizontalSpace (*input_str) && ! AT_END_OF_STRING (input_str)) {
3642 				count ++;
3643 				input_str ++;
3644 			}
3645 			break;
3646 
3647 		case LETTER: /* for ASCII, use [a-zA-Z] */
3648 			while (count < max_cmp && Melder_isLetter (*input_str) && ! AT_END_OF_STRING (input_str)) {
3649 				count ++;
3650 				input_str ++;
3651 			}
3652 			break;
3653 
3654 		case NOT_LETTER: /* for ASCII, use [^a-zA-Z] */
3655 			while (count < max_cmp && ! Melder_isLetter (*input_str) && *input_str != '\n' && ! AT_END_OF_STRING (input_str)) {
3656 				count ++;
3657 				input_str ++;
3658 			}
3659 			break;
3660 
3661 		default:
3662 			/* Called inappropriately.  Only atoms that are SIMPLE should
3663 			   generate a call to greedy.  The above cases should cover
3664 			   all the atoms that are SIMPLE. */
3665 
3666 			reg_error (U"internal error #10 `greedy\'");
3667 			count = 0U;  /* Best we can do. */
3668 	}
3669 
3670 	/* Point to character just after last matched character. */
3671 
3672 	Reg_Input = input_str;
3673 
3674 	return (count);
3675 }
3676 
3677 /*----------------------------------------------------------------------*
3678  * next_ptr - compute the address of a node's "NEXT" pointer.
3679  * Note: a simplified inline version is available via the NEXT_PTR() macro,
3680  *       but that one is only to be used at time-critical places (see the
3681  *       description of the macro).
3682  *----------------------------------------------------------------------*/
3683 
next_ptr(char32 * ptr)3684 static char32 *next_ptr (char32 *ptr) {
3685 	if (ptr == & Compute_Size)
3686 		return nullptr;
3687 	int offset = GET_OFFSET (ptr);
3688 	if (offset == 0)
3689 		return nullptr;
3690 	if (GET_OP_CODE (ptr) == BACK) {
3691 		return ptr - offset;
3692 	} else {
3693 		return ptr + offset;
3694 	}
3695 }
3696 
3697 /*
3698 **  SubstituteRE - Perform substitutions after a `regexp' match.
3699 **
3700 **  This function cleanly shortens results of more than max length to max.
3701 **  To give the caller a chance to react to this the function returns False
3702 **  on any error. The substitution will still be executed.
3703 */
SubstituteRE(const regexp * prog,conststring32 source,mutablestring32 dest,int max,int * errorType)3704 int SubstituteRE (const regexp *prog, conststring32 source, mutablestring32 dest, int max, int *errorType) {
3705 
3706 	const char32 *src;
3707 	const char32 *src_alias;
3708 	char32 *dst;
3709 	char32 c;
3710 	char32 test;
3711 	int   paren_no;
3712 	int   len;
3713 	char32 chgcase;
3714 	bool anyWarnings = false;
3715 
3716 	*errorType = 0;
3717 	if (! prog || ! source || ! dest) {
3718 		reg_error (U"NULL parm to `SubstituteRE\'");
3719 		*errorType = 2;
3720 		return false;
3721 	}
3722 
3723 	if (U_CHAR_AT (prog->program) != MAGIC) {
3724 		*errorType = 3;
3725 		reg_error (U"damaged regexp passed to `SubstituteRE\'");
3726 		return false;
3727 	}
3728 
3729 	src = source;
3730 	dst = dest;
3731 
3732 	while ( (c = *src++) != U'\0') {
3733 		chgcase  = U'\0';
3734 		paren_no = -1;
3735 
3736 		if (c == U'\\') {
3737 			/* Process any case-altering tokens, i.e \u, \U, \l, \L. */
3738 
3739 			if (*src == U'u' || *src == U'U' || *src == U'l' || *src == U'L') {
3740 				chgcase = *src;
3741 				src ++;
3742 				c = *src ++;
3743 				if (c == U'\0')
3744 					break;
3745 			}
3746 		}
3747 
3748 		if (c == U'&') {
3749 			paren_no = 0;
3750 		} else if (c == U'\\') {
3751 			/* Can not pass register variable `&src' to function `numeric_escape'
3752 			   so make a non-register copy that we can take the address of. */
3753 
3754 			src_alias = src;
3755 
3756 			if (U'1' <= *src && *src <= U'9') {
3757 				paren_no = (int) * src++ - (int) '0';
3758 
3759 			} else if ( (test = literal_escape (*src)) != U'\0') {
3760 				c = test; src++;
3761 
3762 			} else if ( (test = numeric_escape (*src, (char32 **) &src_alias)) != U'\0') {
3763 				c   = test;
3764 				src = src_alias;
3765 				src ++;
3766 
3767 				/* NOTE: if an octal escape for zero is attempted (e.g. \000), it
3768 				   will be treated as a literal string. */
3769 			} else if (*src == U'\0') {
3770 				/* If '\' is the last character of the replacement string, it is
3771 				   interpreted as a literal backslash. */
3772 
3773 				c = U'\\';
3774 			} else {
3775 				c = *src ++; /* Allow any escape sequence (This is  */
3776 			}              /* INCONSISTENT with the `CompileRE'   */
3777 		}                 /* mind set of issuing an error!       */
3778 
3779 		if (paren_no < 0) { /* Ordinary character. */
3780 			if ( (dst - dest) >= (max - 1)) {
3781 				*errorType = 1;
3782 				reg_error (U"replacing expression in `SubstituteRE\' too long; truncating");
3783 				anyWarnings = true;
3784 				break;
3785 			} else {
3786 				*dst++ = c;
3787 			}
3788 		} else if (prog->startp [paren_no] != NULL &&
3789 		           prog->endp   [paren_no] != NULL) {
3790 
3791 			len = prog->endp [paren_no] - prog->startp [paren_no];
3792 
3793 			if ( (dst + len - dest) >= max - 1) {
3794 				*errorType = 1;
3795 				reg_error (U"replacing expression in `SubstituteRE\' too long; truncating");
3796 				anyWarnings = true;
3797 				len = max - (dst - dest) - 1;
3798 			}
3799 
3800 			(void) str32ncpy (dst, prog->startp [paren_no], len);
3801 
3802 			if (chgcase != U'\0') {
3803 				adjustcase (dst, len, chgcase);
3804 			}
3805 
3806 			dst += len;
3807 
3808 			if (len != 0 && * (dst - 1) == U'\0') { /* strncpy hit NUL. */
3809 				*errorType = 3;
3810 				reg_error (U"damaged match string in `SubstituteRE\'");
3811 				anyWarnings = true;
3812 			}
3813 		}
3814 	}
3815 
3816 	*dst = U'\0';
3817 
3818 	return ! anyWarnings;
3819 }
3820 
adjustcase(char32 * str,int len,char32 chgcase)3821 static void adjustcase (char32 *str, int len, char32 chgcase) {
3822 
3823 	char32 *string = str;
3824 
3825 	/* The tokens \u and \l only modify the first character while the tokens
3826 	   \U and \L modify the entire string. */
3827 
3828 	if (iswlower ((int) chgcase) && len > 0)
3829 		len = 1;
3830 	switch (chgcase) {
3831 		case U'u':
3832 		case U'U':
3833 			for (integer i = 0; i < len; i ++)
3834 				* (string + i) = Melder_toUpperCase (* (string + i));
3835 			break;
3836 		case U'l':
3837 		case U'L':
3838 			for (integer i = 0; i < len; i ++)
3839 				* (string + i) = Melder_toLowerCase (* (string + i));
3840 			break;
3841 	}
3842 }
3843 
3844 /*----------------------------------------------------------------------*
3845  * reg_error
3846  *----------------------------------------------------------------------*/
3847 
reg_error(conststring32 str)3848 static void reg_error (conststring32 str) {
3849 	Melder_appendError (U"Internal error processing regular expression: ", str);
3850 }
3851 
EnableCountingQuantifier(int is_enabled)3852 void EnableCountingQuantifier (int is_enabled) {
3853 	Enable_Counting_Quantifier = is_enabled;
3854 }
3855 
3856 /* End of file regularExp.cpp */
3857