1 // This is an open source non-commercial project. Dear PVS-Studio, please check
2 // it. PVS-Studio Static Code Analyzer for C, C++ and C#: http://www.viva64.com
3 
4 // uncrustify:off
5 
6 /*
7  * Handling of regular expressions: vim_regcomp(), vim_regexec(), vim_regsub()
8  *
9  * NOTICE:
10  *
11  * This is NOT the original regular expression code as written by Henry
12  * Spencer.  This code has been modified specifically for use with the VIM
13  * editor, and should not be used separately from Vim.  If you want a good
14  * regular expression library, get the original code.  The copyright notice
15  * that follows is from the original.
16  *
17  * END NOTICE
18  *
19  *	Copyright (c) 1986 by University of Toronto.
20  *	Written by Henry Spencer.  Not derived from licensed software.
21  *
22  *	Permission is granted to anyone to use this software for any
23  *	purpose on any computer system, and to redistribute it freely,
24  *	subject to the following restrictions:
25  *
26  *	1. The author is not responsible for the consequences of use of
27  *		this software, no matter how awful, even if they arise
28  *		from defects in it.
29  *
30  *	2. The origin of this software must not be misrepresented, either
31  *		by explicit claim or by omission.
32  *
33  *	3. Altered versions must be plainly marked as such, and must not
34  *		be misrepresented as being the original software.
35  *
36  * Beware that some of this code is subtly aware of the way operator
37  * precedence is structured in regular expressions.  Serious changes in
38  * regular-expression syntax might require a total rethink.
39  *
40  * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert
41  * Webb, Ciaran McCreesh and Bram Moolenaar.
42  * Named character class support added by Walter Briscoe (1998 Jul 01)
43  */
44 
45 // By default: do not create debugging logs or files related to regular
46 // expressions, even when compiling with -DDEBUG.
47 // Uncomment the second line to get the regexp debugging.
48 // #undef REGEXP_DEBUG
49 // #define REGEXP_DEBUG
50 
51 #include <assert.h>
52 #include <inttypes.h>
53 #include <stdbool.h>
54 #include <string.h>
55 
56 #include "nvim/vim.h"
57 #include "nvim/ascii.h"
58 #include "nvim/regexp.h"
59 #include "nvim/charset.h"
60 #include "nvim/eval.h"
61 #include "nvim/eval/userfunc.h"
62 #include "nvim/ex_cmds2.h"
63 #include "nvim/mark.h"
64 #include "nvim/memline.h"
65 #include "nvim/memory.h"
66 #include "nvim/message.h"
67 #include "nvim/misc1.h"
68 #include "nvim/plines.h"
69 #include "nvim/garray.h"
70 #include "nvim/strings.h"
71 
72 #ifdef REGEXP_DEBUG
73 /* show/save debugging data when BT engine is used */
74 # define BT_REGEXP_DUMP
75 /* save the debugging data to a file instead of displaying it */
76 # define BT_REGEXP_LOG
77 # define BT_REGEXP_DEBUG_LOG
78 # define BT_REGEXP_DEBUG_LOG_NAME       "bt_regexp_debug.log"
79 #endif
80 
81 /*
82  * The "internal use only" fields in regexp_defs.h are present to pass info from
83  * compile to execute that permits the execute phase to run lots faster on
84  * simple cases.  They are:
85  *
86  * regstart	char that must begin a match; NUL if none obvious; Can be a
87  *		multi-byte character.
88  * reganch	is the match anchored (at beginning-of-line only)?
89  * regmust	string (pointer into program) that match must include, or NULL
90  * regmlen	length of regmust string
91  * regflags	RF_ values or'ed together
92  *
93  * Regstart and reganch permit very fast decisions on suitable starting points
94  * for a match, cutting down the work a lot.  Regmust permits fast rejection
95  * of lines that cannot possibly match.  The regmust tests are costly enough
96  * that vim_regcomp() supplies a regmust only if the r.e. contains something
97  * potentially expensive (at present, the only such thing detected is * or +
98  * at the start of the r.e., which can involve a lot of backup).  Regmlen is
99  * supplied because the test in vim_regexec() needs it and vim_regcomp() is
100  * computing it anyway.
101  */
102 
103 /*
104  * Structure for regexp "program".  This is essentially a linear encoding
105  * of a nondeterministic finite-state machine (aka syntax charts or
106  * "railroad normal form" in parsing technology).  Each node is an opcode
107  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
108  * all nodes except BRANCH and BRACES_COMPLEX implement concatenation; a "next"
109  * pointer with a BRANCH on both ends of it is connecting two alternatives.
110  * (Here we have one of the subtle syntax dependencies:	an individual BRANCH
111  * (as opposed to a collection of them) is never concatenated with anything
112  * because of operator precedence).  The "next" pointer of a BRACES_COMPLEX
113  * node points to the node after the stuff to be repeated.
114  * The operand of some types of node is a literal string; for others, it is a
115  * node leading into a sub-FSM.  In particular, the operand of a BRANCH node
116  * is the first node of the branch.
117  * (NB this is *not* a tree structure: the tail of the branch connects to the
118  * thing following the set of BRANCHes.)
119  *
120  * pattern	is coded like:
121  *
122  *			  +-----------------+
123  *			  |		    V
124  * <aa>\|<bb>	BRANCH <aa> BRANCH <bb> --> END
125  *		     |	    ^	 |	    ^
126  *		     +------+	 +----------+
127  *
128  *
129  *		       +------------------+
130  *		       V		  |
131  * <aa>*	BRANCH BRANCH <aa> --> BACK BRANCH --> NOTHING --> END
132  *		     |	    |		    ^			   ^
133  *		     |	    +---------------+			   |
134  *		     +---------------------------------------------+
135  *
136  *
137  *		       +----------------------+
138  *		       V		      |
139  * <aa>\+	BRANCH <aa> --> BRANCH --> BACK  BRANCH --> NOTHING --> END
140  *		     |		     |		 ^			^
141  *		     |		     +-----------+			|
142  *		     +--------------------------------------------------+
143  *
144  *
145  *					+-------------------------+
146  *					V			  |
147  * <aa>\{}	BRANCH BRACE_LIMITS --> BRACE_COMPLEX <aa> --> BACK  END
148  *		     |				    |		     ^
149  *		     |				    +----------------+
150  *		     +-----------------------------------------------+
151  *
152  *
153  * <aa>\@!<bb>	BRANCH NOMATCH <aa> --> END  <bb> --> END
154  *		     |	     |		      ^       ^
155  *		     |	     +----------------+       |
156  *		     +--------------------------------+
157  *
158  *						      +---------+
159  *						      |		V
160  * \z[abc]	BRANCH BRANCH  a  BRANCH  b  BRANCH  c	BRANCH	NOTHING --> END
161  *		     |	    |	       |	  |	^		    ^
162  *		     |	    |	       |	  +-----+		    |
163  *		     |	    |	       +----------------+		    |
164  *		     |	    +---------------------------+		    |
165  *		     +------------------------------------------------------+
166  *
167  * They all start with a BRANCH for "\|" alternatives, even when there is only
168  * one alternative.
169  */
170 
171 /*
172  * The opcodes are:
173  */
174 
175 /* definition	number		   opnd?    meaning */
176 #define END             0       /*	End of program or NOMATCH operand. */
177 #define BOL             1       /*	Match "" at beginning of line. */
178 #define EOL             2       /*	Match "" at end of line. */
179 #define BRANCH          3       /* node Match this alternative, or the
180                                  *	next... */
181 #define BACK            4       /*	Match "", "next" ptr points backward. */
182 #define EXACTLY         5       /* str	Match this string. */
183 #define NOTHING         6       /*	Match empty string. */
184 #define STAR            7       /* node Match this (simple) thing 0 or more
185                                  *	times. */
186 #define PLUS            8       /* node Match this (simple) thing 1 or more
187                                  *	times. */
188 #define MATCH           9       /* node match the operand zero-width */
189 #define NOMATCH         10      /* node check for no match with operand */
190 #define BEHIND          11      /* node look behind for a match with operand */
191 #define NOBEHIND        12      /* node look behind for no match with operand */
192 #define SUBPAT          13      /* node match the operand here */
193 #define BRACE_SIMPLE    14      /* node Match this (simple) thing between m and
194                                  *	n times (\{m,n\}). */
195 #define BOW             15      /*	Match "" after [^a-zA-Z0-9_] */
196 #define EOW             16      /*	Match "" at    [^a-zA-Z0-9_] */
197 #define BRACE_LIMITS    17      /* nr nr  define the min & max for BRACE_SIMPLE
198                                  *	and BRACE_COMPLEX. */
199 #define NEWL            18      /*	Match line-break */
200 #define BHPOS           19      /*	End position for BEHIND or NOBEHIND */
201 
202 
203 /* character classes: 20-48 normal, 50-78 include a line-break */
204 #define ADD_NL          30
205 #define FIRST_NL        ANY + ADD_NL
206 #define ANY             20      /*	Match any one character. */
207 #define ANYOF           21      /* str	Match any character in this string. */
208 #define ANYBUT          22      /* str	Match any character not in this
209                                  *	string. */
210 #define IDENT           23      /*	Match identifier char */
211 #define SIDENT          24      /*	Match identifier char but no digit */
212 #define KWORD           25      /*	Match keyword char */
213 #define SKWORD          26      /*	Match word char but no digit */
214 #define FNAME           27      /*	Match file name char */
215 #define SFNAME          28      /*	Match file name char but no digit */
216 #define PRINT           29      /*	Match printable char */
217 #define SPRINT          30      /*	Match printable char but no digit */
218 #define WHITE           31      /*	Match whitespace char */
219 #define NWHITE          32      /*	Match non-whitespace char */
220 #define DIGIT           33      /*	Match digit char */
221 #define NDIGIT          34      /*	Match non-digit char */
222 #define HEX             35      /*	Match hex char */
223 #define NHEX            36      /*	Match non-hex char */
224 #define OCTAL           37      /*	Match octal char */
225 #define NOCTAL          38      /*	Match non-octal char */
226 #define WORD            39      /*	Match word char */
227 #define NWORD           40      /*	Match non-word char */
228 #define HEAD            41      /*	Match head char */
229 #define NHEAD           42      /*	Match non-head char */
230 #define ALPHA           43      /*	Match alpha char */
231 #define NALPHA          44      /*	Match non-alpha char */
232 #define LOWER           45      /*	Match lowercase char */
233 #define NLOWER          46      /*	Match non-lowercase char */
234 #define UPPER           47      /*	Match uppercase char */
235 #define NUPPER          48      /*	Match non-uppercase char */
236 #define LAST_NL         NUPPER + ADD_NL
237 // -V:WITH_NL:560
238 #define WITH_NL(op)     ((op) >= FIRST_NL && (op) <= LAST_NL)
239 
240 #define MOPEN           80   // -89 Mark this point in input as start of
241                              //     \( … \) subexpr.  MOPEN + 0 marks start of
242                              //     match.
243 #define MCLOSE          90   // -99 Analogous to MOPEN.  MCLOSE + 0 marks
244                              //     end of match.
245 #define BACKREF         100  // -109 node Match same string again \1-\9.
246 
247 # define ZOPEN          110  // -119 Mark this point in input as start of
248                              //  \z( … \) subexpr.
249 # define ZCLOSE         120  // -129 Analogous to ZOPEN.
250 # define ZREF           130  // -139 node Match external submatch \z1-\z9
251 
252 #define BRACE_COMPLEX   140 /* -149 node Match nodes between m & n times */
253 
254 #define NOPEN           150     // Mark this point in input as start of
255                                 // \%( subexpr.
256 #define NCLOSE          151     // Analogous to NOPEN.
257 
258 #define MULTIBYTECODE   200     /* mbc	Match one multi-byte character */
259 #define RE_BOF          201     /*	Match "" at beginning of file. */
260 #define RE_EOF          202     /*	Match "" at end of file. */
261 #define CURSOR          203     /*	Match location of cursor. */
262 
263 #define RE_LNUM         204     /* nr cmp  Match line number */
264 #define RE_COL          205     /* nr cmp  Match column number */
265 #define RE_VCOL         206     /* nr cmp  Match virtual column number */
266 
267 #define RE_MARK         207     /* mark cmp  Match mark position */
268 #define RE_VISUAL       208     /*	Match Visual area */
269 #define RE_COMPOSING    209     // any composing characters
270 
271 /*
272  * Magic characters have a special meaning, they don't match literally.
273  * Magic characters are negative.  This separates them from literal characters
274  * (possibly multi-byte).  Only ASCII characters can be Magic.
275  */
276 #define Magic(x)        ((int)(x) - 256)
277 #define un_Magic(x)     ((x) + 256)
278 #define is_Magic(x)     ((x) < 0)
279 
280 /*
281  * We should define ftpr as a pointer to a function returning a pointer to
282  * a function returning a pointer to a function ...
283  * This is impossible, so we declare a pointer to a function returning a
284  * pointer to a function returning void. This should work for all compilers.
285  */
286 typedef void (*(*fptr_T)(int *, int))(void);
287 
288 typedef struct {
289   char_u     *regparse;
290   int prevchr_len;
291   int curchr;
292   int prevchr;
293   int prevprevchr;
294   int nextchr;
295   int at_start;
296   int prev_at_start;
297   int regnpar;
298 } parse_state_T;
299 
300 /*
301  * Structure used to save the current input state, when it needs to be
302  * restored after trying a match.  Used by reg_save() and reg_restore().
303  * Also stores the length of "backpos".
304  */
305 typedef struct {
306   union {
307     char_u  *ptr;       ///< rex.input pointer, for single-line regexp
308     lpos_T pos;         ///< rex.input pos, for multi-line regexp
309   } rs_u;
310   int rs_len;
311 } regsave_T;
312 
313 /* struct to save start/end pointer/position in for \(\) */
314 typedef struct {
315   union {
316     char_u  *ptr;
317     lpos_T pos;
318   } se_u;
319 } save_se_T;
320 
321 /* used for BEHIND and NOBEHIND matching */
322 typedef struct regbehind_S {
323   regsave_T save_after;
324   regsave_T save_behind;
325   int save_need_clear_subexpr;
326   save_se_T save_start[NSUBEXP];
327   save_se_T save_end[NSUBEXP];
328 } regbehind_T;
329 
330 /* Values for rs_state in regitem_T. */
331 typedef enum regstate_E {
332   RS_NOPEN = 0          /* NOPEN and NCLOSE */
333   , RS_MOPEN            /* MOPEN + [0-9] */
334   , RS_MCLOSE           /* MCLOSE + [0-9] */
335   , RS_ZOPEN            /* ZOPEN + [0-9] */
336   , RS_ZCLOSE           /* ZCLOSE + [0-9] */
337   , RS_BRANCH           /* BRANCH */
338   , RS_BRCPLX_MORE      /* BRACE_COMPLEX and trying one more match */
339   , RS_BRCPLX_LONG      /* BRACE_COMPLEX and trying longest match */
340   , RS_BRCPLX_SHORT     /* BRACE_COMPLEX and trying shortest match */
341   , RS_NOMATCH          /* NOMATCH */
342   , RS_BEHIND1          /* BEHIND / NOBEHIND matching rest */
343   , RS_BEHIND2          /* BEHIND / NOBEHIND matching behind part */
344   , RS_STAR_LONG        /* STAR/PLUS/BRACE_SIMPLE longest match */
345   , RS_STAR_SHORT       /* STAR/PLUS/BRACE_SIMPLE shortest match */
346 } regstate_T;
347 
348 /*
349  * When there are alternatives a regstate_T is put on the regstack to remember
350  * what we are doing.
351  * Before it may be another type of item, depending on rs_state, to remember
352  * more things.
353  */
354 typedef struct regitem_S {
355   regstate_T rs_state;          // what we are doing, one of RS_ above
356   uint16_t   rs_no;             // submatch nr or BEHIND/NOBEHIND
357   char_u     *rs_scan;          // current node in program
358   union {
359     save_se_T sesave;
360     regsave_T regsave;
361   } rs_un;                      ///< room for saving rex.input
362 } regitem_T;
363 
364 
365 /* used for STAR, PLUS and BRACE_SIMPLE matching */
366 typedef struct regstar_S {
367   int nextb;                    /* next byte */
368   int nextb_ic;                 /* next byte reverse case */
369   long count;
370   long minval;
371   long maxval;
372 } regstar_T;
373 
374 /* used to store input position when a BACK was encountered, so that we now if
375  * we made any progress since the last time. */
376 typedef struct backpos_S {
377   char_u      *bp_scan;         /* "scan" where BACK was encountered */
378   regsave_T bp_pos;             /* last input position */
379 } backpos_T;
380 
381 typedef struct {
382   int a, b, c;
383 } decomp_T;
384 
385 
386 #ifdef INCLUDE_GENERATED_DECLARATIONS
387 # include "regexp.c.generated.h"
388 #endif
no_Magic(int x)389 static int no_Magic(int x)
390 {
391   if (is_Magic(x))
392     return un_Magic(x);
393   return x;
394 }
395 
toggle_Magic(int x)396 static int toggle_Magic(int x)
397 {
398   if (is_Magic(x))
399     return un_Magic(x);
400   return Magic(x);
401 }
402 
403 /*
404  * The first byte of the regexp internal "program" is actually this magic
405  * number; the start node begins in the second byte.  It's used to catch the
406  * most severe mutilation of the program by the caller.
407  */
408 
409 #define REGMAGIC        0234
410 
411 /*
412  * Opcode notes:
413  *
414  * BRANCH	The set of branches constituting a single choice are hooked
415  *		together with their "next" pointers, since precedence prevents
416  *		anything being concatenated to any individual branch.  The
417  *		"next" pointer of the last BRANCH in a choice points to the
418  *		thing following the whole choice.  This is also where the
419  *		final "next" pointer of each individual branch points; each
420  *		branch starts with the operand node of a BRANCH node.
421  *
422  * BACK		Normal "next" pointers all implicitly point forward; BACK
423  *		exists to make loop structures possible.
424  *
425  * STAR,PLUS	'=', and complex '*' and '+', are implemented as circular
426  *		BRANCH structures using BACK.  Simple cases (one character
427  *		per match) are implemented with STAR and PLUS for speed
428  *		and to minimize recursive plunges.
429  *
430  * BRACE_LIMITS	This is always followed by a BRACE_SIMPLE or BRACE_COMPLEX
431  *		node, and defines the min and max limits to be used for that
432  *		node.
433  *
434  * MOPEN,MCLOSE	...are numbered at compile time.
435  * ZOPEN,ZCLOSE	...ditto
436  */
437 
438 /*
439  * A node is one char of opcode followed by two chars of "next" pointer.
440  * "Next" pointers are stored as two 8-bit bytes, high order first.  The
441  * value is a positive offset from the opcode of the node containing it.
442  * An operand, if any, simply follows the node.  (Note that much of the
443  * code generation knows about this implicit relationship.)
444  *
445  * Using two bytes for the "next" pointer is vast overkill for most things,
446  * but allows patterns to get big without disasters.
447  */
448 #define OP(p)           ((int)*(p))
449 #define NEXT(p)         (((*((p) + 1) & 0377) << 8) + (*((p) + 2) & 0377))
450 #define OPERAND(p)      ((p) + 3)
451 /* Obtain an operand that was stored as four bytes, MSB first. */
452 #define OPERAND_MIN(p)  (((long)(p)[3] << 24) + ((long)(p)[4] << 16) \
453                          + ((long)(p)[5] << 8) + (long)(p)[6])
454 /* Obtain a second operand stored as four bytes. */
455 #define OPERAND_MAX(p)  OPERAND_MIN((p) + 4)
456 /* Obtain a second single-byte operand stored after a four bytes operand. */
457 #define OPERAND_CMP(p)  (p)[7]
458 
459 /*
460  * Utility definitions.
461  */
462 #define UCHARAT(p)      ((int)*(char_u *)(p))
463 
464 // Used for an error (down from) vim_regcomp(): give the error message, set
465 // rc_did_emsg and return NULL
466 #define EMSG_RET_NULL(m) return (emsg(m), rc_did_emsg = true, (void *)NULL)
467 #define IEMSG_RET_NULL(m) return (iemsg(m), rc_did_emsg = true, (void *)NULL)
468 #define EMSG_RET_FAIL(m) return (emsg(m), rc_did_emsg = true, FAIL)
469 #define EMSG2_RET_NULL(m, c) \
470     return (semsg((m), (c) ? "" : "\\"), rc_did_emsg = true, (void *)NULL)
471 #define EMSG2_RET_FAIL(m, c) \
472     return (semsg((m), (c) ? "" : "\\"), rc_did_emsg = true, FAIL)
473 #define EMSG_ONE_RET_NULL EMSG2_RET_NULL(_( \
474     "E369: invalid item in %s%%[]"), reg_magic == MAGIC_ALL)
475 
476 #define MAX_LIMIT       (32767L << 16L)
477 
478 
479 #ifdef BT_REGEXP_DUMP
480 static void regdump(char_u *, bt_regprog_T *);
481 #endif
482 #ifdef REGEXP_DEBUG
483 static char_u   *regprop(char_u *);
484 #endif
485 
486 static char_u e_missingbracket[] = N_("E769: Missing ] after %s[");
487 static char_u e_reverse_range[] = N_("E944: Reverse range in character class");
488 static char_u e_large_class[] = N_("E945: Range too large in character class");
489 static char_u e_unmatchedpp[] = N_("E53: Unmatched %s%%(");
490 static char_u e_unmatchedp[] = N_("E54: Unmatched %s(");
491 static char_u e_unmatchedpar[] = N_("E55: Unmatched %s)");
492 static char_u e_z_not_allowed[] = N_("E66: \\z( not allowed here");
493 static char_u e_z1_not_allowed[] = N_("E67: \\z1 - \\z9 not allowed here");
494 static char_u e_missing_sb[] = N_("E69: Missing ] after %s%%[");
495 static char_u e_empty_sb[]  = N_("E70: Empty %s%%[]");
496 static char_u e_recursive[]  = N_("E956: Cannot use pattern recursively");
497 
498 #define NOT_MULTI       0
499 #define MULTI_ONE       1
500 #define MULTI_MULT      2
501 /*
502  * Return NOT_MULTI if c is not a "multi" operator.
503  * Return MULTI_ONE if c is a single "multi" operator.
504  * Return MULTI_MULT if c is a multi "multi" operator.
505  */
re_multi_type(int c)506 static int re_multi_type(int c)
507 {
508   if (c == Magic('@') || c == Magic('=') || c == Magic('?'))
509     return MULTI_ONE;
510   if (c == Magic('*') || c == Magic('+') || c == Magic('{'))
511     return MULTI_MULT;
512   return NOT_MULTI;
513 }
514 
515 /*
516  * Flags to be passed up and down.
517  */
518 #define HASWIDTH        0x1     /* Known never to match null string. */
519 #define SIMPLE          0x2     /* Simple enough to be STAR/PLUS operand. */
520 #define SPSTART         0x4     /* Starts with * or +. */
521 #define HASNL           0x8     /* Contains some \n. */
522 #define HASLOOKBH       0x10    /* Contains "\@<=" or "\@<!". */
523 #define WORST           0       /* Worst case. */
524 
525 /*
526  * When regcode is set to this value, code is not emitted and size is computed
527  * instead.
528  */
529 #define JUST_CALC_SIZE  ((char_u *) -1)
530 
531 static char_u           *reg_prev_sub = NULL;
532 
533 /*
534  * REGEXP_INRANGE contains all characters which are always special in a []
535  * range after '\'.
536  * REGEXP_ABBR contains all characters which act as abbreviations after '\'.
537  * These are:
538  *  \n	- New line (NL).
539  *  \r	- Carriage Return (CR).
540  *  \t	- Tab (TAB).
541  *  \e	- Escape (ESC).
542  *  \b	- Backspace (Ctrl_H).
543  *  \d  - Character code in decimal, eg \d123
544  *  \o	- Character code in octal, eg \o80
545  *  \x	- Character code in hex, eg \x4a
546  *  \u	- Multibyte character code, eg \u20ac
547  *  \U	- Long multibyte character code, eg \U12345678
548  */
549 static char_u REGEXP_INRANGE[] = "]^-n\\";
550 static char_u REGEXP_ABBR[] = "nrtebdoxuU";
551 
552 
553 /*
554  * Translate '\x' to its control character, except "\n", which is Magic.
555  */
backslash_trans(int c)556 static int backslash_trans(int c)
557 {
558   switch (c) {
559   case 'r':   return CAR;
560   case 't':   return TAB;
561   case 'e':   return ESC;
562   case 'b':   return BS;
563   }
564   return c;
565 }
566 
567 /*
568  * Check for a character class name "[:name:]".  "pp" points to the '['.
569  * Returns one of the CLASS_ items. CLASS_NONE means that no item was
570  * recognized.  Otherwise "pp" is advanced to after the item.
571  */
get_char_class(char_u ** pp)572 static int get_char_class(char_u **pp)
573 {
574   static const char *(class_names[]) =
575   {
576     "alnum:]",
577 #define CLASS_ALNUM 0
578     "alpha:]",
579 #define CLASS_ALPHA 1
580     "blank:]",
581 #define CLASS_BLANK 2
582     "cntrl:]",
583 #define CLASS_CNTRL 3
584     "digit:]",
585 #define CLASS_DIGIT 4
586     "graph:]",
587 #define CLASS_GRAPH 5
588     "lower:]",
589 #define CLASS_LOWER 6
590     "print:]",
591 #define CLASS_PRINT 7
592     "punct:]",
593 #define CLASS_PUNCT 8
594     "space:]",
595 #define CLASS_SPACE 9
596     "upper:]",
597 #define CLASS_UPPER 10
598     "xdigit:]",
599 #define CLASS_XDIGIT 11
600     "tab:]",
601 #define CLASS_TAB 12
602     "return:]",
603 #define CLASS_RETURN 13
604     "backspace:]",
605 #define CLASS_BACKSPACE 14
606     "escape:]",
607 #define CLASS_ESCAPE 15
608     "ident:]",
609 #define CLASS_IDENT 16
610     "keyword:]",
611 #define CLASS_KEYWORD 17
612     "fname:]",
613 #define CLASS_FNAME 18
614   };
615 #define CLASS_NONE 99
616   int i;
617 
618   if ((*pp)[1] == ':') {
619     for (i = 0; i < (int)ARRAY_SIZE(class_names); ++i)
620       if (STRNCMP(*pp + 2, class_names[i], STRLEN(class_names[i])) == 0) {
621         *pp += STRLEN(class_names[i]) + 2;
622         return i;
623       }
624   }
625   return CLASS_NONE;
626 }
627 
628 /*
629  * Specific version of character class functions.
630  * Using a table to keep this fast.
631  */
632 static short class_tab[256];
633 
634 #define     RI_DIGIT    0x01
635 #define     RI_HEX      0x02
636 #define     RI_OCTAL    0x04
637 #define     RI_WORD     0x08
638 #define     RI_HEAD     0x10
639 #define     RI_ALPHA    0x20
640 #define     RI_LOWER    0x40
641 #define     RI_UPPER    0x80
642 #define     RI_WHITE    0x100
643 
init_class_tab(void)644 static void init_class_tab(void)
645 {
646   int i;
647   static int done = false;
648 
649   if (done)
650     return;
651 
652   for (i = 0; i < 256; ++i) {
653     if (i >= '0' && i <= '7')
654       class_tab[i] = RI_DIGIT + RI_HEX + RI_OCTAL + RI_WORD;
655     else if (i >= '8' && i <= '9')
656       class_tab[i] = RI_DIGIT + RI_HEX + RI_WORD;
657     else if (i >= 'a' && i <= 'f')
658       class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
659     else if (i >= 'g' && i <= 'z')
660       class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
661     else if (i >= 'A' && i <= 'F')
662       class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
663     else if (i >= 'G' && i <= 'Z')
664       class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
665     else if (i == '_')
666       class_tab[i] = RI_WORD + RI_HEAD;
667     else
668       class_tab[i] = 0;
669   }
670   class_tab[' '] |= RI_WHITE;
671   class_tab['\t'] |= RI_WHITE;
672   done = true;
673 }
674 
675 # define ri_digit(c)    (c < 0x100 && (class_tab[c] & RI_DIGIT))
676 # define ri_hex(c)      (c < 0x100 && (class_tab[c] & RI_HEX))
677 # define ri_octal(c)    (c < 0x100 && (class_tab[c] & RI_OCTAL))
678 # define ri_word(c)     (c < 0x100 && (class_tab[c] & RI_WORD))
679 # define ri_head(c)     (c < 0x100 && (class_tab[c] & RI_HEAD))
680 # define ri_alpha(c)    (c < 0x100 && (class_tab[c] & RI_ALPHA))
681 # define ri_lower(c)    (c < 0x100 && (class_tab[c] & RI_LOWER))
682 # define ri_upper(c)    (c < 0x100 && (class_tab[c] & RI_UPPER))
683 # define ri_white(c)    (c < 0x100 && (class_tab[c] & RI_WHITE))
684 
685 /* flags for regflags */
686 #define RF_ICASE    1   /* ignore case */
687 #define RF_NOICASE  2   /* don't ignore case */
688 #define RF_HASNL    4   /* can match a NL */
689 #define RF_ICOMBINE 8   /* ignore combining characters */
690 #define RF_LOOKBH   16  /* uses "\@<=" or "\@<!" */
691 
692 // Global work variables for vim_regcomp().
693 
694 static char_u *regparse;        ///< Input-scan pointer.
695 static int prevchr_len;         ///< byte length of previous char
696 static int num_complex_braces;  ///< Complex \{...} count
697 static int regnpar;             ///< () count.
698 static bool wants_nfa;          ///< regex should use NFA engine
699 static int regnzpar;            ///< \z() count.
700 static int re_has_z;            ///< \z item detected
701 static char_u *regcode;         ///< Code-emit pointer, or JUST_CALC_SIZE
702 static long regsize;            ///< Code size.
703 static int reg_toolong;         ///< true when offset out of range
704 static char_u had_endbrace[NSUBEXP];  ///< flags, true if end of () found
705 static unsigned regflags;         ///< RF_ flags for prog
706 static long brace_min[10];        ///< Minimums for complex brace repeats
707 static long brace_max[10];        ///< Maximums for complex brace repeats
708 static int brace_count[10];       ///< Current counts for complex brace repeats
709 static int had_eol;               ///< true when EOL found by vim_regcomp()
710 static int one_exactly = false;   ///< only do one char for EXACTLY
711 
712 static int reg_magic;           /* magicness of the pattern: */
713 #define MAGIC_NONE      1       /* "\V" very unmagic */
714 #define MAGIC_OFF       2       /* "\M" or 'magic' off */
715 #define MAGIC_ON        3       /* "\m" or 'magic' */
716 #define MAGIC_ALL       4       /* "\v" very magic */
717 
718 static int reg_string;          // matching with a string instead of a buffer
719                                 // line
720 static int reg_strict;          // "[abc" is illegal
721 
722 /*
723  * META contains all characters that may be magic, except '^' and '$'.
724  */
725 
726 /* META[] is used often enough to justify turning it into a table. */
727 static char_u META_flags[] = {
728   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
729   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
730   /*		   %  &     (  )  *  +	      .    */
731   0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
732   /*     1  2  3	4  5  6  7  8  9	<  =  >  ? */
733   0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
734   /*  @  A     C	D     F     H  I     K	L  M	 O */
735   1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
736   /*  P	     S	   U  V  W  X	  Z  [		 _ */
737   1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
738   /*     a     c	d     f     h  i     k	l  m  n  o */
739   0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
740   /*  p	     s	   u  v  w  x	  z  {	|     ~    */
741   1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1
742 };
743 
744 static int curchr;              // currently parsed character
745 // Previous character.  Note: prevchr is sometimes -1 when we are not at the
746 // start, eg in /[ ^I]^ the pattern was never found even if it existed,
747 // because ^ was taken to be magic -- webb
748 static int prevchr;
749 static int prevprevchr;         /* previous-previous character */
750 static int nextchr;             /* used for ungetchr() */
751 
752 /* arguments for reg() */
753 #define REG_NOPAREN     0       /* toplevel reg() */
754 #define REG_PAREN       1       /* \(\) */
755 #define REG_ZPAREN      2       /* \z(\) */
756 #define REG_NPAREN      3       /* \%(\) */
757 
758 /*
759  * Forward declarations for vim_regcomp()'s friends.
760  */
761 # define REGMBC(x) regmbc(x);
762 # define CASEMBC(x) case x:
763 
764 static regengine_T bt_regengine;
765 static regengine_T nfa_regengine;
766 
767 // Return true if compiled regular expression "prog" can match a line break.
re_multiline(const regprog_T * prog)768 int re_multiline(const regprog_T *prog)
769   FUNC_ATTR_NONNULL_ALL
770 {
771   return prog->regflags & RF_HASNL;
772 }
773 
774 /*
775  * Check for an equivalence class name "[=a=]".  "pp" points to the '['.
776  * Returns a character representing the class. Zero means that no item was
777  * recognized.  Otherwise "pp" is advanced to after the item.
778  */
get_equi_class(char_u ** pp)779 static int get_equi_class(char_u **pp)
780 {
781   int c;
782   int l = 1;
783   char_u      *p = *pp;
784 
785   if (p[1] == '=' && p[2] != NUL) {
786     l = utfc_ptr2len(p + 2);
787     if (p[l + 2] == '=' && p[l + 3] == ']') {
788       c = utf_ptr2char(p + 2);
789       *pp += l + 4;
790       return c;
791     }
792   }
793   return 0;
794 }
795 
796 
797 /*
798  * Produce the bytes for equivalence class "c".
799  * Currently only handles latin1, latin9 and utf-8.
800  * NOTE: When changing this function, also change nfa_emit_equi_class()
801  */
reg_equi_class(int c)802 static void reg_equi_class(int c)
803 {
804   {
805     switch (c) {
806       // Do not use '\300' style, it results in a negative number.
807     case 'A': case 0xc0: case 0xc1: case 0xc2:
808     case 0xc3: case 0xc4: case 0xc5:
809       CASEMBC(0x100) CASEMBC(0x102) CASEMBC(0x104) CASEMBC(0x1cd)
810       CASEMBC(0x1de) CASEMBC(0x1e0) CASEMBC(0x1ea2)
811       regmbc('A'); regmbc(0xc0); regmbc(0xc1);
812       regmbc(0xc2); regmbc(0xc3); regmbc(0xc4);
813       regmbc(0xc5);
814       REGMBC(0x100) REGMBC(0x102) REGMBC(0x104)
815       REGMBC(0x1cd) REGMBC(0x1de) REGMBC(0x1e0)
816       REGMBC(0x1ea2)
817       return;
818     case 'B': CASEMBC(0x1e02) CASEMBC(0x1e06)
819       regmbc('B'); REGMBC(0x1e02) REGMBC(0x1e06)
820       return;
821     case 'C': case 0xc7:
822       CASEMBC(0x106) CASEMBC(0x108) CASEMBC(0x10a) CASEMBC(0x10c)
823       regmbc('C'); regmbc(0xc7);
824       REGMBC(0x106) REGMBC(0x108) REGMBC(0x10a)
825       REGMBC(0x10c)
826       return;
827     case 'D': CASEMBC(0x10e) CASEMBC(0x110) CASEMBC(0x1e0a)
828       CASEMBC(0x1e0e) CASEMBC(0x1e10)
829       regmbc('D'); REGMBC(0x10e) REGMBC(0x110)
830       REGMBC(0x1e0a) REGMBC(0x1e0e) REGMBC(0x1e10)
831       return;
832     case 'E': case 0xc8: case 0xc9: case 0xca: case 0xcb:
833       CASEMBC(0x112) CASEMBC(0x114) CASEMBC(0x116) CASEMBC(0x118)
834       CASEMBC(0x11a) CASEMBC(0x1eba) CASEMBC(0x1ebc)
835       regmbc('E'); regmbc(0xc8); regmbc(0xc9);
836       regmbc(0xca); regmbc(0xcb);
837       REGMBC(0x112) REGMBC(0x114) REGMBC(0x116)
838       REGMBC(0x118) REGMBC(0x11a) REGMBC(0x1eba)
839       REGMBC(0x1ebc)
840       return;
841     case 'F': CASEMBC(0x1e1e)
842       regmbc('F'); REGMBC(0x1e1e)
843       return;
844     case 'G': CASEMBC(0x11c) CASEMBC(0x11e) CASEMBC(0x120)
845       CASEMBC(0x122) CASEMBC(0x1e4) CASEMBC(0x1e6) CASEMBC(0x1f4)
846       CASEMBC(0x1e20)
847       regmbc('G'); REGMBC(0x11c) REGMBC(0x11e)
848       REGMBC(0x120) REGMBC(0x122) REGMBC(0x1e4)
849       REGMBC(0x1e6) REGMBC(0x1f4) REGMBC(0x1e20)
850       return;
851     case 'H': CASEMBC(0x124) CASEMBC(0x126) CASEMBC(0x1e22)
852       CASEMBC(0x1e26) CASEMBC(0x1e28)
853       regmbc('H'); REGMBC(0x124) REGMBC(0x126)
854       REGMBC(0x1e22) REGMBC(0x1e26) REGMBC(0x1e28)
855       return;
856     case 'I': case 0xcc: case 0xcd: case 0xce: case 0xcf:
857       CASEMBC(0x128) CASEMBC(0x12a) CASEMBC(0x12c) CASEMBC(0x12e)
858       CASEMBC(0x130) CASEMBC(0x1cf) CASEMBC(0x1ec8)
859       regmbc('I'); regmbc(0xcc); regmbc(0xcd);
860       regmbc(0xce); regmbc(0xcf);
861       REGMBC(0x128) REGMBC(0x12a) REGMBC(0x12c)
862       REGMBC(0x12e) REGMBC(0x130) REGMBC(0x1cf)
863       REGMBC(0x1ec8)
864       return;
865     case 'J': CASEMBC(0x134)
866       regmbc('J'); REGMBC(0x134)
867       return;
868     case 'K': CASEMBC(0x136) CASEMBC(0x1e8) CASEMBC(0x1e30)
869       CASEMBC(0x1e34)
870       regmbc('K'); REGMBC(0x136) REGMBC(0x1e8)
871       REGMBC(0x1e30) REGMBC(0x1e34)
872       return;
873     case 'L': CASEMBC(0x139) CASEMBC(0x13b) CASEMBC(0x13d)
874       CASEMBC(0x13f) CASEMBC(0x141) CASEMBC(0x1e3a)
875       regmbc('L'); REGMBC(0x139) REGMBC(0x13b)
876       REGMBC(0x13d) REGMBC(0x13f) REGMBC(0x141)
877       REGMBC(0x1e3a)
878       return;
879     case 'M': CASEMBC(0x1e3e) CASEMBC(0x1e40)
880       regmbc('M'); REGMBC(0x1e3e) REGMBC(0x1e40)
881       return;
882     case 'N': case 0xd1:
883       CASEMBC(0x143) CASEMBC(0x145) CASEMBC(0x147) CASEMBC(0x1e44)
884       CASEMBC(0x1e48)
885       regmbc('N'); regmbc(0xd1);
886       REGMBC(0x143) REGMBC(0x145) REGMBC(0x147)
887       REGMBC(0x1e44) REGMBC(0x1e48)
888       return;
889     case 'O': case 0xd2: case 0xd3: case 0xd4: case 0xd5:
890     case 0xd6: case 0xd8:
891       CASEMBC(0x14c) CASEMBC(0x14e) CASEMBC(0x150) CASEMBC(0x1a0)
892       CASEMBC(0x1d1) CASEMBC(0x1ea) CASEMBC(0x1ec) CASEMBC(0x1ece)
893       regmbc('O'); regmbc(0xd2); regmbc(0xd3);
894       regmbc(0xd4); regmbc(0xd5); regmbc(0xd6);
895       regmbc(0xd8);
896       REGMBC(0x14c) REGMBC(0x14e) REGMBC(0x150)
897       REGMBC(0x1a0) REGMBC(0x1d1) REGMBC(0x1ea)
898       REGMBC(0x1ec) REGMBC(0x1ece)
899       return;
900     case 'P': case 0x1e54: case 0x1e56:
901       regmbc('P'); REGMBC(0x1e54) REGMBC(0x1e56)
902       return;
903     case 'R': CASEMBC(0x154) CASEMBC(0x156) CASEMBC(0x158)
904       CASEMBC(0x1e58) CASEMBC(0x1e5e)
905       regmbc('R'); REGMBC(0x154) REGMBC(0x156) REGMBC(0x158)
906       REGMBC(0x1e58) REGMBC(0x1e5e)
907       return;
908     case 'S': CASEMBC(0x15a) CASEMBC(0x15c) CASEMBC(0x15e)
909       CASEMBC(0x160) CASEMBC(0x1e60)
910       regmbc('S'); REGMBC(0x15a) REGMBC(0x15c)
911       REGMBC(0x15e) REGMBC(0x160) REGMBC(0x1e60)
912       return;
913     case 'T': CASEMBC(0x162) CASEMBC(0x164) CASEMBC(0x166)
914       CASEMBC(0x1e6a) CASEMBC(0x1e6e)
915       regmbc('T'); REGMBC(0x162) REGMBC(0x164)
916       REGMBC(0x166) REGMBC(0x1e6a) REGMBC(0x1e6e)
917       return;
918     case 'U': case 0xd9: case 0xda: case 0xdb: case 0xdc:
919       CASEMBC(0x168) CASEMBC(0x16a) CASEMBC(0x16c) CASEMBC(0x16e)
920       CASEMBC(0x170) CASEMBC(0x172) CASEMBC(0x1af) CASEMBC(0x1d3)
921       CASEMBC(0x1ee6)
922       regmbc('U'); regmbc(0xd9); regmbc(0xda);
923       regmbc(0xdb); regmbc(0xdc);
924       REGMBC(0x168) REGMBC(0x16a) REGMBC(0x16c)
925       REGMBC(0x16e) REGMBC(0x170) REGMBC(0x172)
926       REGMBC(0x1af) REGMBC(0x1d3) REGMBC(0x1ee6)
927       return;
928     case 'V': CASEMBC(0x1e7c)
929       regmbc('V'); REGMBC(0x1e7c)
930       return;
931     case 'W': CASEMBC(0x174) CASEMBC(0x1e80) CASEMBC(0x1e82)
932       CASEMBC(0x1e84) CASEMBC(0x1e86)
933       regmbc('W'); REGMBC(0x174) REGMBC(0x1e80)
934       REGMBC(0x1e82) REGMBC(0x1e84) REGMBC(0x1e86)
935       return;
936     case 'X': CASEMBC(0x1e8a) CASEMBC(0x1e8c)
937       regmbc('X'); REGMBC(0x1e8a) REGMBC(0x1e8c)
938       return;
939     case 'Y': case 0xdd:
940       CASEMBC(0x176) CASEMBC(0x178) CASEMBC(0x1e8e) CASEMBC(0x1ef2)
941       CASEMBC(0x1ef6) CASEMBC(0x1ef8)
942       regmbc('Y'); regmbc(0xdd);
943       REGMBC(0x176) REGMBC(0x178) REGMBC(0x1e8e)
944       REGMBC(0x1ef2) REGMBC(0x1ef6) REGMBC(0x1ef8)
945       return;
946     case 'Z': CASEMBC(0x179) CASEMBC(0x17b) CASEMBC(0x17d)
947       CASEMBC(0x1b5) CASEMBC(0x1e90) CASEMBC(0x1e94)
948       regmbc('Z'); REGMBC(0x179) REGMBC(0x17b)
949       REGMBC(0x17d) REGMBC(0x1b5) REGMBC(0x1e90)
950       REGMBC(0x1e94)
951       return;
952     case 'a': case 0xe0: case 0xe1: case 0xe2:
953     case 0xe3: case 0xe4: case 0xe5:
954       CASEMBC(0x101) CASEMBC(0x103) CASEMBC(0x105) CASEMBC(0x1ce)
955       CASEMBC(0x1df) CASEMBC(0x1e1) CASEMBC(0x1ea3)
956       regmbc('a'); regmbc(0xe0); regmbc(0xe1);
957       regmbc(0xe2); regmbc(0xe3); regmbc(0xe4);
958       regmbc(0xe5);
959       REGMBC(0x101) REGMBC(0x103) REGMBC(0x105)
960       REGMBC(0x1ce) REGMBC(0x1df) REGMBC(0x1e1)
961       REGMBC(0x1ea3)
962       return;
963     case 'b': CASEMBC(0x1e03) CASEMBC(0x1e07)
964       regmbc('b'); REGMBC(0x1e03) REGMBC(0x1e07)
965       return;
966     case 'c': case 0xe7:
967       CASEMBC(0x107) CASEMBC(0x109) CASEMBC(0x10b) CASEMBC(0x10d)
968       regmbc('c'); regmbc(0xe7);
969       REGMBC(0x107) REGMBC(0x109) REGMBC(0x10b)
970       REGMBC(0x10d)
971       return;
972     case 'd': CASEMBC(0x10f) CASEMBC(0x111) CASEMBC(0x1e0b)
973       CASEMBC(0x1e0f) CASEMBC(0x1e11)
974       regmbc('d'); REGMBC(0x10f) REGMBC(0x111)
975       REGMBC(0x1e0b) REGMBC(0x1e0f) REGMBC(0x1e11)
976       return;
977     case 'e': case 0xe8: case 0xe9: case 0xea: case 0xeb:
978       CASEMBC(0x113) CASEMBC(0x115) CASEMBC(0x117) CASEMBC(0x119)
979       CASEMBC(0x11b) CASEMBC(0x1ebb) CASEMBC(0x1ebd)
980       regmbc('e'); regmbc(0xe8); regmbc(0xe9);
981       regmbc(0xea); regmbc(0xeb);
982       REGMBC(0x113) REGMBC(0x115) REGMBC(0x117)
983       REGMBC(0x119) REGMBC(0x11b) REGMBC(0x1ebb)
984       REGMBC(0x1ebd)
985       return;
986     case 'f': CASEMBC(0x1e1f)
987       regmbc('f'); REGMBC(0x1e1f)
988       return;
989     case 'g': CASEMBC(0x11d) CASEMBC(0x11f) CASEMBC(0x121)
990       CASEMBC(0x123) CASEMBC(0x1e5) CASEMBC(0x1e7) CASEMBC(0x1f5)
991       CASEMBC(0x1e21)
992       regmbc('g'); REGMBC(0x11d) REGMBC(0x11f)
993       REGMBC(0x121) REGMBC(0x123) REGMBC(0x1e5)
994       REGMBC(0x1e7) REGMBC(0x1f5) REGMBC(0x1e21)
995       return;
996     case 'h': CASEMBC(0x125) CASEMBC(0x127) CASEMBC(0x1e23)
997       CASEMBC(0x1e27) CASEMBC(0x1e29) CASEMBC(0x1e96)
998       regmbc('h'); REGMBC(0x125) REGMBC(0x127)
999       REGMBC(0x1e23) REGMBC(0x1e27) REGMBC(0x1e29)
1000       REGMBC(0x1e96)
1001       return;
1002     case 'i': case 0xec: case 0xed: case 0xee: case 0xef:
1003       CASEMBC(0x129) CASEMBC(0x12b) CASEMBC(0x12d) CASEMBC(0x12f)
1004       CASEMBC(0x1d0) CASEMBC(0x1ec9)
1005       regmbc('i'); regmbc(0xec); regmbc(0xed);
1006       regmbc(0xee); regmbc(0xef);
1007       REGMBC(0x129) REGMBC(0x12b) REGMBC(0x12d)
1008       REGMBC(0x12f) REGMBC(0x1d0) REGMBC(0x1ec9)
1009       return;
1010     case 'j': CASEMBC(0x135) CASEMBC(0x1f0)
1011       regmbc('j'); REGMBC(0x135) REGMBC(0x1f0)
1012       return;
1013     case 'k': CASEMBC(0x137) CASEMBC(0x1e9) CASEMBC(0x1e31)
1014       CASEMBC(0x1e35)
1015       regmbc('k'); REGMBC(0x137) REGMBC(0x1e9)
1016       REGMBC(0x1e31) REGMBC(0x1e35)
1017       return;
1018     case 'l': CASEMBC(0x13a) CASEMBC(0x13c) CASEMBC(0x13e)
1019       CASEMBC(0x140) CASEMBC(0x142) CASEMBC(0x1e3b)
1020       regmbc('l'); REGMBC(0x13a) REGMBC(0x13c)
1021       REGMBC(0x13e) REGMBC(0x140) REGMBC(0x142)
1022       REGMBC(0x1e3b)
1023       return;
1024     case 'm': CASEMBC(0x1e3f) CASEMBC(0x1e41)
1025       regmbc('m'); REGMBC(0x1e3f) REGMBC(0x1e41)
1026       return;
1027     case 'n': case 0xf1:
1028       CASEMBC(0x144) CASEMBC(0x146) CASEMBC(0x148) CASEMBC(0x149)
1029       CASEMBC(0x1e45) CASEMBC(0x1e49)
1030       regmbc('n'); regmbc(0xf1);
1031       REGMBC(0x144) REGMBC(0x146) REGMBC(0x148)
1032       REGMBC(0x149) REGMBC(0x1e45) REGMBC(0x1e49)
1033       return;
1034     case 'o': case 0xf2: case 0xf3: case 0xf4: case 0xf5:
1035     case 0xf6: case 0xf8:
1036       CASEMBC(0x14d) CASEMBC(0x14f) CASEMBC(0x151) CASEMBC(0x1a1)
1037       CASEMBC(0x1d2) CASEMBC(0x1eb) CASEMBC(0x1ed) CASEMBC(0x1ecf)
1038       regmbc('o'); regmbc(0xf2); regmbc(0xf3);
1039       regmbc(0xf4); regmbc(0xf5); regmbc(0xf6);
1040       regmbc(0xf8);
1041       REGMBC(0x14d) REGMBC(0x14f) REGMBC(0x151)
1042       REGMBC(0x1a1) REGMBC(0x1d2) REGMBC(0x1eb)
1043       REGMBC(0x1ed) REGMBC(0x1ecf)
1044       return;
1045     case 'p': CASEMBC(0x1e55) CASEMBC(0x1e57)
1046       regmbc('p'); REGMBC(0x1e55) REGMBC(0x1e57)
1047       return;
1048     case 'r': CASEMBC(0x155) CASEMBC(0x157) CASEMBC(0x159)
1049       CASEMBC(0x1e59) CASEMBC(0x1e5f)
1050       regmbc('r'); REGMBC(0x155) REGMBC(0x157) REGMBC(0x159)
1051       REGMBC(0x1e59) REGMBC(0x1e5f)
1052       return;
1053     case 's': CASEMBC(0x15b) CASEMBC(0x15d) CASEMBC(0x15f)
1054       CASEMBC(0x161) CASEMBC(0x1e61)
1055       regmbc('s'); REGMBC(0x15b) REGMBC(0x15d)
1056       REGMBC(0x15f) REGMBC(0x161) REGMBC(0x1e61)
1057       return;
1058     case 't': CASEMBC(0x163) CASEMBC(0x165) CASEMBC(0x167)
1059       CASEMBC(0x1e6b) CASEMBC(0x1e6f) CASEMBC(0x1e97)
1060       regmbc('t'); REGMBC(0x163) REGMBC(0x165) REGMBC(0x167)
1061       REGMBC(0x1e6b) REGMBC(0x1e6f) REGMBC(0x1e97)
1062       return;
1063     case 'u': case 0xf9: case 0xfa: case 0xfb: case 0xfc:
1064       CASEMBC(0x169) CASEMBC(0x16b) CASEMBC(0x16d) CASEMBC(0x16f)
1065       CASEMBC(0x171) CASEMBC(0x173) CASEMBC(0x1b0) CASEMBC(0x1d4)
1066       CASEMBC(0x1ee7)
1067       regmbc('u'); regmbc(0xf9); regmbc(0xfa);
1068       regmbc(0xfb); regmbc(0xfc);
1069       REGMBC(0x169) REGMBC(0x16b) REGMBC(0x16d)
1070       REGMBC(0x16f) REGMBC(0x171) REGMBC(0x173)
1071       REGMBC(0x1b0) REGMBC(0x1d4) REGMBC(0x1ee7)
1072       return;
1073     case 'v': CASEMBC(0x1e7d)
1074       regmbc('v'); REGMBC(0x1e7d)
1075       return;
1076     case 'w': CASEMBC(0x175) CASEMBC(0x1e81) CASEMBC(0x1e83)
1077       CASEMBC(0x1e85) CASEMBC(0x1e87) CASEMBC(0x1e98)
1078       regmbc('w'); REGMBC(0x175) REGMBC(0x1e81)
1079       REGMBC(0x1e83) REGMBC(0x1e85) REGMBC(0x1e87)
1080       REGMBC(0x1e98)
1081       return;
1082     case 'x': CASEMBC(0x1e8b) CASEMBC(0x1e8d)
1083       regmbc('x'); REGMBC(0x1e8b) REGMBC(0x1e8d)
1084       return;
1085     case 'y': case 0xfd: case 0xff:
1086       CASEMBC(0x177) CASEMBC(0x1e8f) CASEMBC(0x1e99)
1087       CASEMBC(0x1ef3) CASEMBC(0x1ef7) CASEMBC(0x1ef9)
1088       regmbc('y'); regmbc(0xfd); regmbc(0xff);
1089       REGMBC(0x177) REGMBC(0x1e8f) REGMBC(0x1e99)
1090       REGMBC(0x1ef3) REGMBC(0x1ef7) REGMBC(0x1ef9)
1091       return;
1092     case 'z': CASEMBC(0x17a) CASEMBC(0x17c) CASEMBC(0x17e)
1093       CASEMBC(0x1b6) CASEMBC(0x1e91) CASEMBC(0x1e95)
1094       regmbc('z'); REGMBC(0x17a) REGMBC(0x17c)
1095       REGMBC(0x17e) REGMBC(0x1b6) REGMBC(0x1e91)
1096       REGMBC(0x1e95)
1097       return;
1098     }
1099   }
1100   regmbc(c);
1101 }
1102 
1103 /*
1104  * Check for a collating element "[.a.]".  "pp" points to the '['.
1105  * Returns a character. Zero means that no item was recognized.  Otherwise
1106  * "pp" is advanced to after the item.
1107  * Currently only single characters are recognized!
1108  */
get_coll_element(char_u ** pp)1109 static int get_coll_element(char_u **pp)
1110 {
1111   int c;
1112   int l = 1;
1113   char_u      *p = *pp;
1114 
1115   if (p[0] != NUL && p[1] == '.' && p[2] != NUL) {
1116     l = utfc_ptr2len(p + 2);
1117     if (p[l + 2] == '.' && p[l + 3] == ']') {
1118       c = utf_ptr2char(p + 2);
1119       *pp += l + 4;
1120       return c;
1121     }
1122   }
1123   return 0;
1124 }
1125 
1126 static int reg_cpo_lit; /* 'cpoptions' contains 'l' flag */
1127 
get_cpo_flags(void)1128 static void get_cpo_flags(void)
1129 {
1130   reg_cpo_lit = vim_strchr(p_cpo, CPO_LITERAL) != NULL;
1131 }
1132 
1133 /*
1134  * Skip over a "[]" range.
1135  * "p" must point to the character after the '['.
1136  * The returned pointer is on the matching ']', or the terminating NUL.
1137  */
skip_anyof(char_u * p)1138 static char_u *skip_anyof(char_u *p)
1139 {
1140   int l;
1141 
1142   if (*p == '^')        /* Complement of range. */
1143     ++p;
1144   if (*p == ']' || *p == '-')
1145     ++p;
1146   while (*p != NUL && *p != ']') {
1147     if ((l = utfc_ptr2len(p)) > 1) {
1148       p += l;
1149     } else if (*p == '-')  {
1150       p++;
1151       if (*p != ']' && *p != NUL) {
1152         MB_PTR_ADV(p);
1153       }
1154     } else if (*p == '\\'
1155                && (vim_strchr(REGEXP_INRANGE, p[1]) != NULL
1156                    || (!reg_cpo_lit
1157                        && vim_strchr(REGEXP_ABBR, p[1]) != NULL))) {
1158       p += 2;
1159     } else if (*p == '[') {
1160       if (get_char_class(&p) == CLASS_NONE
1161           && get_equi_class(&p) == 0
1162           && get_coll_element(&p) == 0
1163           && *p != NUL) {
1164         p++;          // It is not a class name and not NUL
1165       }
1166     } else {
1167       p++;
1168     }
1169   }
1170 
1171   return p;
1172 }
1173 
1174 /*
1175  * Skip past regular expression.
1176  * Stop at end of "startp" or where "dirc" is found ('/', '?', etc).
1177  * Take care of characters with a backslash in front of it.
1178  * Skip strings inside [ and ].
1179  * When "newp" is not NULL and "dirc" is '?', make an allocated copy of the
1180  * expression and change "\?" to "?".  If "*newp" is not NULL the expression
1181  * is changed in-place.
1182  */
skip_regexp(char_u * startp,int dirc,int magic,char_u ** newp)1183 char_u *skip_regexp(char_u *startp, int dirc, int magic, char_u **newp)
1184 {
1185   int mymagic;
1186   char_u      *p = startp;
1187 
1188   if (magic)
1189     mymagic = MAGIC_ON;
1190   else
1191     mymagic = MAGIC_OFF;
1192   get_cpo_flags();
1193 
1194   for (; p[0] != NUL; MB_PTR_ADV(p)) {
1195     if (p[0] == dirc) {         // found end of regexp
1196       break;
1197     }
1198     if ((p[0] == '[' && mymagic >= MAGIC_ON)
1199         || (p[0] == '\\' && p[1] == '[' && mymagic <= MAGIC_OFF)) {
1200       p = skip_anyof(p + 1);
1201       if (p[0] == NUL)
1202         break;
1203     } else if (p[0] == '\\' && p[1] != NUL)   {
1204       if (dirc == '?' && newp != NULL && p[1] == '?') {
1205         /* change "\?" to "?", make a copy first. */
1206         if (*newp == NULL) {
1207           *newp = vim_strsave(startp);
1208           p = *newp + (p - startp);
1209         }
1210         STRMOVE(p, p + 1);
1211       } else
1212         ++p;            /* skip next character */
1213       if (*p == 'v')
1214         mymagic = MAGIC_ALL;
1215       else if (*p == 'V')
1216         mymagic = MAGIC_NONE;
1217     }
1218   }
1219   return p;
1220 }
1221 
1222 /// Return true if the back reference is legal. We must have seen the close
1223 /// brace.
1224 /// TODO(vim): Should also check that we don't refer to something repeated
1225 /// (+*=): what instance of the repetition should we match?
seen_endbrace(int refnum)1226 static int seen_endbrace(int refnum)
1227 {
1228   if (!had_endbrace[refnum]) {
1229       char_u *p;
1230 
1231       // Trick: check if "@<=" or "@<!" follows, in which case
1232       // the \1 can appear before the referenced match.
1233       for (p = regparse; *p != NUL; p++) {
1234         if (p[0] == '@' && p[1] == '<' && (p[2] == '!' || p[2] == '=')) {
1235           break;
1236         }
1237       }
1238 
1239     if (*p == NUL) {
1240       emsg(_("E65: Illegal back reference"));
1241       rc_did_emsg = true;
1242       return false;
1243     }
1244   }
1245   return true;
1246 }
1247 
1248 /*
1249  * bt_regcomp() - compile a regular expression into internal code for the
1250  * traditional back track matcher.
1251  * Returns the program in allocated space.  Returns NULL for an error.
1252  *
1253  * We can't allocate space until we know how big the compiled form will be,
1254  * but we can't compile it (and thus know how big it is) until we've got a
1255  * place to put the code.  So we cheat:  we compile it twice, once with code
1256  * generation turned off and size counting turned on, and once "for real".
1257  * This also means that we don't allocate space until we are sure that the
1258  * thing really will compile successfully, and we never have to move the
1259  * code and thus invalidate pointers into it.  (Note that it has to be in
1260  * one piece because free() must be able to free it all.)
1261  *
1262  * Whether upper/lower case is to be ignored is decided when executing the
1263  * program, it does not matter here.
1264  *
1265  * Beware that the optimization-preparation code in here knows about some
1266  * of the structure of the compiled regexp.
1267  * "re_flags": RE_MAGIC and/or RE_STRING.
1268  */
bt_regcomp(char_u * expr,int re_flags)1269 static regprog_T *bt_regcomp(char_u *expr, int re_flags)
1270 {
1271   char_u      *scan;
1272   char_u      *longest;
1273   int len;
1274   int flags;
1275 
1276   if (expr == NULL) {
1277     IEMSG_RET_NULL(_(e_null));
1278   }
1279 
1280   init_class_tab();
1281 
1282   /*
1283    * First pass: determine size, legality.
1284    */
1285   regcomp_start(expr, re_flags);
1286   regcode = JUST_CALC_SIZE;
1287   regc(REGMAGIC);
1288   if (reg(REG_NOPAREN, &flags) == NULL)
1289     return NULL;
1290 
1291   /* Allocate space. */
1292   bt_regprog_T *r = xmalloc(sizeof(bt_regprog_T) + regsize);
1293   r->re_in_use = false;
1294 
1295   /*
1296    * Second pass: emit code.
1297    */
1298   regcomp_start(expr, re_flags);
1299   regcode = r->program;
1300   regc(REGMAGIC);
1301   if (reg(REG_NOPAREN, &flags) == NULL || reg_toolong) {
1302     xfree(r);
1303     if (reg_toolong)
1304       EMSG_RET_NULL(_("E339: Pattern too long"));
1305     return NULL;
1306   }
1307 
1308   /* Dig out information for optimizations. */
1309   r->regstart = NUL;            /* Worst-case defaults. */
1310   r->reganch = 0;
1311   r->regmust = NULL;
1312   r->regmlen = 0;
1313   r->regflags = regflags;
1314   if (flags & HASNL)
1315     r->regflags |= RF_HASNL;
1316   if (flags & HASLOOKBH)
1317     r->regflags |= RF_LOOKBH;
1318   /* Remember whether this pattern has any \z specials in it. */
1319   r->reghasz = re_has_z;
1320   scan = r->program + 1;        /* First BRANCH. */
1321   if (OP(regnext(scan)) == END) {   /* Only one top-level choice. */
1322     scan = OPERAND(scan);
1323 
1324     /* Starting-point info. */
1325     if (OP(scan) == BOL || OP(scan) == RE_BOF) {
1326       r->reganch++;
1327       scan = regnext(scan);
1328     }
1329 
1330     if (OP(scan) == EXACTLY) {
1331       r->regstart = utf_ptr2char(OPERAND(scan));
1332     } else if (OP(scan) == BOW
1333                || OP(scan) == EOW
1334                || OP(scan) == NOTHING
1335                || OP(scan) == MOPEN  + 0 || OP(scan) == NOPEN
1336                || OP(scan) == MCLOSE + 0 || OP(scan) == NCLOSE) {
1337       char_u *regnext_scan = regnext(scan);
1338       if (OP(regnext_scan) == EXACTLY) {
1339         r->regstart = utf_ptr2char(OPERAND(regnext_scan));
1340       }
1341     }
1342 
1343     /*
1344      * If there's something expensive in the r.e., find the longest
1345      * literal string that must appear and make it the regmust.  Resolve
1346      * ties in favor of later strings, since the regstart check works
1347      * with the beginning of the r.e. and avoiding duplication
1348      * strengthens checking.  Not a strong reason, but sufficient in the
1349      * absence of others.
1350      */
1351     /*
1352      * When the r.e. starts with BOW, it is faster to look for a regmust
1353      * first. Used a lot for "#" and "*" commands. (Added by mool).
1354      */
1355     if ((flags & SPSTART || OP(scan) == BOW || OP(scan) == EOW)
1356         && !(flags & HASNL)) {
1357       longest = NULL;
1358       len = 0;
1359       for (; scan != NULL; scan = regnext(scan))
1360         if (OP(scan) == EXACTLY && STRLEN(OPERAND(scan)) >= (size_t)len) {
1361           longest = OPERAND(scan);
1362           len = (int)STRLEN(OPERAND(scan));
1363         }
1364       r->regmust = longest;
1365       r->regmlen = len;
1366     }
1367   }
1368 #ifdef BT_REGEXP_DUMP
1369   regdump(expr, r);
1370 #endif
1371   r->engine = &bt_regengine;
1372   return (regprog_T *)r;
1373 }
1374 
1375 /*
1376  * Free a compiled regexp program, returned by bt_regcomp().
1377  */
bt_regfree(regprog_T * prog)1378 static void bt_regfree(regprog_T *prog)
1379 {
1380   xfree(prog);
1381 }
1382 
1383 /*
1384  * Setup to parse the regexp.  Used once to get the length and once to do it.
1385  */
1386 static void
regcomp_start(char_u * expr,int re_flags)1387 regcomp_start (
1388     char_u *expr,
1389     int re_flags                       /* see vim_regcomp() */
1390 )
1391 {
1392   initchr(expr);
1393   if (re_flags & RE_MAGIC)
1394     reg_magic = MAGIC_ON;
1395   else
1396     reg_magic = MAGIC_OFF;
1397   reg_string = (re_flags & RE_STRING);
1398   reg_strict = (re_flags & RE_STRICT);
1399   get_cpo_flags();
1400 
1401   num_complex_braces = 0;
1402   regnpar = 1;
1403   memset(had_endbrace, 0, sizeof(had_endbrace));
1404   regnzpar = 1;
1405   re_has_z = 0;
1406   regsize = 0L;
1407   reg_toolong = false;
1408   regflags = 0;
1409   had_eol = false;
1410 }
1411 
1412 /*
1413  * Check if during the previous call to vim_regcomp the EOL item "$" has been
1414  * found.  This is messy, but it works fine.
1415  */
vim_regcomp_had_eol(void)1416 int vim_regcomp_had_eol(void)
1417 {
1418   return had_eol;
1419 }
1420 
1421 // variables used for parsing
1422 static int at_start;       // True when on the first character
1423 static int prev_at_start;  // True when on the second character
1424 
1425 /*
1426  * Parse regular expression, i.e. main body or parenthesized thing.
1427  *
1428  * Caller must absorb opening parenthesis.
1429  *
1430  * Combining parenthesis handling with the base level of regular expression
1431  * is a trifle forced, but the need to tie the tails of the branches to what
1432  * follows makes it hard to avoid.
1433  */
1434 static char_u *
reg(int paren,int * flagp)1435 reg (
1436     int paren,              /* REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN */
1437     int *flagp
1438 )
1439 {
1440   char_u      *ret;
1441   char_u      *br;
1442   char_u      *ender;
1443   int parno = 0;
1444   int flags;
1445 
1446   *flagp = HASWIDTH;            /* Tentatively. */
1447 
1448   if (paren == REG_ZPAREN) {
1449     /* Make a ZOPEN node. */
1450     if (regnzpar >= NSUBEXP)
1451       EMSG_RET_NULL(_("E50: Too many \\z("));
1452     parno = regnzpar;
1453     regnzpar++;
1454     ret = regnode(ZOPEN + parno);
1455   } else if (paren == REG_PAREN)    {
1456     /* Make a MOPEN node. */
1457     if (regnpar >= NSUBEXP)
1458       EMSG2_RET_NULL(_("E51: Too many %s("), reg_magic == MAGIC_ALL);
1459     parno = regnpar;
1460     ++regnpar;
1461     ret = regnode(MOPEN + parno);
1462   } else if (paren == REG_NPAREN)   {
1463     /* Make a NOPEN node. */
1464     ret = regnode(NOPEN);
1465   } else
1466     ret = NULL;
1467 
1468   /* Pick up the branches, linking them together. */
1469   br = regbranch(&flags);
1470   if (br == NULL)
1471     return NULL;
1472   if (ret != NULL)
1473     regtail(ret, br);           /* [MZ]OPEN -> first. */
1474   else
1475     ret = br;
1476   /* If one of the branches can be zero-width, the whole thing can.
1477    * If one of the branches has * at start or matches a line-break, the
1478    * whole thing can. */
1479   if (!(flags & HASWIDTH))
1480     *flagp &= ~HASWIDTH;
1481   *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
1482   while (peekchr() == Magic('|')) {
1483     skipchr();
1484     br = regbranch(&flags);
1485     if (br == NULL || reg_toolong)
1486       return NULL;
1487     regtail(ret, br);           /* BRANCH -> BRANCH. */
1488     if (!(flags & HASWIDTH))
1489       *flagp &= ~HASWIDTH;
1490     *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
1491   }
1492 
1493   /* Make a closing node, and hook it on the end. */
1494   ender = regnode(
1495       paren == REG_ZPAREN ? ZCLOSE + parno :
1496       paren == REG_PAREN ? MCLOSE + parno :
1497       paren == REG_NPAREN ? NCLOSE : END);
1498   regtail(ret, ender);
1499 
1500   /* Hook the tails of the branches to the closing node. */
1501   for (br = ret; br != NULL; br = regnext(br))
1502     regoptail(br, ender);
1503 
1504   /* Check for proper termination. */
1505   if (paren != REG_NOPAREN && getchr() != Magic(')')) {
1506     if (paren == REG_ZPAREN)
1507       EMSG_RET_NULL(_("E52: Unmatched \\z("));
1508     else if (paren == REG_NPAREN)
1509       EMSG2_RET_NULL(_(e_unmatchedpp), reg_magic == MAGIC_ALL);
1510     else
1511       EMSG2_RET_NULL(_(e_unmatchedp), reg_magic == MAGIC_ALL);
1512   } else if (paren == REG_NOPAREN && peekchr() != NUL) {
1513     if (curchr == Magic(')'))
1514       EMSG2_RET_NULL(_(e_unmatchedpar), reg_magic == MAGIC_ALL);
1515     else
1516       EMSG_RET_NULL(_(e_trailing));             /* "Can't happen". */
1517     /* NOTREACHED */
1518   }
1519   // Here we set the flag allowing back references to this set of
1520   // parentheses.
1521   if (paren == REG_PAREN) {
1522     had_endbrace[parno] = true;  // have seen the close paren
1523   }
1524   return ret;
1525 }
1526 
1527 /*
1528  * Parse one alternative of an | operator.
1529  * Implements the & operator.
1530  */
regbranch(int * flagp)1531 static char_u *regbranch(int *flagp)
1532 {
1533   char_u      *ret;
1534   char_u      *chain = NULL;
1535   char_u      *latest;
1536   int flags;
1537 
1538   *flagp = WORST | HASNL;               /* Tentatively. */
1539 
1540   ret = regnode(BRANCH);
1541   for (;; ) {
1542     latest = regconcat(&flags);
1543     if (latest == NULL)
1544       return NULL;
1545     /* If one of the branches has width, the whole thing has.  If one of
1546      * the branches anchors at start-of-line, the whole thing does.
1547      * If one of the branches uses look-behind, the whole thing does. */
1548     *flagp |= flags & (HASWIDTH | SPSTART | HASLOOKBH);
1549     /* If one of the branches doesn't match a line-break, the whole thing
1550      * doesn't. */
1551     *flagp &= ~HASNL | (flags & HASNL);
1552     if (chain != NULL)
1553       regtail(chain, latest);
1554     if (peekchr() != Magic('&'))
1555       break;
1556     skipchr();
1557     regtail(latest, regnode(END));     /* operand ends */
1558     if (reg_toolong)
1559       break;
1560     reginsert(MATCH, latest);
1561     chain = latest;
1562   }
1563 
1564   return ret;
1565 }
1566 
1567 /*
1568  * Parse one alternative of an | or & operator.
1569  * Implements the concatenation operator.
1570  */
regconcat(int * flagp)1571 static char_u *regconcat(int *flagp)
1572 {
1573   char_u      *first = NULL;
1574   char_u      *chain = NULL;
1575   char_u      *latest;
1576   int flags;
1577   int cont = true;
1578 
1579   *flagp = WORST;               /* Tentatively. */
1580 
1581   while (cont) {
1582     switch (peekchr()) {
1583     case NUL:
1584     case Magic('|'):
1585     case Magic('&'):
1586     case Magic(')'):
1587       cont = false;
1588       break;
1589     case Magic('Z'):
1590       regflags |= RF_ICOMBINE;
1591       skipchr_keepstart();
1592       break;
1593     case Magic('c'):
1594       regflags |= RF_ICASE;
1595       skipchr_keepstart();
1596       break;
1597     case Magic('C'):
1598       regflags |= RF_NOICASE;
1599       skipchr_keepstart();
1600       break;
1601     case Magic('v'):
1602       reg_magic = MAGIC_ALL;
1603       skipchr_keepstart();
1604       curchr = -1;
1605       break;
1606     case Magic('m'):
1607       reg_magic = MAGIC_ON;
1608       skipchr_keepstart();
1609       curchr = -1;
1610       break;
1611     case Magic('M'):
1612       reg_magic = MAGIC_OFF;
1613       skipchr_keepstart();
1614       curchr = -1;
1615       break;
1616     case Magic('V'):
1617       reg_magic = MAGIC_NONE;
1618       skipchr_keepstart();
1619       curchr = -1;
1620       break;
1621     default:
1622       latest = regpiece(&flags);
1623       if (latest == NULL || reg_toolong)
1624         return NULL;
1625       *flagp |= flags & (HASWIDTH | HASNL | HASLOOKBH);
1626       if (chain == NULL)                        /* First piece. */
1627         *flagp |= flags & SPSTART;
1628       else
1629         regtail(chain, latest);
1630       chain = latest;
1631       if (first == NULL)
1632         first = latest;
1633       break;
1634     }
1635   }
1636   if (first == NULL)            /* Loop ran zero times. */
1637     first = regnode(NOTHING);
1638   return first;
1639 }
1640 
1641 /*
1642  * Parse something followed by possible [*+=].
1643  *
1644  * Note that the branching code sequences used for = and the general cases
1645  * of * and + are somewhat optimized:  they use the same NOTHING node as
1646  * both the endmarker for their branch list and the body of the last branch.
1647  * It might seem that this node could be dispensed with entirely, but the
1648  * endmarker role is not redundant.
1649  */
regpiece(int * flagp)1650 static char_u *regpiece(int *flagp)
1651 {
1652   char_u          *ret;
1653   int op;
1654   char_u          *next;
1655   int flags;
1656   long minval;
1657   long maxval;
1658 
1659   ret = regatom(&flags);
1660   if (ret == NULL)
1661     return NULL;
1662 
1663   op = peekchr();
1664   if (re_multi_type(op) == NOT_MULTI) {
1665     *flagp = flags;
1666     return ret;
1667   }
1668   /* default flags */
1669   *flagp = (WORST | SPSTART | (flags & (HASNL | HASLOOKBH)));
1670 
1671   skipchr();
1672   switch (op) {
1673   case Magic('*'):
1674     if (flags & SIMPLE)
1675       reginsert(STAR, ret);
1676     else {
1677       /* Emit x* as (x&|), where & means "self". */
1678       reginsert(BRANCH, ret);           /* Either x */
1679       regoptail(ret, regnode(BACK));            /* and loop */
1680       regoptail(ret, ret);              /* back */
1681       regtail(ret, regnode(BRANCH));            /* or */
1682       regtail(ret, regnode(NOTHING));           /* null. */
1683     }
1684     break;
1685 
1686   case Magic('+'):
1687     if (flags & SIMPLE)
1688       reginsert(PLUS, ret);
1689     else {
1690       /* Emit x+ as x(&|), where & means "self". */
1691       next = regnode(BRANCH);           /* Either */
1692       regtail(ret, next);
1693       regtail(regnode(BACK), ret);              /* loop back */
1694       regtail(next, regnode(BRANCH));           /* or */
1695       regtail(ret, regnode(NOTHING));           /* null. */
1696     }
1697     *flagp = (WORST | HASWIDTH | (flags & (HASNL | HASLOOKBH)));
1698     break;
1699 
1700   case Magic('@'):
1701   {
1702     int lop = END;
1703     int64_t nr = getdecchrs();
1704 
1705     switch (no_Magic(getchr())) {
1706     case '=': lop = MATCH; break;                                 /* \@= */
1707     case '!': lop = NOMATCH; break;                               /* \@! */
1708     case '>': lop = SUBPAT; break;                                /* \@> */
1709     case '<': switch (no_Magic(getchr())) {
1710       case '=': lop = BEHIND; break;                               /* \@<= */
1711       case '!': lop = NOBEHIND; break;                             /* \@<! */
1712     }
1713     }
1714     if (lop == END)
1715       EMSG2_RET_NULL(_("E59: invalid character after %s@"),
1716           reg_magic == MAGIC_ALL);
1717     /* Look behind must match with behind_pos. */
1718     if (lop == BEHIND || lop == NOBEHIND) {
1719       regtail(ret, regnode(BHPOS));
1720       *flagp |= HASLOOKBH;
1721     }
1722     regtail(ret, regnode(END));             /* operand ends */
1723     if (lop == BEHIND || lop == NOBEHIND) {
1724       if (nr < 0)
1725         nr = 0;                 /* no limit is same as zero limit */
1726       reginsert_nr(lop, (uint32_t)nr, ret);
1727     } else
1728       reginsert(lop, ret);
1729     break;
1730   }
1731 
1732   case Magic('?'):
1733   case Magic('='):
1734     /* Emit x= as (x|) */
1735     reginsert(BRANCH, ret);                     /* Either x */
1736     regtail(ret, regnode(BRANCH));              /* or */
1737     next = regnode(NOTHING);                    /* null. */
1738     regtail(ret, next);
1739     regoptail(ret, next);
1740     break;
1741 
1742   case Magic('{'):
1743     if (!read_limits(&minval, &maxval))
1744       return NULL;
1745     if (flags & SIMPLE) {
1746       reginsert(BRACE_SIMPLE, ret);
1747       reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
1748     } else {
1749       if (num_complex_braces >= 10)
1750         EMSG2_RET_NULL(_("E60: Too many complex %s{...}s"),
1751             reg_magic == MAGIC_ALL);
1752       reginsert(BRACE_COMPLEX + num_complex_braces, ret);
1753       regoptail(ret, regnode(BACK));
1754       regoptail(ret, ret);
1755       reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
1756       ++num_complex_braces;
1757     }
1758     if (minval > 0 && maxval > 0)
1759       *flagp = (HASWIDTH | (flags & (HASNL | HASLOOKBH)));
1760     break;
1761   }
1762   if (re_multi_type(peekchr()) != NOT_MULTI) {
1763     /* Can't have a multi follow a multi. */
1764     if (peekchr() == Magic('*'))
1765       sprintf((char *)IObuff, _("E61: Nested %s*"),
1766           reg_magic >= MAGIC_ON ? "" : "\\");
1767     else
1768       sprintf((char *)IObuff, _("E62: Nested %s%c"),
1769           reg_magic == MAGIC_ALL ? "" : "\\", no_Magic(peekchr()));
1770     EMSG_RET_NULL((char *)IObuff);
1771   }
1772 
1773   return ret;
1774 }
1775 
1776 /* When making changes to classchars also change nfa_classcodes. */
1777 static char_u   *classchars = (char_u *)".iIkKfFpPsSdDxXoOwWhHaAlLuU";
1778 static int classcodes[] = {
1779   ANY, IDENT, SIDENT, KWORD, SKWORD,
1780   FNAME, SFNAME, PRINT, SPRINT,
1781   WHITE, NWHITE, DIGIT, NDIGIT,
1782   HEX, NHEX, OCTAL, NOCTAL,
1783   WORD, NWORD, HEAD, NHEAD,
1784   ALPHA, NALPHA, LOWER, NLOWER,
1785   UPPER, NUPPER
1786 };
1787 
1788 /*
1789  * Parse the lowest level.
1790  *
1791  * Optimization:  gobbles an entire sequence of ordinary characters so that
1792  * it can turn them into a single node, which is smaller to store and
1793  * faster to run.  Don't do this when one_exactly is set.
1794  */
regatom(int * flagp)1795 static char_u *regatom(int *flagp)
1796 {
1797   char_u          *ret;
1798   int flags;
1799   int c;
1800   char_u          *p;
1801   int extra = 0;
1802   int save_prev_at_start = prev_at_start;
1803 
1804   *flagp = WORST;               /* Tentatively. */
1805 
1806   c = getchr();
1807   switch (c) {
1808   case Magic('^'):
1809     ret = regnode(BOL);
1810     break;
1811 
1812   case Magic('$'):
1813     ret = regnode(EOL);
1814     had_eol = true;
1815     break;
1816 
1817   case Magic('<'):
1818     ret = regnode(BOW);
1819     break;
1820 
1821   case Magic('>'):
1822     ret = regnode(EOW);
1823     break;
1824 
1825   case Magic('_'):
1826     c = no_Magic(getchr());
1827     if (c == '^') {             /* "\_^" is start-of-line */
1828       ret = regnode(BOL);
1829       break;
1830     }
1831     if (c == '$') {             /* "\_$" is end-of-line */
1832       ret = regnode(EOL);
1833       had_eol = true;
1834       break;
1835     }
1836 
1837     extra = ADD_NL;
1838     *flagp |= HASNL;
1839 
1840     /* "\_[" is character range plus newline */
1841     if (c == '[')
1842       goto collection;
1843 
1844   // "\_x" is character class plus newline
1845   FALLTHROUGH;
1846 
1847   /*
1848    * Character classes.
1849    */
1850   case Magic('.'):
1851   case Magic('i'):
1852   case Magic('I'):
1853   case Magic('k'):
1854   case Magic('K'):
1855   case Magic('f'):
1856   case Magic('F'):
1857   case Magic('p'):
1858   case Magic('P'):
1859   case Magic('s'):
1860   case Magic('S'):
1861   case Magic('d'):
1862   case Magic('D'):
1863   case Magic('x'):
1864   case Magic('X'):
1865   case Magic('o'):
1866   case Magic('O'):
1867   case Magic('w'):
1868   case Magic('W'):
1869   case Magic('h'):
1870   case Magic('H'):
1871   case Magic('a'):
1872   case Magic('A'):
1873   case Magic('l'):
1874   case Magic('L'):
1875   case Magic('u'):
1876   case Magic('U'):
1877     p = vim_strchr(classchars, no_Magic(c));
1878     if (p == NULL)
1879       EMSG_RET_NULL(_("E63: invalid use of \\_"));
1880     /* When '.' is followed by a composing char ignore the dot, so that
1881      * the composing char is matched here. */
1882     if (c == Magic('.') && utf_iscomposing(peekchr())) {
1883       c = getchr();
1884       goto do_multibyte;
1885     }
1886     ret = regnode(classcodes[p - classchars] + extra);
1887     *flagp |= HASWIDTH | SIMPLE;
1888     break;
1889 
1890   case Magic('n'):
1891     if (reg_string) {
1892       /* In a string "\n" matches a newline character. */
1893       ret = regnode(EXACTLY);
1894       regc(NL);
1895       regc(NUL);
1896       *flagp |= HASWIDTH | SIMPLE;
1897     } else {
1898       /* In buffer text "\n" matches the end of a line. */
1899       ret = regnode(NEWL);
1900       *flagp |= HASWIDTH | HASNL;
1901     }
1902     break;
1903 
1904   case Magic('('):
1905     if (one_exactly)
1906       EMSG_ONE_RET_NULL;
1907     ret = reg(REG_PAREN, &flags);
1908     if (ret == NULL)
1909       return NULL;
1910     *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1911     break;
1912 
1913   case NUL:
1914   case Magic('|'):
1915   case Magic('&'):
1916   case Magic(')'):
1917     if (one_exactly)
1918       EMSG_ONE_RET_NULL;
1919     IEMSG_RET_NULL(_(e_internal));       // Supposed to be caught earlier.
1920   // NOTREACHED
1921 
1922   case Magic('='):
1923   case Magic('?'):
1924   case Magic('+'):
1925   case Magic('@'):
1926   case Magic('{'):
1927   case Magic('*'):
1928     c = no_Magic(c);
1929     sprintf((char *)IObuff, _("E64: %s%c follows nothing"),
1930         (c == '*' ? reg_magic >= MAGIC_ON : reg_magic == MAGIC_ALL)
1931         ? "" : "\\", c);
1932     EMSG_RET_NULL((char *)IObuff);
1933   /* NOTREACHED */
1934 
1935   case Magic('~'):              /* previous substitute pattern */
1936     if (reg_prev_sub != NULL) {
1937       char_u      *lp;
1938 
1939       ret = regnode(EXACTLY);
1940       lp = reg_prev_sub;
1941       while (*lp != NUL)
1942         regc(*lp++);
1943       regc(NUL);
1944       if (*reg_prev_sub != NUL) {
1945         *flagp |= HASWIDTH;
1946         if ((lp - reg_prev_sub) == 1)
1947           *flagp |= SIMPLE;
1948       }
1949     } else
1950       EMSG_RET_NULL(_(e_nopresub));
1951     break;
1952 
1953   case Magic('1'):
1954   case Magic('2'):
1955   case Magic('3'):
1956   case Magic('4'):
1957   case Magic('5'):
1958   case Magic('6'):
1959   case Magic('7'):
1960   case Magic('8'):
1961   case Magic('9'):
1962   {
1963     int refnum;
1964 
1965     refnum = c - Magic('0');
1966     if (!seen_endbrace(refnum)) {
1967       return NULL;
1968     }
1969     ret = regnode(BACKREF + refnum);
1970   }
1971   break;
1972 
1973   case Magic('z'):
1974   {
1975     c = no_Magic(getchr());
1976     switch (c) {
1977     case '(': if ((reg_do_extmatch & REX_SET) == 0)
1978         EMSG_RET_NULL(_(e_z_not_allowed));
1979       if (one_exactly)
1980         EMSG_ONE_RET_NULL;
1981       ret = reg(REG_ZPAREN, &flags);
1982       if (ret == NULL)
1983         return NULL;
1984       *flagp |= flags & (HASWIDTH|SPSTART|HASNL|HASLOOKBH);
1985       re_has_z = REX_SET;
1986       break;
1987 
1988     case '1':
1989     case '2':
1990     case '3':
1991     case '4':
1992     case '5':
1993     case '6':
1994     case '7':
1995     case '8':
1996     case '9': if ((reg_do_extmatch & REX_USE) == 0)
1997         EMSG_RET_NULL(_(e_z1_not_allowed));
1998       ret = regnode(ZREF + c - '0');
1999       re_has_z = REX_USE;
2000       break;
2001 
2002     case 's': ret = regnode(MOPEN + 0);
2003       if (!re_mult_next("\\zs")) {
2004         return NULL;
2005       }
2006       break;
2007 
2008     case 'e': ret = regnode(MCLOSE + 0);
2009       if (!re_mult_next("\\ze")) {
2010         return NULL;
2011       }
2012       break;
2013 
2014     default:  EMSG_RET_NULL(_("E68: Invalid character after \\z"));
2015     }
2016   }
2017   break;
2018 
2019   case Magic('%'):
2020   {
2021     c = no_Magic(getchr());
2022     switch (c) {
2023     /* () without a back reference */
2024     case '(':
2025       if (one_exactly)
2026         EMSG_ONE_RET_NULL;
2027       ret = reg(REG_NPAREN, &flags);
2028       if (ret == NULL)
2029         return NULL;
2030       *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
2031       break;
2032 
2033     /* Catch \%^ and \%$ regardless of where they appear in the
2034      * pattern -- regardless of whether or not it makes sense. */
2035     case '^':
2036       ret = regnode(RE_BOF);
2037       break;
2038 
2039     case '$':
2040       ret = regnode(RE_EOF);
2041       break;
2042 
2043     case '#':
2044       ret = regnode(CURSOR);
2045       break;
2046 
2047     case 'V':
2048       ret = regnode(RE_VISUAL);
2049       break;
2050 
2051     case 'C':
2052       ret = regnode(RE_COMPOSING);
2053       break;
2054 
2055     /* \%[abc]: Emit as a list of branches, all ending at the last
2056      * branch which matches nothing. */
2057     case '[':
2058       if (one_exactly)                          /* doesn't nest */
2059         EMSG_ONE_RET_NULL;
2060       {
2061         char_u    *lastbranch;
2062         char_u    *lastnode = NULL;
2063         char_u    *br;
2064 
2065         ret = NULL;
2066         while ((c = getchr()) != ']') {
2067           if (c == NUL)
2068             EMSG2_RET_NULL(_(e_missing_sb),
2069                 reg_magic == MAGIC_ALL);
2070           br = regnode(BRANCH);
2071           if (ret == NULL) {
2072             ret = br;
2073           } else {
2074             regtail(lastnode, br);
2075             if (reg_toolong) {
2076               return NULL;
2077             }
2078           }
2079 
2080           ungetchr();
2081           one_exactly = true;
2082           lastnode = regatom(flagp);
2083           one_exactly = false;
2084           if (lastnode == NULL) {
2085             return NULL;
2086           }
2087         }
2088         if (ret == NULL)
2089           EMSG2_RET_NULL(_(e_empty_sb),
2090               reg_magic == MAGIC_ALL);
2091         lastbranch = regnode(BRANCH);
2092         br = regnode(NOTHING);
2093         if (ret != JUST_CALC_SIZE) {
2094           regtail(lastnode, br);
2095           regtail(lastbranch, br);
2096           /* connect all branches to the NOTHING
2097            * branch at the end */
2098           for (br = ret; br != lastnode; ) {
2099             if (OP(br) == BRANCH) {
2100               regtail(br, lastbranch);
2101               if (reg_toolong) {
2102                 return NULL;
2103               }
2104               br = OPERAND(br);
2105             } else
2106               br = regnext(br);
2107           }
2108         }
2109         *flagp &= ~(HASWIDTH | SIMPLE);
2110         break;
2111       }
2112 
2113     case 'd':               /* %d123 decimal */
2114     case 'o':               /* %o123 octal */
2115     case 'x':               /* %xab hex 2 */
2116     case 'u':               /* %uabcd hex 4 */
2117     case 'U':               /* %U1234abcd hex 8 */
2118     {
2119       int64_t i;
2120 
2121       switch (c) {
2122       case 'd': i = getdecchrs(); break;
2123       case 'o': i = getoctchrs(); break;
2124       case 'x': i = gethexchrs(2); break;
2125       case 'u': i = gethexchrs(4); break;
2126       case 'U': i = gethexchrs(8); break;
2127       default:  i = -1; break;
2128       }
2129 
2130       if (i < 0 || i > INT_MAX) {
2131         EMSG2_RET_NULL(_("E678: Invalid character after %s%%[dxouU]"),
2132                        reg_magic == MAGIC_ALL);
2133       }
2134       if (use_multibytecode(i)) {
2135         ret = regnode(MULTIBYTECODE);
2136       } else {
2137         ret = regnode(EXACTLY);
2138       }
2139       if (i == 0) {
2140         regc(0x0a);
2141       } else {
2142         regmbc(i);
2143       }
2144       regc(NUL);
2145       *flagp |= HASWIDTH;
2146       break;
2147     }
2148 
2149     default:
2150       if (ascii_isdigit(c) || c == '<' || c == '>'
2151           || c == '\'') {
2152         uint32_t n = 0;
2153         int cmp;
2154 
2155         cmp = c;
2156         if (cmp == '<' || cmp == '>')
2157           c = getchr();
2158         while (ascii_isdigit(c)) {
2159           n = n * 10 + (uint32_t)(c - '0');
2160           c = getchr();
2161         }
2162         if (c == '\'' && n == 0) {
2163           /* "\%'m", "\%<'m" and "\%>'m": Mark */
2164           c = getchr();
2165           ret = regnode(RE_MARK);
2166           if (ret == JUST_CALC_SIZE)
2167             regsize += 2;
2168           else {
2169             *regcode++ = c;
2170             *regcode++ = cmp;
2171           }
2172           break;
2173         } else if (c == 'l' || c == 'c' || c == 'v') {
2174           if (c == 'l') {
2175             ret = regnode(RE_LNUM);
2176             if (save_prev_at_start) {
2177               at_start = true;
2178             }
2179           } else if (c == 'c') {
2180             ret = regnode(RE_COL);
2181           } else {
2182             ret = regnode(RE_VCOL);
2183           }
2184           if (ret == JUST_CALC_SIZE) {
2185             regsize += 5;
2186           } else {
2187             // put the number and the optional
2188             // comparator after the opcode
2189             regcode = re_put_uint32(regcode, n);
2190             *regcode++ = cmp;
2191           }
2192           break;
2193         }
2194       }
2195 
2196       EMSG2_RET_NULL(_("E71: Invalid character after %s%%"),
2197           reg_magic == MAGIC_ALL);
2198     }
2199   }
2200   break;
2201 
2202   case Magic('['):
2203 collection:
2204     {
2205       char_u      *lp;
2206 
2207       /*
2208        * If there is no matching ']', we assume the '[' is a normal
2209        * character.  This makes 'incsearch' and ":help [" work.
2210        */
2211       lp = skip_anyof(regparse);
2212       if (*lp == ']') {         /* there is a matching ']' */
2213         int startc = -1;                /* > 0 when next '-' is a range */
2214         int endc;
2215 
2216         /*
2217          * In a character class, different parsing rules apply.
2218          * Not even \ is special anymore, nothing is.
2219          */
2220         if (*regparse == '^') {             /* Complement of range. */
2221           ret = regnode(ANYBUT + extra);
2222           regparse++;
2223         } else
2224           ret = regnode(ANYOF + extra);
2225 
2226         /* At the start ']' and '-' mean the literal character. */
2227         if (*regparse == ']' || *regparse == '-') {
2228           startc = *regparse;
2229           regc(*regparse++);
2230         }
2231 
2232         while (*regparse != NUL && *regparse != ']') {
2233           if (*regparse == '-') {
2234             ++regparse;
2235             /* The '-' is not used for a range at the end and
2236              * after or before a '\n'. */
2237             if (*regparse == ']' || *regparse == NUL
2238                 || startc == -1
2239                 || (regparse[0] == '\\' && regparse[1] == 'n')) {
2240               regc('-');
2241               startc = '-';                     /* [--x] is a range */
2242             } else {
2243               /* Also accept "a-[.z.]" */
2244               endc = 0;
2245               if (*regparse == '[')
2246                 endc = get_coll_element(&regparse);
2247               if (endc == 0) {
2248                 endc = mb_ptr2char_adv((const char_u **)&regparse);
2249               }
2250 
2251               /* Handle \o40, \x20 and \u20AC style sequences */
2252               if (endc == '\\' && !reg_cpo_lit)
2253                 endc = coll_get_char();
2254 
2255               if (startc > endc) {
2256                 EMSG_RET_NULL(_(e_reverse_range));
2257               }
2258               if (utf_char2len(startc) > 1
2259                   || utf_char2len(endc) > 1) {
2260                 // Limit to a range of 256 chars
2261                 if (endc > startc + 256) {
2262                   EMSG_RET_NULL(_(e_large_class));
2263                 }
2264                 while (++startc <= endc) {
2265                   regmbc(startc);
2266                 }
2267               } else {
2268                 while (++startc <= endc)
2269                   regc(startc);
2270               }
2271               startc = -1;
2272             }
2273           }
2274           /*
2275            * Only "\]", "\^", "\]" and "\\" are special in Vi.  Vim
2276            * accepts "\t", "\e", etc., but only when the 'l' flag in
2277            * 'cpoptions' is not included.
2278            */
2279           else if (*regparse == '\\'
2280                    && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL
2281                        || (!reg_cpo_lit
2282                            && vim_strchr(REGEXP_ABBR,
2283                                regparse[1]) != NULL))) {
2284             regparse++;
2285             if (*regparse == 'n') {
2286               /* '\n' in range: also match NL */
2287               if (ret != JUST_CALC_SIZE) {
2288                 /* Using \n inside [^] does not change what
2289                  * matches. "[^\n]" is the same as ".". */
2290                 if (*ret == ANYOF) {
2291                   *ret = ANYOF + ADD_NL;
2292                   *flagp |= HASNL;
2293                 }
2294                 /* else: must have had a \n already */
2295               }
2296               regparse++;
2297               startc = -1;
2298             } else if (*regparse == 'd'
2299                        || *regparse == 'o'
2300                        || *regparse == 'x'
2301                        || *regparse == 'u'
2302                        || *regparse == 'U') {
2303               startc = coll_get_char();
2304               if (startc == 0)
2305                 regc(0x0a);
2306               else
2307                 regmbc(startc);
2308             } else {
2309               startc = backslash_trans(*regparse++);
2310               regc(startc);
2311             }
2312           } else if (*regparse == '[') {
2313             int c_class;
2314             int cu;
2315 
2316             c_class = get_char_class(&regparse);
2317             startc = -1;
2318             /* Characters assumed to be 8 bits! */
2319             switch (c_class) {
2320             case CLASS_NONE:
2321               c_class = get_equi_class(&regparse);
2322               if (c_class != 0) {
2323                 /* produce equivalence class */
2324                 reg_equi_class(c_class);
2325               } else if ((c_class =
2326                             get_coll_element(&regparse)) != 0) {
2327                 /* produce a collating element */
2328                 regmbc(c_class);
2329               } else {
2330                 /* literal '[', allow [[-x] as a range */
2331                 startc = *regparse++;
2332                 regc(startc);
2333               }
2334               break;
2335             case CLASS_ALNUM:
2336               for (cu = 1; cu < 128; cu++) {
2337                 if (isalnum(cu)) {
2338                   regmbc(cu);
2339                 }
2340               }
2341               break;
2342             case CLASS_ALPHA:
2343               for (cu = 1; cu < 128; cu++) {
2344                 if (isalpha(cu)) {
2345                   regmbc(cu);
2346                 }
2347               }
2348               break;
2349             case CLASS_BLANK:
2350               regc(' ');
2351               regc('\t');
2352               break;
2353             case CLASS_CNTRL:
2354               for (cu = 1; cu <= 127; cu++) {
2355                 if (iscntrl(cu)) {
2356                   regmbc(cu);
2357                 }
2358               }
2359               break;
2360             case CLASS_DIGIT:
2361               for (cu = 1; cu <= 127; cu++) {
2362                 if (ascii_isdigit(cu)) {
2363                   regmbc(cu);
2364                 }
2365               }
2366               break;
2367             case CLASS_GRAPH:
2368               for (cu = 1; cu <= 127; cu++) {
2369                 if (isgraph(cu)) {
2370                   regmbc(cu);
2371                 }
2372               }
2373               break;
2374             case CLASS_LOWER:
2375               for (cu = 1; cu <= 255; cu++) {
2376                 if (mb_islower(cu) && cu != 170 && cu != 186) {
2377                   regmbc(cu);
2378                 }
2379               }
2380               break;
2381             case CLASS_PRINT:
2382               for (cu = 1; cu <= 255; cu++) {
2383                 if (vim_isprintc(cu)) {
2384                   regmbc(cu);
2385                 }
2386               }
2387               break;
2388             case CLASS_PUNCT:
2389               for (cu = 1; cu < 128; cu++) {
2390                 if (ispunct(cu)) {
2391                   regmbc(cu);
2392                 }
2393               }
2394               break;
2395             case CLASS_SPACE:
2396               for (cu = 9; cu <= 13; cu++)
2397                 regc(cu);
2398               regc(' ');
2399               break;
2400             case CLASS_UPPER:
2401               for (cu = 1; cu <= 255; cu++) {
2402                 if (mb_isupper(cu)) {
2403                   regmbc(cu);
2404                 }
2405               }
2406               break;
2407             case CLASS_XDIGIT:
2408               for (cu = 1; cu <= 255; cu++) {
2409                 if (ascii_isxdigit(cu)) {
2410                   regmbc(cu);
2411                 }
2412               }
2413               break;
2414             case CLASS_TAB:
2415               regc('\t');
2416               break;
2417             case CLASS_RETURN:
2418               regc('\r');
2419               break;
2420             case CLASS_BACKSPACE:
2421               regc('\b');
2422               break;
2423             case CLASS_ESCAPE:
2424               regc(ESC);
2425               break;
2426             case CLASS_IDENT:
2427               for (cu = 1; cu <= 255; cu++) {
2428                 if (vim_isIDc(cu)) {
2429                   regmbc(cu);
2430                 }
2431               }
2432               break;
2433             case CLASS_KEYWORD:
2434               for (cu = 1; cu <= 255; cu++) {
2435                 if (reg_iswordc(cu)) {
2436                   regmbc(cu);
2437                 }
2438               }
2439               break;
2440             case CLASS_FNAME:
2441               for (cu = 1; cu <= 255; cu++) {
2442                 if (vim_isfilec(cu)) {
2443                   regmbc(cu);
2444                 }
2445               }
2446               break;
2447             }
2448           } else {
2449             // produce a multibyte character, including any
2450             // following composing characters.
2451             startc = utf_ptr2char(regparse);
2452             int len = utfc_ptr2len(regparse);
2453             if (utf_char2len(startc) != len) {
2454               // composing chars
2455               startc = -1;
2456             }
2457             while (--len >= 0) {
2458               regc(*regparse++);
2459             }
2460           }
2461         }
2462         regc(NUL);
2463         prevchr_len = 1;                /* last char was the ']' */
2464         if (*regparse != ']')
2465           EMSG_RET_NULL(_(e_toomsbra));                 /* Cannot happen? */
2466         skipchr();                  /* let's be friends with the lexer again */
2467         *flagp |= HASWIDTH | SIMPLE;
2468         break;
2469       } else if (reg_strict)
2470         EMSG2_RET_NULL(_(e_missingbracket), reg_magic > MAGIC_OFF);
2471     }
2472     FALLTHROUGH;
2473 
2474   default:
2475   {
2476     int len;
2477 
2478     /* A multi-byte character is handled as a separate atom if it's
2479      * before a multi and when it's a composing char. */
2480     if (use_multibytecode(c)) {
2481 do_multibyte:
2482       ret = regnode(MULTIBYTECODE);
2483       regmbc(c);
2484       *flagp |= HASWIDTH | SIMPLE;
2485       break;
2486     }
2487 
2488     ret = regnode(EXACTLY);
2489 
2490     /*
2491      * Append characters as long as:
2492      * - there is no following multi, we then need the character in
2493      *   front of it as a single character operand
2494      * - not running into a Magic character
2495      * - "one_exactly" is not set
2496      * But always emit at least one character.  Might be a Multi,
2497      * e.g., a "[" without matching "]".
2498      */
2499     for (len = 0; c != NUL && (len == 0
2500                                || (re_multi_type(peekchr()) == NOT_MULTI
2501                                    && !one_exactly
2502                                    && !is_Magic(c))); ++len) {
2503       c = no_Magic(c);
2504       {
2505         regmbc(c);
2506         {
2507           int l;
2508 
2509           /* Need to get composing character too. */
2510           for (;; ) {
2511             l = utf_ptr2len(regparse);
2512             if (!utf_composinglike(regparse, regparse + l)) {
2513               break;
2514             }
2515             regmbc(utf_ptr2char(regparse));
2516             skipchr();
2517           }
2518         }
2519       }
2520       c = getchr();
2521     }
2522     ungetchr();
2523 
2524     regc(NUL);
2525     *flagp |= HASWIDTH;
2526     if (len == 1)
2527       *flagp |= SIMPLE;
2528   }
2529   break;
2530   }
2531 
2532   return ret;
2533 }
2534 
2535 /// Used in a place where no * or \+ can follow.
re_mult_next(char * what)2536 static bool re_mult_next(char *what)
2537 {
2538   if (re_multi_type(peekchr()) == MULTI_MULT) {
2539     EMSG2_RET_FAIL(_("E888: (NFA regexp) cannot repeat %s"), what);
2540   }
2541   return true;
2542 }
2543 
2544 // Return true if MULTIBYTECODE should be used instead of EXACTLY for
2545 // character "c".
use_multibytecode(int c)2546 static bool use_multibytecode(int c)
2547 {
2548   return utf_char2len(c) > 1
2549          && (re_multi_type(peekchr()) != NOT_MULTI
2550              || utf_iscomposing(c));
2551 }
2552 
2553 /*
2554  * Emit a node.
2555  * Return pointer to generated code.
2556  */
regnode(int op)2557 static char_u *regnode(int op)
2558 {
2559   char_u  *ret;
2560 
2561   ret = regcode;
2562   if (ret == JUST_CALC_SIZE)
2563     regsize += 3;
2564   else {
2565     *regcode++ = op;
2566     *regcode++ = NUL;                   /* Null "next" pointer. */
2567     *regcode++ = NUL;
2568   }
2569   return ret;
2570 }
2571 
2572 /*
2573  * Emit (if appropriate) a byte of code
2574  */
regc(int b)2575 static void regc(int b)
2576 {
2577   if (regcode == JUST_CALC_SIZE)
2578     regsize++;
2579   else
2580     *regcode++ = b;
2581 }
2582 
2583 /*
2584  * Emit (if appropriate) a multi-byte character of code
2585  */
regmbc(int c)2586 static void regmbc(int c)
2587 {
2588   if (regcode == JUST_CALC_SIZE) {
2589     regsize += utf_char2len(c);
2590   } else {
2591     regcode += utf_char2bytes(c, regcode);
2592   }
2593 }
2594 
2595 /*
2596  * Insert an operator in front of already-emitted operand
2597  *
2598  * Means relocating the operand.
2599  */
reginsert(int op,char_u * opnd)2600 static void reginsert(int op, char_u *opnd)
2601 {
2602   char_u      *src;
2603   char_u      *dst;
2604   char_u      *place;
2605 
2606   if (regcode == JUST_CALC_SIZE) {
2607     regsize += 3;
2608     return;
2609   }
2610   src = regcode;
2611   regcode += 3;
2612   dst = regcode;
2613   while (src > opnd)
2614     *--dst = *--src;
2615 
2616   place = opnd;                 /* Op node, where operand used to be. */
2617   *place++ = op;
2618   *place++ = NUL;
2619   *place = NUL;
2620 }
2621 
2622 /*
2623  * Insert an operator in front of already-emitted operand.
2624  * Add a number to the operator.
2625  */
reginsert_nr(int op,long val,char_u * opnd)2626 static void reginsert_nr(int op, long val, char_u *opnd)
2627 {
2628   char_u      *src;
2629   char_u      *dst;
2630   char_u      *place;
2631 
2632   if (regcode == JUST_CALC_SIZE) {
2633     regsize += 7;
2634     return;
2635   }
2636   src = regcode;
2637   regcode += 7;
2638   dst = regcode;
2639   while (src > opnd)
2640     *--dst = *--src;
2641 
2642   place = opnd;                 /* Op node, where operand used to be. */
2643   *place++ = op;
2644   *place++ = NUL;
2645   *place++ = NUL;
2646   assert(val >= 0 && (uintmax_t)val <= UINT32_MAX);
2647   re_put_uint32(place, (uint32_t)val);
2648 }
2649 
2650 /*
2651  * Insert an operator in front of already-emitted operand.
2652  * The operator has the given limit values as operands.  Also set next pointer.
2653  *
2654  * Means relocating the operand.
2655  */
reginsert_limits(int op,long minval,long maxval,char_u * opnd)2656 static void reginsert_limits(int op, long minval, long maxval, char_u *opnd)
2657 {
2658   char_u      *src;
2659   char_u      *dst;
2660   char_u      *place;
2661 
2662   if (regcode == JUST_CALC_SIZE) {
2663     regsize += 11;
2664     return;
2665   }
2666   src = regcode;
2667   regcode += 11;
2668   dst = regcode;
2669   while (src > opnd)
2670     *--dst = *--src;
2671 
2672   place = opnd;                 /* Op node, where operand used to be. */
2673   *place++ = op;
2674   *place++ = NUL;
2675   *place++ = NUL;
2676   assert(minval >= 0 && (uintmax_t)minval <= UINT32_MAX);
2677   place = re_put_uint32(place, (uint32_t)minval);
2678   assert(maxval >= 0 && (uintmax_t)maxval <= UINT32_MAX);
2679   place = re_put_uint32(place, (uint32_t)maxval);
2680   regtail(opnd, place);
2681 }
2682 
2683 /*
2684  * Write a four bytes number at "p" and return pointer to the next char.
2685  */
re_put_uint32(char_u * p,uint32_t val)2686 static char_u *re_put_uint32(char_u *p, uint32_t val)
2687 {
2688   *p++ = (char_u) ((val >> 24) & 0377);
2689   *p++ = (char_u) ((val >> 16) & 0377);
2690   *p++ = (char_u) ((val >> 8) & 0377);
2691   *p++ = (char_u) (val & 0377);
2692   return p;
2693 }
2694 
2695 // Set the next-pointer at the end of a node chain.
regtail(char_u * p,char_u * val)2696 static void regtail(char_u *p, char_u *val)
2697 {
2698   int offset;
2699 
2700   if (p == JUST_CALC_SIZE) {
2701     return;
2702   }
2703 
2704   // Find last node.
2705   char_u *scan = p;
2706   for (;; ) {
2707     char_u *temp = regnext(scan);
2708     if (temp == NULL) {
2709       break;
2710     }
2711     scan = temp;
2712   }
2713 
2714   if (OP(scan) == BACK) {
2715     offset = (int)(scan - val);
2716   } else {
2717     offset = (int)(val - scan);
2718   }
2719   // When the offset uses more than 16 bits it can no longer fit in the two
2720   // bytes available.  Use a global flag to avoid having to check return
2721   // values in too many places.
2722   if (offset > 0xffff) {
2723     reg_toolong = true;
2724   } else {
2725     *(scan + 1) = (char_u)(((unsigned)offset >> 8) & 0377);
2726     *(scan + 2) = (char_u)(offset & 0377);
2727   }
2728 }
2729 
2730 /*
2731  * Like regtail, on item after a BRANCH; nop if none.
2732  */
regoptail(char_u * p,char_u * val)2733 static void regoptail(char_u *p, char_u *val)
2734 {
2735   /* When op is neither BRANCH nor BRACE_COMPLEX0-9, it is "operandless" */
2736   if (p == NULL || p == JUST_CALC_SIZE
2737       || (OP(p) != BRANCH
2738           && (OP(p) < BRACE_COMPLEX || OP(p) > BRACE_COMPLEX + 9)))
2739     return;
2740   regtail(OPERAND(p), val);
2741 }
2742 
2743 /*
2744  * Functions for getting characters from the regexp input.
2745  */
2746 
2747 /*
2748  * Start parsing at "str".
2749  */
initchr(char_u * str)2750 static void initchr(char_u *str)
2751 {
2752   regparse = str;
2753   prevchr_len = 0;
2754   curchr = prevprevchr = prevchr = nextchr = -1;
2755   at_start = true;
2756   prev_at_start = false;
2757 }
2758 
2759 /*
2760  * Save the current parse state, so that it can be restored and parsing
2761  * starts in the same state again.
2762  */
save_parse_state(parse_state_T * ps)2763 static void save_parse_state(parse_state_T *ps)
2764 {
2765   ps->regparse = regparse;
2766   ps->prevchr_len = prevchr_len;
2767   ps->curchr = curchr;
2768   ps->prevchr = prevchr;
2769   ps->prevprevchr = prevprevchr;
2770   ps->nextchr = nextchr;
2771   ps->at_start = at_start;
2772   ps->prev_at_start = prev_at_start;
2773   ps->regnpar = regnpar;
2774 }
2775 
2776 /*
2777  * Restore a previously saved parse state.
2778  */
restore_parse_state(parse_state_T * ps)2779 static void restore_parse_state(parse_state_T *ps)
2780 {
2781   regparse = ps->regparse;
2782   prevchr_len = ps->prevchr_len;
2783   curchr = ps->curchr;
2784   prevchr = ps->prevchr;
2785   prevprevchr = ps->prevprevchr;
2786   nextchr = ps->nextchr;
2787   at_start = ps->at_start;
2788   prev_at_start = ps->prev_at_start;
2789   regnpar = ps->regnpar;
2790 }
2791 
2792 
2793 /*
2794  * Get the next character without advancing.
2795  */
peekchr(void)2796 static int peekchr(void)
2797 {
2798   static int after_slash = false;
2799 
2800   if (curchr != -1) {
2801     return curchr;
2802   }
2803 
2804   switch (curchr = regparse[0]) {
2805   case '.':
2806   case '[':
2807   case '~':
2808     /* magic when 'magic' is on */
2809     if (reg_magic >= MAGIC_ON)
2810       curchr = Magic(curchr);
2811     break;
2812   case '(':
2813   case ')':
2814   case '{':
2815   case '%':
2816   case '+':
2817   case '=':
2818   case '?':
2819   case '@':
2820   case '!':
2821   case '&':
2822   case '|':
2823   case '<':
2824   case '>':
2825   case '#':           /* future ext. */
2826   case '"':           /* future ext. */
2827   case '\'':          /* future ext. */
2828   case ',':           /* future ext. */
2829   case '-':           /* future ext. */
2830   case ':':           /* future ext. */
2831   case ';':           /* future ext. */
2832   case '`':           /* future ext. */
2833   case '/':           /* Can't be used in / command */
2834     /* magic only after "\v" */
2835     if (reg_magic == MAGIC_ALL)
2836       curchr = Magic(curchr);
2837     break;
2838   case '*':
2839     // * is not magic as the very first character, eg "?*ptr", when
2840     // after '^', eg "/^*ptr" and when after "\(", "\|", "\&".  But
2841     // "\(\*" is not magic, thus must be magic if "after_slash"
2842     if (reg_magic >= MAGIC_ON
2843         && !at_start
2844         && !(prev_at_start && prevchr == Magic('^'))
2845         && (after_slash
2846             || (prevchr != Magic('(')
2847                 && prevchr != Magic('&')
2848                 && prevchr != Magic('|')))) {
2849       curchr = Magic('*');
2850     }
2851     break;
2852   case '^':
2853     // '^' is only magic as the very first character and if it's after
2854     // "\(", "\|", "\&' or "\n"
2855     if (reg_magic >= MAGIC_OFF
2856         && (at_start
2857             || reg_magic == MAGIC_ALL
2858             || prevchr == Magic('(')
2859             || prevchr == Magic('|')
2860             || prevchr == Magic('&')
2861             || prevchr == Magic('n')
2862             || (no_Magic(prevchr) == '('
2863                 && prevprevchr == Magic('%')))) {
2864       curchr = Magic('^');
2865       at_start = true;
2866       prev_at_start = false;
2867     }
2868     break;
2869   case '$':
2870     // '$' is only magic as the very last char and if it's in front of
2871     // either "\|", "\)", "\&", or "\n"
2872     if (reg_magic >= MAGIC_OFF) {
2873       char_u *p = regparse + 1;
2874       bool is_magic_all = (reg_magic == MAGIC_ALL);
2875 
2876       // ignore \c \C \m \M \v \V and \Z after '$'
2877       while (p[0] == '\\' && (p[1] == 'c' || p[1] == 'C'
2878                               || p[1] == 'm' || p[1] == 'M'
2879                               || p[1] == 'v' || p[1] == 'V'
2880                               || p[1] == 'Z')) {
2881         if (p[1] == 'v') {
2882           is_magic_all = true;
2883         } else if (p[1] == 'm' || p[1] == 'M' || p[1] == 'V') {
2884           is_magic_all = false;
2885         }
2886         p += 2;
2887       }
2888       if (p[0] == NUL
2889           || (p[0] == '\\'
2890               && (p[1] == '|' || p[1] == '&' || p[1] == ')'
2891                   || p[1] == 'n'))
2892           || (is_magic_all
2893               && (p[0] == '|' || p[0] == '&' || p[0] == ')'))
2894           || reg_magic == MAGIC_ALL) {
2895         curchr = Magic('$');
2896       }
2897     }
2898     break;
2899   case '\\':
2900   {
2901     int c = regparse[1];
2902 
2903     if (c == NUL)
2904       curchr = '\\';                  /* trailing '\' */
2905     else if (
2906       c <= '~' && META_flags[c]
2907       ) {
2908       /*
2909        * META contains everything that may be magic sometimes,
2910        * except ^ and $ ("\^" and "\$" are only magic after
2911        * "\V").  We now fetch the next character and toggle its
2912        * magicness.  Therefore, \ is so meta-magic that it is
2913        * not in META.
2914        */
2915       curchr = -1;
2916       prev_at_start = at_start;
2917       at_start = false;  // be able to say "/\*ptr"
2918       regparse++;
2919       after_slash++;
2920       (void)peekchr();
2921       regparse--;
2922       after_slash--;
2923       curchr = toggle_Magic(curchr);
2924     } else if (vim_strchr(REGEXP_ABBR, c)) {
2925       /*
2926        * Handle abbreviations, like "\t" for TAB -- webb
2927        */
2928       curchr = backslash_trans(c);
2929     } else if (reg_magic == MAGIC_NONE && (c == '$' || c == '^'))
2930       curchr = toggle_Magic(c);
2931     else {
2932       /*
2933        * Next character can never be (made) magic?
2934        * Then backslashing it won't do anything.
2935        */
2936       curchr = utf_ptr2char(regparse + 1);
2937     }
2938     break;
2939   }
2940 
2941   default:
2942     curchr = utf_ptr2char(regparse);
2943   }
2944 
2945   return curchr;
2946 }
2947 
2948 /*
2949  * Eat one lexed character.  Do this in a way that we can undo it.
2950  */
skipchr(void)2951 static void skipchr(void)
2952 {
2953   /* peekchr() eats a backslash, do the same here */
2954   if (*regparse == '\\')
2955     prevchr_len = 1;
2956   else
2957     prevchr_len = 0;
2958   if (regparse[prevchr_len] != NUL) {
2959     // Exclude composing chars that utfc_ptr2len does include.
2960     prevchr_len += utf_ptr2len(regparse + prevchr_len);
2961   }
2962   regparse += prevchr_len;
2963   prev_at_start = at_start;
2964   at_start = false;
2965   prevprevchr = prevchr;
2966   prevchr = curchr;
2967   curchr = nextchr;         /* use previously unget char, or -1 */
2968   nextchr = -1;
2969 }
2970 
2971 /*
2972  * Skip a character while keeping the value of prev_at_start for at_start.
2973  * prevchr and prevprevchr are also kept.
2974  */
skipchr_keepstart(void)2975 static void skipchr_keepstart(void)
2976 {
2977   int as = prev_at_start;
2978   int pr = prevchr;
2979   int prpr = prevprevchr;
2980 
2981   skipchr();
2982   at_start = as;
2983   prevchr = pr;
2984   prevprevchr = prpr;
2985 }
2986 
2987 /*
2988  * Get the next character from the pattern. We know about magic and such, so
2989  * therefore we need a lexical analyzer.
2990  */
getchr(void)2991 static int getchr(void)
2992 {
2993   int chr = peekchr();
2994 
2995   skipchr();
2996   return chr;
2997 }
2998 
2999 /*
3000  * put character back.  Works only once!
3001  */
ungetchr(void)3002 static void ungetchr(void)
3003 {
3004   nextchr = curchr;
3005   curchr = prevchr;
3006   prevchr = prevprevchr;
3007   at_start = prev_at_start;
3008   prev_at_start = false;
3009 
3010   // Backup regparse, so that it's at the same position as before the
3011   // getchr().
3012   regparse -= prevchr_len;
3013 }
3014 
3015 /*
3016  * Get and return the value of the hex string at the current position.
3017  * Return -1 if there is no valid hex number.
3018  * The position is updated:
3019  *     blahblah\%x20asdf
3020  *	   before-^ ^-after
3021  * The parameter controls the maximum number of input characters. This will be
3022  * 2 when reading a \%x20 sequence and 4 when reading a \%u20AC sequence.
3023  */
gethexchrs(int maxinputlen)3024 static int64_t gethexchrs(int maxinputlen)
3025 {
3026   int64_t nr = 0;
3027   int c;
3028   int i;
3029 
3030   for (i = 0; i < maxinputlen; ++i) {
3031     c = regparse[0];
3032     if (!ascii_isxdigit(c))
3033       break;
3034     nr <<= 4;
3035     nr |= hex2nr(c);
3036     ++regparse;
3037   }
3038 
3039   if (i == 0)
3040     return -1;
3041   return nr;
3042 }
3043 
3044 /*
3045  * Get and return the value of the decimal string immediately after the
3046  * current position. Return -1 for invalid.  Consumes all digits.
3047  */
getdecchrs(void)3048 static int64_t getdecchrs(void)
3049 {
3050   int64_t nr = 0;
3051   int c;
3052   int i;
3053 
3054   for (i = 0;; ++i) {
3055     c = regparse[0];
3056     if (c < '0' || c > '9')
3057       break;
3058     nr *= 10;
3059     nr += c - '0';
3060     ++regparse;
3061     curchr = -1;     /* no longer valid */
3062   }
3063 
3064   if (i == 0)
3065     return -1;
3066   return nr;
3067 }
3068 
3069 /*
3070  * get and return the value of the octal string immediately after the current
3071  * position. Return -1 for invalid, or 0-255 for valid. Smart enough to handle
3072  * numbers > 377 correctly (for example, 400 is treated as 40) and doesn't
3073  * treat 8 or 9 as recognised characters. Position is updated:
3074  *     blahblah\%o210asdf
3075  *	   before-^  ^-after
3076  */
getoctchrs(void)3077 static int64_t getoctchrs(void)
3078 {
3079   int64_t nr = 0;
3080   int c;
3081   int i;
3082 
3083   for (i = 0; i < 3 && nr < 040; i++) {  // -V536
3084     c = regparse[0];
3085     if (c < '0' || c > '7')
3086       break;
3087     nr <<= 3;
3088     nr |= hex2nr(c);
3089     ++regparse;
3090   }
3091 
3092   if (i == 0)
3093     return -1;
3094   return nr;
3095 }
3096 
3097 /*
3098  * Get a number after a backslash that is inside [].
3099  * When nothing is recognized return a backslash.
3100  */
coll_get_char(void)3101 static int coll_get_char(void)
3102 {
3103   int64_t nr = -1;
3104 
3105   switch (*regparse++) {
3106   case 'd': nr = getdecchrs(); break;
3107   case 'o': nr = getoctchrs(); break;
3108   case 'x': nr = gethexchrs(2); break;
3109   case 'u': nr = gethexchrs(4); break;
3110   case 'U': nr = gethexchrs(8); break;
3111   }
3112   if (nr < 0 || nr > INT_MAX) {
3113     // If getting the number fails be backwards compatible: the character
3114     // is a backslash.
3115     regparse--;
3116     nr = '\\';
3117   }
3118   return nr;
3119 }
3120 
3121 /*
3122  * read_limits - Read two integers to be taken as a minimum and maximum.
3123  * If the first character is '-', then the range is reversed.
3124  * Should end with 'end'.  If minval is missing, zero is default, if maxval is
3125  * missing, a very big number is the default.
3126  */
read_limits(long * minval,long * maxval)3127 static int read_limits(long *minval, long *maxval)
3128 {
3129   int reverse = false;
3130   char_u      *first_char;
3131   long tmp;
3132 
3133   if (*regparse == '-') {
3134     // Starts with '-', so reverse the range later.
3135     regparse++;
3136     reverse = true;
3137   }
3138   first_char = regparse;
3139   *minval = getdigits_long(&regparse, false, 0);
3140   if (*regparse == ',') {           // There is a comma.
3141     if (ascii_isdigit(*++regparse)) {
3142       *maxval = getdigits_long(&regparse, false, MAX_LIMIT);
3143     } else {
3144       *maxval = MAX_LIMIT;
3145     }
3146   } else if (ascii_isdigit(*first_char)) {
3147     *maxval = *minval;              // It was \{n} or \{-n}
3148   } else {
3149     *maxval = MAX_LIMIT;            // It was \{} or \{-}
3150   }
3151   if (*regparse == '\\') {
3152     regparse++;         // Allow either \{...} or \{...\}
3153   }
3154   if (*regparse != '}') {
3155     sprintf((char *)IObuff, _("E554: Syntax error in %s{...}"),
3156         reg_magic == MAGIC_ALL ? "" : "\\");
3157     EMSG_RET_FAIL((char *)IObuff);
3158   }
3159 
3160   /*
3161    * Reverse the range if there was a '-', or make sure it is in the right
3162    * order otherwise.
3163    */
3164   if ((!reverse && *minval > *maxval) || (reverse && *minval < *maxval)) {
3165     tmp = *minval;
3166     *minval = *maxval;
3167     *maxval = tmp;
3168   }
3169   skipchr();            /* let's be friends with the lexer again */
3170   return OK;
3171 }
3172 
3173 /*
3174  * vim_regexec and friends
3175  */
3176 
3177 /*
3178  * Global work variables for vim_regexec().
3179  */
3180 
3181 /* Save the sub-expressions before attempting a match. */
3182 #define save_se(savep, posp, pp) \
3183   REG_MULTI ? save_se_multi((savep), (posp)) : save_se_one((savep), (pp))
3184 
3185 /* After a failed match restore the sub-expressions. */
3186 #define restore_se(savep, posp, pp) { \
3187     if (REG_MULTI) \
3188       *(posp) = (savep)->se_u.pos; \
3189     else \
3190       *(pp) = (savep)->se_u.ptr; }
3191 
3192 
3193 #ifdef REGEXP_DEBUG
3194 int regnarrate = 0;
3195 #endif
3196 
3197 // Sometimes need to save a copy of a line.  Since alloc()/free() is very
3198 // slow, we keep one allocated piece of memory and only re-allocate it when
3199 // it's too small.  It's freed in bt_regexec_both() when finished.
3200 static char_u   *reg_tofree = NULL;
3201 static unsigned reg_tofreelen;
3202 
3203 // Structure used to store the execution state of the regex engine.
3204 // Which ones are set depends on whether a single-line or multi-line match is
3205 // done:
3206 //                      single-line             multi-line
3207 // reg_match            &regmatch_T             NULL
3208 // reg_mmatch           NULL                    &regmmatch_T
3209 // reg_startp           reg_match->startp       <invalid>
3210 // reg_endp             reg_match->endp         <invalid>
3211 // reg_startpos         <invalid>               reg_mmatch->startpos
3212 // reg_endpos           <invalid>               reg_mmatch->endpos
3213 // reg_win              NULL                    window in which to search
3214 // reg_buf              curbuf                  buffer in which to search
3215 // reg_firstlnum        <invalid>               first line in which to search
3216 // reg_maxline          0                       last line nr
3217 // reg_line_lbr         false or true           false
3218 typedef struct {
3219   regmatch_T *reg_match;
3220   regmmatch_T *reg_mmatch;
3221   char_u **reg_startp;
3222   char_u **reg_endp;
3223   lpos_T *reg_startpos;
3224   lpos_T *reg_endpos;
3225   win_T *reg_win;
3226   buf_T *reg_buf;
3227   linenr_T reg_firstlnum;
3228   linenr_T reg_maxline;
3229   bool reg_line_lbr;  // "\n" in string is line break
3230 
3231   // The current match-position is remembered with these variables:
3232   linenr_T lnum;  ///< line number, relative to first line
3233   char_u *line;   ///< start of current line
3234   char_u *input;  ///< current input, points into "regline"
3235 
3236   int need_clear_subexpr;   ///< subexpressions still need to be cleared
3237   int need_clear_zsubexpr;  ///< extmatch subexpressions still need to be
3238                             ///< cleared
3239 
3240 
3241   // Internal copy of 'ignorecase'.  It is set at each call to vim_regexec().
3242   // Normally it gets the value of "rm_ic" or "rmm_ic", but when the pattern
3243   // contains '\c' or '\C' the value is overruled.
3244   bool reg_ic;
3245 
3246   // Similar to "reg_ic", but only for 'combining' characters.  Set with \Z
3247   // flag in the regexp.  Defaults to false, always.
3248   bool reg_icombine;
3249 
3250   // Copy of "rmm_maxcol": maximum column to search for a match.  Zero when
3251   // there is no maximum.
3252   colnr_T reg_maxcol;
3253 
3254   // State for the NFA engine regexec.
3255   int nfa_has_zend;     ///< NFA regexp \ze operator encountered.
3256   int nfa_has_backref;  ///< NFA regexp \1 .. \9 encountered.
3257   int nfa_nsubexpr;     ///< Number of sub expressions actually being used
3258                         ///< during execution. 1 if only the whole match
3259                         ///< (subexpr 0) is used.
3260   // listid is global, so that it increases on recursive calls to
3261   // nfa_regmatch(), which means we don't have to clear the lastlist field of
3262   // all the states.
3263   int nfa_listid;
3264   int nfa_alt_listid;
3265 
3266   int nfa_has_zsubexpr;  ///< NFA regexp has \z( ), set zsubexpr.
3267 } regexec_T;
3268 
3269 static regexec_T rex;
3270 static bool rex_in_use = false;
3271 
3272 /*
3273  * "regstack" and "backpos" are used by regmatch().  They are kept over calls
3274  * to avoid invoking malloc() and free() often.
3275  * "regstack" is a stack with regitem_T items, sometimes preceded by regstar_T
3276  * or regbehind_T.
3277  * "backpos_T" is a table with backpos_T for BACK
3278  */
3279 static garray_T regstack = GA_EMPTY_INIT_VALUE;
3280 static garray_T backpos = GA_EMPTY_INIT_VALUE;
3281 
3282 /*
3283  * Both for regstack and backpos tables we use the following strategy of
3284  * allocation (to reduce malloc/free calls):
3285  * - Initial size is fairly small.
3286  * - When needed, the tables are grown bigger (8 times at first, double after
3287  *   that).
3288  * - After executing the match we free the memory only if the array has grown.
3289  *   Thus the memory is kept allocated when it's at the initial size.
3290  * This makes it fast while not keeping a lot of memory allocated.
3291  * A three times speed increase was observed when using many simple patterns.
3292  */
3293 #define REGSTACK_INITIAL        2048
3294 #define BACKPOS_INITIAL         64
3295 
3296 #if defined(EXITFREE)
free_regexp_stuff(void)3297 void free_regexp_stuff(void)
3298 {
3299   ga_clear(&regstack);
3300   ga_clear(&backpos);
3301   xfree(reg_tofree);
3302   xfree(reg_prev_sub);
3303 }
3304 
3305 #endif
3306 
3307 // Return true if character 'c' is included in 'iskeyword' option for
3308 // "reg_buf" buffer.
reg_iswordc(int c)3309 static bool reg_iswordc(int c)
3310 {
3311   return vim_iswordc_buf(c, rex.reg_buf);
3312 }
3313 
3314 /*
3315  * Get pointer to the line "lnum", which is relative to "reg_firstlnum".
3316  */
reg_getline(linenr_T lnum)3317 static char_u *reg_getline(linenr_T lnum)
3318 {
3319   // when looking behind for a match/no-match lnum is negative.  But we
3320   // can't go before line 1
3321   if (rex.reg_firstlnum + lnum < 1) {
3322     return NULL;
3323   }
3324   if (lnum > rex.reg_maxline) {
3325     // Must have matched the "\n" in the last line.
3326     return (char_u *)"";
3327   }
3328   return ml_get_buf(rex.reg_buf, rex.reg_firstlnum + lnum, false);
3329 }
3330 
3331 static regsave_T behind_pos;
3332 
3333 static char_u   *reg_startzp[NSUBEXP];  /* Workspace to mark beginning */
3334 static char_u   *reg_endzp[NSUBEXP];    /*   and end of \z(...\) matches */
3335 static lpos_T reg_startzpos[NSUBEXP];   /* idem, beginning pos */
3336 static lpos_T reg_endzpos[NSUBEXP];     /* idem, end pos */
3337 
3338 // true if using multi-line regexp.
3339 #define REG_MULTI       (rex.reg_match == NULL)
3340 
3341 /*
3342  * Match a regexp against a string.
3343  * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
3344  * Uses curbuf for line count and 'iskeyword'.
3345  * If "line_lbr" is true, consider a "\n" in "line" to be a line break.
3346  *
3347  * Returns 0 for failure, number of lines contained in the match otherwise.
3348  */
3349 static int
bt_regexec_nl(regmatch_T * rmp,char_u * line,colnr_T col,bool line_lbr)3350 bt_regexec_nl (
3351     regmatch_T *rmp,
3352     char_u *line,      /* string to match against */
3353     colnr_T col,       /* column to start looking for match */
3354     bool line_lbr
3355 )
3356 {
3357   rex.reg_match = rmp;
3358   rex.reg_mmatch = NULL;
3359   rex.reg_maxline = 0;
3360   rex.reg_line_lbr = line_lbr;
3361   rex.reg_buf = curbuf;
3362   rex.reg_win = NULL;
3363   rex.reg_ic = rmp->rm_ic;
3364   rex.reg_icombine = false;
3365   rex.reg_maxcol = 0;
3366 
3367   long r = bt_regexec_both(line, col, NULL, NULL);
3368   assert(r <= INT_MAX);
3369   return (int)r;
3370 }
3371 
3372 /// Wrapper around strchr which accounts for case-insensitive searches and
3373 /// non-ASCII characters.
3374 ///
3375 /// This function is used a lot for simple searches, keep it fast!
3376 ///
3377 /// @param  s  string to search
3378 /// @param  c  character to find in @a s
3379 ///
3380 /// @return  NULL if no match, otherwise pointer to the position in @a s
cstrchr(const char_u * const s,const int c)3381 static inline char_u *cstrchr(const char_u *const s, const int c)
3382   FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
3383   FUNC_ATTR_ALWAYS_INLINE
3384 {
3385   if (!rex.reg_ic) {
3386     return vim_strchr(s, c);
3387   }
3388 
3389   // Use folded case for UTF-8, slow! For ASCII use libc strpbrk which is
3390   // expected to be highly optimized.
3391   if (c > 0x80) {
3392     const int folded_c = utf_fold(c);
3393     for (const char_u *p = s; *p != NUL; p += utfc_ptr2len(p)) {
3394       if (utf_fold(utf_ptr2char(p)) == folded_c) {
3395         return (char_u *)p;
3396       }
3397     }
3398     return NULL;
3399   }
3400 
3401   int cc;
3402   if (ASCII_ISUPPER(c)) {
3403     cc = TOLOWER_ASC(c);
3404   } else if (ASCII_ISLOWER(c)) {
3405     cc = TOUPPER_ASC(c);
3406   } else {
3407     return vim_strchr(s, c);
3408   }
3409 
3410   char tofind[] = { (char)c, (char)cc, NUL };
3411   return (char_u *)strpbrk((const char *)s, tofind);
3412 }
3413 
3414 /// Matches a regexp against multiple lines.
3415 /// "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
3416 /// Uses curbuf for line count and 'iskeyword'.
3417 ///
3418 /// @param win Window in which to search or NULL
3419 /// @param buf Buffer in which to search
3420 /// @param lnum Number of line to start looking for match
3421 /// @param col Column to start looking for match
3422 /// @param tm Timeout limit or NULL
3423 ///
3424 /// @return zero if there is no match and number of lines contained in the match
3425 ///         otherwise.
bt_regexec_multi(regmmatch_T * rmp,win_T * win,buf_T * buf,linenr_T lnum,colnr_T col,proftime_T * tm,int * timed_out)3426 static long bt_regexec_multi(regmmatch_T *rmp, win_T *win, buf_T *buf,
3427                              linenr_T lnum, colnr_T col,
3428                              proftime_T *tm, int *timed_out)
3429 {
3430   rex.reg_match = NULL;
3431   rex.reg_mmatch = rmp;
3432   rex.reg_buf = buf;
3433   rex.reg_win = win;
3434   rex.reg_firstlnum = lnum;
3435   rex.reg_maxline = rex.reg_buf->b_ml.ml_line_count - lnum;
3436   rex.reg_line_lbr = false;
3437   rex.reg_ic = rmp->rmm_ic;
3438   rex.reg_icombine = false;
3439   rex.reg_maxcol = rmp->rmm_maxcol;
3440 
3441   return bt_regexec_both(NULL, col, tm, timed_out);
3442 }
3443 
3444 /// Match a regexp against a string ("line" points to the string) or multiple
3445 /// lines (if "line" is NULL, use reg_getline()).
3446 /// @return 0 for failure, or number of lines contained in the match.
bt_regexec_both(char_u * line,colnr_T col,proftime_T * tm,int * timed_out)3447 static long bt_regexec_both(char_u *line,
3448                             colnr_T col,      // column to start search
3449                             proftime_T *tm,   // timeout limit or NULL
3450                             int *timed_out)   // flag set on timeout or NULL
3451 {
3452   bt_regprog_T        *prog;
3453   char_u      *s;
3454   long retval = 0L;
3455 
3456   /* Create "regstack" and "backpos" if they are not allocated yet.
3457    * We allocate *_INITIAL amount of bytes first and then set the grow size
3458    * to much bigger value to avoid many malloc calls in case of deep regular
3459    * expressions.  */
3460   if (regstack.ga_data == NULL) {
3461     /* Use an item size of 1 byte, since we push different things
3462      * onto the regstack. */
3463     ga_init(&regstack, 1, REGSTACK_INITIAL);
3464     ga_grow(&regstack, REGSTACK_INITIAL);
3465     ga_set_growsize(&regstack, REGSTACK_INITIAL * 8);
3466   }
3467 
3468   if (backpos.ga_data == NULL) {
3469     ga_init(&backpos, sizeof(backpos_T), BACKPOS_INITIAL);
3470     ga_grow(&backpos, BACKPOS_INITIAL);
3471     ga_set_growsize(&backpos, BACKPOS_INITIAL * 8);
3472   }
3473 
3474   if (REG_MULTI) {
3475     prog = (bt_regprog_T *)rex.reg_mmatch->regprog;
3476     line = reg_getline((linenr_T)0);
3477     rex.reg_startpos = rex.reg_mmatch->startpos;
3478     rex.reg_endpos = rex.reg_mmatch->endpos;
3479   } else {
3480     prog = (bt_regprog_T *)rex.reg_match->regprog;
3481     rex.reg_startp = rex.reg_match->startp;
3482     rex.reg_endp = rex.reg_match->endp;
3483   }
3484 
3485   /* Be paranoid... */
3486   if (prog == NULL || line == NULL) {
3487     iemsg(_(e_null));
3488     goto theend;
3489   }
3490 
3491   /* Check validity of program. */
3492   if (prog_magic_wrong())
3493     goto theend;
3494 
3495   // If the start column is past the maximum column: no need to try.
3496   if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol) {
3497     goto theend;
3498   }
3499 
3500   // If pattern contains "\c" or "\C": overrule value of rex.reg_ic
3501   if (prog->regflags & RF_ICASE) {
3502     rex.reg_ic = true;
3503   } else if (prog->regflags & RF_NOICASE) {
3504     rex.reg_ic = false;
3505   }
3506 
3507   // If pattern contains "\Z" overrule value of rex.reg_icombine
3508   if (prog->regflags & RF_ICOMBINE) {
3509     rex.reg_icombine = true;
3510   }
3511 
3512   /* If there is a "must appear" string, look for it. */
3513   if (prog->regmust != NULL) {
3514     int c = utf_ptr2char(prog->regmust);
3515     s = line + col;
3516 
3517     // This is used very often, esp. for ":global".  Use two versions of
3518     // the loop to avoid overhead of conditions.
3519     if (!rex.reg_ic) {
3520       while ((s = vim_strchr(s, c)) != NULL) {
3521         if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0) {
3522           break;  // Found it.
3523         }
3524         MB_PTR_ADV(s);
3525       }
3526     } else {
3527       while ((s = cstrchr(s, c)) != NULL) {
3528         if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0) {
3529           break;  // Found it.
3530         }
3531         MB_PTR_ADV(s);
3532       }
3533     }
3534     if (s == NULL) {  // Not present.
3535       goto theend;
3536     }
3537   }
3538 
3539   rex.line = line;
3540   rex.lnum = 0;
3541   reg_toolong = false;
3542 
3543   /* Simplest case: Anchored match need be tried only once. */
3544   if (prog->reganch) {
3545     int c = utf_ptr2char(rex.line + col);
3546     if (prog->regstart == NUL
3547         || prog->regstart == c
3548         || (rex.reg_ic
3549             && (utf_fold(prog->regstart) == utf_fold(c)
3550                 || (c < 255 && prog->regstart < 255
3551                     && mb_tolower(prog->regstart) == mb_tolower(c))))) {
3552       retval = regtry(prog, col, tm, timed_out);
3553     } else {
3554       retval = 0;
3555     }
3556   } else {
3557     int tm_count = 0;
3558     /* Messy cases:  unanchored match. */
3559     while (!got_int) {
3560       if (prog->regstart != NUL) {
3561         // Skip until the char we know it must start with.
3562         s = cstrchr(rex.line + col, prog->regstart);
3563         if (s == NULL) {
3564           retval = 0;
3565           break;
3566         }
3567         col = (int)(s - rex.line);
3568       }
3569 
3570       // Check for maximum column to try.
3571       if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol) {
3572         retval = 0;
3573         break;
3574       }
3575 
3576       retval = regtry(prog, col, tm, timed_out);
3577       if (retval > 0) {
3578         break;
3579       }
3580 
3581       // if not currently on the first line, get it again
3582       if (rex.lnum != 0) {
3583         rex.lnum = 0;
3584         rex.line = reg_getline((linenr_T)0);
3585       }
3586       if (rex.line[col] == NUL) {
3587         break;
3588       }
3589       col += utfc_ptr2len(rex.line + col);
3590       // Check for timeout once in a twenty times to avoid overhead.
3591       if (tm != NULL && ++tm_count == 20) {
3592         tm_count = 0;
3593         if (profile_passed_limit(*tm)) {
3594           if (timed_out != NULL) {
3595             *timed_out = true;
3596           }
3597           break;
3598         }
3599       }
3600     }
3601   }
3602 
3603 theend:
3604   /* Free "reg_tofree" when it's a bit big.
3605    * Free regstack and backpos if they are bigger than their initial size. */
3606   if (reg_tofreelen > 400) {
3607     XFREE_CLEAR(reg_tofree);
3608   }
3609   if (regstack.ga_maxlen > REGSTACK_INITIAL)
3610     ga_clear(&regstack);
3611   if (backpos.ga_maxlen > BACKPOS_INITIAL)
3612     ga_clear(&backpos);
3613 
3614   if (retval > 0) {
3615     // Make sure the end is never before the start.  Can happen when \zs
3616     // and \ze are used.
3617     if (REG_MULTI) {
3618       const lpos_T *const start = &rex.reg_mmatch->startpos[0];
3619       const lpos_T *const end = &rex.reg_mmatch->endpos[0];
3620 
3621       if (end->lnum < start->lnum
3622           || (end->lnum == start->lnum && end->col < start->col)) {
3623         rex.reg_mmatch->endpos[0] = rex.reg_mmatch->startpos[0];
3624       }
3625     } else {
3626       if (rex.reg_match->endp[0] < rex.reg_match->startp[0]) {
3627         rex.reg_match->endp[0] = rex.reg_match->startp[0];
3628       }
3629     }
3630   }
3631 
3632   return retval;
3633 }
3634 
3635 
3636 /*
3637  * Create a new extmatch and mark it as referenced once.
3638  */
make_extmatch(void)3639 static reg_extmatch_T *make_extmatch(void)
3640   FUNC_ATTR_NONNULL_RET
3641 {
3642   reg_extmatch_T *em = xcalloc(1, sizeof(reg_extmatch_T));
3643   em->refcnt = 1;
3644   return em;
3645 }
3646 
3647 /*
3648  * Add a reference to an extmatch.
3649  */
ref_extmatch(reg_extmatch_T * em)3650 reg_extmatch_T *ref_extmatch(reg_extmatch_T *em)
3651 {
3652   if (em != NULL)
3653     em->refcnt++;
3654   return em;
3655 }
3656 
3657 /*
3658  * Remove a reference to an extmatch.  If there are no references left, free
3659  * the info.
3660  */
unref_extmatch(reg_extmatch_T * em)3661 void unref_extmatch(reg_extmatch_T *em)
3662 {
3663   int i;
3664 
3665   if (em != NULL && --em->refcnt <= 0) {
3666     for (i = 0; i < NSUBEXP; ++i)
3667       xfree(em->matches[i]);
3668     xfree(em);
3669   }
3670 }
3671 
3672 /// Try match of "prog" with at rex.line["col"].
3673 /// @returns 0 for failure, or number of lines contained in the match.
regtry(bt_regprog_T * prog,colnr_T col,proftime_T * tm,int * timed_out)3674 static long regtry(bt_regprog_T *prog,
3675                    colnr_T col,
3676                    proftime_T *tm,    // timeout limit or NULL
3677                    int *timed_out)    // flag set on timeout or NULL
3678 {
3679   rex.input = rex.line + col;
3680   rex.need_clear_subexpr = true;
3681   // Clear the external match subpointers if necessaey.
3682   rex.need_clear_zsubexpr = (prog->reghasz == REX_SET);
3683 
3684   if (regmatch(prog->program + 1, tm, timed_out) == 0) {
3685     return 0;
3686   }
3687 
3688   cleanup_subexpr();
3689   if (REG_MULTI) {
3690     if (rex.reg_startpos[0].lnum < 0) {
3691       rex.reg_startpos[0].lnum = 0;
3692       rex.reg_startpos[0].col = col;
3693     }
3694     if (rex.reg_endpos[0].lnum < 0) {
3695       rex.reg_endpos[0].lnum = rex.lnum;
3696       rex.reg_endpos[0].col = (int)(rex.input - rex.line);
3697     } else {
3698       // Use line number of "\ze".
3699       rex.lnum = rex.reg_endpos[0].lnum;
3700     }
3701   } else {
3702     if (rex.reg_startp[0] == NULL) {
3703       rex.reg_startp[0] = rex.line + col;
3704     }
3705     if (rex.reg_endp[0] == NULL) {
3706       rex.reg_endp[0] = rex.input;
3707     }
3708   }
3709   /* Package any found \z(...\) matches for export. Default is none. */
3710   unref_extmatch(re_extmatch_out);
3711   re_extmatch_out = NULL;
3712 
3713   if (prog->reghasz == REX_SET) {
3714     int i;
3715 
3716     cleanup_zsubexpr();
3717     re_extmatch_out = make_extmatch();
3718     for (i = 0; i < NSUBEXP; i++) {
3719       if (REG_MULTI) {
3720         /* Only accept single line matches. */
3721         if (reg_startzpos[i].lnum >= 0
3722             && reg_endzpos[i].lnum == reg_startzpos[i].lnum
3723             && reg_endzpos[i].col >= reg_startzpos[i].col) {
3724           re_extmatch_out->matches[i] =
3725             vim_strnsave(reg_getline(reg_startzpos[i].lnum)
3726                          + reg_startzpos[i].col,
3727                          reg_endzpos[i].col
3728                          - reg_startzpos[i].col);
3729         }
3730       } else {
3731         if (reg_startzp[i] != NULL && reg_endzp[i] != NULL)
3732           re_extmatch_out->matches[i] =
3733             vim_strnsave(reg_startzp[i], reg_endzp[i] - reg_startzp[i]);
3734       }
3735     }
3736   }
3737   return 1 + rex.lnum;
3738 }
3739 
3740 
3741 // Get class of previous character.
reg_prev_class(void)3742 static int reg_prev_class(void)
3743 {
3744   if (rex.input > rex.line) {
3745     return mb_get_class_tab(
3746         rex.input - 1 - utf_head_off(rex.line, rex.input - 1),
3747         rex.reg_buf->b_chartab);
3748   }
3749   return -1;
3750 }
3751 
3752 
3753 // Return true if the current rex.input position matches the Visual area.
reg_match_visual(void)3754 static bool reg_match_visual(void)
3755 {
3756   pos_T top, bot;
3757   linenr_T lnum;
3758   colnr_T col;
3759   win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win;
3760   int mode;
3761   colnr_T start, end;
3762   colnr_T start2, end2;
3763   colnr_T curswant;
3764 
3765   // Check if the buffer is the current buffer.
3766   if (rex.reg_buf != curbuf || VIsual.lnum == 0) {
3767     return false;
3768   }
3769 
3770   if (VIsual_active) {
3771     if (lt(VIsual, wp->w_cursor)) {
3772       top = VIsual;
3773       bot = wp->w_cursor;
3774     } else {
3775       top = wp->w_cursor;
3776       bot = VIsual;
3777     }
3778     mode = VIsual_mode;
3779     curswant = wp->w_curswant;
3780   } else {
3781     if (lt(curbuf->b_visual.vi_start, curbuf->b_visual.vi_end)) {
3782       top = curbuf->b_visual.vi_start;
3783       bot = curbuf->b_visual.vi_end;
3784     } else {
3785       top = curbuf->b_visual.vi_end;
3786       bot = curbuf->b_visual.vi_start;
3787     }
3788     mode = curbuf->b_visual.vi_mode;
3789     curswant = curbuf->b_visual.vi_curswant;
3790   }
3791   lnum = rex.lnum + rex.reg_firstlnum;
3792   if (lnum < top.lnum || lnum > bot.lnum) {
3793     return false;
3794   }
3795 
3796   if (mode == 'v') {
3797     col = (colnr_T)(rex.input - rex.line);
3798     if ((lnum == top.lnum && col < top.col)
3799         || (lnum == bot.lnum && col >= bot.col + (*p_sel != 'e'))) {
3800       return false;
3801     }
3802   } else if (mode == Ctrl_V) {
3803     getvvcol(wp, &top, &start, NULL, &end);
3804     getvvcol(wp, &bot, &start2, NULL, &end2);
3805     if (start2 < start)
3806       start = start2;
3807     if (end2 > end)
3808       end = end2;
3809     if (top.col == MAXCOL || bot.col == MAXCOL || curswant == MAXCOL) {
3810       end = MAXCOL;
3811     }
3812     unsigned int cols_u = win_linetabsize(wp, rex.line,
3813                                           (colnr_T)(rex.input - rex.line));
3814     assert(cols_u <= MAXCOL);
3815     colnr_T cols = (colnr_T)cols_u;
3816     if (cols < start || cols > end - (*p_sel == 'e')) {
3817       return false;
3818     }
3819   }
3820   return true;
3821 }
3822 
3823 #define ADVANCE_REGINPUT() MB_PTR_ADV(rex.input)
3824 
3825 /*
3826  * The arguments from BRACE_LIMITS are stored here.  They are actually local
3827  * to regmatch(), but they are here to reduce the amount of stack space used
3828  * (it can be called recursively many times).
3829  */
3830 static long bl_minval;
3831 static long bl_maxval;
3832 
3833 /// Main matching routine
3834 ///
3835 /// Conceptually the strategy is simple: Check to see whether the current node
3836 /// matches, push an item onto the regstack and loop to see whether the rest
3837 /// matches, and then act accordingly.  In practice we make some effort to
3838 /// avoid using the regstack, in particular by going through "ordinary" nodes
3839 /// (that don't need to know whether the rest of the match failed) by a nested
3840 /// loop.
3841 ///
3842 /// Returns true when there is a match.  Leaves rex.input and rex.lnum
3843 /// just after the last matched character.
3844 /// Returns false when there is no match.  Leaves rex.input and rex.lnum in an
3845 /// undefined state!
regmatch(char_u * scan,proftime_T * tm,int * timed_out)3846 static bool regmatch(
3847     char_u *scan,               // Current node.
3848     proftime_T *tm,             // timeout limit or NULL
3849     int *timed_out              // flag set on timeout or NULL
3850 )
3851 {
3852   char_u        *next;          /* Next node. */
3853   int op;
3854   int c;
3855   regitem_T     *rp;
3856   int no;
3857   int status;                   // one of the RA_ values:
3858   int tm_count = 0;
3859 #define RA_FAIL         1       // something failed, abort
3860 #define RA_CONT         2       // continue in inner loop
3861 #define RA_BREAK        3       // break inner loop
3862 #define RA_MATCH        4       // successful match
3863 #define RA_NOMATCH      5       // didn't match
3864 
3865   // Make "regstack" and "backpos" empty.  They are allocated and freed in
3866   // bt_regexec_both() to reduce malloc()/free() calls.
3867   regstack.ga_len = 0;
3868   backpos.ga_len = 0;
3869 
3870   /*
3871    * Repeat until "regstack" is empty.
3872    */
3873   for (;; ) {
3874     /* Some patterns may take a long time to match, e.g., "\([a-z]\+\)\+Q".
3875      * Allow interrupting them with CTRL-C. */
3876     fast_breakcheck();
3877 
3878 #ifdef REGEXP_DEBUG
3879     if (scan != NULL && regnarrate) {
3880       mch_errmsg((char *)regprop(scan));
3881       mch_errmsg("(\n");
3882     }
3883 #endif
3884 
3885     /*
3886      * Repeat for items that can be matched sequentially, without using the
3887      * regstack.
3888      */
3889     for (;; ) {
3890       if (got_int || scan == NULL) {
3891         status = RA_FAIL;
3892         break;
3893       }
3894       // Check for timeout once in a 100 times to avoid overhead.
3895       if (tm != NULL && ++tm_count == 100) {
3896         tm_count = 0;
3897         if (profile_passed_limit(*tm)) {
3898           if (timed_out != NULL) {
3899             *timed_out = true;
3900           }
3901           status = RA_FAIL;
3902           break;
3903         }
3904       }
3905       status = RA_CONT;
3906 
3907 #ifdef REGEXP_DEBUG
3908       if (regnarrate) {
3909         mch_errmsg((char *)regprop(scan));
3910         mch_errmsg("...\n");
3911         if (re_extmatch_in != NULL) {
3912           int i;
3913 
3914           mch_errmsg(_("External submatches:\n"));
3915           for (i = 0; i < NSUBEXP; i++) {
3916             mch_errmsg("    \"");
3917             if (re_extmatch_in->matches[i] != NULL)
3918               mch_errmsg((char *)re_extmatch_in->matches[i]);
3919             mch_errmsg("\"\n");
3920           }
3921         }
3922       }
3923 #endif
3924       next = regnext(scan);
3925 
3926       op = OP(scan);
3927       // Check for character class with NL added.
3928       if (!rex.reg_line_lbr && WITH_NL(op) && REG_MULTI
3929           && *rex.input == NUL && rex.lnum <= rex.reg_maxline) {
3930         reg_nextline();
3931       } else if (rex.reg_line_lbr && WITH_NL(op) && *rex.input == '\n') {
3932         ADVANCE_REGINPUT();
3933       } else {
3934         if (WITH_NL(op)) {
3935           op -= ADD_NL;
3936         }
3937         c = utf_ptr2char(rex.input);
3938         switch (op) {
3939         case BOL:
3940           if (rex.input != rex.line) {
3941             status = RA_NOMATCH;
3942           }
3943           break;
3944 
3945         case EOL:
3946           if (c != NUL) {
3947             status = RA_NOMATCH;
3948           }
3949           break;
3950 
3951         case RE_BOF:
3952           // We're not at the beginning of the file when below the first
3953           // line where we started, not at the start of the line or we
3954           // didn't start at the first line of the buffer.
3955           if (rex.lnum != 0 || rex.input != rex.line
3956               || (REG_MULTI && rex.reg_firstlnum > 1)) {
3957             status = RA_NOMATCH;
3958           }
3959           break;
3960 
3961         case RE_EOF:
3962           if (rex.lnum != rex.reg_maxline || c != NUL) {
3963             status = RA_NOMATCH;
3964           }
3965           break;
3966 
3967         case CURSOR:
3968           // Check if the buffer is in a window and compare the
3969           // rex.reg_win->w_cursor position to the match position.
3970           if (rex.reg_win == NULL
3971               || (rex.lnum + rex.reg_firstlnum != rex.reg_win->w_cursor.lnum)
3972               || ((colnr_T)(rex.input - rex.line) !=
3973                   rex.reg_win->w_cursor.col)) {
3974             status = RA_NOMATCH;
3975           }
3976           break;
3977 
3978         case RE_MARK:
3979           /* Compare the mark position to the match position. */
3980         {
3981           int mark = OPERAND(scan)[0];
3982           int cmp = OPERAND(scan)[1];
3983           pos_T   *pos;
3984 
3985           pos = getmark_buf(rex.reg_buf, mark, false);
3986           if (pos == NULL                    // mark doesn't exist
3987               || pos->lnum <= 0) {           // mark isn't set in reg_buf
3988             status = RA_NOMATCH;
3989           } else {
3990             const colnr_T pos_col = pos->lnum == rex.lnum + rex.reg_firstlnum
3991               && pos->col == MAXCOL
3992               ? (colnr_T)STRLEN(reg_getline(pos->lnum - rex.reg_firstlnum))
3993               : pos->col;
3994 
3995             if (pos->lnum == rex.lnum + rex.reg_firstlnum
3996                 ? (pos_col == (colnr_T)(rex.input - rex.line)
3997                    ? (cmp == '<' || cmp == '>')
3998                    : (pos_col < (colnr_T)(rex.input - rex.line)
3999                       ? cmp != '>'
4000                       : cmp != '<'))
4001                 : (pos->lnum < rex.lnum + rex.reg_firstlnum
4002                    ? cmp != '>'
4003                    : cmp != '<')) {
4004               status = RA_NOMATCH;
4005             }
4006           }
4007         }
4008         break;
4009 
4010         case RE_VISUAL:
4011           if (!reg_match_visual())
4012             status = RA_NOMATCH;
4013           break;
4014 
4015         case RE_LNUM:
4016           assert(rex.lnum + rex.reg_firstlnum >= 0
4017                  && (uintmax_t)(rex.lnum + rex.reg_firstlnum) <= UINT32_MAX);
4018           if (!REG_MULTI
4019               || !re_num_cmp((uint32_t)(rex.lnum + rex.reg_firstlnum), scan)) {
4020             status = RA_NOMATCH;
4021           }
4022           break;
4023 
4024         case RE_COL:
4025           assert(rex.input - rex.line + 1 >= 0
4026                  && (uintmax_t)(rex.input - rex.line + 1) <= UINT32_MAX);
4027           if (!re_num_cmp((uint32_t)(rex.input - rex.line + 1), scan)) {
4028             status = RA_NOMATCH;
4029           }
4030           break;
4031 
4032         case RE_VCOL:
4033           if (!re_num_cmp(win_linetabsize(rex.reg_win == NULL
4034                                           ? curwin : rex.reg_win,
4035                                           rex.line,
4036                                           (colnr_T)(rex.input - rex.line)) + 1,
4037                           scan)) {
4038             status = RA_NOMATCH;
4039           }
4040           break;
4041 
4042         case BOW:  // \<word; rex.input points to w
4043           if (c == NUL) {  // Can't match at end of line
4044             status = RA_NOMATCH;
4045           } else {
4046             // Get class of current and previous char (if it exists).
4047             const int this_class =
4048               mb_get_class_tab(rex.input, rex.reg_buf->b_chartab);
4049             if (this_class <= 1) {
4050               status = RA_NOMATCH;  // Not on a word at all.
4051             } else if (reg_prev_class() == this_class) {
4052               status = RA_NOMATCH;  // Previous char is in same word.
4053             }
4054           }
4055           break;
4056 
4057         case EOW:  // word\>; rex.input points after d
4058           if (rex.input == rex.line) {  // Can't match at start of line
4059             status = RA_NOMATCH;
4060           } else {
4061             int this_class, prev_class;
4062 
4063             // Get class of current and previous char (if it exists).
4064             this_class = mb_get_class_tab(rex.input, rex.reg_buf->b_chartab);
4065             prev_class = reg_prev_class();
4066             if (this_class == prev_class
4067                 || prev_class == 0 || prev_class == 1) {
4068               status = RA_NOMATCH;
4069             }
4070           }
4071           break;  // Matched with EOW
4072 
4073         case ANY:
4074           // ANY does not match new lines.
4075           if (c == NUL) {
4076             status = RA_NOMATCH;
4077           } else {
4078             ADVANCE_REGINPUT();
4079           }
4080           break;
4081 
4082         case IDENT:
4083           if (!vim_isIDc(c))
4084             status = RA_NOMATCH;
4085           else
4086             ADVANCE_REGINPUT();
4087           break;
4088 
4089         case SIDENT:
4090           if (ascii_isdigit(*rex.input) || !vim_isIDc(c)) {
4091             status = RA_NOMATCH;
4092           } else {
4093             ADVANCE_REGINPUT();
4094           }
4095           break;
4096 
4097         case KWORD:
4098           if (!vim_iswordp_buf(rex.input, rex.reg_buf)) {
4099             status = RA_NOMATCH;
4100           } else {
4101             ADVANCE_REGINPUT();
4102           }
4103           break;
4104 
4105         case SKWORD:
4106           if (ascii_isdigit(*rex.input)
4107               || !vim_iswordp_buf(rex.input, rex.reg_buf)) {
4108             status = RA_NOMATCH;
4109           } else {
4110             ADVANCE_REGINPUT();
4111           }
4112           break;
4113 
4114         case FNAME:
4115           if (!vim_isfilec(c)) {
4116             status = RA_NOMATCH;
4117           } else {
4118             ADVANCE_REGINPUT();
4119           }
4120           break;
4121 
4122         case SFNAME:
4123           if (ascii_isdigit(*rex.input) || !vim_isfilec(c)) {
4124             status = RA_NOMATCH;
4125           } else {
4126             ADVANCE_REGINPUT();
4127           }
4128           break;
4129 
4130         case PRINT:
4131           if (!vim_isprintc(utf_ptr2char(rex.input))) {
4132             status = RA_NOMATCH;
4133           } else {
4134             ADVANCE_REGINPUT();
4135           }
4136           break;
4137 
4138         case SPRINT:
4139           if (ascii_isdigit(*rex.input) || !vim_isprintc(utf_ptr2char(rex.input))) {
4140             status = RA_NOMATCH;
4141           } else {
4142             ADVANCE_REGINPUT();
4143           }
4144           break;
4145 
4146         case WHITE:
4147           if (!ascii_iswhite(c))
4148             status = RA_NOMATCH;
4149           else
4150             ADVANCE_REGINPUT();
4151           break;
4152 
4153         case NWHITE:
4154           if (c == NUL || ascii_iswhite(c))
4155             status = RA_NOMATCH;
4156           else
4157             ADVANCE_REGINPUT();
4158           break;
4159 
4160         case DIGIT:
4161           if (!ri_digit(c))
4162             status = RA_NOMATCH;
4163           else
4164             ADVANCE_REGINPUT();
4165           break;
4166 
4167         case NDIGIT:
4168           if (c == NUL || ri_digit(c))
4169             status = RA_NOMATCH;
4170           else
4171             ADVANCE_REGINPUT();
4172           break;
4173 
4174         case HEX:
4175           if (!ri_hex(c))
4176             status = RA_NOMATCH;
4177           else
4178             ADVANCE_REGINPUT();
4179           break;
4180 
4181         case NHEX:
4182           if (c == NUL || ri_hex(c))
4183             status = RA_NOMATCH;
4184           else
4185             ADVANCE_REGINPUT();
4186           break;
4187 
4188         case OCTAL:
4189           if (!ri_octal(c))
4190             status = RA_NOMATCH;
4191           else
4192             ADVANCE_REGINPUT();
4193           break;
4194 
4195         case NOCTAL:
4196           if (c == NUL || ri_octal(c))
4197             status = RA_NOMATCH;
4198           else
4199             ADVANCE_REGINPUT();
4200           break;
4201 
4202         case WORD:
4203           if (!ri_word(c))
4204             status = RA_NOMATCH;
4205           else
4206             ADVANCE_REGINPUT();
4207           break;
4208 
4209         case NWORD:
4210           if (c == NUL || ri_word(c))
4211             status = RA_NOMATCH;
4212           else
4213             ADVANCE_REGINPUT();
4214           break;
4215 
4216         case HEAD:
4217           if (!ri_head(c))
4218             status = RA_NOMATCH;
4219           else
4220             ADVANCE_REGINPUT();
4221           break;
4222 
4223         case NHEAD:
4224           if (c == NUL || ri_head(c))
4225             status = RA_NOMATCH;
4226           else
4227             ADVANCE_REGINPUT();
4228           break;
4229 
4230         case ALPHA:
4231           if (!ri_alpha(c))
4232             status = RA_NOMATCH;
4233           else
4234             ADVANCE_REGINPUT();
4235           break;
4236 
4237         case NALPHA:
4238           if (c == NUL || ri_alpha(c))
4239             status = RA_NOMATCH;
4240           else
4241             ADVANCE_REGINPUT();
4242           break;
4243 
4244         case LOWER:
4245           if (!ri_lower(c))
4246             status = RA_NOMATCH;
4247           else
4248             ADVANCE_REGINPUT();
4249           break;
4250 
4251         case NLOWER:
4252           if (c == NUL || ri_lower(c))
4253             status = RA_NOMATCH;
4254           else
4255             ADVANCE_REGINPUT();
4256           break;
4257 
4258         case UPPER:
4259           if (!ri_upper(c))
4260             status = RA_NOMATCH;
4261           else
4262             ADVANCE_REGINPUT();
4263           break;
4264 
4265         case NUPPER:
4266           if (c == NUL || ri_upper(c))
4267             status = RA_NOMATCH;
4268           else
4269             ADVANCE_REGINPUT();
4270           break;
4271 
4272         case EXACTLY:
4273         {
4274           int len;
4275           char_u  *opnd;
4276 
4277           opnd = OPERAND(scan);
4278           // Inline the first byte, for speed.
4279           if (*opnd != *rex.input
4280               && (!rex.reg_ic)) {
4281             status = RA_NOMATCH;
4282           } else if (*opnd == NUL) {
4283             // match empty string always works; happens when "~" is
4284             // empty.
4285           } else {
4286             if (opnd[1] == NUL && !rex.reg_ic) {
4287               len = 1;  // matched a single byte above
4288             } else {
4289               // Need to match first byte again for multi-byte.
4290               len = (int)STRLEN(opnd);
4291               if (cstrncmp(opnd, rex.input, &len) != 0) {
4292                 status = RA_NOMATCH;
4293               }
4294             }
4295             // Check for following composing character, unless %C
4296             // follows (skips over all composing chars).
4297             if (status != RA_NOMATCH
4298                 && utf_composinglike(rex.input, rex.input + len)
4299                 && !rex.reg_icombine
4300                 && OP(next) != RE_COMPOSING) {
4301               // raaron: This code makes a composing character get
4302               // ignored, which is the correct behavior (sometimes)
4303               // for voweled Hebrew texts.
4304               status = RA_NOMATCH;
4305             }
4306             if (status != RA_NOMATCH) {
4307               rex.input += len;
4308             }
4309           }
4310         }
4311         break;
4312 
4313         case ANYOF:
4314         case ANYBUT:
4315           if (c == NUL)
4316             status = RA_NOMATCH;
4317           else if ((cstrchr(OPERAND(scan), c) == NULL) == (op == ANYOF))
4318             status = RA_NOMATCH;
4319           else
4320             ADVANCE_REGINPUT();
4321           break;
4322 
4323         case MULTIBYTECODE:
4324           {
4325             int i, len;
4326 
4327             const char_u *opnd = OPERAND(scan);
4328             // Safety check (just in case 'encoding' was changed since
4329             // compiling the program).
4330             if ((len = utfc_ptr2len(opnd)) < 2) {
4331               status = RA_NOMATCH;
4332               break;
4333             }
4334             const int opndc = utf_ptr2char(opnd);
4335             if (utf_iscomposing(opndc)) {
4336               // When only a composing char is given match at any
4337               // position where that composing char appears.
4338               status = RA_NOMATCH;
4339               for (i = 0; rex.input[i] != NUL;
4340                    i += utf_ptr2len(rex.input + i)) {
4341                 const int inpc = utf_ptr2char(rex.input + i);
4342                 if (!utf_iscomposing(inpc)) {
4343                   if (i > 0) {
4344                     break;
4345                   }
4346                 } else if (opndc == inpc) {
4347                   // Include all following composing chars.
4348                   len = i + utfc_ptr2len(rex.input + i);
4349                   status = RA_MATCH;
4350                   break;
4351                 }
4352               }
4353             } else {
4354               for (i = 0; i < len; i++) {
4355                 if (opnd[i] != rex.input[i]) {
4356                   status = RA_NOMATCH;
4357                   break;
4358                 }
4359               }
4360             }
4361             rex.input += len;
4362           }
4363           break;
4364 
4365         case RE_COMPOSING:
4366           {
4367             // Skip composing characters.
4368             while (utf_iscomposing(utf_ptr2char(rex.input))) {
4369               MB_CPTR_ADV(rex.input);
4370             }
4371           }
4372           break;
4373 
4374         case NOTHING:
4375           break;
4376 
4377         case BACK:
4378         {
4379           int i;
4380 
4381           /*
4382            * When we run into BACK we need to check if we don't keep
4383            * looping without matching any input.  The second and later
4384            * times a BACK is encountered it fails if the input is still
4385            * at the same position as the previous time.
4386            * The positions are stored in "backpos" and found by the
4387            * current value of "scan", the position in the RE program.
4388            */
4389           backpos_T *bp = (backpos_T *)backpos.ga_data;
4390           for (i = 0; i < backpos.ga_len; ++i)
4391             if (bp[i].bp_scan == scan)
4392               break;
4393           if (i == backpos.ga_len) {
4394             backpos_T *p = GA_APPEND_VIA_PTR(backpos_T, &backpos);
4395             p->bp_scan = scan;
4396           } else if (reg_save_equal(&bp[i].bp_pos))
4397             /* Still at same position as last time, fail. */
4398             status = RA_NOMATCH;
4399 
4400           assert(status != RA_FAIL);
4401           if (status != RA_NOMATCH) {
4402             reg_save(&bp[i].bp_pos, &backpos);
4403           }
4404         }
4405         break;
4406 
4407         case MOPEN + 0:     /* Match start: \zs */
4408         case MOPEN + 1:     /* \( */
4409         case MOPEN + 2:
4410         case MOPEN + 3:
4411         case MOPEN + 4:
4412         case MOPEN + 5:
4413         case MOPEN + 6:
4414         case MOPEN + 7:
4415         case MOPEN + 8:
4416         case MOPEN + 9:
4417         {
4418           no = op - MOPEN;
4419           cleanup_subexpr();
4420           rp = regstack_push(RS_MOPEN, scan);
4421           if (rp == NULL)
4422             status = RA_FAIL;
4423           else {
4424             rp->rs_no = no;
4425             save_se(&rp->rs_un.sesave, &rex.reg_startpos[no],
4426                     &rex.reg_startp[no]);
4427             // We simply continue and handle the result when done.
4428           }
4429         }
4430         break;
4431 
4432         case NOPEN:         /* \%( */
4433         case NCLOSE:        /* \) after \%( */
4434           if (regstack_push(RS_NOPEN, scan) == NULL)
4435             status = RA_FAIL;
4436           /* We simply continue and handle the result when done. */
4437           break;
4438 
4439         case ZOPEN + 1:
4440         case ZOPEN + 2:
4441         case ZOPEN + 3:
4442         case ZOPEN + 4:
4443         case ZOPEN + 5:
4444         case ZOPEN + 6:
4445         case ZOPEN + 7:
4446         case ZOPEN + 8:
4447         case ZOPEN + 9:
4448         {
4449           no = op - ZOPEN;
4450           cleanup_zsubexpr();
4451           rp = regstack_push(RS_ZOPEN, scan);
4452           if (rp == NULL)
4453             status = RA_FAIL;
4454           else {
4455             rp->rs_no = no;
4456             save_se(&rp->rs_un.sesave, &reg_startzpos[no],
4457                 &reg_startzp[no]);
4458             /* We simply continue and handle the result when done. */
4459           }
4460         }
4461         break;
4462 
4463         case MCLOSE + 0:    /* Match end: \ze */
4464         case MCLOSE + 1:    /* \) */
4465         case MCLOSE + 2:
4466         case MCLOSE + 3:
4467         case MCLOSE + 4:
4468         case MCLOSE + 5:
4469         case MCLOSE + 6:
4470         case MCLOSE + 7:
4471         case MCLOSE + 8:
4472         case MCLOSE + 9:
4473         {
4474           no = op - MCLOSE;
4475           cleanup_subexpr();
4476           rp = regstack_push(RS_MCLOSE, scan);
4477           if (rp == NULL) {
4478             status = RA_FAIL;
4479           } else {
4480             rp->rs_no = no;
4481             save_se(&rp->rs_un.sesave, &rex.reg_endpos[no], &rex.reg_endp[no]);
4482             // We simply continue and handle the result when done.
4483           }
4484         }
4485         break;
4486 
4487         case ZCLOSE + 1:    /* \) after \z( */
4488         case ZCLOSE + 2:
4489         case ZCLOSE + 3:
4490         case ZCLOSE + 4:
4491         case ZCLOSE + 5:
4492         case ZCLOSE + 6:
4493         case ZCLOSE + 7:
4494         case ZCLOSE + 8:
4495         case ZCLOSE + 9:
4496         {
4497           no = op - ZCLOSE;
4498           cleanup_zsubexpr();
4499           rp = regstack_push(RS_ZCLOSE, scan);
4500           if (rp == NULL)
4501             status = RA_FAIL;
4502           else {
4503             rp->rs_no = no;
4504             save_se(&rp->rs_un.sesave, &reg_endzpos[no],
4505                 &reg_endzp[no]);
4506             /* We simply continue and handle the result when done. */
4507           }
4508         }
4509         break;
4510 
4511         case BACKREF + 1:
4512         case BACKREF + 2:
4513         case BACKREF + 3:
4514         case BACKREF + 4:
4515         case BACKREF + 5:
4516         case BACKREF + 6:
4517         case BACKREF + 7:
4518         case BACKREF + 8:
4519         case BACKREF + 9:
4520         {
4521           int len;
4522 
4523           no = op - BACKREF;
4524           cleanup_subexpr();
4525           if (!REG_MULTI) {  // Single-line regexp
4526             if (rex.reg_startp[no] == NULL || rex.reg_endp[no] == NULL) {
4527               // Backref was not set: Match an empty string.
4528               len = 0;
4529             } else {
4530               // Compare current input with back-ref in the same line.
4531               len = (int)(rex.reg_endp[no] - rex.reg_startp[no]);
4532               if (cstrncmp(rex.reg_startp[no], rex.input, &len) != 0) {
4533                 status = RA_NOMATCH;
4534               }
4535             }
4536           } else {  // Multi-line regexp
4537             if (rex.reg_startpos[no].lnum < 0 || rex.reg_endpos[no].lnum < 0) {
4538               // Backref was not set: Match an empty string.
4539               len = 0;
4540             } else {
4541               if (rex.reg_startpos[no].lnum == rex.lnum
4542                   && rex.reg_endpos[no].lnum == rex.lnum) {
4543                 // Compare back-ref within the current line.
4544                 len = rex.reg_endpos[no].col - rex.reg_startpos[no].col;
4545                 if (cstrncmp(rex.line + rex.reg_startpos[no].col,
4546                              rex.input, &len) != 0) {
4547                   status = RA_NOMATCH;
4548                 }
4549               } else {
4550                 // Messy situation: Need to compare between two lines.
4551                 int r = match_with_backref(rex.reg_startpos[no].lnum,
4552                                            rex.reg_startpos[no].col,
4553                                            rex.reg_endpos[no].lnum,
4554                                            rex.reg_endpos[no].col,
4555                                            &len);
4556                 if (r != RA_MATCH) {
4557                   status = r;
4558                 }
4559               }
4560             }
4561           }
4562 
4563           // Matched the backref, skip over it.
4564           rex.input += len;
4565         }
4566         break;
4567 
4568         case ZREF + 1:
4569         case ZREF + 2:
4570         case ZREF + 3:
4571         case ZREF + 4:
4572         case ZREF + 5:
4573         case ZREF + 6:
4574         case ZREF + 7:
4575         case ZREF + 8:
4576         case ZREF + 9:
4577         {
4578           cleanup_zsubexpr();
4579           no = op - ZREF;
4580           if (re_extmatch_in != NULL
4581               && re_extmatch_in->matches[no] != NULL) {
4582             int len = (int)STRLEN(re_extmatch_in->matches[no]);
4583             if (cstrncmp(re_extmatch_in->matches[no], rex.input, &len) != 0) {
4584               status = RA_NOMATCH;
4585             } else {
4586               rex.input += len;
4587             }
4588           } else {
4589             // Backref was not set: Match an empty string.
4590           }
4591         }
4592         break;
4593 
4594         case BRANCH:
4595         {
4596           if (OP(next) != BRANCH)       /* No choice. */
4597             next = OPERAND(scan);               /* Avoid recursion. */
4598           else {
4599             rp = regstack_push(RS_BRANCH, scan);
4600             if (rp == NULL)
4601               status = RA_FAIL;
4602             else
4603               status = RA_BREAK;                /* rest is below */
4604           }
4605         }
4606         break;
4607 
4608         case BRACE_LIMITS:
4609         {
4610           if (OP(next) == BRACE_SIMPLE) {
4611             bl_minval = OPERAND_MIN(scan);
4612             bl_maxval = OPERAND_MAX(scan);
4613           } else if (OP(next) >= BRACE_COMPLEX
4614                      && OP(next) < BRACE_COMPLEX + 10) {
4615             no = OP(next) - BRACE_COMPLEX;
4616             brace_min[no] = OPERAND_MIN(scan);
4617             brace_max[no] = OPERAND_MAX(scan);
4618             brace_count[no] = 0;
4619           } else {
4620             internal_error("BRACE_LIMITS");
4621             status = RA_FAIL;
4622           }
4623         }
4624         break;
4625 
4626         case BRACE_COMPLEX + 0:
4627         case BRACE_COMPLEX + 1:
4628         case BRACE_COMPLEX + 2:
4629         case BRACE_COMPLEX + 3:
4630         case BRACE_COMPLEX + 4:
4631         case BRACE_COMPLEX + 5:
4632         case BRACE_COMPLEX + 6:
4633         case BRACE_COMPLEX + 7:
4634         case BRACE_COMPLEX + 8:
4635         case BRACE_COMPLEX + 9:
4636         {
4637           no = op - BRACE_COMPLEX;
4638           ++brace_count[no];
4639 
4640           /* If not matched enough times yet, try one more */
4641           if (brace_count[no] <= (brace_min[no] <= brace_max[no]
4642                                   ? brace_min[no] : brace_max[no])) {
4643             rp = regstack_push(RS_BRCPLX_MORE, scan);
4644             if (rp == NULL)
4645               status = RA_FAIL;
4646             else {
4647               rp->rs_no = no;
4648               reg_save(&rp->rs_un.regsave, &backpos);
4649               next = OPERAND(scan);
4650               /* We continue and handle the result when done. */
4651             }
4652             break;
4653           }
4654 
4655           /* If matched enough times, may try matching some more */
4656           if (brace_min[no] <= brace_max[no]) {
4657             /* Range is the normal way around, use longest match */
4658             if (brace_count[no] <= brace_max[no]) {
4659               rp = regstack_push(RS_BRCPLX_LONG, scan);
4660               if (rp == NULL)
4661                 status = RA_FAIL;
4662               else {
4663                 rp->rs_no = no;
4664                 reg_save(&rp->rs_un.regsave, &backpos);
4665                 next = OPERAND(scan);
4666                 /* We continue and handle the result when done. */
4667               }
4668             }
4669           } else {
4670             /* Range is backwards, use shortest match first */
4671             if (brace_count[no] <= brace_min[no]) {
4672               rp = regstack_push(RS_BRCPLX_SHORT, scan);
4673               if (rp == NULL)
4674                 status = RA_FAIL;
4675               else {
4676                 reg_save(&rp->rs_un.regsave, &backpos);
4677                 /* We continue and handle the result when done. */
4678               }
4679             }
4680           }
4681         }
4682         break;
4683 
4684         case BRACE_SIMPLE:
4685         case STAR:
4686         case PLUS:
4687         {
4688           regstar_T rst;
4689 
4690           /*
4691            * Lookahead to avoid useless match attempts when we know
4692            * what character comes next.
4693            */
4694           if (OP(next) == EXACTLY) {
4695             rst.nextb = *OPERAND(next);
4696             if (rex.reg_ic) {
4697               if (mb_isupper(rst.nextb)) {
4698                 rst.nextb_ic = mb_tolower(rst.nextb);
4699               } else {
4700                 rst.nextb_ic = mb_toupper(rst.nextb);
4701               }
4702             } else {
4703               rst.nextb_ic = rst.nextb;
4704             }
4705           } else {
4706             rst.nextb = NUL;
4707             rst.nextb_ic = NUL;
4708           }
4709           if (op != BRACE_SIMPLE) {
4710             rst.minval = (op == STAR) ? 0 : 1;
4711             rst.maxval = MAX_LIMIT;
4712           } else {
4713             rst.minval = bl_minval;
4714             rst.maxval = bl_maxval;
4715           }
4716 
4717           /*
4718            * When maxval > minval, try matching as much as possible, up
4719            * to maxval.  When maxval < minval, try matching at least the
4720            * minimal number (since the range is backwards, that's also
4721            * maxval!).
4722            */
4723           rst.count = regrepeat(OPERAND(scan), rst.maxval);
4724           if (got_int) {
4725             status = RA_FAIL;
4726             break;
4727           }
4728           if (rst.minval <= rst.maxval
4729               ? rst.count >= rst.minval : rst.count >= rst.maxval) {
4730             /* It could match.  Prepare for trying to match what
4731              * follows.  The code is below.  Parameters are stored in
4732              * a regstar_T on the regstack. */
4733             if ((long)((unsigned)regstack.ga_len >> 10) >= p_mmp) {
4734               emsg(_(e_maxmempat));
4735               status = RA_FAIL;
4736             } else {
4737               ga_grow(&regstack, sizeof(regstar_T));
4738               regstack.ga_len += sizeof(regstar_T);
4739               rp = regstack_push(rst.minval <= rst.maxval
4740                   ? RS_STAR_LONG : RS_STAR_SHORT, scan);
4741               if (rp == NULL)
4742                 status = RA_FAIL;
4743               else {
4744                 *(((regstar_T *)rp) - 1) = rst;
4745                 status = RA_BREAK;                  /* skip the restore bits */
4746               }
4747             }
4748           } else
4749             status = RA_NOMATCH;
4750 
4751         }
4752         break;
4753 
4754         case NOMATCH:
4755         case MATCH:
4756         case SUBPAT:
4757           rp = regstack_push(RS_NOMATCH, scan);
4758           if (rp == NULL)
4759             status = RA_FAIL;
4760           else {
4761             rp->rs_no = op;
4762             reg_save(&rp->rs_un.regsave, &backpos);
4763             next = OPERAND(scan);
4764             /* We continue and handle the result when done. */
4765           }
4766           break;
4767 
4768         case BEHIND:
4769         case NOBEHIND:
4770           /* Need a bit of room to store extra positions. */
4771           if ((long)((unsigned)regstack.ga_len >> 10) >= p_mmp) {
4772             emsg(_(e_maxmempat));
4773             status = RA_FAIL;
4774           } else {
4775             ga_grow(&regstack, sizeof(regbehind_T));
4776             regstack.ga_len += sizeof(regbehind_T);
4777             rp = regstack_push(RS_BEHIND1, scan);
4778             if (rp == NULL)
4779               status = RA_FAIL;
4780             else {
4781               /* Need to save the subexpr to be able to restore them
4782                * when there is a match but we don't use it. */
4783               save_subexpr(((regbehind_T *)rp) - 1);
4784 
4785               rp->rs_no = op;
4786               reg_save(&rp->rs_un.regsave, &backpos);
4787               /* First try if what follows matches.  If it does then we
4788                * check the behind match by looping. */
4789             }
4790           }
4791           break;
4792 
4793         case BHPOS:
4794           if (REG_MULTI) {
4795             if (behind_pos.rs_u.pos.col != (colnr_T)(rex.input - rex.line)
4796                 || behind_pos.rs_u.pos.lnum != rex.lnum) {
4797               status = RA_NOMATCH;
4798             }
4799           } else if (behind_pos.rs_u.ptr != rex.input) {
4800             status = RA_NOMATCH;
4801           }
4802           break;
4803 
4804         case NEWL:
4805           if ((c != NUL || !REG_MULTI || rex.lnum > rex.reg_maxline
4806                || rex.reg_line_lbr) && (c != '\n' || !rex.reg_line_lbr)) {
4807             status = RA_NOMATCH;
4808           } else if (rex.reg_line_lbr) {
4809             ADVANCE_REGINPUT();
4810           } else {
4811             reg_nextline();
4812           }
4813           break;
4814 
4815         case END:
4816           status = RA_MATCH;    /* Success! */
4817           break;
4818 
4819         default:
4820           iemsg(_(e_re_corr));
4821 #ifdef REGEXP_DEBUG
4822           printf("Illegal op code %d\n", op);
4823 #endif
4824           status = RA_FAIL;
4825           break;
4826         }
4827       }
4828 
4829       /* If we can't continue sequentially, break the inner loop. */
4830       if (status != RA_CONT)
4831         break;
4832 
4833       /* Continue in inner loop, advance to next item. */
4834       scan = next;
4835 
4836     } /* end of inner loop */
4837 
4838     /*
4839      * If there is something on the regstack execute the code for the state.
4840      * If the state is popped then loop and use the older state.
4841      */
4842     while (!GA_EMPTY(&regstack) && status != RA_FAIL) {
4843       rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len) - 1;
4844       switch (rp->rs_state) {
4845       case RS_NOPEN:
4846         /* Result is passed on as-is, simply pop the state. */
4847         regstack_pop(&scan);
4848         break;
4849 
4850       case RS_MOPEN:
4851         // Pop the state.  Restore pointers when there is no match.
4852         if (status == RA_NOMATCH) {
4853           restore_se(&rp->rs_un.sesave, &rex.reg_startpos[rp->rs_no],
4854                      &rex.reg_startp[rp->rs_no]);
4855         }
4856         regstack_pop(&scan);
4857         break;
4858 
4859       case RS_ZOPEN:
4860         /* Pop the state.  Restore pointers when there is no match. */
4861         if (status == RA_NOMATCH)
4862           restore_se(&rp->rs_un.sesave, &reg_startzpos[rp->rs_no],
4863               &reg_startzp[rp->rs_no]);
4864         regstack_pop(&scan);
4865         break;
4866 
4867       case RS_MCLOSE:
4868         // Pop the state.  Restore pointers when there is no match.
4869         if (status == RA_NOMATCH) {
4870           restore_se(&rp->rs_un.sesave, &rex.reg_endpos[rp->rs_no],
4871                      &rex.reg_endp[rp->rs_no]);
4872         }
4873         regstack_pop(&scan);
4874         break;
4875 
4876       case RS_ZCLOSE:
4877         /* Pop the state.  Restore pointers when there is no match. */
4878         if (status == RA_NOMATCH)
4879           restore_se(&rp->rs_un.sesave, &reg_endzpos[rp->rs_no],
4880               &reg_endzp[rp->rs_no]);
4881         regstack_pop(&scan);
4882         break;
4883 
4884       case RS_BRANCH:
4885         if (status == RA_MATCH)
4886           /* this branch matched, use it */
4887           regstack_pop(&scan);
4888         else {
4889           if (status != RA_BREAK) {
4890             /* After a non-matching branch: try next one. */
4891             reg_restore(&rp->rs_un.regsave, &backpos);
4892             scan = rp->rs_scan;
4893           }
4894           if (scan == NULL || OP(scan) != BRANCH) {
4895             /* no more branches, didn't find a match */
4896             status = RA_NOMATCH;
4897             regstack_pop(&scan);
4898           } else {
4899             /* Prepare to try a branch. */
4900             rp->rs_scan = regnext(scan);
4901             reg_save(&rp->rs_un.regsave, &backpos);
4902             scan = OPERAND(scan);
4903           }
4904         }
4905         break;
4906 
4907       case RS_BRCPLX_MORE:
4908         /* Pop the state.  Restore pointers when there is no match. */
4909         if (status == RA_NOMATCH) {
4910           reg_restore(&rp->rs_un.regsave, &backpos);
4911           --brace_count[rp->rs_no];             /* decrement match count */
4912         }
4913         regstack_pop(&scan);
4914         break;
4915 
4916       case RS_BRCPLX_LONG:
4917         /* Pop the state.  Restore pointers when there is no match. */
4918         if (status == RA_NOMATCH) {
4919           /* There was no match, but we did find enough matches. */
4920           reg_restore(&rp->rs_un.regsave, &backpos);
4921           --brace_count[rp->rs_no];
4922           /* continue with the items after "\{}" */
4923           status = RA_CONT;
4924         }
4925         regstack_pop(&scan);
4926         if (status == RA_CONT)
4927           scan = regnext(scan);
4928         break;
4929 
4930       case RS_BRCPLX_SHORT:
4931         /* Pop the state.  Restore pointers when there is no match. */
4932         if (status == RA_NOMATCH)
4933           /* There was no match, try to match one more item. */
4934           reg_restore(&rp->rs_un.regsave, &backpos);
4935         regstack_pop(&scan);
4936         if (status == RA_NOMATCH) {
4937           scan = OPERAND(scan);
4938           status = RA_CONT;
4939         }
4940         break;
4941 
4942       case RS_NOMATCH:
4943         /* Pop the state.  If the operand matches for NOMATCH or
4944         * doesn't match for MATCH/SUBPAT, we fail.  Otherwise backup,
4945         * except for SUBPAT, and continue with the next item. */
4946         if (status == (rp->rs_no == NOMATCH ? RA_MATCH : RA_NOMATCH))
4947           status = RA_NOMATCH;
4948         else {
4949           status = RA_CONT;
4950           if (rp->rs_no != SUBPAT)              /* zero-width */
4951             reg_restore(&rp->rs_un.regsave, &backpos);
4952         }
4953         regstack_pop(&scan);
4954         if (status == RA_CONT)
4955           scan = regnext(scan);
4956         break;
4957 
4958       case RS_BEHIND1:
4959         if (status == RA_NOMATCH) {
4960           regstack_pop(&scan);
4961           regstack.ga_len -= sizeof(regbehind_T);
4962         } else {
4963           /* The stuff after BEHIND/NOBEHIND matches.  Now try if
4964            * the behind part does (not) match before the current
4965            * position in the input.  This must be done at every
4966            * position in the input and checking if the match ends at
4967            * the current position. */
4968 
4969           /* save the position after the found match for next */
4970           reg_save(&(((regbehind_T *)rp) - 1)->save_after, &backpos);
4971 
4972           /* Start looking for a match with operand at the current
4973            * position.  Go back one character until we find the
4974            * result, hitting the start of the line or the previous
4975            * line (for multi-line matching).
4976            * Set behind_pos to where the match should end, BHPOS
4977            * will match it.  Save the current value. */
4978           (((regbehind_T *)rp) - 1)->save_behind = behind_pos;
4979           behind_pos = rp->rs_un.regsave;
4980 
4981           rp->rs_state = RS_BEHIND2;
4982 
4983           reg_restore(&rp->rs_un.regsave, &backpos);
4984           scan = OPERAND(rp->rs_scan) + 4;
4985         }
4986         break;
4987 
4988       case RS_BEHIND2:
4989         /*
4990          * Looping for BEHIND / NOBEHIND match.
4991          */
4992         if (status == RA_MATCH && reg_save_equal(&behind_pos)) {
4993           /* found a match that ends where "next" started */
4994           behind_pos = (((regbehind_T *)rp) - 1)->save_behind;
4995           if (rp->rs_no == BEHIND)
4996             reg_restore(&(((regbehind_T *)rp) - 1)->save_after,
4997                 &backpos);
4998           else {
4999             /* But we didn't want a match.  Need to restore the
5000              * subexpr, because what follows matched, so they have
5001              * been set. */
5002             status = RA_NOMATCH;
5003             restore_subexpr(((regbehind_T *)rp) - 1);
5004           }
5005           regstack_pop(&scan);
5006           regstack.ga_len -= sizeof(regbehind_T);
5007         } else {
5008           long limit;
5009 
5010           /* No match or a match that doesn't end where we want it: Go
5011            * back one character.  May go to previous line once. */
5012           no = OK;
5013           limit = OPERAND_MIN(rp->rs_scan);
5014           if (REG_MULTI) {
5015             if (limit > 0
5016                 && ((rp->rs_un.regsave.rs_u.pos.lnum
5017                      < behind_pos.rs_u.pos.lnum
5018                      ? (colnr_T)STRLEN(rex.line)
5019                      : behind_pos.rs_u.pos.col)
5020                     - rp->rs_un.regsave.rs_u.pos.col >= limit))
5021               no = FAIL;
5022             else if (rp->rs_un.regsave.rs_u.pos.col == 0) {
5023               if (rp->rs_un.regsave.rs_u.pos.lnum
5024                   < behind_pos.rs_u.pos.lnum
5025                   || reg_getline(
5026                       --rp->rs_un.regsave.rs_u.pos.lnum)
5027                   == NULL)
5028                 no = FAIL;
5029               else {
5030                 reg_restore(&rp->rs_un.regsave, &backpos);
5031                 rp->rs_un.regsave.rs_u.pos.col =
5032                   (colnr_T)STRLEN(rex.line);
5033               }
5034             } else {
5035               const char_u *const line =
5036                   reg_getline(rp->rs_un.regsave.rs_u.pos.lnum);
5037 
5038               rp->rs_un.regsave.rs_u.pos.col -=
5039                   utf_head_off(line,
5040                                line + rp->rs_un.regsave.rs_u.pos.col - 1)
5041                   + 1;
5042             }
5043           } else {
5044             if (rp->rs_un.regsave.rs_u.ptr == rex.line) {
5045               no = FAIL;
5046             } else {
5047               MB_PTR_BACK(rex.line, rp->rs_un.regsave.rs_u.ptr);
5048               if (limit > 0
5049                   && (long)(behind_pos.rs_u.ptr
5050                             - rp->rs_un.regsave.rs_u.ptr) > limit) {
5051                 no = FAIL;
5052               }
5053             }
5054           }
5055           if (no == OK) {
5056             /* Advanced, prepare for finding match again. */
5057             reg_restore(&rp->rs_un.regsave, &backpos);
5058             scan = OPERAND(rp->rs_scan) + 4;
5059             if (status == RA_MATCH) {
5060               /* We did match, so subexpr may have been changed,
5061                * need to restore them for the next try. */
5062               status = RA_NOMATCH;
5063               restore_subexpr(((regbehind_T *)rp) - 1);
5064             }
5065           } else {
5066             /* Can't advance.  For NOBEHIND that's a match. */
5067             behind_pos = (((regbehind_T *)rp) - 1)->save_behind;
5068             if (rp->rs_no == NOBEHIND) {
5069               reg_restore(&(((regbehind_T *)rp) - 1)->save_after,
5070                   &backpos);
5071               status = RA_MATCH;
5072             } else {
5073               /* We do want a proper match.  Need to restore the
5074                * subexpr if we had a match, because they may have
5075                * been set. */
5076               if (status == RA_MATCH) {
5077                 status = RA_NOMATCH;
5078                 restore_subexpr(((regbehind_T *)rp) - 1);
5079               }
5080             }
5081             regstack_pop(&scan);
5082             regstack.ga_len -= sizeof(regbehind_T);
5083           }
5084         }
5085         break;
5086 
5087       case RS_STAR_LONG:
5088       case RS_STAR_SHORT:
5089       {
5090         regstar_T           *rst = ((regstar_T *)rp) - 1;
5091 
5092         if (status == RA_MATCH) {
5093           regstack_pop(&scan);
5094           regstack.ga_len -= sizeof(regstar_T);
5095           break;
5096         }
5097 
5098         /* Tried once already, restore input pointers. */
5099         if (status != RA_BREAK)
5100           reg_restore(&rp->rs_un.regsave, &backpos);
5101 
5102         /* Repeat until we found a position where it could match. */
5103         for (;; ) {
5104           if (status != RA_BREAK) {
5105             /* Tried first position already, advance. */
5106             if (rp->rs_state == RS_STAR_LONG) {
5107               /* Trying for longest match, but couldn't or
5108                * didn't match -- back up one char. */
5109               if (--rst->count < rst->minval)
5110                 break;
5111               if (rex.input == rex.line) {
5112                 // backup to last char of previous line
5113                 rex.lnum--;
5114                 rex.line = reg_getline(rex.lnum);
5115                 // Just in case regrepeat() didn't count right.
5116                 if (rex.line == NULL) {
5117                   break;
5118                 }
5119                 rex.input = rex.line + STRLEN(rex.line);
5120                 fast_breakcheck();
5121               } else {
5122                 MB_PTR_BACK(rex.line, rex.input);
5123               }
5124             } else {
5125               /* Range is backwards, use shortest match first.
5126                * Careful: maxval and minval are exchanged!
5127                * Couldn't or didn't match: try advancing one
5128                * char. */
5129               if (rst->count == rst->minval
5130                   || regrepeat(OPERAND(rp->rs_scan), 1L) == 0)
5131                 break;
5132               ++rst->count;
5133             }
5134             if (got_int)
5135               break;
5136           } else
5137             status = RA_NOMATCH;
5138 
5139           // If it could match, try it.
5140           if (rst->nextb == NUL || *rex.input == rst->nextb
5141               || *rex.input == rst->nextb_ic) {
5142             reg_save(&rp->rs_un.regsave, &backpos);
5143             scan = regnext(rp->rs_scan);
5144             status = RA_CONT;
5145             break;
5146           }
5147         }
5148         if (status != RA_CONT) {
5149           /* Failed. */
5150           regstack_pop(&scan);
5151           regstack.ga_len -= sizeof(regstar_T);
5152           status = RA_NOMATCH;
5153         }
5154       }
5155       break;
5156       }
5157 
5158       /* If we want to continue the inner loop or didn't pop a state
5159        * continue matching loop */
5160       if (status == RA_CONT || rp == (regitem_T *)
5161           ((char *)regstack.ga_data + regstack.ga_len) - 1)
5162         break;
5163     }
5164 
5165     /* May need to continue with the inner loop, starting at "scan". */
5166     if (status == RA_CONT)
5167       continue;
5168 
5169     /*
5170      * If the regstack is empty or something failed we are done.
5171      */
5172     if (GA_EMPTY(&regstack) || status == RA_FAIL) {
5173       if (scan == NULL) {
5174         /*
5175          * We get here only if there's trouble -- normally "case END" is
5176          * the terminating point.
5177          */
5178         iemsg(_(e_re_corr));
5179 #ifdef REGEXP_DEBUG
5180         printf("Premature EOL\n");
5181 #endif
5182       }
5183       return status == RA_MATCH;
5184     }
5185 
5186   } /* End of loop until the regstack is empty. */
5187 
5188   /* NOTREACHED */
5189 }
5190 
5191 /*
5192  * Push an item onto the regstack.
5193  * Returns pointer to new item.  Returns NULL when out of memory.
5194  */
regstack_push(regstate_T state,char_u * scan)5195 static regitem_T *regstack_push(regstate_T state, char_u *scan)
5196 {
5197   regitem_T   *rp;
5198 
5199   if ((long)((unsigned)regstack.ga_len >> 10) >= p_mmp) {
5200     emsg(_(e_maxmempat));
5201     return NULL;
5202   }
5203   ga_grow(&regstack, sizeof(regitem_T));
5204 
5205   rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len);
5206   rp->rs_state = state;
5207   rp->rs_scan = scan;
5208 
5209   regstack.ga_len += sizeof(regitem_T);
5210   return rp;
5211 }
5212 
5213 /*
5214  * Pop an item from the regstack.
5215  */
regstack_pop(char_u ** scan)5216 static void regstack_pop(char_u **scan)
5217 {
5218   regitem_T   *rp;
5219 
5220   rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len) - 1;
5221   *scan = rp->rs_scan;
5222 
5223   regstack.ga_len -= sizeof(regitem_T);
5224 }
5225 
5226 /*
5227  * regrepeat - repeatedly match something simple, return how many.
5228  * Advances rex.input (and rex.lnum) to just after the matched chars.
5229  */
5230 static int
regrepeat(char_u * p,long maxcount)5231 regrepeat (
5232     char_u *p,
5233     long maxcount              /* maximum number of matches allowed */
5234 )
5235 {
5236   long count = 0;
5237   char_u      *opnd;
5238   int mask;
5239   int testval = 0;
5240 
5241   char_u *scan = rex.input;  // Make local copy of rex.input for speed.
5242   opnd = OPERAND(p);
5243   switch (OP(p)) {
5244   case ANY:
5245   case ANY + ADD_NL:
5246     while (count < maxcount) {
5247       /* Matching anything means we continue until end-of-line (or
5248        * end-of-file for ANY + ADD_NL), only limited by maxcount. */
5249       while (*scan != NUL && count < maxcount) {
5250         count++;
5251         MB_PTR_ADV(scan);
5252       }
5253       if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
5254           || rex.reg_line_lbr || count == maxcount) {
5255         break;
5256       }
5257       count++;  // count the line-break
5258       reg_nextline();
5259       scan = rex.input;
5260       if (got_int) {
5261         break;
5262       }
5263     }
5264     break;
5265 
5266   case IDENT:
5267   case IDENT + ADD_NL:
5268     testval = 1;
5269     FALLTHROUGH;
5270   case SIDENT:
5271   case SIDENT + ADD_NL:
5272     while (count < maxcount) {
5273       if (vim_isIDc(utf_ptr2char(scan)) && (testval || !ascii_isdigit(*scan))) {
5274         MB_PTR_ADV(scan);
5275       } else if (*scan == NUL) {
5276         if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
5277             || rex.reg_line_lbr) {
5278           break;
5279         }
5280         reg_nextline();
5281         scan = rex.input;
5282         if (got_int) {
5283           break;
5284         }
5285       } else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p))) {
5286         scan++;
5287       } else {
5288         break;
5289       }
5290       ++count;
5291     }
5292     break;
5293 
5294   case KWORD:
5295   case KWORD + ADD_NL:
5296     testval = 1;
5297     FALLTHROUGH;
5298   case SKWORD:
5299   case SKWORD + ADD_NL:
5300     while (count < maxcount) {
5301       if (vim_iswordp_buf(scan, rex.reg_buf)
5302           && (testval || !ascii_isdigit(*scan))) {
5303         MB_PTR_ADV(scan);
5304       } else if (*scan == NUL) {
5305         if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
5306             || rex.reg_line_lbr) {
5307           break;
5308         }
5309         reg_nextline();
5310         scan = rex.input;
5311         if (got_int) {
5312           break;
5313         }
5314       } else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p))) {
5315         scan++;
5316       } else {
5317         break;
5318       }
5319       count++;
5320     }
5321     break;
5322 
5323   case FNAME:
5324   case FNAME + ADD_NL:
5325     testval = 1;
5326     FALLTHROUGH;
5327   case SFNAME:
5328   case SFNAME + ADD_NL:
5329     while (count < maxcount) {
5330       if (vim_isfilec(utf_ptr2char(scan)) && (testval || !ascii_isdigit(*scan))) {
5331         MB_PTR_ADV(scan);
5332       } else if (*scan == NUL) {
5333         if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
5334             || rex.reg_line_lbr) {
5335           break;
5336         }
5337         reg_nextline();
5338         scan = rex.input;
5339         if (got_int) {
5340           break;
5341         }
5342       } else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p))) {
5343         scan++;
5344       } else {
5345         break;
5346       }
5347       count++;
5348     }
5349     break;
5350 
5351   case PRINT:
5352   case PRINT + ADD_NL:
5353     testval = 1;
5354     FALLTHROUGH;
5355   case SPRINT:
5356   case SPRINT + ADD_NL:
5357     while (count < maxcount) {
5358       if (*scan == NUL) {
5359         if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
5360             || rex.reg_line_lbr) {
5361           break;
5362         }
5363         reg_nextline();
5364         scan = rex.input;
5365         if (got_int) {
5366           break;
5367         }
5368       } else if (vim_isprintc(utf_ptr2char(scan)) == 1
5369                  && (testval || !ascii_isdigit(*scan))) {
5370         MB_PTR_ADV(scan);
5371       } else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p))) {
5372         scan++;
5373       } else {
5374         break;
5375       }
5376       count++;
5377     }
5378     break;
5379 
5380   case WHITE:
5381   case WHITE + ADD_NL:
5382     testval = mask = RI_WHITE;
5383 do_class:
5384     while (count < maxcount) {
5385       int l;
5386       if (*scan == NUL) {
5387         if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
5388             || rex.reg_line_lbr) {
5389           break;
5390         }
5391         reg_nextline();
5392         scan = rex.input;
5393         if (got_int) {
5394           break;
5395         }
5396       } else if ((l = utfc_ptr2len(scan)) > 1) {
5397         if (testval != 0) {
5398           break;
5399         }
5400         scan += l;
5401       } else if ((class_tab[*scan] & mask) == testval) {
5402         scan++;
5403       } else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p))) {
5404         scan++;
5405       } else {
5406         break;
5407       }
5408       ++count;
5409     }
5410     break;
5411 
5412   case NWHITE:
5413   case NWHITE + ADD_NL:
5414     mask = RI_WHITE;
5415     goto do_class;
5416   case DIGIT:
5417   case DIGIT + ADD_NL:
5418     testval = mask = RI_DIGIT;
5419     goto do_class;
5420   case NDIGIT:
5421   case NDIGIT + ADD_NL:
5422     mask = RI_DIGIT;
5423     goto do_class;
5424   case HEX:
5425   case HEX + ADD_NL:
5426     testval = mask = RI_HEX;
5427     goto do_class;
5428   case NHEX:
5429   case NHEX + ADD_NL:
5430     mask = RI_HEX;
5431     goto do_class;
5432   case OCTAL:
5433   case OCTAL + ADD_NL:
5434     testval = mask = RI_OCTAL;
5435     goto do_class;
5436   case NOCTAL:
5437   case NOCTAL + ADD_NL:
5438     mask = RI_OCTAL;
5439     goto do_class;
5440   case WORD:
5441   case WORD + ADD_NL:
5442     testval = mask = RI_WORD;
5443     goto do_class;
5444   case NWORD:
5445   case NWORD + ADD_NL:
5446     mask = RI_WORD;
5447     goto do_class;
5448   case HEAD:
5449   case HEAD + ADD_NL:
5450     testval = mask = RI_HEAD;
5451     goto do_class;
5452   case NHEAD:
5453   case NHEAD + ADD_NL:
5454     mask = RI_HEAD;
5455     goto do_class;
5456   case ALPHA:
5457   case ALPHA + ADD_NL:
5458     testval = mask = RI_ALPHA;
5459     goto do_class;
5460   case NALPHA:
5461   case NALPHA + ADD_NL:
5462     mask = RI_ALPHA;
5463     goto do_class;
5464   case LOWER:
5465   case LOWER + ADD_NL:
5466     testval = mask = RI_LOWER;
5467     goto do_class;
5468   case NLOWER:
5469   case NLOWER + ADD_NL:
5470     mask = RI_LOWER;
5471     goto do_class;
5472   case UPPER:
5473   case UPPER + ADD_NL:
5474     testval = mask = RI_UPPER;
5475     goto do_class;
5476   case NUPPER:
5477   case NUPPER + ADD_NL:
5478     mask = RI_UPPER;
5479     goto do_class;
5480 
5481   case EXACTLY:
5482   {
5483     int cu, cl;
5484 
5485     // This doesn't do a multi-byte character, because a MULTIBYTECODE
5486     // would have been used for it.  It does handle single-byte
5487     // characters, such as latin1.
5488     if (rex.reg_ic) {
5489       cu = mb_toupper(*opnd);
5490       cl = mb_tolower(*opnd);
5491       while (count < maxcount && (*scan == cu || *scan == cl)) {
5492         count++;
5493         scan++;
5494       }
5495     } else {
5496       cu = *opnd;
5497       while (count < maxcount && *scan == cu) {
5498         count++;
5499         scan++;
5500       }
5501     }
5502     break;
5503   }
5504 
5505   case MULTIBYTECODE:
5506   {
5507     int i, len, cf = 0;
5508 
5509     /* Safety check (just in case 'encoding' was changed since
5510      * compiling the program). */
5511     if ((len = utfc_ptr2len(opnd)) > 1) {
5512       if (rex.reg_ic) {
5513         cf = utf_fold(utf_ptr2char(opnd));
5514       }
5515       while (count < maxcount && utfc_ptr2len(scan) >= len) {
5516         for (i = 0; i < len; i++) {
5517           if (opnd[i] != scan[i]) {
5518             break;
5519           }
5520         }
5521         if (i < len && (!rex.reg_ic
5522                         || utf_fold(utf_ptr2char(scan)) != cf)) {
5523           break;
5524         }
5525         scan += len;
5526         ++count;
5527       }
5528     }
5529   }
5530   break;
5531 
5532   case ANYOF:
5533   case ANYOF + ADD_NL:
5534     testval = 1;
5535     FALLTHROUGH;
5536 
5537   case ANYBUT:
5538   case ANYBUT + ADD_NL:
5539     while (count < maxcount) {
5540       int len;
5541       if (*scan == NUL) {
5542         if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
5543             || rex.reg_line_lbr) {
5544           break;
5545         }
5546         reg_nextline();
5547         scan = rex.input;
5548         if (got_int) {
5549           break;
5550         }
5551       } else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p))) {
5552         scan++;
5553       } else if ((len = utfc_ptr2len(scan)) > 1) {
5554         if ((cstrchr(opnd, utf_ptr2char(scan)) == NULL) == testval) {
5555           break;
5556         }
5557         scan += len;
5558       } else {
5559         if ((cstrchr(opnd, *scan) == NULL) == testval)
5560           break;
5561         ++scan;
5562       }
5563       ++count;
5564     }
5565     break;
5566 
5567   case NEWL:
5568     while (count < maxcount
5569            && ((*scan == NUL && rex.lnum <= rex.reg_maxline && !rex.reg_line_lbr
5570                 && REG_MULTI) || (*scan == '\n' && rex.reg_line_lbr))) {
5571       count++;
5572       if (rex.reg_line_lbr) {
5573         ADVANCE_REGINPUT();
5574       } else {
5575         reg_nextline();
5576       }
5577       scan = rex.input;
5578       if (got_int) {
5579         break;
5580       }
5581     }
5582     break;
5583 
5584   default:  // Oh dear.  Called inappropriately.
5585     iemsg(_(e_re_corr));
5586 #ifdef REGEXP_DEBUG
5587     printf("Called regrepeat with op code %d\n", OP(p));
5588 #endif
5589     break;
5590   }
5591 
5592   rex.input = scan;
5593 
5594   return (int)count;
5595 }
5596 
5597 /*
5598  * regnext - dig the "next" pointer out of a node
5599  * Returns NULL when calculating size, when there is no next item and when
5600  * there is an error.
5601  */
regnext(char_u * p)5602 static char_u *regnext(char_u *p)
5603   FUNC_ATTR_NONNULL_ALL
5604 {
5605   int offset;
5606 
5607   if (p == JUST_CALC_SIZE || reg_toolong)
5608     return NULL;
5609 
5610   offset = NEXT(p);
5611   if (offset == 0)
5612     return NULL;
5613 
5614   if (OP(p) == BACK)
5615     return p - offset;
5616   else
5617     return p + offset;
5618 }
5619 
5620 /*
5621  * Check the regexp program for its magic number.
5622  * Return true if it's wrong.
5623  */
prog_magic_wrong(void)5624 static int prog_magic_wrong(void)
5625 {
5626   regprog_T   *prog;
5627 
5628   prog = REG_MULTI ? rex.reg_mmatch->regprog : rex.reg_match->regprog;
5629   if (prog->engine == &nfa_regengine) {
5630     // For NFA matcher we don't check the magic
5631     return false;
5632   }
5633 
5634   if (UCHARAT(((bt_regprog_T *)prog)->program) != REGMAGIC) {
5635     emsg(_(e_re_corr));
5636     return true;
5637   }
5638   return false;
5639 }
5640 
5641 /*
5642  * Cleanup the subexpressions, if this wasn't done yet.
5643  * This construction is used to clear the subexpressions only when they are
5644  * used (to increase speed).
5645  */
cleanup_subexpr(void)5646 static void cleanup_subexpr(void)
5647 {
5648   if (rex.need_clear_subexpr) {
5649     if (REG_MULTI) {
5650       // Use 0xff to set lnum to -1
5651       memset(rex.reg_startpos, 0xff, sizeof(lpos_T) * NSUBEXP);
5652       memset(rex.reg_endpos, 0xff, sizeof(lpos_T) * NSUBEXP);
5653     } else {
5654       memset(rex.reg_startp, 0, sizeof(char_u *) * NSUBEXP);
5655       memset(rex.reg_endp, 0, sizeof(char_u *) * NSUBEXP);
5656     }
5657     rex.need_clear_subexpr = false;
5658   }
5659 }
5660 
cleanup_zsubexpr(void)5661 static void cleanup_zsubexpr(void)
5662 {
5663   if (rex.need_clear_zsubexpr) {
5664     if (REG_MULTI) {
5665       /* Use 0xff to set lnum to -1 */
5666       memset(reg_startzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
5667       memset(reg_endzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
5668     } else {
5669       memset(reg_startzp, 0, sizeof(char_u *) * NSUBEXP);
5670       memset(reg_endzp, 0, sizeof(char_u *) * NSUBEXP);
5671     }
5672     rex.need_clear_zsubexpr = false;
5673   }
5674 }
5675 
5676 // Save the current subexpr to "bp", so that they can be restored
5677 // later by restore_subexpr().
save_subexpr(regbehind_T * bp)5678 static void save_subexpr(regbehind_T *bp)
5679   FUNC_ATTR_NONNULL_ALL
5680 {
5681   // When "rex.need_clear_subexpr" is set we don't need to save the values, only
5682   // remember that this flag needs to be set again when restoring.
5683   bp->save_need_clear_subexpr = rex.need_clear_subexpr;
5684   if (!rex.need_clear_subexpr) {
5685     for (int i = 0; i < NSUBEXP; i++) {
5686       if (REG_MULTI) {
5687         bp->save_start[i].se_u.pos = rex.reg_startpos[i];
5688         bp->save_end[i].se_u.pos = rex.reg_endpos[i];
5689       } else {
5690         bp->save_start[i].se_u.ptr = rex.reg_startp[i];
5691         bp->save_end[i].se_u.ptr = rex.reg_endp[i];
5692       }
5693     }
5694   }
5695 }
5696 
5697 // Restore the subexpr from "bp".
restore_subexpr(regbehind_T * bp)5698 static void restore_subexpr(regbehind_T *bp)
5699   FUNC_ATTR_NONNULL_ALL
5700 {
5701   // Only need to restore saved values when they are not to be cleared.
5702   rex.need_clear_subexpr = bp->save_need_clear_subexpr;
5703   if (!rex.need_clear_subexpr) {
5704     for (int i = 0; i < NSUBEXP; i++) {
5705       if (REG_MULTI) {
5706         rex.reg_startpos[i] = bp->save_start[i].se_u.pos;
5707         rex.reg_endpos[i] = bp->save_end[i].se_u.pos;
5708       } else {
5709         rex.reg_startp[i] = bp->save_start[i].se_u.ptr;
5710         rex.reg_endp[i] = bp->save_end[i].se_u.ptr;
5711       }
5712     }
5713   }
5714 }
5715 
5716 // Advance rex.lnum, rex.line and rex.input to the next line.
reg_nextline(void)5717 static void reg_nextline(void)
5718 {
5719   rex.line = reg_getline(++rex.lnum);
5720   rex.input = rex.line;
5721   fast_breakcheck();
5722 }
5723 
5724 // Save the input line and position in a regsave_T.
reg_save(regsave_T * save,garray_T * gap)5725 static void reg_save(regsave_T *save, garray_T *gap)
5726   FUNC_ATTR_NONNULL_ALL
5727 {
5728   if (REG_MULTI) {
5729     save->rs_u.pos.col = (colnr_T)(rex.input - rex.line);
5730     save->rs_u.pos.lnum = rex.lnum;
5731   } else {
5732     save->rs_u.ptr = rex.input;
5733   }
5734   save->rs_len = gap->ga_len;
5735 }
5736 
5737 // Restore the input line and position from a regsave_T.
reg_restore(regsave_T * save,garray_T * gap)5738 static void reg_restore(regsave_T *save, garray_T *gap)
5739   FUNC_ATTR_NONNULL_ALL
5740 {
5741   if (REG_MULTI) {
5742     if (rex.lnum != save->rs_u.pos.lnum) {
5743       // only call reg_getline() when the line number changed to save
5744       // a bit of time
5745       rex.lnum = save->rs_u.pos.lnum;
5746       rex.line = reg_getline(rex.lnum);
5747     }
5748     rex.input = rex.line + save->rs_u.pos.col;
5749   } else {
5750     rex.input = save->rs_u.ptr;
5751   }
5752   gap->ga_len = save->rs_len;
5753 }
5754 
5755 // Return true if current position is equal to saved position.
reg_save_equal(const regsave_T * save)5756 static bool reg_save_equal(const regsave_T *save)
5757   FUNC_ATTR_NONNULL_ALL
5758 {
5759   if (REG_MULTI) {
5760     return rex.lnum == save->rs_u.pos.lnum
5761            && rex.input == rex.line + save->rs_u.pos.col;
5762   }
5763   return rex.input == save->rs_u.ptr;
5764 }
5765 
5766 /*
5767  * Tentatively set the sub-expression start to the current position (after
5768  * calling regmatch() they will have changed).  Need to save the existing
5769  * values for when there is no match.
5770  * Use se_save() to use pointer (save_se_multi()) or position (save_se_one()),
5771  * depending on REG_MULTI.
5772  */
save_se_multi(save_se_T * savep,lpos_T * posp)5773 static void save_se_multi(save_se_T *savep, lpos_T *posp)
5774 {
5775   savep->se_u.pos = *posp;
5776   posp->lnum = rex.lnum;
5777   posp->col = (colnr_T)(rex.input - rex.line);
5778 }
5779 
save_se_one(save_se_T * savep,char_u ** pp)5780 static void save_se_one(save_se_T *savep, char_u **pp)
5781 {
5782   savep->se_u.ptr = *pp;
5783   *pp = rex.input;
5784 }
5785 
5786 /*
5787  * Compare a number with the operand of RE_LNUM, RE_COL or RE_VCOL.
5788  */
re_num_cmp(uint32_t val,char_u * scan)5789 static int re_num_cmp(uint32_t val, char_u *scan)
5790 {
5791   uint32_t n = (uint32_t)OPERAND_MIN(scan);
5792 
5793   if (OPERAND_CMP(scan) == '>')
5794     return val > n;
5795   if (OPERAND_CMP(scan) == '<')
5796     return val < n;
5797   return val == n;
5798 }
5799 
5800 /*
5801  * Check whether a backreference matches.
5802  * Returns RA_FAIL, RA_NOMATCH or RA_MATCH.
5803  * If "bytelen" is not NULL, it is set to the byte length of the match in the
5804  * last line.
5805  */
match_with_backref(linenr_T start_lnum,colnr_T start_col,linenr_T end_lnum,colnr_T end_col,int * bytelen)5806 static int match_with_backref(linenr_T start_lnum, colnr_T start_col, linenr_T end_lnum, colnr_T end_col, int *bytelen)
5807 {
5808   linenr_T clnum = start_lnum;
5809   colnr_T ccol = start_col;
5810   int len;
5811   char_u      *p;
5812 
5813   if (bytelen != NULL)
5814     *bytelen = 0;
5815   for (;; ) {
5816     // Since getting one line may invalidate the other, need to make copy.
5817     // Slow!
5818     if (rex.line != reg_tofree) {
5819       len = (int)STRLEN(rex.line);
5820       if (reg_tofree == NULL || len >= (int)reg_tofreelen) {
5821         len += 50;              /* get some extra */
5822         xfree(reg_tofree);
5823         reg_tofree = xmalloc(len);
5824         reg_tofreelen = len;
5825       }
5826       STRCPY(reg_tofree, rex.line);
5827       rex.input = reg_tofree + (rex.input - rex.line);
5828       rex.line = reg_tofree;
5829     }
5830 
5831     /* Get the line to compare with. */
5832     p = reg_getline(clnum);
5833     assert(p);
5834 
5835     if (clnum == end_lnum)
5836       len = end_col - ccol;
5837     else
5838       len = (int)STRLEN(p + ccol);
5839 
5840     if (cstrncmp(p + ccol, rex.input, &len) != 0) {
5841       return RA_NOMATCH;  // doesn't match
5842     }
5843     if (bytelen != NULL) {
5844       *bytelen += len;
5845     }
5846     if (clnum == end_lnum) {
5847       break;  // match and at end!
5848     }
5849     if (rex.lnum >= rex.reg_maxline) {
5850       return RA_NOMATCH;  // text too short
5851     }
5852 
5853     /* Advance to next line. */
5854     reg_nextline();
5855     if (bytelen != NULL)
5856       *bytelen = 0;
5857     ++clnum;
5858     ccol = 0;
5859     if (got_int)
5860       return RA_FAIL;
5861   }
5862 
5863   // found a match!  Note that rex.line may now point to a copy of the line,
5864   // that should not matter.
5865   return RA_MATCH;
5866 }
5867 
5868 #ifdef BT_REGEXP_DUMP
5869 
5870 /*
5871  * regdump - dump a regexp onto stdout in vaguely comprehensible form
5872  */
regdump(char_u * pattern,bt_regprog_T * r)5873 static void regdump(char_u *pattern, bt_regprog_T *r)
5874 {
5875   char_u  *s;
5876   int op = EXACTLY;             /* Arbitrary non-END op. */
5877   char_u  *next;
5878   char_u  *end = NULL;
5879   FILE    *f;
5880 
5881 #ifdef BT_REGEXP_LOG
5882   f = fopen("bt_regexp_log.log", "a");
5883 #else
5884   f = stdout;
5885 #endif
5886   if (f == NULL)
5887     return;
5888   fprintf(f, "-------------------------------------\n\r\nregcomp(%s):\r\n",
5889       pattern);
5890 
5891   s = r->program + 1;
5892   /*
5893    * Loop until we find the END that isn't before a referred next (an END
5894    * can also appear in a NOMATCH operand).
5895    */
5896   while (op != END || s <= end) {
5897     op = OP(s);
5898     fprintf(f, "%2d%s", (int)(s - r->program), regprop(s));     /* Where, what. */
5899     next = regnext(s);
5900     if (next == NULL)           /* Next ptr. */
5901       fprintf(f, "(0)");
5902     else
5903       fprintf(f, "(%d)", (int)((s - r->program) + (next - s)));
5904     if (end < next)
5905       end = next;
5906     if (op == BRACE_LIMITS) {
5907       /* Two ints */
5908       fprintf(f, " minval %" PRId64 ", maxval %" PRId64,
5909               (int64_t)OPERAND_MIN(s), (int64_t)OPERAND_MAX(s));
5910       s += 8;
5911     } else if (op == BEHIND || op == NOBEHIND) {
5912       /* one int */
5913       fprintf(f, " count %" PRId64, (int64_t)OPERAND_MIN(s));
5914       s += 4;
5915     } else if (op == RE_LNUM || op == RE_COL || op == RE_VCOL) {
5916       // one int plus comparator
5917       fprintf(f, " count %" PRId64, (int64_t)OPERAND_MIN(s));
5918       s += 5;
5919     }
5920     s += 3;
5921     if (op == ANYOF || op == ANYOF + ADD_NL
5922         || op == ANYBUT || op == ANYBUT + ADD_NL
5923         || op == EXACTLY) {
5924       /* Literal string, where present. */
5925       fprintf(f, "\nxxxxxxxxx\n");
5926       while (*s != NUL)
5927         fprintf(f, "%c", *s++);
5928       fprintf(f, "\nxxxxxxxxx\n");
5929       s++;
5930     }
5931     fprintf(f, "\r\n");
5932   }
5933 
5934   /* Header fields of interest. */
5935   if (r->regstart != NUL)
5936     fprintf(f, "start `%s' 0x%x; ", r->regstart < 256
5937         ? (char *)transchar(r->regstart)
5938         : "multibyte", r->regstart);
5939   if (r->reganch)
5940     fprintf(f, "anchored; ");
5941   if (r->regmust != NULL)
5942     fprintf(f, "must have \"%s\"", r->regmust);
5943   fprintf(f, "\r\n");
5944 
5945 #ifdef BT_REGEXP_LOG
5946   fclose(f);
5947 #endif
5948 }
5949 #endif      /* BT_REGEXP_DUMP */
5950 
5951 #ifdef REGEXP_DEBUG
5952 /*
5953  * regprop - printable representation of opcode
5954  */
regprop(char_u * op)5955 static char_u *regprop(char_u *op)
5956 {
5957   char            *p;
5958   static char buf[50];
5959 
5960   STRCPY(buf, ":");
5961 
5962   switch ((int) OP(op)) {
5963   case BOL:
5964     p = "BOL";
5965     break;
5966   case EOL:
5967     p = "EOL";
5968     break;
5969   case RE_BOF:
5970     p = "BOF";
5971     break;
5972   case RE_EOF:
5973     p = "EOF";
5974     break;
5975   case CURSOR:
5976     p = "CURSOR";
5977     break;
5978   case RE_VISUAL:
5979     p = "RE_VISUAL";
5980     break;
5981   case RE_LNUM:
5982     p = "RE_LNUM";
5983     break;
5984   case RE_MARK:
5985     p = "RE_MARK";
5986     break;
5987   case RE_COL:
5988     p = "RE_COL";
5989     break;
5990   case RE_VCOL:
5991     p = "RE_VCOL";
5992     break;
5993   case BOW:
5994     p = "BOW";
5995     break;
5996   case EOW:
5997     p = "EOW";
5998     break;
5999   case ANY:
6000     p = "ANY";
6001     break;
6002   case ANY + ADD_NL:
6003     p = "ANY+NL";
6004     break;
6005   case ANYOF:
6006     p = "ANYOF";
6007     break;
6008   case ANYOF + ADD_NL:
6009     p = "ANYOF+NL";
6010     break;
6011   case ANYBUT:
6012     p = "ANYBUT";
6013     break;
6014   case ANYBUT + ADD_NL:
6015     p = "ANYBUT+NL";
6016     break;
6017   case IDENT:
6018     p = "IDENT";
6019     break;
6020   case IDENT + ADD_NL:
6021     p = "IDENT+NL";
6022     break;
6023   case SIDENT:
6024     p = "SIDENT";
6025     break;
6026   case SIDENT + ADD_NL:
6027     p = "SIDENT+NL";
6028     break;
6029   case KWORD:
6030     p = "KWORD";
6031     break;
6032   case KWORD + ADD_NL:
6033     p = "KWORD+NL";
6034     break;
6035   case SKWORD:
6036     p = "SKWORD";
6037     break;
6038   case SKWORD + ADD_NL:
6039     p = "SKWORD+NL";
6040     break;
6041   case FNAME:
6042     p = "FNAME";
6043     break;
6044   case FNAME + ADD_NL:
6045     p = "FNAME+NL";
6046     break;
6047   case SFNAME:
6048     p = "SFNAME";
6049     break;
6050   case SFNAME + ADD_NL:
6051     p = "SFNAME+NL";
6052     break;
6053   case PRINT:
6054     p = "PRINT";
6055     break;
6056   case PRINT + ADD_NL:
6057     p = "PRINT+NL";
6058     break;
6059   case SPRINT:
6060     p = "SPRINT";
6061     break;
6062   case SPRINT + ADD_NL:
6063     p = "SPRINT+NL";
6064     break;
6065   case WHITE:
6066     p = "WHITE";
6067     break;
6068   case WHITE + ADD_NL:
6069     p = "WHITE+NL";
6070     break;
6071   case NWHITE:
6072     p = "NWHITE";
6073     break;
6074   case NWHITE + ADD_NL:
6075     p = "NWHITE+NL";
6076     break;
6077   case DIGIT:
6078     p = "DIGIT";
6079     break;
6080   case DIGIT + ADD_NL:
6081     p = "DIGIT+NL";
6082     break;
6083   case NDIGIT:
6084     p = "NDIGIT";
6085     break;
6086   case NDIGIT + ADD_NL:
6087     p = "NDIGIT+NL";
6088     break;
6089   case HEX:
6090     p = "HEX";
6091     break;
6092   case HEX + ADD_NL:
6093     p = "HEX+NL";
6094     break;
6095   case NHEX:
6096     p = "NHEX";
6097     break;
6098   case NHEX + ADD_NL:
6099     p = "NHEX+NL";
6100     break;
6101   case OCTAL:
6102     p = "OCTAL";
6103     break;
6104   case OCTAL + ADD_NL:
6105     p = "OCTAL+NL";
6106     break;
6107   case NOCTAL:
6108     p = "NOCTAL";
6109     break;
6110   case NOCTAL + ADD_NL:
6111     p = "NOCTAL+NL";
6112     break;
6113   case WORD:
6114     p = "WORD";
6115     break;
6116   case WORD + ADD_NL:
6117     p = "WORD+NL";
6118     break;
6119   case NWORD:
6120     p = "NWORD";
6121     break;
6122   case NWORD + ADD_NL:
6123     p = "NWORD+NL";
6124     break;
6125   case HEAD:
6126     p = "HEAD";
6127     break;
6128   case HEAD + ADD_NL:
6129     p = "HEAD+NL";
6130     break;
6131   case NHEAD:
6132     p = "NHEAD";
6133     break;
6134   case NHEAD + ADD_NL:
6135     p = "NHEAD+NL";
6136     break;
6137   case ALPHA:
6138     p = "ALPHA";
6139     break;
6140   case ALPHA + ADD_NL:
6141     p = "ALPHA+NL";
6142     break;
6143   case NALPHA:
6144     p = "NALPHA";
6145     break;
6146   case NALPHA + ADD_NL:
6147     p = "NALPHA+NL";
6148     break;
6149   case LOWER:
6150     p = "LOWER";
6151     break;
6152   case LOWER + ADD_NL:
6153     p = "LOWER+NL";
6154     break;
6155   case NLOWER:
6156     p = "NLOWER";
6157     break;
6158   case NLOWER + ADD_NL:
6159     p = "NLOWER+NL";
6160     break;
6161   case UPPER:
6162     p = "UPPER";
6163     break;
6164   case UPPER + ADD_NL:
6165     p = "UPPER+NL";
6166     break;
6167   case NUPPER:
6168     p = "NUPPER";
6169     break;
6170   case NUPPER + ADD_NL:
6171     p = "NUPPER+NL";
6172     break;
6173   case BRANCH:
6174     p = "BRANCH";
6175     break;
6176   case EXACTLY:
6177     p = "EXACTLY";
6178     break;
6179   case NOTHING:
6180     p = "NOTHING";
6181     break;
6182   case BACK:
6183     p = "BACK";
6184     break;
6185   case END:
6186     p = "END";
6187     break;
6188   case MOPEN + 0:
6189     p = "MATCH START";
6190     break;
6191   case MOPEN + 1:
6192   case MOPEN + 2:
6193   case MOPEN + 3:
6194   case MOPEN + 4:
6195   case MOPEN + 5:
6196   case MOPEN + 6:
6197   case MOPEN + 7:
6198   case MOPEN + 8:
6199   case MOPEN + 9:
6200     sprintf(buf + STRLEN(buf), "MOPEN%d", OP(op) - MOPEN);
6201     p = NULL;
6202     break;
6203   case MCLOSE + 0:
6204     p = "MATCH END";
6205     break;
6206   case MCLOSE + 1:
6207   case MCLOSE + 2:
6208   case MCLOSE + 3:
6209   case MCLOSE + 4:
6210   case MCLOSE + 5:
6211   case MCLOSE + 6:
6212   case MCLOSE + 7:
6213   case MCLOSE + 8:
6214   case MCLOSE + 9:
6215     sprintf(buf + STRLEN(buf), "MCLOSE%d", OP(op) - MCLOSE);
6216     p = NULL;
6217     break;
6218   case BACKREF + 1:
6219   case BACKREF + 2:
6220   case BACKREF + 3:
6221   case BACKREF + 4:
6222   case BACKREF + 5:
6223   case BACKREF + 6:
6224   case BACKREF + 7:
6225   case BACKREF + 8:
6226   case BACKREF + 9:
6227     sprintf(buf + STRLEN(buf), "BACKREF%d", OP(op) - BACKREF);
6228     p = NULL;
6229     break;
6230   case NOPEN:
6231     p = "NOPEN";
6232     break;
6233   case NCLOSE:
6234     p = "NCLOSE";
6235     break;
6236   case ZOPEN + 1:
6237   case ZOPEN + 2:
6238   case ZOPEN + 3:
6239   case ZOPEN + 4:
6240   case ZOPEN + 5:
6241   case ZOPEN + 6:
6242   case ZOPEN + 7:
6243   case ZOPEN + 8:
6244   case ZOPEN + 9:
6245     sprintf(buf + STRLEN(buf), "ZOPEN%d", OP(op) - ZOPEN);
6246     p = NULL;
6247     break;
6248   case ZCLOSE + 1:
6249   case ZCLOSE + 2:
6250   case ZCLOSE + 3:
6251   case ZCLOSE + 4:
6252   case ZCLOSE + 5:
6253   case ZCLOSE + 6:
6254   case ZCLOSE + 7:
6255   case ZCLOSE + 8:
6256   case ZCLOSE + 9:
6257     sprintf(buf + STRLEN(buf), "ZCLOSE%d", OP(op) - ZCLOSE);
6258     p = NULL;
6259     break;
6260   case ZREF + 1:
6261   case ZREF + 2:
6262   case ZREF + 3:
6263   case ZREF + 4:
6264   case ZREF + 5:
6265   case ZREF + 6:
6266   case ZREF + 7:
6267   case ZREF + 8:
6268   case ZREF + 9:
6269     sprintf(buf + STRLEN(buf), "ZREF%d", OP(op) - ZREF);
6270     p = NULL;
6271     break;
6272   case STAR:
6273     p = "STAR";
6274     break;
6275   case PLUS:
6276     p = "PLUS";
6277     break;
6278   case NOMATCH:
6279     p = "NOMATCH";
6280     break;
6281   case MATCH:
6282     p = "MATCH";
6283     break;
6284   case BEHIND:
6285     p = "BEHIND";
6286     break;
6287   case NOBEHIND:
6288     p = "NOBEHIND";
6289     break;
6290   case SUBPAT:
6291     p = "SUBPAT";
6292     break;
6293   case BRACE_LIMITS:
6294     p = "BRACE_LIMITS";
6295     break;
6296   case BRACE_SIMPLE:
6297     p = "BRACE_SIMPLE";
6298     break;
6299   case BRACE_COMPLEX + 0:
6300   case BRACE_COMPLEX + 1:
6301   case BRACE_COMPLEX + 2:
6302   case BRACE_COMPLEX + 3:
6303   case BRACE_COMPLEX + 4:
6304   case BRACE_COMPLEX + 5:
6305   case BRACE_COMPLEX + 6:
6306   case BRACE_COMPLEX + 7:
6307   case BRACE_COMPLEX + 8:
6308   case BRACE_COMPLEX + 9:
6309     sprintf(buf + STRLEN(buf), "BRACE_COMPLEX%d", OP(op) - BRACE_COMPLEX);
6310     p = NULL;
6311     break;
6312   case MULTIBYTECODE:
6313     p = "MULTIBYTECODE";
6314     break;
6315   case NEWL:
6316     p = "NEWL";
6317     break;
6318   default:
6319     sprintf(buf + STRLEN(buf), "corrupt %d", OP(op));
6320     p = NULL;
6321     break;
6322   }
6323   if (p != NULL)
6324     STRCAT(buf, p);
6325   return (char_u *)buf;
6326 }
6327 #endif      /* REGEXP_DEBUG */
6328 
6329 
6330 
6331 /* 0xfb20 - 0xfb4f */
6332 static decomp_T decomp_table[0xfb4f-0xfb20+1] =
6333 {
6334   {0x5e2,0,0},                  /* 0xfb20	alt ayin */
6335   {0x5d0,0,0},                  /* 0xfb21	alt alef */
6336   {0x5d3,0,0},                  /* 0xfb22	alt dalet */
6337   {0x5d4,0,0},                  /* 0xfb23	alt he */
6338   {0x5db,0,0},                  /* 0xfb24	alt kaf */
6339   {0x5dc,0,0},                  /* 0xfb25	alt lamed */
6340   {0x5dd,0,0},                  /* 0xfb26	alt mem-sofit */
6341   {0x5e8,0,0},                  /* 0xfb27	alt resh */
6342   {0x5ea,0,0},                  /* 0xfb28	alt tav */
6343   {'+', 0, 0},                  /* 0xfb29	alt plus */
6344   {0x5e9, 0x5c1, 0},            /* 0xfb2a	shin+shin-dot */
6345   {0x5e9, 0x5c2, 0},            /* 0xfb2b	shin+sin-dot */
6346   {0x5e9, 0x5c1, 0x5bc},        /* 0xfb2c	shin+shin-dot+dagesh */
6347   {0x5e9, 0x5c2, 0x5bc},        /* 0xfb2d	shin+sin-dot+dagesh */
6348   {0x5d0, 0x5b7, 0},            /* 0xfb2e	alef+patah */
6349   {0x5d0, 0x5b8, 0},            /* 0xfb2f	alef+qamats */
6350   {0x5d0, 0x5b4, 0},            /* 0xfb30	alef+hiriq */
6351   {0x5d1, 0x5bc, 0},            /* 0xfb31	bet+dagesh */
6352   {0x5d2, 0x5bc, 0},            /* 0xfb32	gimel+dagesh */
6353   {0x5d3, 0x5bc, 0},            /* 0xfb33	dalet+dagesh */
6354   {0x5d4, 0x5bc, 0},            /* 0xfb34	he+dagesh */
6355   {0x5d5, 0x5bc, 0},            /* 0xfb35	vav+dagesh */
6356   {0x5d6, 0x5bc, 0},            /* 0xfb36	zayin+dagesh */
6357   {0xfb37, 0, 0},               /* 0xfb37 -- */
6358   {0x5d8, 0x5bc, 0},            /* 0xfb38	tet+dagesh */
6359   {0x5d9, 0x5bc, 0},            /* 0xfb39	yud+dagesh */
6360   {0x5da, 0x5bc, 0},            /* 0xfb3a	kaf sofit+dagesh */
6361   {0x5db, 0x5bc, 0},            /* 0xfb3b	kaf+dagesh */
6362   {0x5dc, 0x5bc, 0},            /* 0xfb3c	lamed+dagesh */
6363   {0xfb3d, 0, 0},               /* 0xfb3d -- */
6364   {0x5de, 0x5bc, 0},            /* 0xfb3e	mem+dagesh */
6365   {0xfb3f, 0, 0},               /* 0xfb3f -- */
6366   {0x5e0, 0x5bc, 0},            /* 0xfb40	nun+dagesh */
6367   {0x5e1, 0x5bc, 0},            /* 0xfb41	samech+dagesh */
6368   {0xfb42, 0, 0},               /* 0xfb42 -- */
6369   {0x5e3, 0x5bc, 0},            /* 0xfb43	pe sofit+dagesh */
6370   {0x5e4, 0x5bc,0},             /* 0xfb44	pe+dagesh */
6371   {0xfb45, 0, 0},               /* 0xfb45 -- */
6372   {0x5e6, 0x5bc, 0},            /* 0xfb46	tsadi+dagesh */
6373   {0x5e7, 0x5bc, 0},            /* 0xfb47	qof+dagesh */
6374   {0x5e8, 0x5bc, 0},            /* 0xfb48	resh+dagesh */
6375   {0x5e9, 0x5bc, 0},            /* 0xfb49	shin+dagesh */
6376   {0x5ea, 0x5bc, 0},            /* 0xfb4a	tav+dagesh */
6377   {0x5d5, 0x5b9, 0},            /* 0xfb4b	vav+holam */
6378   {0x5d1, 0x5bf, 0},            /* 0xfb4c	bet+rafe */
6379   {0x5db, 0x5bf, 0},            /* 0xfb4d	kaf+rafe */
6380   {0x5e4, 0x5bf, 0},            /* 0xfb4e	pe+rafe */
6381   {0x5d0, 0x5dc, 0}             /* 0xfb4f	alef-lamed */
6382 };
6383 
mb_decompose(int c,int * c1,int * c2,int * c3)6384 static void mb_decompose(int c, int *c1, int *c2, int *c3)
6385 {
6386   decomp_T d;
6387 
6388   if (c >= 0xfb20 && c <= 0xfb4f) {
6389     d = decomp_table[c - 0xfb20];
6390     *c1 = d.a;
6391     *c2 = d.b;
6392     *c3 = d.c;
6393   } else {
6394     *c1 = c;
6395     *c2 = *c3 = 0;
6396   }
6397 }
6398 
6399 // Compare two strings, ignore case if rex.reg_ic set.
6400 // Return 0 if strings match, non-zero otherwise.
6401 // Correct the length "*n" when composing characters are ignored.
cstrncmp(char_u * s1,char_u * s2,int * n)6402 static int cstrncmp(char_u *s1, char_u *s2, int *n)
6403 {
6404   int result;
6405 
6406   if (!rex.reg_ic) {
6407     result = STRNCMP(s1, s2, *n);
6408   } else {
6409     assert(*n >= 0);
6410     result = mb_strnicmp(s1, s2, (size_t)*n);
6411   }
6412 
6413   // if it failed and it's utf8 and we want to combineignore:
6414   if (result != 0 && rex.reg_icombine) {
6415     char_u  *str1, *str2;
6416     int c1, c2, c11, c12;
6417     int junk;
6418 
6419     // we have to handle the strcmp ourselves, since it is necessary to
6420     // deal with the composing characters by ignoring them:
6421     str1 = s1;
6422     str2 = s2;
6423     c1 = c2 = 0;
6424     while ((int)(str1 - s1) < *n) {
6425       c1 = mb_ptr2char_adv((const char_u **)&str1);
6426       c2 = mb_ptr2char_adv((const char_u **)&str2);
6427 
6428       /* decompose the character if necessary, into 'base' characters
6429        * because I don't care about Arabic, I will hard-code the Hebrew
6430        * which I *do* care about!  So sue me... */
6431       if (c1 != c2 && (!rex.reg_ic || utf_fold(c1) != utf_fold(c2))) {
6432         // decomposition necessary?
6433         mb_decompose(c1, &c11, &junk, &junk);
6434         mb_decompose(c2, &c12, &junk, &junk);
6435         c1 = c11;
6436         c2 = c12;
6437         if (c11 != c12 && (!rex.reg_ic || utf_fold(c11) != utf_fold(c12))) {
6438           break;
6439         }
6440       }
6441     }
6442     result = c2 - c1;
6443     if (result == 0)
6444       *n = (int)(str2 - s2);
6445   }
6446 
6447   return result;
6448 }
6449 
6450 ////////////////////////////////////////////////////////////////
6451 //                    regsub stuff                            //
6452 ////////////////////////////////////////////////////////////////
6453 
6454 /* This stuff below really confuses cc on an SGI -- webb */
6455 
6456 
6457 
do_upper(int * d,int c)6458 static fptr_T do_upper(int *d, int c)
6459 {
6460   *d = mb_toupper(c);
6461 
6462   return (fptr_T)NULL;
6463 }
6464 
do_Upper(int * d,int c)6465 static fptr_T do_Upper(int *d, int c)
6466 {
6467   *d = mb_toupper(c);
6468 
6469   return (fptr_T)do_Upper;
6470 }
6471 
do_lower(int * d,int c)6472 static fptr_T do_lower(int *d, int c)
6473 {
6474   *d = mb_tolower(c);
6475 
6476   return (fptr_T)NULL;
6477 }
6478 
do_Lower(int * d,int c)6479 static fptr_T do_Lower(int *d, int c)
6480 {
6481   *d = mb_tolower(c);
6482 
6483   return (fptr_T)do_Lower;
6484 }
6485 
6486 /*
6487  * regtilde(): Replace tildes in the pattern by the old pattern.
6488  *
6489  * Short explanation of the tilde: It stands for the previous replacement
6490  * pattern.  If that previous pattern also contains a ~ we should go back a
6491  * step further...  But we insert the previous pattern into the current one
6492  * and remember that.
6493  * This still does not handle the case where "magic" changes.  So require the
6494  * user to keep his hands off of "magic".
6495  *
6496  * The tildes are parsed once before the first call to vim_regsub().
6497  */
regtilde(char_u * source,int magic)6498 char_u *regtilde(char_u *source, int magic)
6499 {
6500   char_u      *newsub = source;
6501   char_u      *tmpsub;
6502   char_u      *p;
6503   int len;
6504   int prevlen;
6505 
6506   for (p = newsub; *p; ++p) {
6507     if ((*p == '~' && magic) || (*p == '\\' && *(p + 1) == '~' && !magic)) {
6508       if (reg_prev_sub != NULL) {
6509         /* length = len(newsub) - 1 + len(prev_sub) + 1 */
6510         prevlen = (int)STRLEN(reg_prev_sub);
6511         tmpsub = xmalloc(STRLEN(newsub) + prevlen);
6512         /* copy prefix */
6513         len = (int)(p - newsub);              /* not including ~ */
6514         memmove(tmpsub, newsub, (size_t)len);
6515         /* interpret tilde */
6516         memmove(tmpsub + len, reg_prev_sub, (size_t)prevlen);
6517         // copy postfix
6518         if (!magic) {
6519           p++;                                // back off backslash
6520         }
6521         STRCPY(tmpsub + len + prevlen, p + 1);
6522 
6523         if (newsub != source)                 /* already allocated newsub */
6524           xfree(newsub);
6525         newsub = tmpsub;
6526         p = newsub + len + prevlen;
6527       } else if (magic)
6528         STRMOVE(p, p + 1);              /* remove '~' */
6529       else
6530         STRMOVE(p, p + 2);              /* remove '\~' */
6531       --p;
6532     } else {
6533       if (*p == '\\' && p[1]) {         // skip escaped characters
6534         p++;
6535       }
6536       p += utfc_ptr2len(p) - 1;
6537     }
6538   }
6539 
6540   xfree(reg_prev_sub);
6541   if (newsub != source)         /* newsub was allocated, just keep it */
6542     reg_prev_sub = newsub;
6543   else                          /* no ~ found, need to save newsub  */
6544     reg_prev_sub = vim_strsave(newsub);
6545   return newsub;
6546 }
6547 
6548 static bool can_f_submatch = false;  // true when submatch() can be used
6549 
6550 // These pointers are used for reg_submatch().  Needed for when the
6551 // substitution string is an expression that contains a call to substitute()
6552 // and submatch().
6553 typedef struct {
6554   regmatch_T *sm_match;
6555   regmmatch_T *sm_mmatch;
6556   linenr_T sm_firstlnum;
6557   linenr_T sm_maxline;
6558   int sm_line_lbr;
6559 } regsubmatch_T;
6560 
6561 static regsubmatch_T rsm;  // can only be used when can_f_submatch is true
6562 
6563 /// Put the submatches in "argv[argskip]" which is a list passed into
6564 /// call_func() by vim_regsub_both().
fill_submatch_list(int argc FUNC_ATTR_UNUSED,typval_T * argv,int argskip,int argcount)6565 static int fill_submatch_list(int argc FUNC_ATTR_UNUSED, typval_T *argv,
6566                               int argskip, int argcount)
6567   FUNC_ATTR_NONNULL_ALL
6568 {
6569   typval_T *listarg = argv + argskip;
6570 
6571   if (argcount == argskip) {
6572     // called function doesn't take a submatches argument
6573     return argskip;
6574   }
6575 
6576   // Relies on sl_list to be the first item in staticList10_T.
6577   tv_list_init_static10((staticList10_T *)listarg->vval.v_list);
6578 
6579   // There are always 10 list items in staticList10_T.
6580   listitem_T *li = tv_list_first(listarg->vval.v_list);
6581   for (int i = 0; i < 10; i++) {
6582     char_u *s = rsm.sm_match->startp[i];
6583     if (s == NULL || rsm.sm_match->endp[i] == NULL) {
6584       s = NULL;
6585     } else {
6586       s = vim_strnsave(s, rsm.sm_match->endp[i] - s);
6587     }
6588     TV_LIST_ITEM_TV(li)->v_type = VAR_STRING;
6589     TV_LIST_ITEM_TV(li)->vval.v_string = s;
6590     li = TV_LIST_ITEM_NEXT(argv->vval.v_list, li);
6591   }
6592   return argskip + 1;
6593 }
6594 
clear_submatch_list(staticList10_T * sl)6595 static void clear_submatch_list(staticList10_T *sl)
6596 {
6597   TV_LIST_ITER(&sl->sl_list, li, {
6598     xfree(TV_LIST_ITEM_TV(li)->vval.v_string);
6599   });
6600 }
6601 
6602 /// vim_regsub() - perform substitutions after a vim_regexec() or
6603 /// vim_regexec_multi() match.
6604 ///
6605 /// If "copy" is true really copy into "dest".
6606 /// If "copy" is false nothing is copied, this is just to find out the length
6607 /// of the result.
6608 ///
6609 /// If "backslash" is true, a backslash will be removed later, need to double
6610 /// them to keep them, and insert a backslash before a CR to avoid it being
6611 /// replaced with a line break later.
6612 ///
6613 /// Note: The matched text must not change between the call of
6614 /// vim_regexec()/vim_regexec_multi() and vim_regsub()!  It would make the back
6615 /// references invalid!
6616 ///
6617 /// Returns the size of the replacement, including terminating NUL.
vim_regsub(regmatch_T * rmp,char_u * source,typval_T * expr,char_u * dest,int copy,int magic,int backslash)6618 int vim_regsub(regmatch_T *rmp, char_u *source, typval_T *expr, char_u *dest,
6619                int copy, int magic, int backslash)
6620 {
6621   regexec_T rex_save;
6622   bool rex_in_use_save = rex_in_use;
6623 
6624   if (rex_in_use) {
6625     // Being called recursively, save the state.
6626     rex_save = rex;
6627   }
6628   rex_in_use = true;
6629 
6630   rex.reg_match = rmp;
6631   rex.reg_mmatch = NULL;
6632   rex.reg_maxline = 0;
6633   rex.reg_buf = curbuf;
6634   rex.reg_line_lbr = true;
6635   int result = vim_regsub_both(source, expr, dest, copy, magic, backslash);
6636 
6637   rex_in_use = rex_in_use_save;
6638   if (rex_in_use) {
6639     rex = rex_save;
6640   }
6641 
6642   return result;
6643 }
6644 
vim_regsub_multi(regmmatch_T * rmp,linenr_T lnum,char_u * source,char_u * dest,int copy,int magic,int backslash)6645 int vim_regsub_multi(regmmatch_T *rmp, linenr_T lnum, char_u *source, char_u *dest, int copy, int magic, int backslash)
6646 {
6647   regexec_T rex_save;
6648   bool rex_in_use_save = rex_in_use;
6649 
6650   if (rex_in_use) {
6651     // Being called recursively, save the state.
6652     rex_save = rex;
6653   }
6654   rex_in_use = true;
6655 
6656   rex.reg_match = NULL;
6657   rex.reg_mmatch = rmp;
6658   rex.reg_buf = curbuf;  // always works on the current buffer!
6659   rex.reg_firstlnum = lnum;
6660   rex.reg_maxline = curbuf->b_ml.ml_line_count - lnum;
6661   rex.reg_line_lbr = false;
6662   int result = vim_regsub_both(source, NULL, dest, copy, magic, backslash);
6663 
6664   rex_in_use = rex_in_use_save;
6665   if (rex_in_use) {
6666     rex = rex_save;
6667   }
6668 
6669   return result;
6670 }
6671 
vim_regsub_both(char_u * source,typval_T * expr,char_u * dest,int copy,int magic,int backslash)6672 static int vim_regsub_both(char_u *source, typval_T *expr, char_u *dest,
6673                            int copy, int magic, int backslash)
6674 {
6675   char_u      *src;
6676   char_u      *dst;
6677   char_u      *s;
6678   int c;
6679   int cc;
6680   int no = -1;
6681   fptr_T func_all = (fptr_T)NULL;
6682   fptr_T func_one = (fptr_T)NULL;
6683   linenr_T clnum = 0;           /* init for GCC */
6684   int len = 0;                  /* init for GCC */
6685   static char_u *eval_result = NULL;
6686 
6687   // We need to keep track of how many backslashes we escape, so that the byte
6688   // counts for `extmark_splice` are correct.
6689   int num_escaped = 0;
6690 
6691   // Be paranoid...
6692   if ((source == NULL && expr == NULL) || dest == NULL) {
6693     emsg(_(e_null));
6694     return 0;
6695   }
6696   if (prog_magic_wrong())
6697     return 0;
6698   src = source;
6699   dst = dest;
6700 
6701   // When the substitute part starts with "\=" evaluate it as an expression.
6702   if (expr != NULL || (source[0] == '\\' && source[1] == '=')) {
6703     // To make sure that the length doesn't change between checking the
6704     // length and copying the string, and to speed up things, the
6705     // resulting string is saved from the call with "copy" == false to the
6706     // call with "copy" == true.
6707     if (copy) {
6708       if (eval_result != NULL) {
6709         STRCPY(dest, eval_result);
6710         dst += STRLEN(eval_result);
6711         XFREE_CLEAR(eval_result);
6712       }
6713     } else {
6714       const bool prev_can_f_submatch = can_f_submatch;
6715       regsubmatch_T rsm_save;
6716 
6717       xfree(eval_result);
6718 
6719       // The expression may contain substitute(), which calls us
6720       // recursively.  Make sure submatch() gets the text from the first
6721       // level.
6722       if (can_f_submatch) {
6723         rsm_save = rsm;
6724       }
6725       can_f_submatch = true;
6726       rsm.sm_match = rex.reg_match;
6727       rsm.sm_mmatch = rex.reg_mmatch;
6728       rsm.sm_firstlnum = rex.reg_firstlnum;
6729       rsm.sm_maxline = rex.reg_maxline;
6730       rsm.sm_line_lbr = rex.reg_line_lbr;
6731 
6732       if (expr != NULL) {
6733         typval_T argv[2];
6734         typval_T rettv;
6735         staticList10_T matchList = TV_LIST_STATIC10_INIT;
6736         rettv.v_type = VAR_STRING;
6737         rettv.vval.v_string = NULL;
6738         argv[0].v_type = VAR_LIST;
6739         argv[0].vval.v_list = &matchList.sl_list;
6740         funcexe_T funcexe = FUNCEXE_INIT;
6741         funcexe.argv_func = fill_submatch_list;
6742         funcexe.evaluate = true;
6743         if (expr->v_type == VAR_FUNC) {
6744           s = expr->vval.v_string;
6745           call_func(s, -1, &rettv, 1, argv, &funcexe);
6746         } else if (expr->v_type == VAR_PARTIAL) {
6747           partial_T *partial = expr->vval.v_partial;
6748 
6749           s = partial_name(partial);
6750           funcexe.partial = partial;
6751           call_func(s, -1, &rettv, 1, argv, &funcexe);
6752         }
6753         if (tv_list_len(&matchList.sl_list) > 0) {
6754           // fill_submatch_list() was called.
6755           clear_submatch_list(&matchList);
6756         }
6757         if (rettv.v_type == VAR_UNKNOWN) {
6758           // something failed, no need to report another error
6759           eval_result = NULL;
6760         } else {
6761           char buf[NUMBUFLEN];
6762           eval_result = (char_u *)tv_get_string_buf_chk(&rettv, buf);
6763           if (eval_result != NULL) {
6764             eval_result = vim_strsave(eval_result);
6765           }
6766         }
6767         tv_clear(&rettv);
6768       } else {
6769         eval_result = eval_to_string(source + 2, NULL, true);
6770       }
6771 
6772       if (eval_result != NULL) {
6773         int had_backslash = false;
6774 
6775         for (s = eval_result; *s != NUL; MB_PTR_ADV(s)) {
6776           // Change NL to CR, so that it becomes a line break,
6777           // unless called from vim_regexec_nl().
6778           // Skip over a backslashed character.
6779           if (*s == NL && !rsm.sm_line_lbr) {
6780             *s = CAR;
6781           } else if (*s == '\\' && s[1] != NUL) {
6782             s++;
6783             /* Change NL to CR here too, so that this works:
6784              * :s/abc\\\ndef/\="aaa\\\nbbb"/  on text:
6785              *   abc\
6786              *   def
6787              * Not when called from vim_regexec_nl().
6788              */
6789             if (*s == NL && !rsm.sm_line_lbr) {
6790               *s = CAR;
6791             }
6792             had_backslash = true;
6793           }
6794         }
6795         if (had_backslash && backslash) {
6796           /* Backslashes will be consumed, need to double them. */
6797           s = vim_strsave_escaped(eval_result, (char_u *)"\\");
6798           xfree(eval_result);
6799           eval_result = s;
6800         }
6801 
6802         dst += STRLEN(eval_result);
6803       }
6804 
6805       can_f_submatch = prev_can_f_submatch;
6806       if (can_f_submatch) {
6807         rsm = rsm_save;
6808       }
6809     }
6810   } else
6811     while ((c = *src++) != NUL) {
6812       if (c == '&' && magic)
6813         no = 0;
6814       else if (c == '\\' && *src != NUL) {
6815         if (*src == '&' && !magic) {
6816           ++src;
6817           no = 0;
6818         } else if ('0' <= *src && *src <= '9') {
6819           no = *src++ - '0';
6820         } else if (vim_strchr((char_u *)"uUlLeE", *src)) {
6821           switch (*src++) {
6822           case 'u':   func_one = (fptr_T)do_upper;
6823             continue;
6824           case 'U':   func_all = (fptr_T)do_Upper;
6825             continue;
6826           case 'l':   func_one = (fptr_T)do_lower;
6827             continue;
6828           case 'L':   func_all = (fptr_T)do_Lower;
6829             continue;
6830           case 'e':
6831           case 'E':   func_one = func_all = (fptr_T)NULL;
6832             continue;
6833           }
6834         }
6835       }
6836       if (no < 0) {           /* Ordinary character. */
6837         if (c == K_SPECIAL && src[0] != NUL && src[1] != NUL) {
6838           /* Copy a special key as-is. */
6839           if (copy) {
6840             *dst++ = c;
6841             *dst++ = *src++;
6842             *dst++ = *src++;
6843           } else {
6844             dst += 3;
6845             src += 2;
6846           }
6847           continue;
6848         }
6849 
6850         if (c == '\\' && *src != NUL) {
6851           // Check for abbreviations -- webb
6852           switch (*src) {
6853           case 'r':   c = CAR;        ++src;  break;
6854           case 'n':   c = NL;         ++src;  break;
6855           case 't':   c = TAB;        ++src;  break;
6856           // Oh no!  \e already has meaning in subst pat :-(
6857           // case 'e':   c = ESC;        ++src;  break;
6858           case 'b':   c = Ctrl_H;     ++src;  break;
6859 
6860           // If "backslash" is true the backslash will be removed
6861           // later.  Used to insert a literal CR.
6862           default:
6863             if (backslash) {
6864               num_escaped += 1;
6865               if (copy) {
6866                 *dst = '\\';
6867               }
6868               dst++;
6869             }
6870             c = *src++;
6871           }
6872         } else {
6873           c = utf_ptr2char(src - 1);
6874         }
6875         // Write to buffer, if copy is set.
6876         if (func_one != NULL) {
6877           func_one = (fptr_T)(func_one(&cc, c));
6878         } else if (func_all != NULL) {
6879           func_all = (fptr_T)(func_all(&cc, c));
6880         } else {
6881           // just copy
6882           cc = c;
6883         }
6884 
6885         int totlen = utfc_ptr2len(src - 1);
6886 
6887         if (copy) {
6888           utf_char2bytes(cc, dst);
6889         }
6890         dst += utf_char2len(cc) - 1;
6891         int clen = utf_ptr2len(src - 1);
6892 
6893         // If the character length is shorter than "totlen", there
6894         // are composing characters; copy them as-is.
6895         if (clen < totlen) {
6896           if (copy) {
6897             memmove(dst + 1, src - 1 + clen, (size_t)(totlen - clen));
6898           }
6899           dst += totlen - clen;
6900         }
6901         src += totlen - 1;
6902         dst++;
6903       } else {
6904         if (REG_MULTI) {
6905           clnum = rex.reg_mmatch->startpos[no].lnum;
6906           if (clnum < 0 || rex.reg_mmatch->endpos[no].lnum < 0) {
6907             s = NULL;
6908           } else {
6909             s = reg_getline(clnum) + rex.reg_mmatch->startpos[no].col;
6910             if (rex.reg_mmatch->endpos[no].lnum == clnum) {
6911               len = rex.reg_mmatch->endpos[no].col
6912                     - rex.reg_mmatch->startpos[no].col;
6913             } else {
6914               len = (int)STRLEN(s);
6915             }
6916           }
6917         } else {
6918           s = rex.reg_match->startp[no];
6919           if (rex.reg_match->endp[no] == NULL) {
6920             s = NULL;
6921           } else {
6922             len = (int)(rex.reg_match->endp[no] - s);
6923           }
6924         }
6925         if (s != NULL) {
6926           for (;; ) {
6927             if (len == 0) {
6928               if (REG_MULTI) {
6929                 if (rex.reg_mmatch->endpos[no].lnum == clnum) {
6930                   break;
6931                 }
6932                 if (copy) {
6933                   *dst = CAR;
6934                 }
6935                 dst++;
6936                 s = reg_getline(++clnum);
6937                 if (rex.reg_mmatch->endpos[no].lnum == clnum) {
6938                   len = rex.reg_mmatch->endpos[no].col;
6939                 } else {
6940                   len = (int)STRLEN(s);
6941                 }
6942               } else {
6943                 break;
6944               }
6945             } else if (*s == NUL) {  // we hit NUL.
6946               if (copy) {
6947                 iemsg(_(e_re_damg));
6948               }
6949               goto exit;
6950             } else {
6951               if (backslash && (*s == CAR || *s == '\\')) {
6952                 /*
6953                  * Insert a backslash in front of a CR, otherwise
6954                  * it will be replaced by a line break.
6955                  * Number of backslashes will be halved later,
6956                  * double them here.
6957                  */
6958                 if (copy) {
6959                   dst[0] = '\\';
6960                   dst[1] = *s;
6961                 }
6962                 dst += 2;
6963               } else {
6964                 c = utf_ptr2char(s);
6965 
6966                 if (func_one != (fptr_T)NULL)
6967                   /* Turbo C complains without the typecast */
6968                   func_one = (fptr_T)(func_one(&cc, c));
6969                 else if (func_all != (fptr_T)NULL)
6970                   /* Turbo C complains without the typecast */
6971                   func_all = (fptr_T)(func_all(&cc, c));
6972                 else             /* just copy */
6973                   cc = c;
6974 
6975                 {
6976                   int l;
6977 
6978                   // Copy composing characters separately, one
6979                   // at a time.
6980                   l = utf_ptr2len(s) - 1;
6981 
6982                   s += l;
6983                   len -= l;
6984                   if (copy) {
6985                     utf_char2bytes(cc, dst);
6986                   }
6987                   dst += utf_char2len(cc) - 1;
6988                 }
6989                 dst++;
6990               }
6991 
6992               ++s;
6993               --len;
6994             }
6995           }
6996         }
6997         no = -1;
6998       }
6999     }
7000   if (copy)
7001     *dst = NUL;
7002 
7003 exit:
7004   return (int)((dst - dest) + 1 - num_escaped);
7005 }
7006 
7007 
7008 /*
7009  * Call reg_getline() with the line numbers from the submatch.  If a
7010  * substitute() was used the reg_maxline and other values have been
7011  * overwritten.
7012  */
reg_getline_submatch(linenr_T lnum)7013 static char_u *reg_getline_submatch(linenr_T lnum)
7014 {
7015   char_u *s;
7016   linenr_T save_first = rex.reg_firstlnum;
7017   linenr_T save_max = rex.reg_maxline;
7018 
7019   rex.reg_firstlnum = rsm.sm_firstlnum;
7020   rex.reg_maxline = rsm.sm_maxline;
7021 
7022   s = reg_getline(lnum);
7023 
7024   rex.reg_firstlnum = save_first;
7025   rex.reg_maxline = save_max;
7026   return s;
7027 }
7028 
7029 /*
7030  * Used for the submatch() function: get the string from the n'th submatch in
7031  * allocated memory.
7032  * Returns NULL when not in a ":s" command and for a non-existing submatch.
7033  */
reg_submatch(int no)7034 char_u *reg_submatch(int no)
7035 {
7036   char_u      *retval = NULL;
7037   char_u      *s;
7038   int round;
7039   linenr_T lnum;
7040 
7041   if (!can_f_submatch || no < 0)
7042     return NULL;
7043 
7044   if (rsm.sm_match == NULL) {
7045     ssize_t len;
7046 
7047     /*
7048      * First round: compute the length and allocate memory.
7049      * Second round: copy the text.
7050      */
7051     for (round = 1; round <= 2; round++) {
7052       lnum = rsm.sm_mmatch->startpos[no].lnum;
7053       if (lnum < 0 || rsm.sm_mmatch->endpos[no].lnum < 0) {
7054         return NULL;
7055       }
7056 
7057       s = reg_getline_submatch(lnum);
7058       if (s == NULL) {  // anti-crash check, cannot happen?
7059         break;
7060       }
7061       s += rsm.sm_mmatch->startpos[no].col;
7062       if (rsm.sm_mmatch->endpos[no].lnum == lnum) {
7063         // Within one line: take form start to end col.
7064         len = rsm.sm_mmatch->endpos[no].col - rsm.sm_mmatch->startpos[no].col;
7065         if (round == 2) {
7066           STRLCPY(retval, s, len + 1);
7067         }
7068         len++;
7069       } else {
7070         // Multiple lines: take start line from start col, middle
7071         // lines completely and end line up to end col.
7072         len = (ssize_t)STRLEN(s);
7073         if (round == 2) {
7074           STRCPY(retval, s);
7075           retval[len] = '\n';
7076         }
7077         len++;
7078         lnum++;
7079         while (lnum < rsm.sm_mmatch->endpos[no].lnum) {
7080           s = reg_getline_submatch(lnum++);
7081           if (round == 2)
7082             STRCPY(retval + len, s);
7083           len += STRLEN(s);
7084           if (round == 2)
7085             retval[len] = '\n';
7086           ++len;
7087         }
7088         if (round == 2) {
7089           STRNCPY(retval + len, reg_getline_submatch(lnum),
7090                   rsm.sm_mmatch->endpos[no].col);
7091         }
7092         len += rsm.sm_mmatch->endpos[no].col;
7093         if (round == 2) {
7094           retval[len] = NUL;  // -V595
7095         }
7096         len++;
7097       }
7098 
7099       if (retval == NULL) {
7100         retval = xmalloc(len);
7101       }
7102     }
7103   } else {
7104     s = rsm.sm_match->startp[no];
7105     if (s == NULL || rsm.sm_match->endp[no] == NULL) {
7106       retval = NULL;
7107     } else {
7108       retval = vim_strnsave(s, rsm.sm_match->endp[no] - s);
7109     }
7110   }
7111 
7112   return retval;
7113 }
7114 
7115 // Used for the submatch() function with the optional non-zero argument: get
7116 // the list of strings from the n'th submatch in allocated memory with NULs
7117 // represented in NLs.
7118 // Returns a list of allocated strings.  Returns NULL when not in a ":s"
7119 // command, for a non-existing submatch and for any error.
reg_submatch_list(int no)7120 list_T *reg_submatch_list(int no)
7121 {
7122   if (!can_f_submatch || no < 0) {
7123     return NULL;
7124   }
7125 
7126   linenr_T slnum;
7127   linenr_T elnum;
7128   list_T *list;
7129   const char *s;
7130 
7131   if (rsm.sm_match == NULL) {
7132     slnum = rsm.sm_mmatch->startpos[no].lnum;
7133     elnum = rsm.sm_mmatch->endpos[no].lnum;
7134     if (slnum < 0 || elnum < 0) {
7135       return NULL;
7136     }
7137 
7138     colnr_T scol = rsm.sm_mmatch->startpos[no].col;
7139     colnr_T ecol = rsm.sm_mmatch->endpos[no].col;
7140 
7141     list = tv_list_alloc(elnum - slnum + 1);
7142 
7143     s = (const char *)reg_getline_submatch(slnum) + scol;
7144     if (slnum == elnum) {
7145       tv_list_append_string(list, s, ecol - scol);
7146     } else {
7147       tv_list_append_string(list, s, -1);
7148       for (int i = 1; i < elnum - slnum; i++) {
7149         s = (const char *)reg_getline_submatch(slnum + i);
7150         tv_list_append_string(list, s, -1);
7151       }
7152       s = (const char *)reg_getline_submatch(elnum);
7153       tv_list_append_string(list, s, ecol);
7154     }
7155   } else {
7156     s = (const char *)rsm.sm_match->startp[no];
7157     if (s == NULL || rsm.sm_match->endp[no] == NULL) {
7158       return NULL;
7159     }
7160     list = tv_list_alloc(1);
7161     tv_list_append_string(list, s, (const char *)rsm.sm_match->endp[no] - s);
7162   }
7163 
7164   tv_list_ref(list);
7165   return list;
7166 }
7167 
7168 static regengine_T bt_regengine =
7169 {
7170   bt_regcomp,
7171   bt_regfree,
7172   bt_regexec_nl,
7173   bt_regexec_multi,
7174   (char_u *)""
7175 };
7176 
7177 
7178 // XXX Do not allow headers generator to catch definitions from regexp_nfa.c
7179 #ifndef DO_NOT_DEFINE_EMPTY_ATTRIBUTES
7180 # include "nvim/regexp_nfa.c"
7181 #endif
7182 
7183 static regengine_T nfa_regengine =
7184 {
7185   nfa_regcomp,
7186   nfa_regfree,
7187   nfa_regexec_nl,
7188   nfa_regexec_multi,
7189   (char_u *)""
7190 };
7191 
7192 // Which regexp engine to use? Needed for vim_regcomp().
7193 // Must match with 'regexpengine'.
7194 static int regexp_engine = 0;
7195 
7196 #ifdef REGEXP_DEBUG
7197 static char_u regname[][30] = {
7198   "AUTOMATIC Regexp Engine",
7199   "BACKTRACKING Regexp Engine",
7200   "NFA Regexp Engine"
7201 };
7202 #endif
7203 
7204 /*
7205  * Compile a regular expression into internal code.
7206  * Returns the program in allocated memory.
7207  * Use vim_regfree() to free the memory.
7208  * Returns NULL for an error.
7209  */
vim_regcomp(char_u * expr_arg,int re_flags)7210 regprog_T *vim_regcomp(char_u *expr_arg, int re_flags)
7211 {
7212   regprog_T   *prog = NULL;
7213   char_u      *expr = expr_arg;
7214   int          save_called_emsg;
7215 
7216   regexp_engine = p_re;
7217 
7218   /* Check for prefix "\%#=", that sets the regexp engine */
7219   if (STRNCMP(expr, "\\%#=", 4) == 0) {
7220     int newengine = expr[4] - '0';
7221 
7222     if (newengine == AUTOMATIC_ENGINE
7223         || newengine == BACKTRACKING_ENGINE
7224         || newengine == NFA_ENGINE) {
7225       regexp_engine = expr[4] - '0';
7226       expr += 5;
7227 #ifdef REGEXP_DEBUG
7228       smsg("New regexp mode selected (%d): %s",
7229            regexp_engine,
7230            regname[newengine]);
7231 #endif
7232     } else {
7233       emsg(_(
7234               "E864: \\%#= can only be followed by 0, 1, or 2. The automatic engine will be used "));
7235       regexp_engine = AUTOMATIC_ENGINE;
7236     }
7237   }
7238 #ifdef REGEXP_DEBUG
7239   bt_regengine.expr = expr;
7240   nfa_regengine.expr = expr;
7241 #endif
7242   // reg_iswordc() uses rex.reg_buf
7243   rex.reg_buf = curbuf;
7244 
7245   //
7246   // First try the NFA engine, unless backtracking was requested.
7247   //
7248   save_called_emsg = called_emsg;
7249   called_emsg = false;
7250   if (regexp_engine != BACKTRACKING_ENGINE) {
7251     prog = nfa_regengine.regcomp(expr,
7252         re_flags + (regexp_engine == AUTOMATIC_ENGINE ? RE_AUTO : 0));
7253   } else {
7254     prog = bt_regengine.regcomp(expr, re_flags);
7255   }
7256 
7257   // Check for error compiling regexp with initial engine.
7258   if (prog == NULL) {
7259 #ifdef BT_REGEXP_DEBUG_LOG
7260     // Debugging log for BT engine.
7261     if (regexp_engine != BACKTRACKING_ENGINE) {
7262       FILE *f = fopen(BT_REGEXP_DEBUG_LOG_NAME, "a");
7263       if (f) {
7264         fprintf(f, "Syntax error in \"%s\"\n", expr);
7265         fclose(f);
7266       } else
7267         semsg("(NFA) Could not open \"%s\" to write !!!",
7268             BT_REGEXP_DEBUG_LOG_NAME);
7269     }
7270 #endif
7271     // If the NFA engine failed, try the backtracking engine. The NFA engine
7272     // also fails for patterns that it can't handle well but are still valid
7273     // patterns, thus a retry should work.
7274     // But don't try if an error message was given.
7275     if (regexp_engine == AUTOMATIC_ENGINE && !called_emsg) {
7276       regexp_engine = BACKTRACKING_ENGINE;
7277       report_re_switch(expr);
7278       prog = bt_regengine.regcomp(expr, re_flags);
7279     }
7280   }
7281   called_emsg |= save_called_emsg;
7282 
7283   if (prog != NULL) {
7284     // Store the info needed to call regcomp() again when the engine turns out
7285     // to be very slow when executing it.
7286     prog->re_engine = regexp_engine;
7287     prog->re_flags = re_flags;
7288   }
7289 
7290   return prog;
7291 }
7292 
7293 /*
7294  * Free a compiled regexp program, returned by vim_regcomp().
7295  */
vim_regfree(regprog_T * prog)7296 void vim_regfree(regprog_T *prog)
7297 {
7298   if (prog != NULL)
7299     prog->engine->regfree(prog);
7300 }
7301 
report_re_switch(char_u * pat)7302 static void report_re_switch(char_u *pat)
7303 {
7304   if (p_verbose > 0) {
7305     verbose_enter();
7306     msg_puts(_("Switching to backtracking RE engine for pattern: "));
7307     msg_puts((char *)pat);
7308     verbose_leave();
7309   }
7310 }
7311 
7312 /// Matches a regexp against a string.
7313 /// "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
7314 /// Note: "rmp->regprog" may be freed and changed.
7315 /// Uses curbuf for line count and 'iskeyword'.
7316 /// When "nl" is true consider a "\n" in "line" to be a line break.
7317 ///
7318 /// @param rmp
7319 /// @param line the string to match against
7320 /// @param col  the column to start looking for match
7321 /// @param nl
7322 ///
7323 /// @return true if there is a match, false if not.
vim_regexec_string(regmatch_T * rmp,char_u * line,colnr_T col,bool nl)7324 static bool vim_regexec_string(regmatch_T *rmp, char_u *line, colnr_T col,
7325                                bool nl)
7326 {
7327   regexec_T rex_save;
7328   bool rex_in_use_save = rex_in_use;
7329 
7330   // Cannot use the same prog recursively, it contains state.
7331   if (rmp->regprog->re_in_use) {
7332     emsg(_(e_recursive));
7333     return false;
7334   }
7335   rmp->regprog->re_in_use = true;
7336 
7337   if (rex_in_use) {
7338     // Being called recursively, save the state.
7339     rex_save = rex;
7340   }
7341   rex_in_use = true;
7342 
7343   rex.reg_startp = NULL;
7344   rex.reg_endp = NULL;
7345   rex.reg_startpos = NULL;
7346   rex.reg_endpos = NULL;
7347 
7348   int result = rmp->regprog->engine->regexec_nl(rmp, line, col, nl);
7349   rmp->regprog->re_in_use = false;
7350 
7351   // NFA engine aborted because it's very slow, use backtracking engine instead.
7352   if (rmp->regprog->re_engine == AUTOMATIC_ENGINE
7353       && result == NFA_TOO_EXPENSIVE) {
7354     int save_p_re = p_re;
7355     int re_flags = rmp->regprog->re_flags;
7356     char_u *pat = vim_strsave(((nfa_regprog_T *)rmp->regprog)->pattern);
7357 
7358     p_re = BACKTRACKING_ENGINE;
7359     vim_regfree(rmp->regprog);
7360     report_re_switch(pat);
7361     rmp->regprog = vim_regcomp(pat, re_flags);
7362     if (rmp->regprog != NULL) {
7363       rmp->regprog->re_in_use = true;
7364       result = rmp->regprog->engine->regexec_nl(rmp, line, col, nl);
7365       rmp->regprog->re_in_use = false;
7366     }
7367 
7368     xfree(pat);
7369     p_re = save_p_re;
7370   }
7371 
7372   rex_in_use = rex_in_use_save;
7373   if (rex_in_use) {
7374     rex = rex_save;
7375   }
7376 
7377   return result > 0;
7378 }
7379 
7380 // Note: "*prog" may be freed and changed.
7381 // Return true if there is a match, false if not.
vim_regexec_prog(regprog_T ** prog,bool ignore_case,char_u * line,colnr_T col)7382 bool vim_regexec_prog(regprog_T **prog, bool ignore_case, char_u *line,
7383                       colnr_T col)
7384 {
7385   regmatch_T regmatch = { .regprog = *prog, .rm_ic = ignore_case };
7386   bool r = vim_regexec_string(&regmatch, line, col, false);
7387   *prog = regmatch.regprog;
7388   return r;
7389 }
7390 
7391 // Note: "rmp->regprog" may be freed and changed.
7392 // Return true if there is a match, false if not.
vim_regexec(regmatch_T * rmp,char_u * line,colnr_T col)7393 bool vim_regexec(regmatch_T *rmp, char_u *line, colnr_T col)
7394 {
7395   return vim_regexec_string(rmp, line, col, false);
7396 }
7397 
7398 // Like vim_regexec(), but consider a "\n" in "line" to be a line break.
7399 // Note: "rmp->regprog" may be freed and changed.
7400 // Return true if there is a match, false if not.
vim_regexec_nl(regmatch_T * rmp,char_u * line,colnr_T col)7401 bool vim_regexec_nl(regmatch_T *rmp, char_u *line, colnr_T col)
7402 {
7403   return vim_regexec_string(rmp, line, col, true);
7404 }
7405 
7406 /// Match a regexp against multiple lines.
7407 /// "rmp->regprog" must be a compiled regexp as returned by vim_regcomp().
7408 /// Note: "rmp->regprog" may be freed and changed, even set to NULL.
7409 /// Uses curbuf for line count and 'iskeyword'.
7410 ///
7411 /// Return zero if there is no match.  Return number of lines contained in the
7412 /// match otherwise.
vim_regexec_multi(regmmatch_T * rmp,win_T * win,buf_T * buf,linenr_T lnum,colnr_T col,proftime_T * tm,int * timed_out)7413 long vim_regexec_multi(
7414     regmmatch_T *rmp,
7415     win_T       *win,               // window in which to search or NULL
7416     buf_T       *buf,               // buffer in which to search
7417     linenr_T lnum,                  // nr of line to start looking for match
7418     colnr_T col,                    // column to start looking for match
7419     proftime_T  *tm,                // timeout limit or NULL
7420     int         *timed_out          // flag is set when timeout limit reached
7421 )
7422   FUNC_ATTR_NONNULL_ARG(1)
7423 {
7424   regexec_T rex_save;
7425   bool rex_in_use_save = rex_in_use;
7426 
7427   // Cannot use the same prog recursively, it contains state.
7428   if (rmp->regprog->re_in_use) {
7429     emsg(_(e_recursive));
7430     return false;
7431   }
7432   rmp->regprog->re_in_use = true;
7433 
7434   if (rex_in_use) {
7435     // Being called recursively, save the state.
7436     rex_save = rex;
7437   }
7438   rex_in_use = true;
7439 
7440   int result = rmp->regprog->engine->regexec_multi(rmp, win, buf, lnum, col,
7441                                                    tm, timed_out);
7442   rmp->regprog->re_in_use = false;
7443 
7444   // NFA engine aborted because it's very slow, use backtracking engine instead.
7445   if (rmp->regprog->re_engine == AUTOMATIC_ENGINE
7446       && result == NFA_TOO_EXPENSIVE) {
7447     int save_p_re = p_re;
7448     int re_flags = rmp->regprog->re_flags;
7449     char_u *pat = vim_strsave(((nfa_regprog_T *)rmp->regprog)->pattern);
7450 
7451     p_re = BACKTRACKING_ENGINE;
7452     vim_regfree(rmp->regprog);
7453     report_re_switch(pat);
7454     // checking for \z misuse was already done when compiling for NFA,
7455     // allow all here
7456     reg_do_extmatch = REX_ALL;
7457     rmp->regprog = vim_regcomp(pat, re_flags);
7458     reg_do_extmatch = 0;
7459 
7460     if (rmp->regprog != NULL) {
7461       rmp->regprog->re_in_use = true;
7462       result = rmp->regprog->engine->regexec_multi(rmp, win, buf, lnum, col,
7463                                                    tm, timed_out);
7464       rmp->regprog->re_in_use = false;
7465     }
7466 
7467     xfree(pat);
7468     p_re = save_p_re;
7469   }
7470 
7471   rex_in_use = rex_in_use_save;
7472   if (rex_in_use) {
7473     rex = rex_save;
7474   }
7475 
7476   return result <= 0 ? 0 : result;
7477 }
7478