1 // This is an open source non-commercial project. Dear PVS-Studio, please check
2 // it. PVS-Studio Static Code Analyzer for C, C++ and C#: http://www.viva64.com
3
4 // uncrustify:off
5
6 /*
7 * Handling of regular expressions: vim_regcomp(), vim_regexec(), vim_regsub()
8 *
9 * NOTICE:
10 *
11 * This is NOT the original regular expression code as written by Henry
12 * Spencer. This code has been modified specifically for use with the VIM
13 * editor, and should not be used separately from Vim. If you want a good
14 * regular expression library, get the original code. The copyright notice
15 * that follows is from the original.
16 *
17 * END NOTICE
18 *
19 * Copyright (c) 1986 by University of Toronto.
20 * Written by Henry Spencer. Not derived from licensed software.
21 *
22 * Permission is granted to anyone to use this software for any
23 * purpose on any computer system, and to redistribute it freely,
24 * subject to the following restrictions:
25 *
26 * 1. The author is not responsible for the consequences of use of
27 * this software, no matter how awful, even if they arise
28 * from defects in it.
29 *
30 * 2. The origin of this software must not be misrepresented, either
31 * by explicit claim or by omission.
32 *
33 * 3. Altered versions must be plainly marked as such, and must not
34 * be misrepresented as being the original software.
35 *
36 * Beware that some of this code is subtly aware of the way operator
37 * precedence is structured in regular expressions. Serious changes in
38 * regular-expression syntax might require a total rethink.
39 *
40 * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert
41 * Webb, Ciaran McCreesh and Bram Moolenaar.
42 * Named character class support added by Walter Briscoe (1998 Jul 01)
43 */
44
45 // By default: do not create debugging logs or files related to regular
46 // expressions, even when compiling with -DDEBUG.
47 // Uncomment the second line to get the regexp debugging.
48 // #undef REGEXP_DEBUG
49 // #define REGEXP_DEBUG
50
51 #include <assert.h>
52 #include <inttypes.h>
53 #include <stdbool.h>
54 #include <string.h>
55
56 #include "nvim/vim.h"
57 #include "nvim/ascii.h"
58 #include "nvim/regexp.h"
59 #include "nvim/charset.h"
60 #include "nvim/eval.h"
61 #include "nvim/eval/userfunc.h"
62 #include "nvim/ex_cmds2.h"
63 #include "nvim/mark.h"
64 #include "nvim/memline.h"
65 #include "nvim/memory.h"
66 #include "nvim/message.h"
67 #include "nvim/misc1.h"
68 #include "nvim/plines.h"
69 #include "nvim/garray.h"
70 #include "nvim/strings.h"
71
72 #ifdef REGEXP_DEBUG
73 /* show/save debugging data when BT engine is used */
74 # define BT_REGEXP_DUMP
75 /* save the debugging data to a file instead of displaying it */
76 # define BT_REGEXP_LOG
77 # define BT_REGEXP_DEBUG_LOG
78 # define BT_REGEXP_DEBUG_LOG_NAME "bt_regexp_debug.log"
79 #endif
80
81 /*
82 * The "internal use only" fields in regexp_defs.h are present to pass info from
83 * compile to execute that permits the execute phase to run lots faster on
84 * simple cases. They are:
85 *
86 * regstart char that must begin a match; NUL if none obvious; Can be a
87 * multi-byte character.
88 * reganch is the match anchored (at beginning-of-line only)?
89 * regmust string (pointer into program) that match must include, or NULL
90 * regmlen length of regmust string
91 * regflags RF_ values or'ed together
92 *
93 * Regstart and reganch permit very fast decisions on suitable starting points
94 * for a match, cutting down the work a lot. Regmust permits fast rejection
95 * of lines that cannot possibly match. The regmust tests are costly enough
96 * that vim_regcomp() supplies a regmust only if the r.e. contains something
97 * potentially expensive (at present, the only such thing detected is * or +
98 * at the start of the r.e., which can involve a lot of backup). Regmlen is
99 * supplied because the test in vim_regexec() needs it and vim_regcomp() is
100 * computing it anyway.
101 */
102
103 /*
104 * Structure for regexp "program". This is essentially a linear encoding
105 * of a nondeterministic finite-state machine (aka syntax charts or
106 * "railroad normal form" in parsing technology). Each node is an opcode
107 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
108 * all nodes except BRANCH and BRACES_COMPLEX implement concatenation; a "next"
109 * pointer with a BRANCH on both ends of it is connecting two alternatives.
110 * (Here we have one of the subtle syntax dependencies: an individual BRANCH
111 * (as opposed to a collection of them) is never concatenated with anything
112 * because of operator precedence). The "next" pointer of a BRACES_COMPLEX
113 * node points to the node after the stuff to be repeated.
114 * The operand of some types of node is a literal string; for others, it is a
115 * node leading into a sub-FSM. In particular, the operand of a BRANCH node
116 * is the first node of the branch.
117 * (NB this is *not* a tree structure: the tail of the branch connects to the
118 * thing following the set of BRANCHes.)
119 *
120 * pattern is coded like:
121 *
122 * +-----------------+
123 * | V
124 * <aa>\|<bb> BRANCH <aa> BRANCH <bb> --> END
125 * | ^ | ^
126 * +------+ +----------+
127 *
128 *
129 * +------------------+
130 * V |
131 * <aa>* BRANCH BRANCH <aa> --> BACK BRANCH --> NOTHING --> END
132 * | | ^ ^
133 * | +---------------+ |
134 * +---------------------------------------------+
135 *
136 *
137 * +----------------------+
138 * V |
139 * <aa>\+ BRANCH <aa> --> BRANCH --> BACK BRANCH --> NOTHING --> END
140 * | | ^ ^
141 * | +-----------+ |
142 * +--------------------------------------------------+
143 *
144 *
145 * +-------------------------+
146 * V |
147 * <aa>\{} BRANCH BRACE_LIMITS --> BRACE_COMPLEX <aa> --> BACK END
148 * | | ^
149 * | +----------------+
150 * +-----------------------------------------------+
151 *
152 *
153 * <aa>\@!<bb> BRANCH NOMATCH <aa> --> END <bb> --> END
154 * | | ^ ^
155 * | +----------------+ |
156 * +--------------------------------+
157 *
158 * +---------+
159 * | V
160 * \z[abc] BRANCH BRANCH a BRANCH b BRANCH c BRANCH NOTHING --> END
161 * | | | | ^ ^
162 * | | | +-----+ |
163 * | | +----------------+ |
164 * | +---------------------------+ |
165 * +------------------------------------------------------+
166 *
167 * They all start with a BRANCH for "\|" alternatives, even when there is only
168 * one alternative.
169 */
170
171 /*
172 * The opcodes are:
173 */
174
175 /* definition number opnd? meaning */
176 #define END 0 /* End of program or NOMATCH operand. */
177 #define BOL 1 /* Match "" at beginning of line. */
178 #define EOL 2 /* Match "" at end of line. */
179 #define BRANCH 3 /* node Match this alternative, or the
180 * next... */
181 #define BACK 4 /* Match "", "next" ptr points backward. */
182 #define EXACTLY 5 /* str Match this string. */
183 #define NOTHING 6 /* Match empty string. */
184 #define STAR 7 /* node Match this (simple) thing 0 or more
185 * times. */
186 #define PLUS 8 /* node Match this (simple) thing 1 or more
187 * times. */
188 #define MATCH 9 /* node match the operand zero-width */
189 #define NOMATCH 10 /* node check for no match with operand */
190 #define BEHIND 11 /* node look behind for a match with operand */
191 #define NOBEHIND 12 /* node look behind for no match with operand */
192 #define SUBPAT 13 /* node match the operand here */
193 #define BRACE_SIMPLE 14 /* node Match this (simple) thing between m and
194 * n times (\{m,n\}). */
195 #define BOW 15 /* Match "" after [^a-zA-Z0-9_] */
196 #define EOW 16 /* Match "" at [^a-zA-Z0-9_] */
197 #define BRACE_LIMITS 17 /* nr nr define the min & max for BRACE_SIMPLE
198 * and BRACE_COMPLEX. */
199 #define NEWL 18 /* Match line-break */
200 #define BHPOS 19 /* End position for BEHIND or NOBEHIND */
201
202
203 /* character classes: 20-48 normal, 50-78 include a line-break */
204 #define ADD_NL 30
205 #define FIRST_NL ANY + ADD_NL
206 #define ANY 20 /* Match any one character. */
207 #define ANYOF 21 /* str Match any character in this string. */
208 #define ANYBUT 22 /* str Match any character not in this
209 * string. */
210 #define IDENT 23 /* Match identifier char */
211 #define SIDENT 24 /* Match identifier char but no digit */
212 #define KWORD 25 /* Match keyword char */
213 #define SKWORD 26 /* Match word char but no digit */
214 #define FNAME 27 /* Match file name char */
215 #define SFNAME 28 /* Match file name char but no digit */
216 #define PRINT 29 /* Match printable char */
217 #define SPRINT 30 /* Match printable char but no digit */
218 #define WHITE 31 /* Match whitespace char */
219 #define NWHITE 32 /* Match non-whitespace char */
220 #define DIGIT 33 /* Match digit char */
221 #define NDIGIT 34 /* Match non-digit char */
222 #define HEX 35 /* Match hex char */
223 #define NHEX 36 /* Match non-hex char */
224 #define OCTAL 37 /* Match octal char */
225 #define NOCTAL 38 /* Match non-octal char */
226 #define WORD 39 /* Match word char */
227 #define NWORD 40 /* Match non-word char */
228 #define HEAD 41 /* Match head char */
229 #define NHEAD 42 /* Match non-head char */
230 #define ALPHA 43 /* Match alpha char */
231 #define NALPHA 44 /* Match non-alpha char */
232 #define LOWER 45 /* Match lowercase char */
233 #define NLOWER 46 /* Match non-lowercase char */
234 #define UPPER 47 /* Match uppercase char */
235 #define NUPPER 48 /* Match non-uppercase char */
236 #define LAST_NL NUPPER + ADD_NL
237 // -V:WITH_NL:560
238 #define WITH_NL(op) ((op) >= FIRST_NL && (op) <= LAST_NL)
239
240 #define MOPEN 80 // -89 Mark this point in input as start of
241 // \( … \) subexpr. MOPEN + 0 marks start of
242 // match.
243 #define MCLOSE 90 // -99 Analogous to MOPEN. MCLOSE + 0 marks
244 // end of match.
245 #define BACKREF 100 // -109 node Match same string again \1-\9.
246
247 # define ZOPEN 110 // -119 Mark this point in input as start of
248 // \z( … \) subexpr.
249 # define ZCLOSE 120 // -129 Analogous to ZOPEN.
250 # define ZREF 130 // -139 node Match external submatch \z1-\z9
251
252 #define BRACE_COMPLEX 140 /* -149 node Match nodes between m & n times */
253
254 #define NOPEN 150 // Mark this point in input as start of
255 // \%( subexpr.
256 #define NCLOSE 151 // Analogous to NOPEN.
257
258 #define MULTIBYTECODE 200 /* mbc Match one multi-byte character */
259 #define RE_BOF 201 /* Match "" at beginning of file. */
260 #define RE_EOF 202 /* Match "" at end of file. */
261 #define CURSOR 203 /* Match location of cursor. */
262
263 #define RE_LNUM 204 /* nr cmp Match line number */
264 #define RE_COL 205 /* nr cmp Match column number */
265 #define RE_VCOL 206 /* nr cmp Match virtual column number */
266
267 #define RE_MARK 207 /* mark cmp Match mark position */
268 #define RE_VISUAL 208 /* Match Visual area */
269 #define RE_COMPOSING 209 // any composing characters
270
271 /*
272 * Magic characters have a special meaning, they don't match literally.
273 * Magic characters are negative. This separates them from literal characters
274 * (possibly multi-byte). Only ASCII characters can be Magic.
275 */
276 #define Magic(x) ((int)(x) - 256)
277 #define un_Magic(x) ((x) + 256)
278 #define is_Magic(x) ((x) < 0)
279
280 /*
281 * We should define ftpr as a pointer to a function returning a pointer to
282 * a function returning a pointer to a function ...
283 * This is impossible, so we declare a pointer to a function returning a
284 * pointer to a function returning void. This should work for all compilers.
285 */
286 typedef void (*(*fptr_T)(int *, int))(void);
287
288 typedef struct {
289 char_u *regparse;
290 int prevchr_len;
291 int curchr;
292 int prevchr;
293 int prevprevchr;
294 int nextchr;
295 int at_start;
296 int prev_at_start;
297 int regnpar;
298 } parse_state_T;
299
300 /*
301 * Structure used to save the current input state, when it needs to be
302 * restored after trying a match. Used by reg_save() and reg_restore().
303 * Also stores the length of "backpos".
304 */
305 typedef struct {
306 union {
307 char_u *ptr; ///< rex.input pointer, for single-line regexp
308 lpos_T pos; ///< rex.input pos, for multi-line regexp
309 } rs_u;
310 int rs_len;
311 } regsave_T;
312
313 /* struct to save start/end pointer/position in for \(\) */
314 typedef struct {
315 union {
316 char_u *ptr;
317 lpos_T pos;
318 } se_u;
319 } save_se_T;
320
321 /* used for BEHIND and NOBEHIND matching */
322 typedef struct regbehind_S {
323 regsave_T save_after;
324 regsave_T save_behind;
325 int save_need_clear_subexpr;
326 save_se_T save_start[NSUBEXP];
327 save_se_T save_end[NSUBEXP];
328 } regbehind_T;
329
330 /* Values for rs_state in regitem_T. */
331 typedef enum regstate_E {
332 RS_NOPEN = 0 /* NOPEN and NCLOSE */
333 , RS_MOPEN /* MOPEN + [0-9] */
334 , RS_MCLOSE /* MCLOSE + [0-9] */
335 , RS_ZOPEN /* ZOPEN + [0-9] */
336 , RS_ZCLOSE /* ZCLOSE + [0-9] */
337 , RS_BRANCH /* BRANCH */
338 , RS_BRCPLX_MORE /* BRACE_COMPLEX and trying one more match */
339 , RS_BRCPLX_LONG /* BRACE_COMPLEX and trying longest match */
340 , RS_BRCPLX_SHORT /* BRACE_COMPLEX and trying shortest match */
341 , RS_NOMATCH /* NOMATCH */
342 , RS_BEHIND1 /* BEHIND / NOBEHIND matching rest */
343 , RS_BEHIND2 /* BEHIND / NOBEHIND matching behind part */
344 , RS_STAR_LONG /* STAR/PLUS/BRACE_SIMPLE longest match */
345 , RS_STAR_SHORT /* STAR/PLUS/BRACE_SIMPLE shortest match */
346 } regstate_T;
347
348 /*
349 * When there are alternatives a regstate_T is put on the regstack to remember
350 * what we are doing.
351 * Before it may be another type of item, depending on rs_state, to remember
352 * more things.
353 */
354 typedef struct regitem_S {
355 regstate_T rs_state; // what we are doing, one of RS_ above
356 uint16_t rs_no; // submatch nr or BEHIND/NOBEHIND
357 char_u *rs_scan; // current node in program
358 union {
359 save_se_T sesave;
360 regsave_T regsave;
361 } rs_un; ///< room for saving rex.input
362 } regitem_T;
363
364
365 /* used for STAR, PLUS and BRACE_SIMPLE matching */
366 typedef struct regstar_S {
367 int nextb; /* next byte */
368 int nextb_ic; /* next byte reverse case */
369 long count;
370 long minval;
371 long maxval;
372 } regstar_T;
373
374 /* used to store input position when a BACK was encountered, so that we now if
375 * we made any progress since the last time. */
376 typedef struct backpos_S {
377 char_u *bp_scan; /* "scan" where BACK was encountered */
378 regsave_T bp_pos; /* last input position */
379 } backpos_T;
380
381 typedef struct {
382 int a, b, c;
383 } decomp_T;
384
385
386 #ifdef INCLUDE_GENERATED_DECLARATIONS
387 # include "regexp.c.generated.h"
388 #endif
no_Magic(int x)389 static int no_Magic(int x)
390 {
391 if (is_Magic(x))
392 return un_Magic(x);
393 return x;
394 }
395
toggle_Magic(int x)396 static int toggle_Magic(int x)
397 {
398 if (is_Magic(x))
399 return un_Magic(x);
400 return Magic(x);
401 }
402
403 /*
404 * The first byte of the regexp internal "program" is actually this magic
405 * number; the start node begins in the second byte. It's used to catch the
406 * most severe mutilation of the program by the caller.
407 */
408
409 #define REGMAGIC 0234
410
411 /*
412 * Opcode notes:
413 *
414 * BRANCH The set of branches constituting a single choice are hooked
415 * together with their "next" pointers, since precedence prevents
416 * anything being concatenated to any individual branch. The
417 * "next" pointer of the last BRANCH in a choice points to the
418 * thing following the whole choice. This is also where the
419 * final "next" pointer of each individual branch points; each
420 * branch starts with the operand node of a BRANCH node.
421 *
422 * BACK Normal "next" pointers all implicitly point forward; BACK
423 * exists to make loop structures possible.
424 *
425 * STAR,PLUS '=', and complex '*' and '+', are implemented as circular
426 * BRANCH structures using BACK. Simple cases (one character
427 * per match) are implemented with STAR and PLUS for speed
428 * and to minimize recursive plunges.
429 *
430 * BRACE_LIMITS This is always followed by a BRACE_SIMPLE or BRACE_COMPLEX
431 * node, and defines the min and max limits to be used for that
432 * node.
433 *
434 * MOPEN,MCLOSE ...are numbered at compile time.
435 * ZOPEN,ZCLOSE ...ditto
436 */
437
438 /*
439 * A node is one char of opcode followed by two chars of "next" pointer.
440 * "Next" pointers are stored as two 8-bit bytes, high order first. The
441 * value is a positive offset from the opcode of the node containing it.
442 * An operand, if any, simply follows the node. (Note that much of the
443 * code generation knows about this implicit relationship.)
444 *
445 * Using two bytes for the "next" pointer is vast overkill for most things,
446 * but allows patterns to get big without disasters.
447 */
448 #define OP(p) ((int)*(p))
449 #define NEXT(p) (((*((p) + 1) & 0377) << 8) + (*((p) + 2) & 0377))
450 #define OPERAND(p) ((p) + 3)
451 /* Obtain an operand that was stored as four bytes, MSB first. */
452 #define OPERAND_MIN(p) (((long)(p)[3] << 24) + ((long)(p)[4] << 16) \
453 + ((long)(p)[5] << 8) + (long)(p)[6])
454 /* Obtain a second operand stored as four bytes. */
455 #define OPERAND_MAX(p) OPERAND_MIN((p) + 4)
456 /* Obtain a second single-byte operand stored after a four bytes operand. */
457 #define OPERAND_CMP(p) (p)[7]
458
459 /*
460 * Utility definitions.
461 */
462 #define UCHARAT(p) ((int)*(char_u *)(p))
463
464 // Used for an error (down from) vim_regcomp(): give the error message, set
465 // rc_did_emsg and return NULL
466 #define EMSG_RET_NULL(m) return (emsg(m), rc_did_emsg = true, (void *)NULL)
467 #define IEMSG_RET_NULL(m) return (iemsg(m), rc_did_emsg = true, (void *)NULL)
468 #define EMSG_RET_FAIL(m) return (emsg(m), rc_did_emsg = true, FAIL)
469 #define EMSG2_RET_NULL(m, c) \
470 return (semsg((m), (c) ? "" : "\\"), rc_did_emsg = true, (void *)NULL)
471 #define EMSG2_RET_FAIL(m, c) \
472 return (semsg((m), (c) ? "" : "\\"), rc_did_emsg = true, FAIL)
473 #define EMSG_ONE_RET_NULL EMSG2_RET_NULL(_( \
474 "E369: invalid item in %s%%[]"), reg_magic == MAGIC_ALL)
475
476 #define MAX_LIMIT (32767L << 16L)
477
478
479 #ifdef BT_REGEXP_DUMP
480 static void regdump(char_u *, bt_regprog_T *);
481 #endif
482 #ifdef REGEXP_DEBUG
483 static char_u *regprop(char_u *);
484 #endif
485
486 static char_u e_missingbracket[] = N_("E769: Missing ] after %s[");
487 static char_u e_reverse_range[] = N_("E944: Reverse range in character class");
488 static char_u e_large_class[] = N_("E945: Range too large in character class");
489 static char_u e_unmatchedpp[] = N_("E53: Unmatched %s%%(");
490 static char_u e_unmatchedp[] = N_("E54: Unmatched %s(");
491 static char_u e_unmatchedpar[] = N_("E55: Unmatched %s)");
492 static char_u e_z_not_allowed[] = N_("E66: \\z( not allowed here");
493 static char_u e_z1_not_allowed[] = N_("E67: \\z1 - \\z9 not allowed here");
494 static char_u e_missing_sb[] = N_("E69: Missing ] after %s%%[");
495 static char_u e_empty_sb[] = N_("E70: Empty %s%%[]");
496 static char_u e_recursive[] = N_("E956: Cannot use pattern recursively");
497
498 #define NOT_MULTI 0
499 #define MULTI_ONE 1
500 #define MULTI_MULT 2
501 /*
502 * Return NOT_MULTI if c is not a "multi" operator.
503 * Return MULTI_ONE if c is a single "multi" operator.
504 * Return MULTI_MULT if c is a multi "multi" operator.
505 */
re_multi_type(int c)506 static int re_multi_type(int c)
507 {
508 if (c == Magic('@') || c == Magic('=') || c == Magic('?'))
509 return MULTI_ONE;
510 if (c == Magic('*') || c == Magic('+') || c == Magic('{'))
511 return MULTI_MULT;
512 return NOT_MULTI;
513 }
514
515 /*
516 * Flags to be passed up and down.
517 */
518 #define HASWIDTH 0x1 /* Known never to match null string. */
519 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
520 #define SPSTART 0x4 /* Starts with * or +. */
521 #define HASNL 0x8 /* Contains some \n. */
522 #define HASLOOKBH 0x10 /* Contains "\@<=" or "\@<!". */
523 #define WORST 0 /* Worst case. */
524
525 /*
526 * When regcode is set to this value, code is not emitted and size is computed
527 * instead.
528 */
529 #define JUST_CALC_SIZE ((char_u *) -1)
530
531 static char_u *reg_prev_sub = NULL;
532
533 /*
534 * REGEXP_INRANGE contains all characters which are always special in a []
535 * range after '\'.
536 * REGEXP_ABBR contains all characters which act as abbreviations after '\'.
537 * These are:
538 * \n - New line (NL).
539 * \r - Carriage Return (CR).
540 * \t - Tab (TAB).
541 * \e - Escape (ESC).
542 * \b - Backspace (Ctrl_H).
543 * \d - Character code in decimal, eg \d123
544 * \o - Character code in octal, eg \o80
545 * \x - Character code in hex, eg \x4a
546 * \u - Multibyte character code, eg \u20ac
547 * \U - Long multibyte character code, eg \U12345678
548 */
549 static char_u REGEXP_INRANGE[] = "]^-n\\";
550 static char_u REGEXP_ABBR[] = "nrtebdoxuU";
551
552
553 /*
554 * Translate '\x' to its control character, except "\n", which is Magic.
555 */
backslash_trans(int c)556 static int backslash_trans(int c)
557 {
558 switch (c) {
559 case 'r': return CAR;
560 case 't': return TAB;
561 case 'e': return ESC;
562 case 'b': return BS;
563 }
564 return c;
565 }
566
567 /*
568 * Check for a character class name "[:name:]". "pp" points to the '['.
569 * Returns one of the CLASS_ items. CLASS_NONE means that no item was
570 * recognized. Otherwise "pp" is advanced to after the item.
571 */
get_char_class(char_u ** pp)572 static int get_char_class(char_u **pp)
573 {
574 static const char *(class_names[]) =
575 {
576 "alnum:]",
577 #define CLASS_ALNUM 0
578 "alpha:]",
579 #define CLASS_ALPHA 1
580 "blank:]",
581 #define CLASS_BLANK 2
582 "cntrl:]",
583 #define CLASS_CNTRL 3
584 "digit:]",
585 #define CLASS_DIGIT 4
586 "graph:]",
587 #define CLASS_GRAPH 5
588 "lower:]",
589 #define CLASS_LOWER 6
590 "print:]",
591 #define CLASS_PRINT 7
592 "punct:]",
593 #define CLASS_PUNCT 8
594 "space:]",
595 #define CLASS_SPACE 9
596 "upper:]",
597 #define CLASS_UPPER 10
598 "xdigit:]",
599 #define CLASS_XDIGIT 11
600 "tab:]",
601 #define CLASS_TAB 12
602 "return:]",
603 #define CLASS_RETURN 13
604 "backspace:]",
605 #define CLASS_BACKSPACE 14
606 "escape:]",
607 #define CLASS_ESCAPE 15
608 "ident:]",
609 #define CLASS_IDENT 16
610 "keyword:]",
611 #define CLASS_KEYWORD 17
612 "fname:]",
613 #define CLASS_FNAME 18
614 };
615 #define CLASS_NONE 99
616 int i;
617
618 if ((*pp)[1] == ':') {
619 for (i = 0; i < (int)ARRAY_SIZE(class_names); ++i)
620 if (STRNCMP(*pp + 2, class_names[i], STRLEN(class_names[i])) == 0) {
621 *pp += STRLEN(class_names[i]) + 2;
622 return i;
623 }
624 }
625 return CLASS_NONE;
626 }
627
628 /*
629 * Specific version of character class functions.
630 * Using a table to keep this fast.
631 */
632 static short class_tab[256];
633
634 #define RI_DIGIT 0x01
635 #define RI_HEX 0x02
636 #define RI_OCTAL 0x04
637 #define RI_WORD 0x08
638 #define RI_HEAD 0x10
639 #define RI_ALPHA 0x20
640 #define RI_LOWER 0x40
641 #define RI_UPPER 0x80
642 #define RI_WHITE 0x100
643
init_class_tab(void)644 static void init_class_tab(void)
645 {
646 int i;
647 static int done = false;
648
649 if (done)
650 return;
651
652 for (i = 0; i < 256; ++i) {
653 if (i >= '0' && i <= '7')
654 class_tab[i] = RI_DIGIT + RI_HEX + RI_OCTAL + RI_WORD;
655 else if (i >= '8' && i <= '9')
656 class_tab[i] = RI_DIGIT + RI_HEX + RI_WORD;
657 else if (i >= 'a' && i <= 'f')
658 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
659 else if (i >= 'g' && i <= 'z')
660 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_LOWER;
661 else if (i >= 'A' && i <= 'F')
662 class_tab[i] = RI_HEX + RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
663 else if (i >= 'G' && i <= 'Z')
664 class_tab[i] = RI_WORD + RI_HEAD + RI_ALPHA + RI_UPPER;
665 else if (i == '_')
666 class_tab[i] = RI_WORD + RI_HEAD;
667 else
668 class_tab[i] = 0;
669 }
670 class_tab[' '] |= RI_WHITE;
671 class_tab['\t'] |= RI_WHITE;
672 done = true;
673 }
674
675 # define ri_digit(c) (c < 0x100 && (class_tab[c] & RI_DIGIT))
676 # define ri_hex(c) (c < 0x100 && (class_tab[c] & RI_HEX))
677 # define ri_octal(c) (c < 0x100 && (class_tab[c] & RI_OCTAL))
678 # define ri_word(c) (c < 0x100 && (class_tab[c] & RI_WORD))
679 # define ri_head(c) (c < 0x100 && (class_tab[c] & RI_HEAD))
680 # define ri_alpha(c) (c < 0x100 && (class_tab[c] & RI_ALPHA))
681 # define ri_lower(c) (c < 0x100 && (class_tab[c] & RI_LOWER))
682 # define ri_upper(c) (c < 0x100 && (class_tab[c] & RI_UPPER))
683 # define ri_white(c) (c < 0x100 && (class_tab[c] & RI_WHITE))
684
685 /* flags for regflags */
686 #define RF_ICASE 1 /* ignore case */
687 #define RF_NOICASE 2 /* don't ignore case */
688 #define RF_HASNL 4 /* can match a NL */
689 #define RF_ICOMBINE 8 /* ignore combining characters */
690 #define RF_LOOKBH 16 /* uses "\@<=" or "\@<!" */
691
692 // Global work variables for vim_regcomp().
693
694 static char_u *regparse; ///< Input-scan pointer.
695 static int prevchr_len; ///< byte length of previous char
696 static int num_complex_braces; ///< Complex \{...} count
697 static int regnpar; ///< () count.
698 static bool wants_nfa; ///< regex should use NFA engine
699 static int regnzpar; ///< \z() count.
700 static int re_has_z; ///< \z item detected
701 static char_u *regcode; ///< Code-emit pointer, or JUST_CALC_SIZE
702 static long regsize; ///< Code size.
703 static int reg_toolong; ///< true when offset out of range
704 static char_u had_endbrace[NSUBEXP]; ///< flags, true if end of () found
705 static unsigned regflags; ///< RF_ flags for prog
706 static long brace_min[10]; ///< Minimums for complex brace repeats
707 static long brace_max[10]; ///< Maximums for complex brace repeats
708 static int brace_count[10]; ///< Current counts for complex brace repeats
709 static int had_eol; ///< true when EOL found by vim_regcomp()
710 static int one_exactly = false; ///< only do one char for EXACTLY
711
712 static int reg_magic; /* magicness of the pattern: */
713 #define MAGIC_NONE 1 /* "\V" very unmagic */
714 #define MAGIC_OFF 2 /* "\M" or 'magic' off */
715 #define MAGIC_ON 3 /* "\m" or 'magic' */
716 #define MAGIC_ALL 4 /* "\v" very magic */
717
718 static int reg_string; // matching with a string instead of a buffer
719 // line
720 static int reg_strict; // "[abc" is illegal
721
722 /*
723 * META contains all characters that may be magic, except '^' and '$'.
724 */
725
726 /* META[] is used often enough to justify turning it into a table. */
727 static char_u META_flags[] = {
728 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
729 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
730 /* % & ( ) * + . */
731 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
732 /* 1 2 3 4 5 6 7 8 9 < = > ? */
733 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
734 /* @ A C D F H I K L M O */
735 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
736 /* P S U V W X Z [ _ */
737 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
738 /* a c d f h i k l m n o */
739 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
740 /* p s u v w x z { | ~ */
741 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1
742 };
743
744 static int curchr; // currently parsed character
745 // Previous character. Note: prevchr is sometimes -1 when we are not at the
746 // start, eg in /[ ^I]^ the pattern was never found even if it existed,
747 // because ^ was taken to be magic -- webb
748 static int prevchr;
749 static int prevprevchr; /* previous-previous character */
750 static int nextchr; /* used for ungetchr() */
751
752 /* arguments for reg() */
753 #define REG_NOPAREN 0 /* toplevel reg() */
754 #define REG_PAREN 1 /* \(\) */
755 #define REG_ZPAREN 2 /* \z(\) */
756 #define REG_NPAREN 3 /* \%(\) */
757
758 /*
759 * Forward declarations for vim_regcomp()'s friends.
760 */
761 # define REGMBC(x) regmbc(x);
762 # define CASEMBC(x) case x:
763
764 static regengine_T bt_regengine;
765 static regengine_T nfa_regengine;
766
767 // Return true if compiled regular expression "prog" can match a line break.
re_multiline(const regprog_T * prog)768 int re_multiline(const regprog_T *prog)
769 FUNC_ATTR_NONNULL_ALL
770 {
771 return prog->regflags & RF_HASNL;
772 }
773
774 /*
775 * Check for an equivalence class name "[=a=]". "pp" points to the '['.
776 * Returns a character representing the class. Zero means that no item was
777 * recognized. Otherwise "pp" is advanced to after the item.
778 */
get_equi_class(char_u ** pp)779 static int get_equi_class(char_u **pp)
780 {
781 int c;
782 int l = 1;
783 char_u *p = *pp;
784
785 if (p[1] == '=' && p[2] != NUL) {
786 l = utfc_ptr2len(p + 2);
787 if (p[l + 2] == '=' && p[l + 3] == ']') {
788 c = utf_ptr2char(p + 2);
789 *pp += l + 4;
790 return c;
791 }
792 }
793 return 0;
794 }
795
796
797 /*
798 * Produce the bytes for equivalence class "c".
799 * Currently only handles latin1, latin9 and utf-8.
800 * NOTE: When changing this function, also change nfa_emit_equi_class()
801 */
reg_equi_class(int c)802 static void reg_equi_class(int c)
803 {
804 {
805 switch (c) {
806 // Do not use '\300' style, it results in a negative number.
807 case 'A': case 0xc0: case 0xc1: case 0xc2:
808 case 0xc3: case 0xc4: case 0xc5:
809 CASEMBC(0x100) CASEMBC(0x102) CASEMBC(0x104) CASEMBC(0x1cd)
810 CASEMBC(0x1de) CASEMBC(0x1e0) CASEMBC(0x1ea2)
811 regmbc('A'); regmbc(0xc0); regmbc(0xc1);
812 regmbc(0xc2); regmbc(0xc3); regmbc(0xc4);
813 regmbc(0xc5);
814 REGMBC(0x100) REGMBC(0x102) REGMBC(0x104)
815 REGMBC(0x1cd) REGMBC(0x1de) REGMBC(0x1e0)
816 REGMBC(0x1ea2)
817 return;
818 case 'B': CASEMBC(0x1e02) CASEMBC(0x1e06)
819 regmbc('B'); REGMBC(0x1e02) REGMBC(0x1e06)
820 return;
821 case 'C': case 0xc7:
822 CASEMBC(0x106) CASEMBC(0x108) CASEMBC(0x10a) CASEMBC(0x10c)
823 regmbc('C'); regmbc(0xc7);
824 REGMBC(0x106) REGMBC(0x108) REGMBC(0x10a)
825 REGMBC(0x10c)
826 return;
827 case 'D': CASEMBC(0x10e) CASEMBC(0x110) CASEMBC(0x1e0a)
828 CASEMBC(0x1e0e) CASEMBC(0x1e10)
829 regmbc('D'); REGMBC(0x10e) REGMBC(0x110)
830 REGMBC(0x1e0a) REGMBC(0x1e0e) REGMBC(0x1e10)
831 return;
832 case 'E': case 0xc8: case 0xc9: case 0xca: case 0xcb:
833 CASEMBC(0x112) CASEMBC(0x114) CASEMBC(0x116) CASEMBC(0x118)
834 CASEMBC(0x11a) CASEMBC(0x1eba) CASEMBC(0x1ebc)
835 regmbc('E'); regmbc(0xc8); regmbc(0xc9);
836 regmbc(0xca); regmbc(0xcb);
837 REGMBC(0x112) REGMBC(0x114) REGMBC(0x116)
838 REGMBC(0x118) REGMBC(0x11a) REGMBC(0x1eba)
839 REGMBC(0x1ebc)
840 return;
841 case 'F': CASEMBC(0x1e1e)
842 regmbc('F'); REGMBC(0x1e1e)
843 return;
844 case 'G': CASEMBC(0x11c) CASEMBC(0x11e) CASEMBC(0x120)
845 CASEMBC(0x122) CASEMBC(0x1e4) CASEMBC(0x1e6) CASEMBC(0x1f4)
846 CASEMBC(0x1e20)
847 regmbc('G'); REGMBC(0x11c) REGMBC(0x11e)
848 REGMBC(0x120) REGMBC(0x122) REGMBC(0x1e4)
849 REGMBC(0x1e6) REGMBC(0x1f4) REGMBC(0x1e20)
850 return;
851 case 'H': CASEMBC(0x124) CASEMBC(0x126) CASEMBC(0x1e22)
852 CASEMBC(0x1e26) CASEMBC(0x1e28)
853 regmbc('H'); REGMBC(0x124) REGMBC(0x126)
854 REGMBC(0x1e22) REGMBC(0x1e26) REGMBC(0x1e28)
855 return;
856 case 'I': case 0xcc: case 0xcd: case 0xce: case 0xcf:
857 CASEMBC(0x128) CASEMBC(0x12a) CASEMBC(0x12c) CASEMBC(0x12e)
858 CASEMBC(0x130) CASEMBC(0x1cf) CASEMBC(0x1ec8)
859 regmbc('I'); regmbc(0xcc); regmbc(0xcd);
860 regmbc(0xce); regmbc(0xcf);
861 REGMBC(0x128) REGMBC(0x12a) REGMBC(0x12c)
862 REGMBC(0x12e) REGMBC(0x130) REGMBC(0x1cf)
863 REGMBC(0x1ec8)
864 return;
865 case 'J': CASEMBC(0x134)
866 regmbc('J'); REGMBC(0x134)
867 return;
868 case 'K': CASEMBC(0x136) CASEMBC(0x1e8) CASEMBC(0x1e30)
869 CASEMBC(0x1e34)
870 regmbc('K'); REGMBC(0x136) REGMBC(0x1e8)
871 REGMBC(0x1e30) REGMBC(0x1e34)
872 return;
873 case 'L': CASEMBC(0x139) CASEMBC(0x13b) CASEMBC(0x13d)
874 CASEMBC(0x13f) CASEMBC(0x141) CASEMBC(0x1e3a)
875 regmbc('L'); REGMBC(0x139) REGMBC(0x13b)
876 REGMBC(0x13d) REGMBC(0x13f) REGMBC(0x141)
877 REGMBC(0x1e3a)
878 return;
879 case 'M': CASEMBC(0x1e3e) CASEMBC(0x1e40)
880 regmbc('M'); REGMBC(0x1e3e) REGMBC(0x1e40)
881 return;
882 case 'N': case 0xd1:
883 CASEMBC(0x143) CASEMBC(0x145) CASEMBC(0x147) CASEMBC(0x1e44)
884 CASEMBC(0x1e48)
885 regmbc('N'); regmbc(0xd1);
886 REGMBC(0x143) REGMBC(0x145) REGMBC(0x147)
887 REGMBC(0x1e44) REGMBC(0x1e48)
888 return;
889 case 'O': case 0xd2: case 0xd3: case 0xd4: case 0xd5:
890 case 0xd6: case 0xd8:
891 CASEMBC(0x14c) CASEMBC(0x14e) CASEMBC(0x150) CASEMBC(0x1a0)
892 CASEMBC(0x1d1) CASEMBC(0x1ea) CASEMBC(0x1ec) CASEMBC(0x1ece)
893 regmbc('O'); regmbc(0xd2); regmbc(0xd3);
894 regmbc(0xd4); regmbc(0xd5); regmbc(0xd6);
895 regmbc(0xd8);
896 REGMBC(0x14c) REGMBC(0x14e) REGMBC(0x150)
897 REGMBC(0x1a0) REGMBC(0x1d1) REGMBC(0x1ea)
898 REGMBC(0x1ec) REGMBC(0x1ece)
899 return;
900 case 'P': case 0x1e54: case 0x1e56:
901 regmbc('P'); REGMBC(0x1e54) REGMBC(0x1e56)
902 return;
903 case 'R': CASEMBC(0x154) CASEMBC(0x156) CASEMBC(0x158)
904 CASEMBC(0x1e58) CASEMBC(0x1e5e)
905 regmbc('R'); REGMBC(0x154) REGMBC(0x156) REGMBC(0x158)
906 REGMBC(0x1e58) REGMBC(0x1e5e)
907 return;
908 case 'S': CASEMBC(0x15a) CASEMBC(0x15c) CASEMBC(0x15e)
909 CASEMBC(0x160) CASEMBC(0x1e60)
910 regmbc('S'); REGMBC(0x15a) REGMBC(0x15c)
911 REGMBC(0x15e) REGMBC(0x160) REGMBC(0x1e60)
912 return;
913 case 'T': CASEMBC(0x162) CASEMBC(0x164) CASEMBC(0x166)
914 CASEMBC(0x1e6a) CASEMBC(0x1e6e)
915 regmbc('T'); REGMBC(0x162) REGMBC(0x164)
916 REGMBC(0x166) REGMBC(0x1e6a) REGMBC(0x1e6e)
917 return;
918 case 'U': case 0xd9: case 0xda: case 0xdb: case 0xdc:
919 CASEMBC(0x168) CASEMBC(0x16a) CASEMBC(0x16c) CASEMBC(0x16e)
920 CASEMBC(0x170) CASEMBC(0x172) CASEMBC(0x1af) CASEMBC(0x1d3)
921 CASEMBC(0x1ee6)
922 regmbc('U'); regmbc(0xd9); regmbc(0xda);
923 regmbc(0xdb); regmbc(0xdc);
924 REGMBC(0x168) REGMBC(0x16a) REGMBC(0x16c)
925 REGMBC(0x16e) REGMBC(0x170) REGMBC(0x172)
926 REGMBC(0x1af) REGMBC(0x1d3) REGMBC(0x1ee6)
927 return;
928 case 'V': CASEMBC(0x1e7c)
929 regmbc('V'); REGMBC(0x1e7c)
930 return;
931 case 'W': CASEMBC(0x174) CASEMBC(0x1e80) CASEMBC(0x1e82)
932 CASEMBC(0x1e84) CASEMBC(0x1e86)
933 regmbc('W'); REGMBC(0x174) REGMBC(0x1e80)
934 REGMBC(0x1e82) REGMBC(0x1e84) REGMBC(0x1e86)
935 return;
936 case 'X': CASEMBC(0x1e8a) CASEMBC(0x1e8c)
937 regmbc('X'); REGMBC(0x1e8a) REGMBC(0x1e8c)
938 return;
939 case 'Y': case 0xdd:
940 CASEMBC(0x176) CASEMBC(0x178) CASEMBC(0x1e8e) CASEMBC(0x1ef2)
941 CASEMBC(0x1ef6) CASEMBC(0x1ef8)
942 regmbc('Y'); regmbc(0xdd);
943 REGMBC(0x176) REGMBC(0x178) REGMBC(0x1e8e)
944 REGMBC(0x1ef2) REGMBC(0x1ef6) REGMBC(0x1ef8)
945 return;
946 case 'Z': CASEMBC(0x179) CASEMBC(0x17b) CASEMBC(0x17d)
947 CASEMBC(0x1b5) CASEMBC(0x1e90) CASEMBC(0x1e94)
948 regmbc('Z'); REGMBC(0x179) REGMBC(0x17b)
949 REGMBC(0x17d) REGMBC(0x1b5) REGMBC(0x1e90)
950 REGMBC(0x1e94)
951 return;
952 case 'a': case 0xe0: case 0xe1: case 0xe2:
953 case 0xe3: case 0xe4: case 0xe5:
954 CASEMBC(0x101) CASEMBC(0x103) CASEMBC(0x105) CASEMBC(0x1ce)
955 CASEMBC(0x1df) CASEMBC(0x1e1) CASEMBC(0x1ea3)
956 regmbc('a'); regmbc(0xe0); regmbc(0xe1);
957 regmbc(0xe2); regmbc(0xe3); regmbc(0xe4);
958 regmbc(0xe5);
959 REGMBC(0x101) REGMBC(0x103) REGMBC(0x105)
960 REGMBC(0x1ce) REGMBC(0x1df) REGMBC(0x1e1)
961 REGMBC(0x1ea3)
962 return;
963 case 'b': CASEMBC(0x1e03) CASEMBC(0x1e07)
964 regmbc('b'); REGMBC(0x1e03) REGMBC(0x1e07)
965 return;
966 case 'c': case 0xe7:
967 CASEMBC(0x107) CASEMBC(0x109) CASEMBC(0x10b) CASEMBC(0x10d)
968 regmbc('c'); regmbc(0xe7);
969 REGMBC(0x107) REGMBC(0x109) REGMBC(0x10b)
970 REGMBC(0x10d)
971 return;
972 case 'd': CASEMBC(0x10f) CASEMBC(0x111) CASEMBC(0x1e0b)
973 CASEMBC(0x1e0f) CASEMBC(0x1e11)
974 regmbc('d'); REGMBC(0x10f) REGMBC(0x111)
975 REGMBC(0x1e0b) REGMBC(0x1e0f) REGMBC(0x1e11)
976 return;
977 case 'e': case 0xe8: case 0xe9: case 0xea: case 0xeb:
978 CASEMBC(0x113) CASEMBC(0x115) CASEMBC(0x117) CASEMBC(0x119)
979 CASEMBC(0x11b) CASEMBC(0x1ebb) CASEMBC(0x1ebd)
980 regmbc('e'); regmbc(0xe8); regmbc(0xe9);
981 regmbc(0xea); regmbc(0xeb);
982 REGMBC(0x113) REGMBC(0x115) REGMBC(0x117)
983 REGMBC(0x119) REGMBC(0x11b) REGMBC(0x1ebb)
984 REGMBC(0x1ebd)
985 return;
986 case 'f': CASEMBC(0x1e1f)
987 regmbc('f'); REGMBC(0x1e1f)
988 return;
989 case 'g': CASEMBC(0x11d) CASEMBC(0x11f) CASEMBC(0x121)
990 CASEMBC(0x123) CASEMBC(0x1e5) CASEMBC(0x1e7) CASEMBC(0x1f5)
991 CASEMBC(0x1e21)
992 regmbc('g'); REGMBC(0x11d) REGMBC(0x11f)
993 REGMBC(0x121) REGMBC(0x123) REGMBC(0x1e5)
994 REGMBC(0x1e7) REGMBC(0x1f5) REGMBC(0x1e21)
995 return;
996 case 'h': CASEMBC(0x125) CASEMBC(0x127) CASEMBC(0x1e23)
997 CASEMBC(0x1e27) CASEMBC(0x1e29) CASEMBC(0x1e96)
998 regmbc('h'); REGMBC(0x125) REGMBC(0x127)
999 REGMBC(0x1e23) REGMBC(0x1e27) REGMBC(0x1e29)
1000 REGMBC(0x1e96)
1001 return;
1002 case 'i': case 0xec: case 0xed: case 0xee: case 0xef:
1003 CASEMBC(0x129) CASEMBC(0x12b) CASEMBC(0x12d) CASEMBC(0x12f)
1004 CASEMBC(0x1d0) CASEMBC(0x1ec9)
1005 regmbc('i'); regmbc(0xec); regmbc(0xed);
1006 regmbc(0xee); regmbc(0xef);
1007 REGMBC(0x129) REGMBC(0x12b) REGMBC(0x12d)
1008 REGMBC(0x12f) REGMBC(0x1d0) REGMBC(0x1ec9)
1009 return;
1010 case 'j': CASEMBC(0x135) CASEMBC(0x1f0)
1011 regmbc('j'); REGMBC(0x135) REGMBC(0x1f0)
1012 return;
1013 case 'k': CASEMBC(0x137) CASEMBC(0x1e9) CASEMBC(0x1e31)
1014 CASEMBC(0x1e35)
1015 regmbc('k'); REGMBC(0x137) REGMBC(0x1e9)
1016 REGMBC(0x1e31) REGMBC(0x1e35)
1017 return;
1018 case 'l': CASEMBC(0x13a) CASEMBC(0x13c) CASEMBC(0x13e)
1019 CASEMBC(0x140) CASEMBC(0x142) CASEMBC(0x1e3b)
1020 regmbc('l'); REGMBC(0x13a) REGMBC(0x13c)
1021 REGMBC(0x13e) REGMBC(0x140) REGMBC(0x142)
1022 REGMBC(0x1e3b)
1023 return;
1024 case 'm': CASEMBC(0x1e3f) CASEMBC(0x1e41)
1025 regmbc('m'); REGMBC(0x1e3f) REGMBC(0x1e41)
1026 return;
1027 case 'n': case 0xf1:
1028 CASEMBC(0x144) CASEMBC(0x146) CASEMBC(0x148) CASEMBC(0x149)
1029 CASEMBC(0x1e45) CASEMBC(0x1e49)
1030 regmbc('n'); regmbc(0xf1);
1031 REGMBC(0x144) REGMBC(0x146) REGMBC(0x148)
1032 REGMBC(0x149) REGMBC(0x1e45) REGMBC(0x1e49)
1033 return;
1034 case 'o': case 0xf2: case 0xf3: case 0xf4: case 0xf5:
1035 case 0xf6: case 0xf8:
1036 CASEMBC(0x14d) CASEMBC(0x14f) CASEMBC(0x151) CASEMBC(0x1a1)
1037 CASEMBC(0x1d2) CASEMBC(0x1eb) CASEMBC(0x1ed) CASEMBC(0x1ecf)
1038 regmbc('o'); regmbc(0xf2); regmbc(0xf3);
1039 regmbc(0xf4); regmbc(0xf5); regmbc(0xf6);
1040 regmbc(0xf8);
1041 REGMBC(0x14d) REGMBC(0x14f) REGMBC(0x151)
1042 REGMBC(0x1a1) REGMBC(0x1d2) REGMBC(0x1eb)
1043 REGMBC(0x1ed) REGMBC(0x1ecf)
1044 return;
1045 case 'p': CASEMBC(0x1e55) CASEMBC(0x1e57)
1046 regmbc('p'); REGMBC(0x1e55) REGMBC(0x1e57)
1047 return;
1048 case 'r': CASEMBC(0x155) CASEMBC(0x157) CASEMBC(0x159)
1049 CASEMBC(0x1e59) CASEMBC(0x1e5f)
1050 regmbc('r'); REGMBC(0x155) REGMBC(0x157) REGMBC(0x159)
1051 REGMBC(0x1e59) REGMBC(0x1e5f)
1052 return;
1053 case 's': CASEMBC(0x15b) CASEMBC(0x15d) CASEMBC(0x15f)
1054 CASEMBC(0x161) CASEMBC(0x1e61)
1055 regmbc('s'); REGMBC(0x15b) REGMBC(0x15d)
1056 REGMBC(0x15f) REGMBC(0x161) REGMBC(0x1e61)
1057 return;
1058 case 't': CASEMBC(0x163) CASEMBC(0x165) CASEMBC(0x167)
1059 CASEMBC(0x1e6b) CASEMBC(0x1e6f) CASEMBC(0x1e97)
1060 regmbc('t'); REGMBC(0x163) REGMBC(0x165) REGMBC(0x167)
1061 REGMBC(0x1e6b) REGMBC(0x1e6f) REGMBC(0x1e97)
1062 return;
1063 case 'u': case 0xf9: case 0xfa: case 0xfb: case 0xfc:
1064 CASEMBC(0x169) CASEMBC(0x16b) CASEMBC(0x16d) CASEMBC(0x16f)
1065 CASEMBC(0x171) CASEMBC(0x173) CASEMBC(0x1b0) CASEMBC(0x1d4)
1066 CASEMBC(0x1ee7)
1067 regmbc('u'); regmbc(0xf9); regmbc(0xfa);
1068 regmbc(0xfb); regmbc(0xfc);
1069 REGMBC(0x169) REGMBC(0x16b) REGMBC(0x16d)
1070 REGMBC(0x16f) REGMBC(0x171) REGMBC(0x173)
1071 REGMBC(0x1b0) REGMBC(0x1d4) REGMBC(0x1ee7)
1072 return;
1073 case 'v': CASEMBC(0x1e7d)
1074 regmbc('v'); REGMBC(0x1e7d)
1075 return;
1076 case 'w': CASEMBC(0x175) CASEMBC(0x1e81) CASEMBC(0x1e83)
1077 CASEMBC(0x1e85) CASEMBC(0x1e87) CASEMBC(0x1e98)
1078 regmbc('w'); REGMBC(0x175) REGMBC(0x1e81)
1079 REGMBC(0x1e83) REGMBC(0x1e85) REGMBC(0x1e87)
1080 REGMBC(0x1e98)
1081 return;
1082 case 'x': CASEMBC(0x1e8b) CASEMBC(0x1e8d)
1083 regmbc('x'); REGMBC(0x1e8b) REGMBC(0x1e8d)
1084 return;
1085 case 'y': case 0xfd: case 0xff:
1086 CASEMBC(0x177) CASEMBC(0x1e8f) CASEMBC(0x1e99)
1087 CASEMBC(0x1ef3) CASEMBC(0x1ef7) CASEMBC(0x1ef9)
1088 regmbc('y'); regmbc(0xfd); regmbc(0xff);
1089 REGMBC(0x177) REGMBC(0x1e8f) REGMBC(0x1e99)
1090 REGMBC(0x1ef3) REGMBC(0x1ef7) REGMBC(0x1ef9)
1091 return;
1092 case 'z': CASEMBC(0x17a) CASEMBC(0x17c) CASEMBC(0x17e)
1093 CASEMBC(0x1b6) CASEMBC(0x1e91) CASEMBC(0x1e95)
1094 regmbc('z'); REGMBC(0x17a) REGMBC(0x17c)
1095 REGMBC(0x17e) REGMBC(0x1b6) REGMBC(0x1e91)
1096 REGMBC(0x1e95)
1097 return;
1098 }
1099 }
1100 regmbc(c);
1101 }
1102
1103 /*
1104 * Check for a collating element "[.a.]". "pp" points to the '['.
1105 * Returns a character. Zero means that no item was recognized. Otherwise
1106 * "pp" is advanced to after the item.
1107 * Currently only single characters are recognized!
1108 */
get_coll_element(char_u ** pp)1109 static int get_coll_element(char_u **pp)
1110 {
1111 int c;
1112 int l = 1;
1113 char_u *p = *pp;
1114
1115 if (p[0] != NUL && p[1] == '.' && p[2] != NUL) {
1116 l = utfc_ptr2len(p + 2);
1117 if (p[l + 2] == '.' && p[l + 3] == ']') {
1118 c = utf_ptr2char(p + 2);
1119 *pp += l + 4;
1120 return c;
1121 }
1122 }
1123 return 0;
1124 }
1125
1126 static int reg_cpo_lit; /* 'cpoptions' contains 'l' flag */
1127
get_cpo_flags(void)1128 static void get_cpo_flags(void)
1129 {
1130 reg_cpo_lit = vim_strchr(p_cpo, CPO_LITERAL) != NULL;
1131 }
1132
1133 /*
1134 * Skip over a "[]" range.
1135 * "p" must point to the character after the '['.
1136 * The returned pointer is on the matching ']', or the terminating NUL.
1137 */
skip_anyof(char_u * p)1138 static char_u *skip_anyof(char_u *p)
1139 {
1140 int l;
1141
1142 if (*p == '^') /* Complement of range. */
1143 ++p;
1144 if (*p == ']' || *p == '-')
1145 ++p;
1146 while (*p != NUL && *p != ']') {
1147 if ((l = utfc_ptr2len(p)) > 1) {
1148 p += l;
1149 } else if (*p == '-') {
1150 p++;
1151 if (*p != ']' && *p != NUL) {
1152 MB_PTR_ADV(p);
1153 }
1154 } else if (*p == '\\'
1155 && (vim_strchr(REGEXP_INRANGE, p[1]) != NULL
1156 || (!reg_cpo_lit
1157 && vim_strchr(REGEXP_ABBR, p[1]) != NULL))) {
1158 p += 2;
1159 } else if (*p == '[') {
1160 if (get_char_class(&p) == CLASS_NONE
1161 && get_equi_class(&p) == 0
1162 && get_coll_element(&p) == 0
1163 && *p != NUL) {
1164 p++; // It is not a class name and not NUL
1165 }
1166 } else {
1167 p++;
1168 }
1169 }
1170
1171 return p;
1172 }
1173
1174 /*
1175 * Skip past regular expression.
1176 * Stop at end of "startp" or where "dirc" is found ('/', '?', etc).
1177 * Take care of characters with a backslash in front of it.
1178 * Skip strings inside [ and ].
1179 * When "newp" is not NULL and "dirc" is '?', make an allocated copy of the
1180 * expression and change "\?" to "?". If "*newp" is not NULL the expression
1181 * is changed in-place.
1182 */
skip_regexp(char_u * startp,int dirc,int magic,char_u ** newp)1183 char_u *skip_regexp(char_u *startp, int dirc, int magic, char_u **newp)
1184 {
1185 int mymagic;
1186 char_u *p = startp;
1187
1188 if (magic)
1189 mymagic = MAGIC_ON;
1190 else
1191 mymagic = MAGIC_OFF;
1192 get_cpo_flags();
1193
1194 for (; p[0] != NUL; MB_PTR_ADV(p)) {
1195 if (p[0] == dirc) { // found end of regexp
1196 break;
1197 }
1198 if ((p[0] == '[' && mymagic >= MAGIC_ON)
1199 || (p[0] == '\\' && p[1] == '[' && mymagic <= MAGIC_OFF)) {
1200 p = skip_anyof(p + 1);
1201 if (p[0] == NUL)
1202 break;
1203 } else if (p[0] == '\\' && p[1] != NUL) {
1204 if (dirc == '?' && newp != NULL && p[1] == '?') {
1205 /* change "\?" to "?", make a copy first. */
1206 if (*newp == NULL) {
1207 *newp = vim_strsave(startp);
1208 p = *newp + (p - startp);
1209 }
1210 STRMOVE(p, p + 1);
1211 } else
1212 ++p; /* skip next character */
1213 if (*p == 'v')
1214 mymagic = MAGIC_ALL;
1215 else if (*p == 'V')
1216 mymagic = MAGIC_NONE;
1217 }
1218 }
1219 return p;
1220 }
1221
1222 /// Return true if the back reference is legal. We must have seen the close
1223 /// brace.
1224 /// TODO(vim): Should also check that we don't refer to something repeated
1225 /// (+*=): what instance of the repetition should we match?
seen_endbrace(int refnum)1226 static int seen_endbrace(int refnum)
1227 {
1228 if (!had_endbrace[refnum]) {
1229 char_u *p;
1230
1231 // Trick: check if "@<=" or "@<!" follows, in which case
1232 // the \1 can appear before the referenced match.
1233 for (p = regparse; *p != NUL; p++) {
1234 if (p[0] == '@' && p[1] == '<' && (p[2] == '!' || p[2] == '=')) {
1235 break;
1236 }
1237 }
1238
1239 if (*p == NUL) {
1240 emsg(_("E65: Illegal back reference"));
1241 rc_did_emsg = true;
1242 return false;
1243 }
1244 }
1245 return true;
1246 }
1247
1248 /*
1249 * bt_regcomp() - compile a regular expression into internal code for the
1250 * traditional back track matcher.
1251 * Returns the program in allocated space. Returns NULL for an error.
1252 *
1253 * We can't allocate space until we know how big the compiled form will be,
1254 * but we can't compile it (and thus know how big it is) until we've got a
1255 * place to put the code. So we cheat: we compile it twice, once with code
1256 * generation turned off and size counting turned on, and once "for real".
1257 * This also means that we don't allocate space until we are sure that the
1258 * thing really will compile successfully, and we never have to move the
1259 * code and thus invalidate pointers into it. (Note that it has to be in
1260 * one piece because free() must be able to free it all.)
1261 *
1262 * Whether upper/lower case is to be ignored is decided when executing the
1263 * program, it does not matter here.
1264 *
1265 * Beware that the optimization-preparation code in here knows about some
1266 * of the structure of the compiled regexp.
1267 * "re_flags": RE_MAGIC and/or RE_STRING.
1268 */
bt_regcomp(char_u * expr,int re_flags)1269 static regprog_T *bt_regcomp(char_u *expr, int re_flags)
1270 {
1271 char_u *scan;
1272 char_u *longest;
1273 int len;
1274 int flags;
1275
1276 if (expr == NULL) {
1277 IEMSG_RET_NULL(_(e_null));
1278 }
1279
1280 init_class_tab();
1281
1282 /*
1283 * First pass: determine size, legality.
1284 */
1285 regcomp_start(expr, re_flags);
1286 regcode = JUST_CALC_SIZE;
1287 regc(REGMAGIC);
1288 if (reg(REG_NOPAREN, &flags) == NULL)
1289 return NULL;
1290
1291 /* Allocate space. */
1292 bt_regprog_T *r = xmalloc(sizeof(bt_regprog_T) + regsize);
1293 r->re_in_use = false;
1294
1295 /*
1296 * Second pass: emit code.
1297 */
1298 regcomp_start(expr, re_flags);
1299 regcode = r->program;
1300 regc(REGMAGIC);
1301 if (reg(REG_NOPAREN, &flags) == NULL || reg_toolong) {
1302 xfree(r);
1303 if (reg_toolong)
1304 EMSG_RET_NULL(_("E339: Pattern too long"));
1305 return NULL;
1306 }
1307
1308 /* Dig out information for optimizations. */
1309 r->regstart = NUL; /* Worst-case defaults. */
1310 r->reganch = 0;
1311 r->regmust = NULL;
1312 r->regmlen = 0;
1313 r->regflags = regflags;
1314 if (flags & HASNL)
1315 r->regflags |= RF_HASNL;
1316 if (flags & HASLOOKBH)
1317 r->regflags |= RF_LOOKBH;
1318 /* Remember whether this pattern has any \z specials in it. */
1319 r->reghasz = re_has_z;
1320 scan = r->program + 1; /* First BRANCH. */
1321 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
1322 scan = OPERAND(scan);
1323
1324 /* Starting-point info. */
1325 if (OP(scan) == BOL || OP(scan) == RE_BOF) {
1326 r->reganch++;
1327 scan = regnext(scan);
1328 }
1329
1330 if (OP(scan) == EXACTLY) {
1331 r->regstart = utf_ptr2char(OPERAND(scan));
1332 } else if (OP(scan) == BOW
1333 || OP(scan) == EOW
1334 || OP(scan) == NOTHING
1335 || OP(scan) == MOPEN + 0 || OP(scan) == NOPEN
1336 || OP(scan) == MCLOSE + 0 || OP(scan) == NCLOSE) {
1337 char_u *regnext_scan = regnext(scan);
1338 if (OP(regnext_scan) == EXACTLY) {
1339 r->regstart = utf_ptr2char(OPERAND(regnext_scan));
1340 }
1341 }
1342
1343 /*
1344 * If there's something expensive in the r.e., find the longest
1345 * literal string that must appear and make it the regmust. Resolve
1346 * ties in favor of later strings, since the regstart check works
1347 * with the beginning of the r.e. and avoiding duplication
1348 * strengthens checking. Not a strong reason, but sufficient in the
1349 * absence of others.
1350 */
1351 /*
1352 * When the r.e. starts with BOW, it is faster to look for a regmust
1353 * first. Used a lot for "#" and "*" commands. (Added by mool).
1354 */
1355 if ((flags & SPSTART || OP(scan) == BOW || OP(scan) == EOW)
1356 && !(flags & HASNL)) {
1357 longest = NULL;
1358 len = 0;
1359 for (; scan != NULL; scan = regnext(scan))
1360 if (OP(scan) == EXACTLY && STRLEN(OPERAND(scan)) >= (size_t)len) {
1361 longest = OPERAND(scan);
1362 len = (int)STRLEN(OPERAND(scan));
1363 }
1364 r->regmust = longest;
1365 r->regmlen = len;
1366 }
1367 }
1368 #ifdef BT_REGEXP_DUMP
1369 regdump(expr, r);
1370 #endif
1371 r->engine = &bt_regengine;
1372 return (regprog_T *)r;
1373 }
1374
1375 /*
1376 * Free a compiled regexp program, returned by bt_regcomp().
1377 */
bt_regfree(regprog_T * prog)1378 static void bt_regfree(regprog_T *prog)
1379 {
1380 xfree(prog);
1381 }
1382
1383 /*
1384 * Setup to parse the regexp. Used once to get the length and once to do it.
1385 */
1386 static void
regcomp_start(char_u * expr,int re_flags)1387 regcomp_start (
1388 char_u *expr,
1389 int re_flags /* see vim_regcomp() */
1390 )
1391 {
1392 initchr(expr);
1393 if (re_flags & RE_MAGIC)
1394 reg_magic = MAGIC_ON;
1395 else
1396 reg_magic = MAGIC_OFF;
1397 reg_string = (re_flags & RE_STRING);
1398 reg_strict = (re_flags & RE_STRICT);
1399 get_cpo_flags();
1400
1401 num_complex_braces = 0;
1402 regnpar = 1;
1403 memset(had_endbrace, 0, sizeof(had_endbrace));
1404 regnzpar = 1;
1405 re_has_z = 0;
1406 regsize = 0L;
1407 reg_toolong = false;
1408 regflags = 0;
1409 had_eol = false;
1410 }
1411
1412 /*
1413 * Check if during the previous call to vim_regcomp the EOL item "$" has been
1414 * found. This is messy, but it works fine.
1415 */
vim_regcomp_had_eol(void)1416 int vim_regcomp_had_eol(void)
1417 {
1418 return had_eol;
1419 }
1420
1421 // variables used for parsing
1422 static int at_start; // True when on the first character
1423 static int prev_at_start; // True when on the second character
1424
1425 /*
1426 * Parse regular expression, i.e. main body or parenthesized thing.
1427 *
1428 * Caller must absorb opening parenthesis.
1429 *
1430 * Combining parenthesis handling with the base level of regular expression
1431 * is a trifle forced, but the need to tie the tails of the branches to what
1432 * follows makes it hard to avoid.
1433 */
1434 static char_u *
reg(int paren,int * flagp)1435 reg (
1436 int paren, /* REG_NOPAREN, REG_PAREN, REG_NPAREN or REG_ZPAREN */
1437 int *flagp
1438 )
1439 {
1440 char_u *ret;
1441 char_u *br;
1442 char_u *ender;
1443 int parno = 0;
1444 int flags;
1445
1446 *flagp = HASWIDTH; /* Tentatively. */
1447
1448 if (paren == REG_ZPAREN) {
1449 /* Make a ZOPEN node. */
1450 if (regnzpar >= NSUBEXP)
1451 EMSG_RET_NULL(_("E50: Too many \\z("));
1452 parno = regnzpar;
1453 regnzpar++;
1454 ret = regnode(ZOPEN + parno);
1455 } else if (paren == REG_PAREN) {
1456 /* Make a MOPEN node. */
1457 if (regnpar >= NSUBEXP)
1458 EMSG2_RET_NULL(_("E51: Too many %s("), reg_magic == MAGIC_ALL);
1459 parno = regnpar;
1460 ++regnpar;
1461 ret = regnode(MOPEN + parno);
1462 } else if (paren == REG_NPAREN) {
1463 /* Make a NOPEN node. */
1464 ret = regnode(NOPEN);
1465 } else
1466 ret = NULL;
1467
1468 /* Pick up the branches, linking them together. */
1469 br = regbranch(&flags);
1470 if (br == NULL)
1471 return NULL;
1472 if (ret != NULL)
1473 regtail(ret, br); /* [MZ]OPEN -> first. */
1474 else
1475 ret = br;
1476 /* If one of the branches can be zero-width, the whole thing can.
1477 * If one of the branches has * at start or matches a line-break, the
1478 * whole thing can. */
1479 if (!(flags & HASWIDTH))
1480 *flagp &= ~HASWIDTH;
1481 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
1482 while (peekchr() == Magic('|')) {
1483 skipchr();
1484 br = regbranch(&flags);
1485 if (br == NULL || reg_toolong)
1486 return NULL;
1487 regtail(ret, br); /* BRANCH -> BRANCH. */
1488 if (!(flags & HASWIDTH))
1489 *flagp &= ~HASWIDTH;
1490 *flagp |= flags & (SPSTART | HASNL | HASLOOKBH);
1491 }
1492
1493 /* Make a closing node, and hook it on the end. */
1494 ender = regnode(
1495 paren == REG_ZPAREN ? ZCLOSE + parno :
1496 paren == REG_PAREN ? MCLOSE + parno :
1497 paren == REG_NPAREN ? NCLOSE : END);
1498 regtail(ret, ender);
1499
1500 /* Hook the tails of the branches to the closing node. */
1501 for (br = ret; br != NULL; br = regnext(br))
1502 regoptail(br, ender);
1503
1504 /* Check for proper termination. */
1505 if (paren != REG_NOPAREN && getchr() != Magic(')')) {
1506 if (paren == REG_ZPAREN)
1507 EMSG_RET_NULL(_("E52: Unmatched \\z("));
1508 else if (paren == REG_NPAREN)
1509 EMSG2_RET_NULL(_(e_unmatchedpp), reg_magic == MAGIC_ALL);
1510 else
1511 EMSG2_RET_NULL(_(e_unmatchedp), reg_magic == MAGIC_ALL);
1512 } else if (paren == REG_NOPAREN && peekchr() != NUL) {
1513 if (curchr == Magic(')'))
1514 EMSG2_RET_NULL(_(e_unmatchedpar), reg_magic == MAGIC_ALL);
1515 else
1516 EMSG_RET_NULL(_(e_trailing)); /* "Can't happen". */
1517 /* NOTREACHED */
1518 }
1519 // Here we set the flag allowing back references to this set of
1520 // parentheses.
1521 if (paren == REG_PAREN) {
1522 had_endbrace[parno] = true; // have seen the close paren
1523 }
1524 return ret;
1525 }
1526
1527 /*
1528 * Parse one alternative of an | operator.
1529 * Implements the & operator.
1530 */
regbranch(int * flagp)1531 static char_u *regbranch(int *flagp)
1532 {
1533 char_u *ret;
1534 char_u *chain = NULL;
1535 char_u *latest;
1536 int flags;
1537
1538 *flagp = WORST | HASNL; /* Tentatively. */
1539
1540 ret = regnode(BRANCH);
1541 for (;; ) {
1542 latest = regconcat(&flags);
1543 if (latest == NULL)
1544 return NULL;
1545 /* If one of the branches has width, the whole thing has. If one of
1546 * the branches anchors at start-of-line, the whole thing does.
1547 * If one of the branches uses look-behind, the whole thing does. */
1548 *flagp |= flags & (HASWIDTH | SPSTART | HASLOOKBH);
1549 /* If one of the branches doesn't match a line-break, the whole thing
1550 * doesn't. */
1551 *flagp &= ~HASNL | (flags & HASNL);
1552 if (chain != NULL)
1553 regtail(chain, latest);
1554 if (peekchr() != Magic('&'))
1555 break;
1556 skipchr();
1557 regtail(latest, regnode(END)); /* operand ends */
1558 if (reg_toolong)
1559 break;
1560 reginsert(MATCH, latest);
1561 chain = latest;
1562 }
1563
1564 return ret;
1565 }
1566
1567 /*
1568 * Parse one alternative of an | or & operator.
1569 * Implements the concatenation operator.
1570 */
regconcat(int * flagp)1571 static char_u *regconcat(int *flagp)
1572 {
1573 char_u *first = NULL;
1574 char_u *chain = NULL;
1575 char_u *latest;
1576 int flags;
1577 int cont = true;
1578
1579 *flagp = WORST; /* Tentatively. */
1580
1581 while (cont) {
1582 switch (peekchr()) {
1583 case NUL:
1584 case Magic('|'):
1585 case Magic('&'):
1586 case Magic(')'):
1587 cont = false;
1588 break;
1589 case Magic('Z'):
1590 regflags |= RF_ICOMBINE;
1591 skipchr_keepstart();
1592 break;
1593 case Magic('c'):
1594 regflags |= RF_ICASE;
1595 skipchr_keepstart();
1596 break;
1597 case Magic('C'):
1598 regflags |= RF_NOICASE;
1599 skipchr_keepstart();
1600 break;
1601 case Magic('v'):
1602 reg_magic = MAGIC_ALL;
1603 skipchr_keepstart();
1604 curchr = -1;
1605 break;
1606 case Magic('m'):
1607 reg_magic = MAGIC_ON;
1608 skipchr_keepstart();
1609 curchr = -1;
1610 break;
1611 case Magic('M'):
1612 reg_magic = MAGIC_OFF;
1613 skipchr_keepstart();
1614 curchr = -1;
1615 break;
1616 case Magic('V'):
1617 reg_magic = MAGIC_NONE;
1618 skipchr_keepstart();
1619 curchr = -1;
1620 break;
1621 default:
1622 latest = regpiece(&flags);
1623 if (latest == NULL || reg_toolong)
1624 return NULL;
1625 *flagp |= flags & (HASWIDTH | HASNL | HASLOOKBH);
1626 if (chain == NULL) /* First piece. */
1627 *flagp |= flags & SPSTART;
1628 else
1629 regtail(chain, latest);
1630 chain = latest;
1631 if (first == NULL)
1632 first = latest;
1633 break;
1634 }
1635 }
1636 if (first == NULL) /* Loop ran zero times. */
1637 first = regnode(NOTHING);
1638 return first;
1639 }
1640
1641 /*
1642 * Parse something followed by possible [*+=].
1643 *
1644 * Note that the branching code sequences used for = and the general cases
1645 * of * and + are somewhat optimized: they use the same NOTHING node as
1646 * both the endmarker for their branch list and the body of the last branch.
1647 * It might seem that this node could be dispensed with entirely, but the
1648 * endmarker role is not redundant.
1649 */
regpiece(int * flagp)1650 static char_u *regpiece(int *flagp)
1651 {
1652 char_u *ret;
1653 int op;
1654 char_u *next;
1655 int flags;
1656 long minval;
1657 long maxval;
1658
1659 ret = regatom(&flags);
1660 if (ret == NULL)
1661 return NULL;
1662
1663 op = peekchr();
1664 if (re_multi_type(op) == NOT_MULTI) {
1665 *flagp = flags;
1666 return ret;
1667 }
1668 /* default flags */
1669 *flagp = (WORST | SPSTART | (flags & (HASNL | HASLOOKBH)));
1670
1671 skipchr();
1672 switch (op) {
1673 case Magic('*'):
1674 if (flags & SIMPLE)
1675 reginsert(STAR, ret);
1676 else {
1677 /* Emit x* as (x&|), where & means "self". */
1678 reginsert(BRANCH, ret); /* Either x */
1679 regoptail(ret, regnode(BACK)); /* and loop */
1680 regoptail(ret, ret); /* back */
1681 regtail(ret, regnode(BRANCH)); /* or */
1682 regtail(ret, regnode(NOTHING)); /* null. */
1683 }
1684 break;
1685
1686 case Magic('+'):
1687 if (flags & SIMPLE)
1688 reginsert(PLUS, ret);
1689 else {
1690 /* Emit x+ as x(&|), where & means "self". */
1691 next = regnode(BRANCH); /* Either */
1692 regtail(ret, next);
1693 regtail(regnode(BACK), ret); /* loop back */
1694 regtail(next, regnode(BRANCH)); /* or */
1695 regtail(ret, regnode(NOTHING)); /* null. */
1696 }
1697 *flagp = (WORST | HASWIDTH | (flags & (HASNL | HASLOOKBH)));
1698 break;
1699
1700 case Magic('@'):
1701 {
1702 int lop = END;
1703 int64_t nr = getdecchrs();
1704
1705 switch (no_Magic(getchr())) {
1706 case '=': lop = MATCH; break; /* \@= */
1707 case '!': lop = NOMATCH; break; /* \@! */
1708 case '>': lop = SUBPAT; break; /* \@> */
1709 case '<': switch (no_Magic(getchr())) {
1710 case '=': lop = BEHIND; break; /* \@<= */
1711 case '!': lop = NOBEHIND; break; /* \@<! */
1712 }
1713 }
1714 if (lop == END)
1715 EMSG2_RET_NULL(_("E59: invalid character after %s@"),
1716 reg_magic == MAGIC_ALL);
1717 /* Look behind must match with behind_pos. */
1718 if (lop == BEHIND || lop == NOBEHIND) {
1719 regtail(ret, regnode(BHPOS));
1720 *flagp |= HASLOOKBH;
1721 }
1722 regtail(ret, regnode(END)); /* operand ends */
1723 if (lop == BEHIND || lop == NOBEHIND) {
1724 if (nr < 0)
1725 nr = 0; /* no limit is same as zero limit */
1726 reginsert_nr(lop, (uint32_t)nr, ret);
1727 } else
1728 reginsert(lop, ret);
1729 break;
1730 }
1731
1732 case Magic('?'):
1733 case Magic('='):
1734 /* Emit x= as (x|) */
1735 reginsert(BRANCH, ret); /* Either x */
1736 regtail(ret, regnode(BRANCH)); /* or */
1737 next = regnode(NOTHING); /* null. */
1738 regtail(ret, next);
1739 regoptail(ret, next);
1740 break;
1741
1742 case Magic('{'):
1743 if (!read_limits(&minval, &maxval))
1744 return NULL;
1745 if (flags & SIMPLE) {
1746 reginsert(BRACE_SIMPLE, ret);
1747 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
1748 } else {
1749 if (num_complex_braces >= 10)
1750 EMSG2_RET_NULL(_("E60: Too many complex %s{...}s"),
1751 reg_magic == MAGIC_ALL);
1752 reginsert(BRACE_COMPLEX + num_complex_braces, ret);
1753 regoptail(ret, regnode(BACK));
1754 regoptail(ret, ret);
1755 reginsert_limits(BRACE_LIMITS, minval, maxval, ret);
1756 ++num_complex_braces;
1757 }
1758 if (minval > 0 && maxval > 0)
1759 *flagp = (HASWIDTH | (flags & (HASNL | HASLOOKBH)));
1760 break;
1761 }
1762 if (re_multi_type(peekchr()) != NOT_MULTI) {
1763 /* Can't have a multi follow a multi. */
1764 if (peekchr() == Magic('*'))
1765 sprintf((char *)IObuff, _("E61: Nested %s*"),
1766 reg_magic >= MAGIC_ON ? "" : "\\");
1767 else
1768 sprintf((char *)IObuff, _("E62: Nested %s%c"),
1769 reg_magic == MAGIC_ALL ? "" : "\\", no_Magic(peekchr()));
1770 EMSG_RET_NULL((char *)IObuff);
1771 }
1772
1773 return ret;
1774 }
1775
1776 /* When making changes to classchars also change nfa_classcodes. */
1777 static char_u *classchars = (char_u *)".iIkKfFpPsSdDxXoOwWhHaAlLuU";
1778 static int classcodes[] = {
1779 ANY, IDENT, SIDENT, KWORD, SKWORD,
1780 FNAME, SFNAME, PRINT, SPRINT,
1781 WHITE, NWHITE, DIGIT, NDIGIT,
1782 HEX, NHEX, OCTAL, NOCTAL,
1783 WORD, NWORD, HEAD, NHEAD,
1784 ALPHA, NALPHA, LOWER, NLOWER,
1785 UPPER, NUPPER
1786 };
1787
1788 /*
1789 * Parse the lowest level.
1790 *
1791 * Optimization: gobbles an entire sequence of ordinary characters so that
1792 * it can turn them into a single node, which is smaller to store and
1793 * faster to run. Don't do this when one_exactly is set.
1794 */
regatom(int * flagp)1795 static char_u *regatom(int *flagp)
1796 {
1797 char_u *ret;
1798 int flags;
1799 int c;
1800 char_u *p;
1801 int extra = 0;
1802 int save_prev_at_start = prev_at_start;
1803
1804 *flagp = WORST; /* Tentatively. */
1805
1806 c = getchr();
1807 switch (c) {
1808 case Magic('^'):
1809 ret = regnode(BOL);
1810 break;
1811
1812 case Magic('$'):
1813 ret = regnode(EOL);
1814 had_eol = true;
1815 break;
1816
1817 case Magic('<'):
1818 ret = regnode(BOW);
1819 break;
1820
1821 case Magic('>'):
1822 ret = regnode(EOW);
1823 break;
1824
1825 case Magic('_'):
1826 c = no_Magic(getchr());
1827 if (c == '^') { /* "\_^" is start-of-line */
1828 ret = regnode(BOL);
1829 break;
1830 }
1831 if (c == '$') { /* "\_$" is end-of-line */
1832 ret = regnode(EOL);
1833 had_eol = true;
1834 break;
1835 }
1836
1837 extra = ADD_NL;
1838 *flagp |= HASNL;
1839
1840 /* "\_[" is character range plus newline */
1841 if (c == '[')
1842 goto collection;
1843
1844 // "\_x" is character class plus newline
1845 FALLTHROUGH;
1846
1847 /*
1848 * Character classes.
1849 */
1850 case Magic('.'):
1851 case Magic('i'):
1852 case Magic('I'):
1853 case Magic('k'):
1854 case Magic('K'):
1855 case Magic('f'):
1856 case Magic('F'):
1857 case Magic('p'):
1858 case Magic('P'):
1859 case Magic('s'):
1860 case Magic('S'):
1861 case Magic('d'):
1862 case Magic('D'):
1863 case Magic('x'):
1864 case Magic('X'):
1865 case Magic('o'):
1866 case Magic('O'):
1867 case Magic('w'):
1868 case Magic('W'):
1869 case Magic('h'):
1870 case Magic('H'):
1871 case Magic('a'):
1872 case Magic('A'):
1873 case Magic('l'):
1874 case Magic('L'):
1875 case Magic('u'):
1876 case Magic('U'):
1877 p = vim_strchr(classchars, no_Magic(c));
1878 if (p == NULL)
1879 EMSG_RET_NULL(_("E63: invalid use of \\_"));
1880 /* When '.' is followed by a composing char ignore the dot, so that
1881 * the composing char is matched here. */
1882 if (c == Magic('.') && utf_iscomposing(peekchr())) {
1883 c = getchr();
1884 goto do_multibyte;
1885 }
1886 ret = regnode(classcodes[p - classchars] + extra);
1887 *flagp |= HASWIDTH | SIMPLE;
1888 break;
1889
1890 case Magic('n'):
1891 if (reg_string) {
1892 /* In a string "\n" matches a newline character. */
1893 ret = regnode(EXACTLY);
1894 regc(NL);
1895 regc(NUL);
1896 *flagp |= HASWIDTH | SIMPLE;
1897 } else {
1898 /* In buffer text "\n" matches the end of a line. */
1899 ret = regnode(NEWL);
1900 *flagp |= HASWIDTH | HASNL;
1901 }
1902 break;
1903
1904 case Magic('('):
1905 if (one_exactly)
1906 EMSG_ONE_RET_NULL;
1907 ret = reg(REG_PAREN, &flags);
1908 if (ret == NULL)
1909 return NULL;
1910 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
1911 break;
1912
1913 case NUL:
1914 case Magic('|'):
1915 case Magic('&'):
1916 case Magic(')'):
1917 if (one_exactly)
1918 EMSG_ONE_RET_NULL;
1919 IEMSG_RET_NULL(_(e_internal)); // Supposed to be caught earlier.
1920 // NOTREACHED
1921
1922 case Magic('='):
1923 case Magic('?'):
1924 case Magic('+'):
1925 case Magic('@'):
1926 case Magic('{'):
1927 case Magic('*'):
1928 c = no_Magic(c);
1929 sprintf((char *)IObuff, _("E64: %s%c follows nothing"),
1930 (c == '*' ? reg_magic >= MAGIC_ON : reg_magic == MAGIC_ALL)
1931 ? "" : "\\", c);
1932 EMSG_RET_NULL((char *)IObuff);
1933 /* NOTREACHED */
1934
1935 case Magic('~'): /* previous substitute pattern */
1936 if (reg_prev_sub != NULL) {
1937 char_u *lp;
1938
1939 ret = regnode(EXACTLY);
1940 lp = reg_prev_sub;
1941 while (*lp != NUL)
1942 regc(*lp++);
1943 regc(NUL);
1944 if (*reg_prev_sub != NUL) {
1945 *flagp |= HASWIDTH;
1946 if ((lp - reg_prev_sub) == 1)
1947 *flagp |= SIMPLE;
1948 }
1949 } else
1950 EMSG_RET_NULL(_(e_nopresub));
1951 break;
1952
1953 case Magic('1'):
1954 case Magic('2'):
1955 case Magic('3'):
1956 case Magic('4'):
1957 case Magic('5'):
1958 case Magic('6'):
1959 case Magic('7'):
1960 case Magic('8'):
1961 case Magic('9'):
1962 {
1963 int refnum;
1964
1965 refnum = c - Magic('0');
1966 if (!seen_endbrace(refnum)) {
1967 return NULL;
1968 }
1969 ret = regnode(BACKREF + refnum);
1970 }
1971 break;
1972
1973 case Magic('z'):
1974 {
1975 c = no_Magic(getchr());
1976 switch (c) {
1977 case '(': if ((reg_do_extmatch & REX_SET) == 0)
1978 EMSG_RET_NULL(_(e_z_not_allowed));
1979 if (one_exactly)
1980 EMSG_ONE_RET_NULL;
1981 ret = reg(REG_ZPAREN, &flags);
1982 if (ret == NULL)
1983 return NULL;
1984 *flagp |= flags & (HASWIDTH|SPSTART|HASNL|HASLOOKBH);
1985 re_has_z = REX_SET;
1986 break;
1987
1988 case '1':
1989 case '2':
1990 case '3':
1991 case '4':
1992 case '5':
1993 case '6':
1994 case '7':
1995 case '8':
1996 case '9': if ((reg_do_extmatch & REX_USE) == 0)
1997 EMSG_RET_NULL(_(e_z1_not_allowed));
1998 ret = regnode(ZREF + c - '0');
1999 re_has_z = REX_USE;
2000 break;
2001
2002 case 's': ret = regnode(MOPEN + 0);
2003 if (!re_mult_next("\\zs")) {
2004 return NULL;
2005 }
2006 break;
2007
2008 case 'e': ret = regnode(MCLOSE + 0);
2009 if (!re_mult_next("\\ze")) {
2010 return NULL;
2011 }
2012 break;
2013
2014 default: EMSG_RET_NULL(_("E68: Invalid character after \\z"));
2015 }
2016 }
2017 break;
2018
2019 case Magic('%'):
2020 {
2021 c = no_Magic(getchr());
2022 switch (c) {
2023 /* () without a back reference */
2024 case '(':
2025 if (one_exactly)
2026 EMSG_ONE_RET_NULL;
2027 ret = reg(REG_NPAREN, &flags);
2028 if (ret == NULL)
2029 return NULL;
2030 *flagp |= flags & (HASWIDTH | SPSTART | HASNL | HASLOOKBH);
2031 break;
2032
2033 /* Catch \%^ and \%$ regardless of where they appear in the
2034 * pattern -- regardless of whether or not it makes sense. */
2035 case '^':
2036 ret = regnode(RE_BOF);
2037 break;
2038
2039 case '$':
2040 ret = regnode(RE_EOF);
2041 break;
2042
2043 case '#':
2044 ret = regnode(CURSOR);
2045 break;
2046
2047 case 'V':
2048 ret = regnode(RE_VISUAL);
2049 break;
2050
2051 case 'C':
2052 ret = regnode(RE_COMPOSING);
2053 break;
2054
2055 /* \%[abc]: Emit as a list of branches, all ending at the last
2056 * branch which matches nothing. */
2057 case '[':
2058 if (one_exactly) /* doesn't nest */
2059 EMSG_ONE_RET_NULL;
2060 {
2061 char_u *lastbranch;
2062 char_u *lastnode = NULL;
2063 char_u *br;
2064
2065 ret = NULL;
2066 while ((c = getchr()) != ']') {
2067 if (c == NUL)
2068 EMSG2_RET_NULL(_(e_missing_sb),
2069 reg_magic == MAGIC_ALL);
2070 br = regnode(BRANCH);
2071 if (ret == NULL) {
2072 ret = br;
2073 } else {
2074 regtail(lastnode, br);
2075 if (reg_toolong) {
2076 return NULL;
2077 }
2078 }
2079
2080 ungetchr();
2081 one_exactly = true;
2082 lastnode = regatom(flagp);
2083 one_exactly = false;
2084 if (lastnode == NULL) {
2085 return NULL;
2086 }
2087 }
2088 if (ret == NULL)
2089 EMSG2_RET_NULL(_(e_empty_sb),
2090 reg_magic == MAGIC_ALL);
2091 lastbranch = regnode(BRANCH);
2092 br = regnode(NOTHING);
2093 if (ret != JUST_CALC_SIZE) {
2094 regtail(lastnode, br);
2095 regtail(lastbranch, br);
2096 /* connect all branches to the NOTHING
2097 * branch at the end */
2098 for (br = ret; br != lastnode; ) {
2099 if (OP(br) == BRANCH) {
2100 regtail(br, lastbranch);
2101 if (reg_toolong) {
2102 return NULL;
2103 }
2104 br = OPERAND(br);
2105 } else
2106 br = regnext(br);
2107 }
2108 }
2109 *flagp &= ~(HASWIDTH | SIMPLE);
2110 break;
2111 }
2112
2113 case 'd': /* %d123 decimal */
2114 case 'o': /* %o123 octal */
2115 case 'x': /* %xab hex 2 */
2116 case 'u': /* %uabcd hex 4 */
2117 case 'U': /* %U1234abcd hex 8 */
2118 {
2119 int64_t i;
2120
2121 switch (c) {
2122 case 'd': i = getdecchrs(); break;
2123 case 'o': i = getoctchrs(); break;
2124 case 'x': i = gethexchrs(2); break;
2125 case 'u': i = gethexchrs(4); break;
2126 case 'U': i = gethexchrs(8); break;
2127 default: i = -1; break;
2128 }
2129
2130 if (i < 0 || i > INT_MAX) {
2131 EMSG2_RET_NULL(_("E678: Invalid character after %s%%[dxouU]"),
2132 reg_magic == MAGIC_ALL);
2133 }
2134 if (use_multibytecode(i)) {
2135 ret = regnode(MULTIBYTECODE);
2136 } else {
2137 ret = regnode(EXACTLY);
2138 }
2139 if (i == 0) {
2140 regc(0x0a);
2141 } else {
2142 regmbc(i);
2143 }
2144 regc(NUL);
2145 *flagp |= HASWIDTH;
2146 break;
2147 }
2148
2149 default:
2150 if (ascii_isdigit(c) || c == '<' || c == '>'
2151 || c == '\'') {
2152 uint32_t n = 0;
2153 int cmp;
2154
2155 cmp = c;
2156 if (cmp == '<' || cmp == '>')
2157 c = getchr();
2158 while (ascii_isdigit(c)) {
2159 n = n * 10 + (uint32_t)(c - '0');
2160 c = getchr();
2161 }
2162 if (c == '\'' && n == 0) {
2163 /* "\%'m", "\%<'m" and "\%>'m": Mark */
2164 c = getchr();
2165 ret = regnode(RE_MARK);
2166 if (ret == JUST_CALC_SIZE)
2167 regsize += 2;
2168 else {
2169 *regcode++ = c;
2170 *regcode++ = cmp;
2171 }
2172 break;
2173 } else if (c == 'l' || c == 'c' || c == 'v') {
2174 if (c == 'l') {
2175 ret = regnode(RE_LNUM);
2176 if (save_prev_at_start) {
2177 at_start = true;
2178 }
2179 } else if (c == 'c') {
2180 ret = regnode(RE_COL);
2181 } else {
2182 ret = regnode(RE_VCOL);
2183 }
2184 if (ret == JUST_CALC_SIZE) {
2185 regsize += 5;
2186 } else {
2187 // put the number and the optional
2188 // comparator after the opcode
2189 regcode = re_put_uint32(regcode, n);
2190 *regcode++ = cmp;
2191 }
2192 break;
2193 }
2194 }
2195
2196 EMSG2_RET_NULL(_("E71: Invalid character after %s%%"),
2197 reg_magic == MAGIC_ALL);
2198 }
2199 }
2200 break;
2201
2202 case Magic('['):
2203 collection:
2204 {
2205 char_u *lp;
2206
2207 /*
2208 * If there is no matching ']', we assume the '[' is a normal
2209 * character. This makes 'incsearch' and ":help [" work.
2210 */
2211 lp = skip_anyof(regparse);
2212 if (*lp == ']') { /* there is a matching ']' */
2213 int startc = -1; /* > 0 when next '-' is a range */
2214 int endc;
2215
2216 /*
2217 * In a character class, different parsing rules apply.
2218 * Not even \ is special anymore, nothing is.
2219 */
2220 if (*regparse == '^') { /* Complement of range. */
2221 ret = regnode(ANYBUT + extra);
2222 regparse++;
2223 } else
2224 ret = regnode(ANYOF + extra);
2225
2226 /* At the start ']' and '-' mean the literal character. */
2227 if (*regparse == ']' || *regparse == '-') {
2228 startc = *regparse;
2229 regc(*regparse++);
2230 }
2231
2232 while (*regparse != NUL && *regparse != ']') {
2233 if (*regparse == '-') {
2234 ++regparse;
2235 /* The '-' is not used for a range at the end and
2236 * after or before a '\n'. */
2237 if (*regparse == ']' || *regparse == NUL
2238 || startc == -1
2239 || (regparse[0] == '\\' && regparse[1] == 'n')) {
2240 regc('-');
2241 startc = '-'; /* [--x] is a range */
2242 } else {
2243 /* Also accept "a-[.z.]" */
2244 endc = 0;
2245 if (*regparse == '[')
2246 endc = get_coll_element(®parse);
2247 if (endc == 0) {
2248 endc = mb_ptr2char_adv((const char_u **)®parse);
2249 }
2250
2251 /* Handle \o40, \x20 and \u20AC style sequences */
2252 if (endc == '\\' && !reg_cpo_lit)
2253 endc = coll_get_char();
2254
2255 if (startc > endc) {
2256 EMSG_RET_NULL(_(e_reverse_range));
2257 }
2258 if (utf_char2len(startc) > 1
2259 || utf_char2len(endc) > 1) {
2260 // Limit to a range of 256 chars
2261 if (endc > startc + 256) {
2262 EMSG_RET_NULL(_(e_large_class));
2263 }
2264 while (++startc <= endc) {
2265 regmbc(startc);
2266 }
2267 } else {
2268 while (++startc <= endc)
2269 regc(startc);
2270 }
2271 startc = -1;
2272 }
2273 }
2274 /*
2275 * Only "\]", "\^", "\]" and "\\" are special in Vi. Vim
2276 * accepts "\t", "\e", etc., but only when the 'l' flag in
2277 * 'cpoptions' is not included.
2278 */
2279 else if (*regparse == '\\'
2280 && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL
2281 || (!reg_cpo_lit
2282 && vim_strchr(REGEXP_ABBR,
2283 regparse[1]) != NULL))) {
2284 regparse++;
2285 if (*regparse == 'n') {
2286 /* '\n' in range: also match NL */
2287 if (ret != JUST_CALC_SIZE) {
2288 /* Using \n inside [^] does not change what
2289 * matches. "[^\n]" is the same as ".". */
2290 if (*ret == ANYOF) {
2291 *ret = ANYOF + ADD_NL;
2292 *flagp |= HASNL;
2293 }
2294 /* else: must have had a \n already */
2295 }
2296 regparse++;
2297 startc = -1;
2298 } else if (*regparse == 'd'
2299 || *regparse == 'o'
2300 || *regparse == 'x'
2301 || *regparse == 'u'
2302 || *regparse == 'U') {
2303 startc = coll_get_char();
2304 if (startc == 0)
2305 regc(0x0a);
2306 else
2307 regmbc(startc);
2308 } else {
2309 startc = backslash_trans(*regparse++);
2310 regc(startc);
2311 }
2312 } else if (*regparse == '[') {
2313 int c_class;
2314 int cu;
2315
2316 c_class = get_char_class(®parse);
2317 startc = -1;
2318 /* Characters assumed to be 8 bits! */
2319 switch (c_class) {
2320 case CLASS_NONE:
2321 c_class = get_equi_class(®parse);
2322 if (c_class != 0) {
2323 /* produce equivalence class */
2324 reg_equi_class(c_class);
2325 } else if ((c_class =
2326 get_coll_element(®parse)) != 0) {
2327 /* produce a collating element */
2328 regmbc(c_class);
2329 } else {
2330 /* literal '[', allow [[-x] as a range */
2331 startc = *regparse++;
2332 regc(startc);
2333 }
2334 break;
2335 case CLASS_ALNUM:
2336 for (cu = 1; cu < 128; cu++) {
2337 if (isalnum(cu)) {
2338 regmbc(cu);
2339 }
2340 }
2341 break;
2342 case CLASS_ALPHA:
2343 for (cu = 1; cu < 128; cu++) {
2344 if (isalpha(cu)) {
2345 regmbc(cu);
2346 }
2347 }
2348 break;
2349 case CLASS_BLANK:
2350 regc(' ');
2351 regc('\t');
2352 break;
2353 case CLASS_CNTRL:
2354 for (cu = 1; cu <= 127; cu++) {
2355 if (iscntrl(cu)) {
2356 regmbc(cu);
2357 }
2358 }
2359 break;
2360 case CLASS_DIGIT:
2361 for (cu = 1; cu <= 127; cu++) {
2362 if (ascii_isdigit(cu)) {
2363 regmbc(cu);
2364 }
2365 }
2366 break;
2367 case CLASS_GRAPH:
2368 for (cu = 1; cu <= 127; cu++) {
2369 if (isgraph(cu)) {
2370 regmbc(cu);
2371 }
2372 }
2373 break;
2374 case CLASS_LOWER:
2375 for (cu = 1; cu <= 255; cu++) {
2376 if (mb_islower(cu) && cu != 170 && cu != 186) {
2377 regmbc(cu);
2378 }
2379 }
2380 break;
2381 case CLASS_PRINT:
2382 for (cu = 1; cu <= 255; cu++) {
2383 if (vim_isprintc(cu)) {
2384 regmbc(cu);
2385 }
2386 }
2387 break;
2388 case CLASS_PUNCT:
2389 for (cu = 1; cu < 128; cu++) {
2390 if (ispunct(cu)) {
2391 regmbc(cu);
2392 }
2393 }
2394 break;
2395 case CLASS_SPACE:
2396 for (cu = 9; cu <= 13; cu++)
2397 regc(cu);
2398 regc(' ');
2399 break;
2400 case CLASS_UPPER:
2401 for (cu = 1; cu <= 255; cu++) {
2402 if (mb_isupper(cu)) {
2403 regmbc(cu);
2404 }
2405 }
2406 break;
2407 case CLASS_XDIGIT:
2408 for (cu = 1; cu <= 255; cu++) {
2409 if (ascii_isxdigit(cu)) {
2410 regmbc(cu);
2411 }
2412 }
2413 break;
2414 case CLASS_TAB:
2415 regc('\t');
2416 break;
2417 case CLASS_RETURN:
2418 regc('\r');
2419 break;
2420 case CLASS_BACKSPACE:
2421 regc('\b');
2422 break;
2423 case CLASS_ESCAPE:
2424 regc(ESC);
2425 break;
2426 case CLASS_IDENT:
2427 for (cu = 1; cu <= 255; cu++) {
2428 if (vim_isIDc(cu)) {
2429 regmbc(cu);
2430 }
2431 }
2432 break;
2433 case CLASS_KEYWORD:
2434 for (cu = 1; cu <= 255; cu++) {
2435 if (reg_iswordc(cu)) {
2436 regmbc(cu);
2437 }
2438 }
2439 break;
2440 case CLASS_FNAME:
2441 for (cu = 1; cu <= 255; cu++) {
2442 if (vim_isfilec(cu)) {
2443 regmbc(cu);
2444 }
2445 }
2446 break;
2447 }
2448 } else {
2449 // produce a multibyte character, including any
2450 // following composing characters.
2451 startc = utf_ptr2char(regparse);
2452 int len = utfc_ptr2len(regparse);
2453 if (utf_char2len(startc) != len) {
2454 // composing chars
2455 startc = -1;
2456 }
2457 while (--len >= 0) {
2458 regc(*regparse++);
2459 }
2460 }
2461 }
2462 regc(NUL);
2463 prevchr_len = 1; /* last char was the ']' */
2464 if (*regparse != ']')
2465 EMSG_RET_NULL(_(e_toomsbra)); /* Cannot happen? */
2466 skipchr(); /* let's be friends with the lexer again */
2467 *flagp |= HASWIDTH | SIMPLE;
2468 break;
2469 } else if (reg_strict)
2470 EMSG2_RET_NULL(_(e_missingbracket), reg_magic > MAGIC_OFF);
2471 }
2472 FALLTHROUGH;
2473
2474 default:
2475 {
2476 int len;
2477
2478 /* A multi-byte character is handled as a separate atom if it's
2479 * before a multi and when it's a composing char. */
2480 if (use_multibytecode(c)) {
2481 do_multibyte:
2482 ret = regnode(MULTIBYTECODE);
2483 regmbc(c);
2484 *flagp |= HASWIDTH | SIMPLE;
2485 break;
2486 }
2487
2488 ret = regnode(EXACTLY);
2489
2490 /*
2491 * Append characters as long as:
2492 * - there is no following multi, we then need the character in
2493 * front of it as a single character operand
2494 * - not running into a Magic character
2495 * - "one_exactly" is not set
2496 * But always emit at least one character. Might be a Multi,
2497 * e.g., a "[" without matching "]".
2498 */
2499 for (len = 0; c != NUL && (len == 0
2500 || (re_multi_type(peekchr()) == NOT_MULTI
2501 && !one_exactly
2502 && !is_Magic(c))); ++len) {
2503 c = no_Magic(c);
2504 {
2505 regmbc(c);
2506 {
2507 int l;
2508
2509 /* Need to get composing character too. */
2510 for (;; ) {
2511 l = utf_ptr2len(regparse);
2512 if (!utf_composinglike(regparse, regparse + l)) {
2513 break;
2514 }
2515 regmbc(utf_ptr2char(regparse));
2516 skipchr();
2517 }
2518 }
2519 }
2520 c = getchr();
2521 }
2522 ungetchr();
2523
2524 regc(NUL);
2525 *flagp |= HASWIDTH;
2526 if (len == 1)
2527 *flagp |= SIMPLE;
2528 }
2529 break;
2530 }
2531
2532 return ret;
2533 }
2534
2535 /// Used in a place where no * or \+ can follow.
re_mult_next(char * what)2536 static bool re_mult_next(char *what)
2537 {
2538 if (re_multi_type(peekchr()) == MULTI_MULT) {
2539 EMSG2_RET_FAIL(_("E888: (NFA regexp) cannot repeat %s"), what);
2540 }
2541 return true;
2542 }
2543
2544 // Return true if MULTIBYTECODE should be used instead of EXACTLY for
2545 // character "c".
use_multibytecode(int c)2546 static bool use_multibytecode(int c)
2547 {
2548 return utf_char2len(c) > 1
2549 && (re_multi_type(peekchr()) != NOT_MULTI
2550 || utf_iscomposing(c));
2551 }
2552
2553 /*
2554 * Emit a node.
2555 * Return pointer to generated code.
2556 */
regnode(int op)2557 static char_u *regnode(int op)
2558 {
2559 char_u *ret;
2560
2561 ret = regcode;
2562 if (ret == JUST_CALC_SIZE)
2563 regsize += 3;
2564 else {
2565 *regcode++ = op;
2566 *regcode++ = NUL; /* Null "next" pointer. */
2567 *regcode++ = NUL;
2568 }
2569 return ret;
2570 }
2571
2572 /*
2573 * Emit (if appropriate) a byte of code
2574 */
regc(int b)2575 static void regc(int b)
2576 {
2577 if (regcode == JUST_CALC_SIZE)
2578 regsize++;
2579 else
2580 *regcode++ = b;
2581 }
2582
2583 /*
2584 * Emit (if appropriate) a multi-byte character of code
2585 */
regmbc(int c)2586 static void regmbc(int c)
2587 {
2588 if (regcode == JUST_CALC_SIZE) {
2589 regsize += utf_char2len(c);
2590 } else {
2591 regcode += utf_char2bytes(c, regcode);
2592 }
2593 }
2594
2595 /*
2596 * Insert an operator in front of already-emitted operand
2597 *
2598 * Means relocating the operand.
2599 */
reginsert(int op,char_u * opnd)2600 static void reginsert(int op, char_u *opnd)
2601 {
2602 char_u *src;
2603 char_u *dst;
2604 char_u *place;
2605
2606 if (regcode == JUST_CALC_SIZE) {
2607 regsize += 3;
2608 return;
2609 }
2610 src = regcode;
2611 regcode += 3;
2612 dst = regcode;
2613 while (src > opnd)
2614 *--dst = *--src;
2615
2616 place = opnd; /* Op node, where operand used to be. */
2617 *place++ = op;
2618 *place++ = NUL;
2619 *place = NUL;
2620 }
2621
2622 /*
2623 * Insert an operator in front of already-emitted operand.
2624 * Add a number to the operator.
2625 */
reginsert_nr(int op,long val,char_u * opnd)2626 static void reginsert_nr(int op, long val, char_u *opnd)
2627 {
2628 char_u *src;
2629 char_u *dst;
2630 char_u *place;
2631
2632 if (regcode == JUST_CALC_SIZE) {
2633 regsize += 7;
2634 return;
2635 }
2636 src = regcode;
2637 regcode += 7;
2638 dst = regcode;
2639 while (src > opnd)
2640 *--dst = *--src;
2641
2642 place = opnd; /* Op node, where operand used to be. */
2643 *place++ = op;
2644 *place++ = NUL;
2645 *place++ = NUL;
2646 assert(val >= 0 && (uintmax_t)val <= UINT32_MAX);
2647 re_put_uint32(place, (uint32_t)val);
2648 }
2649
2650 /*
2651 * Insert an operator in front of already-emitted operand.
2652 * The operator has the given limit values as operands. Also set next pointer.
2653 *
2654 * Means relocating the operand.
2655 */
reginsert_limits(int op,long minval,long maxval,char_u * opnd)2656 static void reginsert_limits(int op, long minval, long maxval, char_u *opnd)
2657 {
2658 char_u *src;
2659 char_u *dst;
2660 char_u *place;
2661
2662 if (regcode == JUST_CALC_SIZE) {
2663 regsize += 11;
2664 return;
2665 }
2666 src = regcode;
2667 regcode += 11;
2668 dst = regcode;
2669 while (src > opnd)
2670 *--dst = *--src;
2671
2672 place = opnd; /* Op node, where operand used to be. */
2673 *place++ = op;
2674 *place++ = NUL;
2675 *place++ = NUL;
2676 assert(minval >= 0 && (uintmax_t)minval <= UINT32_MAX);
2677 place = re_put_uint32(place, (uint32_t)minval);
2678 assert(maxval >= 0 && (uintmax_t)maxval <= UINT32_MAX);
2679 place = re_put_uint32(place, (uint32_t)maxval);
2680 regtail(opnd, place);
2681 }
2682
2683 /*
2684 * Write a four bytes number at "p" and return pointer to the next char.
2685 */
re_put_uint32(char_u * p,uint32_t val)2686 static char_u *re_put_uint32(char_u *p, uint32_t val)
2687 {
2688 *p++ = (char_u) ((val >> 24) & 0377);
2689 *p++ = (char_u) ((val >> 16) & 0377);
2690 *p++ = (char_u) ((val >> 8) & 0377);
2691 *p++ = (char_u) (val & 0377);
2692 return p;
2693 }
2694
2695 // Set the next-pointer at the end of a node chain.
regtail(char_u * p,char_u * val)2696 static void regtail(char_u *p, char_u *val)
2697 {
2698 int offset;
2699
2700 if (p == JUST_CALC_SIZE) {
2701 return;
2702 }
2703
2704 // Find last node.
2705 char_u *scan = p;
2706 for (;; ) {
2707 char_u *temp = regnext(scan);
2708 if (temp == NULL) {
2709 break;
2710 }
2711 scan = temp;
2712 }
2713
2714 if (OP(scan) == BACK) {
2715 offset = (int)(scan - val);
2716 } else {
2717 offset = (int)(val - scan);
2718 }
2719 // When the offset uses more than 16 bits it can no longer fit in the two
2720 // bytes available. Use a global flag to avoid having to check return
2721 // values in too many places.
2722 if (offset > 0xffff) {
2723 reg_toolong = true;
2724 } else {
2725 *(scan + 1) = (char_u)(((unsigned)offset >> 8) & 0377);
2726 *(scan + 2) = (char_u)(offset & 0377);
2727 }
2728 }
2729
2730 /*
2731 * Like regtail, on item after a BRANCH; nop if none.
2732 */
regoptail(char_u * p,char_u * val)2733 static void regoptail(char_u *p, char_u *val)
2734 {
2735 /* When op is neither BRANCH nor BRACE_COMPLEX0-9, it is "operandless" */
2736 if (p == NULL || p == JUST_CALC_SIZE
2737 || (OP(p) != BRANCH
2738 && (OP(p) < BRACE_COMPLEX || OP(p) > BRACE_COMPLEX + 9)))
2739 return;
2740 regtail(OPERAND(p), val);
2741 }
2742
2743 /*
2744 * Functions for getting characters from the regexp input.
2745 */
2746
2747 /*
2748 * Start parsing at "str".
2749 */
initchr(char_u * str)2750 static void initchr(char_u *str)
2751 {
2752 regparse = str;
2753 prevchr_len = 0;
2754 curchr = prevprevchr = prevchr = nextchr = -1;
2755 at_start = true;
2756 prev_at_start = false;
2757 }
2758
2759 /*
2760 * Save the current parse state, so that it can be restored and parsing
2761 * starts in the same state again.
2762 */
save_parse_state(parse_state_T * ps)2763 static void save_parse_state(parse_state_T *ps)
2764 {
2765 ps->regparse = regparse;
2766 ps->prevchr_len = prevchr_len;
2767 ps->curchr = curchr;
2768 ps->prevchr = prevchr;
2769 ps->prevprevchr = prevprevchr;
2770 ps->nextchr = nextchr;
2771 ps->at_start = at_start;
2772 ps->prev_at_start = prev_at_start;
2773 ps->regnpar = regnpar;
2774 }
2775
2776 /*
2777 * Restore a previously saved parse state.
2778 */
restore_parse_state(parse_state_T * ps)2779 static void restore_parse_state(parse_state_T *ps)
2780 {
2781 regparse = ps->regparse;
2782 prevchr_len = ps->prevchr_len;
2783 curchr = ps->curchr;
2784 prevchr = ps->prevchr;
2785 prevprevchr = ps->prevprevchr;
2786 nextchr = ps->nextchr;
2787 at_start = ps->at_start;
2788 prev_at_start = ps->prev_at_start;
2789 regnpar = ps->regnpar;
2790 }
2791
2792
2793 /*
2794 * Get the next character without advancing.
2795 */
peekchr(void)2796 static int peekchr(void)
2797 {
2798 static int after_slash = false;
2799
2800 if (curchr != -1) {
2801 return curchr;
2802 }
2803
2804 switch (curchr = regparse[0]) {
2805 case '.':
2806 case '[':
2807 case '~':
2808 /* magic when 'magic' is on */
2809 if (reg_magic >= MAGIC_ON)
2810 curchr = Magic(curchr);
2811 break;
2812 case '(':
2813 case ')':
2814 case '{':
2815 case '%':
2816 case '+':
2817 case '=':
2818 case '?':
2819 case '@':
2820 case '!':
2821 case '&':
2822 case '|':
2823 case '<':
2824 case '>':
2825 case '#': /* future ext. */
2826 case '"': /* future ext. */
2827 case '\'': /* future ext. */
2828 case ',': /* future ext. */
2829 case '-': /* future ext. */
2830 case ':': /* future ext. */
2831 case ';': /* future ext. */
2832 case '`': /* future ext. */
2833 case '/': /* Can't be used in / command */
2834 /* magic only after "\v" */
2835 if (reg_magic == MAGIC_ALL)
2836 curchr = Magic(curchr);
2837 break;
2838 case '*':
2839 // * is not magic as the very first character, eg "?*ptr", when
2840 // after '^', eg "/^*ptr" and when after "\(", "\|", "\&". But
2841 // "\(\*" is not magic, thus must be magic if "after_slash"
2842 if (reg_magic >= MAGIC_ON
2843 && !at_start
2844 && !(prev_at_start && prevchr == Magic('^'))
2845 && (after_slash
2846 || (prevchr != Magic('(')
2847 && prevchr != Magic('&')
2848 && prevchr != Magic('|')))) {
2849 curchr = Magic('*');
2850 }
2851 break;
2852 case '^':
2853 // '^' is only magic as the very first character and if it's after
2854 // "\(", "\|", "\&' or "\n"
2855 if (reg_magic >= MAGIC_OFF
2856 && (at_start
2857 || reg_magic == MAGIC_ALL
2858 || prevchr == Magic('(')
2859 || prevchr == Magic('|')
2860 || prevchr == Magic('&')
2861 || prevchr == Magic('n')
2862 || (no_Magic(prevchr) == '('
2863 && prevprevchr == Magic('%')))) {
2864 curchr = Magic('^');
2865 at_start = true;
2866 prev_at_start = false;
2867 }
2868 break;
2869 case '$':
2870 // '$' is only magic as the very last char and if it's in front of
2871 // either "\|", "\)", "\&", or "\n"
2872 if (reg_magic >= MAGIC_OFF) {
2873 char_u *p = regparse + 1;
2874 bool is_magic_all = (reg_magic == MAGIC_ALL);
2875
2876 // ignore \c \C \m \M \v \V and \Z after '$'
2877 while (p[0] == '\\' && (p[1] == 'c' || p[1] == 'C'
2878 || p[1] == 'm' || p[1] == 'M'
2879 || p[1] == 'v' || p[1] == 'V'
2880 || p[1] == 'Z')) {
2881 if (p[1] == 'v') {
2882 is_magic_all = true;
2883 } else if (p[1] == 'm' || p[1] == 'M' || p[1] == 'V') {
2884 is_magic_all = false;
2885 }
2886 p += 2;
2887 }
2888 if (p[0] == NUL
2889 || (p[0] == '\\'
2890 && (p[1] == '|' || p[1] == '&' || p[1] == ')'
2891 || p[1] == 'n'))
2892 || (is_magic_all
2893 && (p[0] == '|' || p[0] == '&' || p[0] == ')'))
2894 || reg_magic == MAGIC_ALL) {
2895 curchr = Magic('$');
2896 }
2897 }
2898 break;
2899 case '\\':
2900 {
2901 int c = regparse[1];
2902
2903 if (c == NUL)
2904 curchr = '\\'; /* trailing '\' */
2905 else if (
2906 c <= '~' && META_flags[c]
2907 ) {
2908 /*
2909 * META contains everything that may be magic sometimes,
2910 * except ^ and $ ("\^" and "\$" are only magic after
2911 * "\V"). We now fetch the next character and toggle its
2912 * magicness. Therefore, \ is so meta-magic that it is
2913 * not in META.
2914 */
2915 curchr = -1;
2916 prev_at_start = at_start;
2917 at_start = false; // be able to say "/\*ptr"
2918 regparse++;
2919 after_slash++;
2920 (void)peekchr();
2921 regparse--;
2922 after_slash--;
2923 curchr = toggle_Magic(curchr);
2924 } else if (vim_strchr(REGEXP_ABBR, c)) {
2925 /*
2926 * Handle abbreviations, like "\t" for TAB -- webb
2927 */
2928 curchr = backslash_trans(c);
2929 } else if (reg_magic == MAGIC_NONE && (c == '$' || c == '^'))
2930 curchr = toggle_Magic(c);
2931 else {
2932 /*
2933 * Next character can never be (made) magic?
2934 * Then backslashing it won't do anything.
2935 */
2936 curchr = utf_ptr2char(regparse + 1);
2937 }
2938 break;
2939 }
2940
2941 default:
2942 curchr = utf_ptr2char(regparse);
2943 }
2944
2945 return curchr;
2946 }
2947
2948 /*
2949 * Eat one lexed character. Do this in a way that we can undo it.
2950 */
skipchr(void)2951 static void skipchr(void)
2952 {
2953 /* peekchr() eats a backslash, do the same here */
2954 if (*regparse == '\\')
2955 prevchr_len = 1;
2956 else
2957 prevchr_len = 0;
2958 if (regparse[prevchr_len] != NUL) {
2959 // Exclude composing chars that utfc_ptr2len does include.
2960 prevchr_len += utf_ptr2len(regparse + prevchr_len);
2961 }
2962 regparse += prevchr_len;
2963 prev_at_start = at_start;
2964 at_start = false;
2965 prevprevchr = prevchr;
2966 prevchr = curchr;
2967 curchr = nextchr; /* use previously unget char, or -1 */
2968 nextchr = -1;
2969 }
2970
2971 /*
2972 * Skip a character while keeping the value of prev_at_start for at_start.
2973 * prevchr and prevprevchr are also kept.
2974 */
skipchr_keepstart(void)2975 static void skipchr_keepstart(void)
2976 {
2977 int as = prev_at_start;
2978 int pr = prevchr;
2979 int prpr = prevprevchr;
2980
2981 skipchr();
2982 at_start = as;
2983 prevchr = pr;
2984 prevprevchr = prpr;
2985 }
2986
2987 /*
2988 * Get the next character from the pattern. We know about magic and such, so
2989 * therefore we need a lexical analyzer.
2990 */
getchr(void)2991 static int getchr(void)
2992 {
2993 int chr = peekchr();
2994
2995 skipchr();
2996 return chr;
2997 }
2998
2999 /*
3000 * put character back. Works only once!
3001 */
ungetchr(void)3002 static void ungetchr(void)
3003 {
3004 nextchr = curchr;
3005 curchr = prevchr;
3006 prevchr = prevprevchr;
3007 at_start = prev_at_start;
3008 prev_at_start = false;
3009
3010 // Backup regparse, so that it's at the same position as before the
3011 // getchr().
3012 regparse -= prevchr_len;
3013 }
3014
3015 /*
3016 * Get and return the value of the hex string at the current position.
3017 * Return -1 if there is no valid hex number.
3018 * The position is updated:
3019 * blahblah\%x20asdf
3020 * before-^ ^-after
3021 * The parameter controls the maximum number of input characters. This will be
3022 * 2 when reading a \%x20 sequence and 4 when reading a \%u20AC sequence.
3023 */
gethexchrs(int maxinputlen)3024 static int64_t gethexchrs(int maxinputlen)
3025 {
3026 int64_t nr = 0;
3027 int c;
3028 int i;
3029
3030 for (i = 0; i < maxinputlen; ++i) {
3031 c = regparse[0];
3032 if (!ascii_isxdigit(c))
3033 break;
3034 nr <<= 4;
3035 nr |= hex2nr(c);
3036 ++regparse;
3037 }
3038
3039 if (i == 0)
3040 return -1;
3041 return nr;
3042 }
3043
3044 /*
3045 * Get and return the value of the decimal string immediately after the
3046 * current position. Return -1 for invalid. Consumes all digits.
3047 */
getdecchrs(void)3048 static int64_t getdecchrs(void)
3049 {
3050 int64_t nr = 0;
3051 int c;
3052 int i;
3053
3054 for (i = 0;; ++i) {
3055 c = regparse[0];
3056 if (c < '0' || c > '9')
3057 break;
3058 nr *= 10;
3059 nr += c - '0';
3060 ++regparse;
3061 curchr = -1; /* no longer valid */
3062 }
3063
3064 if (i == 0)
3065 return -1;
3066 return nr;
3067 }
3068
3069 /*
3070 * get and return the value of the octal string immediately after the current
3071 * position. Return -1 for invalid, or 0-255 for valid. Smart enough to handle
3072 * numbers > 377 correctly (for example, 400 is treated as 40) and doesn't
3073 * treat 8 or 9 as recognised characters. Position is updated:
3074 * blahblah\%o210asdf
3075 * before-^ ^-after
3076 */
getoctchrs(void)3077 static int64_t getoctchrs(void)
3078 {
3079 int64_t nr = 0;
3080 int c;
3081 int i;
3082
3083 for (i = 0; i < 3 && nr < 040; i++) { // -V536
3084 c = regparse[0];
3085 if (c < '0' || c > '7')
3086 break;
3087 nr <<= 3;
3088 nr |= hex2nr(c);
3089 ++regparse;
3090 }
3091
3092 if (i == 0)
3093 return -1;
3094 return nr;
3095 }
3096
3097 /*
3098 * Get a number after a backslash that is inside [].
3099 * When nothing is recognized return a backslash.
3100 */
coll_get_char(void)3101 static int coll_get_char(void)
3102 {
3103 int64_t nr = -1;
3104
3105 switch (*regparse++) {
3106 case 'd': nr = getdecchrs(); break;
3107 case 'o': nr = getoctchrs(); break;
3108 case 'x': nr = gethexchrs(2); break;
3109 case 'u': nr = gethexchrs(4); break;
3110 case 'U': nr = gethexchrs(8); break;
3111 }
3112 if (nr < 0 || nr > INT_MAX) {
3113 // If getting the number fails be backwards compatible: the character
3114 // is a backslash.
3115 regparse--;
3116 nr = '\\';
3117 }
3118 return nr;
3119 }
3120
3121 /*
3122 * read_limits - Read two integers to be taken as a minimum and maximum.
3123 * If the first character is '-', then the range is reversed.
3124 * Should end with 'end'. If minval is missing, zero is default, if maxval is
3125 * missing, a very big number is the default.
3126 */
read_limits(long * minval,long * maxval)3127 static int read_limits(long *minval, long *maxval)
3128 {
3129 int reverse = false;
3130 char_u *first_char;
3131 long tmp;
3132
3133 if (*regparse == '-') {
3134 // Starts with '-', so reverse the range later.
3135 regparse++;
3136 reverse = true;
3137 }
3138 first_char = regparse;
3139 *minval = getdigits_long(®parse, false, 0);
3140 if (*regparse == ',') { // There is a comma.
3141 if (ascii_isdigit(*++regparse)) {
3142 *maxval = getdigits_long(®parse, false, MAX_LIMIT);
3143 } else {
3144 *maxval = MAX_LIMIT;
3145 }
3146 } else if (ascii_isdigit(*first_char)) {
3147 *maxval = *minval; // It was \{n} or \{-n}
3148 } else {
3149 *maxval = MAX_LIMIT; // It was \{} or \{-}
3150 }
3151 if (*regparse == '\\') {
3152 regparse++; // Allow either \{...} or \{...\}
3153 }
3154 if (*regparse != '}') {
3155 sprintf((char *)IObuff, _("E554: Syntax error in %s{...}"),
3156 reg_magic == MAGIC_ALL ? "" : "\\");
3157 EMSG_RET_FAIL((char *)IObuff);
3158 }
3159
3160 /*
3161 * Reverse the range if there was a '-', or make sure it is in the right
3162 * order otherwise.
3163 */
3164 if ((!reverse && *minval > *maxval) || (reverse && *minval < *maxval)) {
3165 tmp = *minval;
3166 *minval = *maxval;
3167 *maxval = tmp;
3168 }
3169 skipchr(); /* let's be friends with the lexer again */
3170 return OK;
3171 }
3172
3173 /*
3174 * vim_regexec and friends
3175 */
3176
3177 /*
3178 * Global work variables for vim_regexec().
3179 */
3180
3181 /* Save the sub-expressions before attempting a match. */
3182 #define save_se(savep, posp, pp) \
3183 REG_MULTI ? save_se_multi((savep), (posp)) : save_se_one((savep), (pp))
3184
3185 /* After a failed match restore the sub-expressions. */
3186 #define restore_se(savep, posp, pp) { \
3187 if (REG_MULTI) \
3188 *(posp) = (savep)->se_u.pos; \
3189 else \
3190 *(pp) = (savep)->se_u.ptr; }
3191
3192
3193 #ifdef REGEXP_DEBUG
3194 int regnarrate = 0;
3195 #endif
3196
3197 // Sometimes need to save a copy of a line. Since alloc()/free() is very
3198 // slow, we keep one allocated piece of memory and only re-allocate it when
3199 // it's too small. It's freed in bt_regexec_both() when finished.
3200 static char_u *reg_tofree = NULL;
3201 static unsigned reg_tofreelen;
3202
3203 // Structure used to store the execution state of the regex engine.
3204 // Which ones are set depends on whether a single-line or multi-line match is
3205 // done:
3206 // single-line multi-line
3207 // reg_match ®match_T NULL
3208 // reg_mmatch NULL ®mmatch_T
3209 // reg_startp reg_match->startp <invalid>
3210 // reg_endp reg_match->endp <invalid>
3211 // reg_startpos <invalid> reg_mmatch->startpos
3212 // reg_endpos <invalid> reg_mmatch->endpos
3213 // reg_win NULL window in which to search
3214 // reg_buf curbuf buffer in which to search
3215 // reg_firstlnum <invalid> first line in which to search
3216 // reg_maxline 0 last line nr
3217 // reg_line_lbr false or true false
3218 typedef struct {
3219 regmatch_T *reg_match;
3220 regmmatch_T *reg_mmatch;
3221 char_u **reg_startp;
3222 char_u **reg_endp;
3223 lpos_T *reg_startpos;
3224 lpos_T *reg_endpos;
3225 win_T *reg_win;
3226 buf_T *reg_buf;
3227 linenr_T reg_firstlnum;
3228 linenr_T reg_maxline;
3229 bool reg_line_lbr; // "\n" in string is line break
3230
3231 // The current match-position is remembered with these variables:
3232 linenr_T lnum; ///< line number, relative to first line
3233 char_u *line; ///< start of current line
3234 char_u *input; ///< current input, points into "regline"
3235
3236 int need_clear_subexpr; ///< subexpressions still need to be cleared
3237 int need_clear_zsubexpr; ///< extmatch subexpressions still need to be
3238 ///< cleared
3239
3240
3241 // Internal copy of 'ignorecase'. It is set at each call to vim_regexec().
3242 // Normally it gets the value of "rm_ic" or "rmm_ic", but when the pattern
3243 // contains '\c' or '\C' the value is overruled.
3244 bool reg_ic;
3245
3246 // Similar to "reg_ic", but only for 'combining' characters. Set with \Z
3247 // flag in the regexp. Defaults to false, always.
3248 bool reg_icombine;
3249
3250 // Copy of "rmm_maxcol": maximum column to search for a match. Zero when
3251 // there is no maximum.
3252 colnr_T reg_maxcol;
3253
3254 // State for the NFA engine regexec.
3255 int nfa_has_zend; ///< NFA regexp \ze operator encountered.
3256 int nfa_has_backref; ///< NFA regexp \1 .. \9 encountered.
3257 int nfa_nsubexpr; ///< Number of sub expressions actually being used
3258 ///< during execution. 1 if only the whole match
3259 ///< (subexpr 0) is used.
3260 // listid is global, so that it increases on recursive calls to
3261 // nfa_regmatch(), which means we don't have to clear the lastlist field of
3262 // all the states.
3263 int nfa_listid;
3264 int nfa_alt_listid;
3265
3266 int nfa_has_zsubexpr; ///< NFA regexp has \z( ), set zsubexpr.
3267 } regexec_T;
3268
3269 static regexec_T rex;
3270 static bool rex_in_use = false;
3271
3272 /*
3273 * "regstack" and "backpos" are used by regmatch(). They are kept over calls
3274 * to avoid invoking malloc() and free() often.
3275 * "regstack" is a stack with regitem_T items, sometimes preceded by regstar_T
3276 * or regbehind_T.
3277 * "backpos_T" is a table with backpos_T for BACK
3278 */
3279 static garray_T regstack = GA_EMPTY_INIT_VALUE;
3280 static garray_T backpos = GA_EMPTY_INIT_VALUE;
3281
3282 /*
3283 * Both for regstack and backpos tables we use the following strategy of
3284 * allocation (to reduce malloc/free calls):
3285 * - Initial size is fairly small.
3286 * - When needed, the tables are grown bigger (8 times at first, double after
3287 * that).
3288 * - After executing the match we free the memory only if the array has grown.
3289 * Thus the memory is kept allocated when it's at the initial size.
3290 * This makes it fast while not keeping a lot of memory allocated.
3291 * A three times speed increase was observed when using many simple patterns.
3292 */
3293 #define REGSTACK_INITIAL 2048
3294 #define BACKPOS_INITIAL 64
3295
3296 #if defined(EXITFREE)
free_regexp_stuff(void)3297 void free_regexp_stuff(void)
3298 {
3299 ga_clear(®stack);
3300 ga_clear(&backpos);
3301 xfree(reg_tofree);
3302 xfree(reg_prev_sub);
3303 }
3304
3305 #endif
3306
3307 // Return true if character 'c' is included in 'iskeyword' option for
3308 // "reg_buf" buffer.
reg_iswordc(int c)3309 static bool reg_iswordc(int c)
3310 {
3311 return vim_iswordc_buf(c, rex.reg_buf);
3312 }
3313
3314 /*
3315 * Get pointer to the line "lnum", which is relative to "reg_firstlnum".
3316 */
reg_getline(linenr_T lnum)3317 static char_u *reg_getline(linenr_T lnum)
3318 {
3319 // when looking behind for a match/no-match lnum is negative. But we
3320 // can't go before line 1
3321 if (rex.reg_firstlnum + lnum < 1) {
3322 return NULL;
3323 }
3324 if (lnum > rex.reg_maxline) {
3325 // Must have matched the "\n" in the last line.
3326 return (char_u *)"";
3327 }
3328 return ml_get_buf(rex.reg_buf, rex.reg_firstlnum + lnum, false);
3329 }
3330
3331 static regsave_T behind_pos;
3332
3333 static char_u *reg_startzp[NSUBEXP]; /* Workspace to mark beginning */
3334 static char_u *reg_endzp[NSUBEXP]; /* and end of \z(...\) matches */
3335 static lpos_T reg_startzpos[NSUBEXP]; /* idem, beginning pos */
3336 static lpos_T reg_endzpos[NSUBEXP]; /* idem, end pos */
3337
3338 // true if using multi-line regexp.
3339 #define REG_MULTI (rex.reg_match == NULL)
3340
3341 /*
3342 * Match a regexp against a string.
3343 * "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
3344 * Uses curbuf for line count and 'iskeyword'.
3345 * If "line_lbr" is true, consider a "\n" in "line" to be a line break.
3346 *
3347 * Returns 0 for failure, number of lines contained in the match otherwise.
3348 */
3349 static int
bt_regexec_nl(regmatch_T * rmp,char_u * line,colnr_T col,bool line_lbr)3350 bt_regexec_nl (
3351 regmatch_T *rmp,
3352 char_u *line, /* string to match against */
3353 colnr_T col, /* column to start looking for match */
3354 bool line_lbr
3355 )
3356 {
3357 rex.reg_match = rmp;
3358 rex.reg_mmatch = NULL;
3359 rex.reg_maxline = 0;
3360 rex.reg_line_lbr = line_lbr;
3361 rex.reg_buf = curbuf;
3362 rex.reg_win = NULL;
3363 rex.reg_ic = rmp->rm_ic;
3364 rex.reg_icombine = false;
3365 rex.reg_maxcol = 0;
3366
3367 long r = bt_regexec_both(line, col, NULL, NULL);
3368 assert(r <= INT_MAX);
3369 return (int)r;
3370 }
3371
3372 /// Wrapper around strchr which accounts for case-insensitive searches and
3373 /// non-ASCII characters.
3374 ///
3375 /// This function is used a lot for simple searches, keep it fast!
3376 ///
3377 /// @param s string to search
3378 /// @param c character to find in @a s
3379 ///
3380 /// @return NULL if no match, otherwise pointer to the position in @a s
cstrchr(const char_u * const s,const int c)3381 static inline char_u *cstrchr(const char_u *const s, const int c)
3382 FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
3383 FUNC_ATTR_ALWAYS_INLINE
3384 {
3385 if (!rex.reg_ic) {
3386 return vim_strchr(s, c);
3387 }
3388
3389 // Use folded case for UTF-8, slow! For ASCII use libc strpbrk which is
3390 // expected to be highly optimized.
3391 if (c > 0x80) {
3392 const int folded_c = utf_fold(c);
3393 for (const char_u *p = s; *p != NUL; p += utfc_ptr2len(p)) {
3394 if (utf_fold(utf_ptr2char(p)) == folded_c) {
3395 return (char_u *)p;
3396 }
3397 }
3398 return NULL;
3399 }
3400
3401 int cc;
3402 if (ASCII_ISUPPER(c)) {
3403 cc = TOLOWER_ASC(c);
3404 } else if (ASCII_ISLOWER(c)) {
3405 cc = TOUPPER_ASC(c);
3406 } else {
3407 return vim_strchr(s, c);
3408 }
3409
3410 char tofind[] = { (char)c, (char)cc, NUL };
3411 return (char_u *)strpbrk((const char *)s, tofind);
3412 }
3413
3414 /// Matches a regexp against multiple lines.
3415 /// "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
3416 /// Uses curbuf for line count and 'iskeyword'.
3417 ///
3418 /// @param win Window in which to search or NULL
3419 /// @param buf Buffer in which to search
3420 /// @param lnum Number of line to start looking for match
3421 /// @param col Column to start looking for match
3422 /// @param tm Timeout limit or NULL
3423 ///
3424 /// @return zero if there is no match and number of lines contained in the match
3425 /// otherwise.
bt_regexec_multi(regmmatch_T * rmp,win_T * win,buf_T * buf,linenr_T lnum,colnr_T col,proftime_T * tm,int * timed_out)3426 static long bt_regexec_multi(regmmatch_T *rmp, win_T *win, buf_T *buf,
3427 linenr_T lnum, colnr_T col,
3428 proftime_T *tm, int *timed_out)
3429 {
3430 rex.reg_match = NULL;
3431 rex.reg_mmatch = rmp;
3432 rex.reg_buf = buf;
3433 rex.reg_win = win;
3434 rex.reg_firstlnum = lnum;
3435 rex.reg_maxline = rex.reg_buf->b_ml.ml_line_count - lnum;
3436 rex.reg_line_lbr = false;
3437 rex.reg_ic = rmp->rmm_ic;
3438 rex.reg_icombine = false;
3439 rex.reg_maxcol = rmp->rmm_maxcol;
3440
3441 return bt_regexec_both(NULL, col, tm, timed_out);
3442 }
3443
3444 /// Match a regexp against a string ("line" points to the string) or multiple
3445 /// lines (if "line" is NULL, use reg_getline()).
3446 /// @return 0 for failure, or number of lines contained in the match.
bt_regexec_both(char_u * line,colnr_T col,proftime_T * tm,int * timed_out)3447 static long bt_regexec_both(char_u *line,
3448 colnr_T col, // column to start search
3449 proftime_T *tm, // timeout limit or NULL
3450 int *timed_out) // flag set on timeout or NULL
3451 {
3452 bt_regprog_T *prog;
3453 char_u *s;
3454 long retval = 0L;
3455
3456 /* Create "regstack" and "backpos" if they are not allocated yet.
3457 * We allocate *_INITIAL amount of bytes first and then set the grow size
3458 * to much bigger value to avoid many malloc calls in case of deep regular
3459 * expressions. */
3460 if (regstack.ga_data == NULL) {
3461 /* Use an item size of 1 byte, since we push different things
3462 * onto the regstack. */
3463 ga_init(®stack, 1, REGSTACK_INITIAL);
3464 ga_grow(®stack, REGSTACK_INITIAL);
3465 ga_set_growsize(®stack, REGSTACK_INITIAL * 8);
3466 }
3467
3468 if (backpos.ga_data == NULL) {
3469 ga_init(&backpos, sizeof(backpos_T), BACKPOS_INITIAL);
3470 ga_grow(&backpos, BACKPOS_INITIAL);
3471 ga_set_growsize(&backpos, BACKPOS_INITIAL * 8);
3472 }
3473
3474 if (REG_MULTI) {
3475 prog = (bt_regprog_T *)rex.reg_mmatch->regprog;
3476 line = reg_getline((linenr_T)0);
3477 rex.reg_startpos = rex.reg_mmatch->startpos;
3478 rex.reg_endpos = rex.reg_mmatch->endpos;
3479 } else {
3480 prog = (bt_regprog_T *)rex.reg_match->regprog;
3481 rex.reg_startp = rex.reg_match->startp;
3482 rex.reg_endp = rex.reg_match->endp;
3483 }
3484
3485 /* Be paranoid... */
3486 if (prog == NULL || line == NULL) {
3487 iemsg(_(e_null));
3488 goto theend;
3489 }
3490
3491 /* Check validity of program. */
3492 if (prog_magic_wrong())
3493 goto theend;
3494
3495 // If the start column is past the maximum column: no need to try.
3496 if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol) {
3497 goto theend;
3498 }
3499
3500 // If pattern contains "\c" or "\C": overrule value of rex.reg_ic
3501 if (prog->regflags & RF_ICASE) {
3502 rex.reg_ic = true;
3503 } else if (prog->regflags & RF_NOICASE) {
3504 rex.reg_ic = false;
3505 }
3506
3507 // If pattern contains "\Z" overrule value of rex.reg_icombine
3508 if (prog->regflags & RF_ICOMBINE) {
3509 rex.reg_icombine = true;
3510 }
3511
3512 /* If there is a "must appear" string, look for it. */
3513 if (prog->regmust != NULL) {
3514 int c = utf_ptr2char(prog->regmust);
3515 s = line + col;
3516
3517 // This is used very often, esp. for ":global". Use two versions of
3518 // the loop to avoid overhead of conditions.
3519 if (!rex.reg_ic) {
3520 while ((s = vim_strchr(s, c)) != NULL) {
3521 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0) {
3522 break; // Found it.
3523 }
3524 MB_PTR_ADV(s);
3525 }
3526 } else {
3527 while ((s = cstrchr(s, c)) != NULL) {
3528 if (cstrncmp(s, prog->regmust, &prog->regmlen) == 0) {
3529 break; // Found it.
3530 }
3531 MB_PTR_ADV(s);
3532 }
3533 }
3534 if (s == NULL) { // Not present.
3535 goto theend;
3536 }
3537 }
3538
3539 rex.line = line;
3540 rex.lnum = 0;
3541 reg_toolong = false;
3542
3543 /* Simplest case: Anchored match need be tried only once. */
3544 if (prog->reganch) {
3545 int c = utf_ptr2char(rex.line + col);
3546 if (prog->regstart == NUL
3547 || prog->regstart == c
3548 || (rex.reg_ic
3549 && (utf_fold(prog->regstart) == utf_fold(c)
3550 || (c < 255 && prog->regstart < 255
3551 && mb_tolower(prog->regstart) == mb_tolower(c))))) {
3552 retval = regtry(prog, col, tm, timed_out);
3553 } else {
3554 retval = 0;
3555 }
3556 } else {
3557 int tm_count = 0;
3558 /* Messy cases: unanchored match. */
3559 while (!got_int) {
3560 if (prog->regstart != NUL) {
3561 // Skip until the char we know it must start with.
3562 s = cstrchr(rex.line + col, prog->regstart);
3563 if (s == NULL) {
3564 retval = 0;
3565 break;
3566 }
3567 col = (int)(s - rex.line);
3568 }
3569
3570 // Check for maximum column to try.
3571 if (rex.reg_maxcol > 0 && col >= rex.reg_maxcol) {
3572 retval = 0;
3573 break;
3574 }
3575
3576 retval = regtry(prog, col, tm, timed_out);
3577 if (retval > 0) {
3578 break;
3579 }
3580
3581 // if not currently on the first line, get it again
3582 if (rex.lnum != 0) {
3583 rex.lnum = 0;
3584 rex.line = reg_getline((linenr_T)0);
3585 }
3586 if (rex.line[col] == NUL) {
3587 break;
3588 }
3589 col += utfc_ptr2len(rex.line + col);
3590 // Check for timeout once in a twenty times to avoid overhead.
3591 if (tm != NULL && ++tm_count == 20) {
3592 tm_count = 0;
3593 if (profile_passed_limit(*tm)) {
3594 if (timed_out != NULL) {
3595 *timed_out = true;
3596 }
3597 break;
3598 }
3599 }
3600 }
3601 }
3602
3603 theend:
3604 /* Free "reg_tofree" when it's a bit big.
3605 * Free regstack and backpos if they are bigger than their initial size. */
3606 if (reg_tofreelen > 400) {
3607 XFREE_CLEAR(reg_tofree);
3608 }
3609 if (regstack.ga_maxlen > REGSTACK_INITIAL)
3610 ga_clear(®stack);
3611 if (backpos.ga_maxlen > BACKPOS_INITIAL)
3612 ga_clear(&backpos);
3613
3614 if (retval > 0) {
3615 // Make sure the end is never before the start. Can happen when \zs
3616 // and \ze are used.
3617 if (REG_MULTI) {
3618 const lpos_T *const start = &rex.reg_mmatch->startpos[0];
3619 const lpos_T *const end = &rex.reg_mmatch->endpos[0];
3620
3621 if (end->lnum < start->lnum
3622 || (end->lnum == start->lnum && end->col < start->col)) {
3623 rex.reg_mmatch->endpos[0] = rex.reg_mmatch->startpos[0];
3624 }
3625 } else {
3626 if (rex.reg_match->endp[0] < rex.reg_match->startp[0]) {
3627 rex.reg_match->endp[0] = rex.reg_match->startp[0];
3628 }
3629 }
3630 }
3631
3632 return retval;
3633 }
3634
3635
3636 /*
3637 * Create a new extmatch and mark it as referenced once.
3638 */
make_extmatch(void)3639 static reg_extmatch_T *make_extmatch(void)
3640 FUNC_ATTR_NONNULL_RET
3641 {
3642 reg_extmatch_T *em = xcalloc(1, sizeof(reg_extmatch_T));
3643 em->refcnt = 1;
3644 return em;
3645 }
3646
3647 /*
3648 * Add a reference to an extmatch.
3649 */
ref_extmatch(reg_extmatch_T * em)3650 reg_extmatch_T *ref_extmatch(reg_extmatch_T *em)
3651 {
3652 if (em != NULL)
3653 em->refcnt++;
3654 return em;
3655 }
3656
3657 /*
3658 * Remove a reference to an extmatch. If there are no references left, free
3659 * the info.
3660 */
unref_extmatch(reg_extmatch_T * em)3661 void unref_extmatch(reg_extmatch_T *em)
3662 {
3663 int i;
3664
3665 if (em != NULL && --em->refcnt <= 0) {
3666 for (i = 0; i < NSUBEXP; ++i)
3667 xfree(em->matches[i]);
3668 xfree(em);
3669 }
3670 }
3671
3672 /// Try match of "prog" with at rex.line["col"].
3673 /// @returns 0 for failure, or number of lines contained in the match.
regtry(bt_regprog_T * prog,colnr_T col,proftime_T * tm,int * timed_out)3674 static long regtry(bt_regprog_T *prog,
3675 colnr_T col,
3676 proftime_T *tm, // timeout limit or NULL
3677 int *timed_out) // flag set on timeout or NULL
3678 {
3679 rex.input = rex.line + col;
3680 rex.need_clear_subexpr = true;
3681 // Clear the external match subpointers if necessaey.
3682 rex.need_clear_zsubexpr = (prog->reghasz == REX_SET);
3683
3684 if (regmatch(prog->program + 1, tm, timed_out) == 0) {
3685 return 0;
3686 }
3687
3688 cleanup_subexpr();
3689 if (REG_MULTI) {
3690 if (rex.reg_startpos[0].lnum < 0) {
3691 rex.reg_startpos[0].lnum = 0;
3692 rex.reg_startpos[0].col = col;
3693 }
3694 if (rex.reg_endpos[0].lnum < 0) {
3695 rex.reg_endpos[0].lnum = rex.lnum;
3696 rex.reg_endpos[0].col = (int)(rex.input - rex.line);
3697 } else {
3698 // Use line number of "\ze".
3699 rex.lnum = rex.reg_endpos[0].lnum;
3700 }
3701 } else {
3702 if (rex.reg_startp[0] == NULL) {
3703 rex.reg_startp[0] = rex.line + col;
3704 }
3705 if (rex.reg_endp[0] == NULL) {
3706 rex.reg_endp[0] = rex.input;
3707 }
3708 }
3709 /* Package any found \z(...\) matches for export. Default is none. */
3710 unref_extmatch(re_extmatch_out);
3711 re_extmatch_out = NULL;
3712
3713 if (prog->reghasz == REX_SET) {
3714 int i;
3715
3716 cleanup_zsubexpr();
3717 re_extmatch_out = make_extmatch();
3718 for (i = 0; i < NSUBEXP; i++) {
3719 if (REG_MULTI) {
3720 /* Only accept single line matches. */
3721 if (reg_startzpos[i].lnum >= 0
3722 && reg_endzpos[i].lnum == reg_startzpos[i].lnum
3723 && reg_endzpos[i].col >= reg_startzpos[i].col) {
3724 re_extmatch_out->matches[i] =
3725 vim_strnsave(reg_getline(reg_startzpos[i].lnum)
3726 + reg_startzpos[i].col,
3727 reg_endzpos[i].col
3728 - reg_startzpos[i].col);
3729 }
3730 } else {
3731 if (reg_startzp[i] != NULL && reg_endzp[i] != NULL)
3732 re_extmatch_out->matches[i] =
3733 vim_strnsave(reg_startzp[i], reg_endzp[i] - reg_startzp[i]);
3734 }
3735 }
3736 }
3737 return 1 + rex.lnum;
3738 }
3739
3740
3741 // Get class of previous character.
reg_prev_class(void)3742 static int reg_prev_class(void)
3743 {
3744 if (rex.input > rex.line) {
3745 return mb_get_class_tab(
3746 rex.input - 1 - utf_head_off(rex.line, rex.input - 1),
3747 rex.reg_buf->b_chartab);
3748 }
3749 return -1;
3750 }
3751
3752
3753 // Return true if the current rex.input position matches the Visual area.
reg_match_visual(void)3754 static bool reg_match_visual(void)
3755 {
3756 pos_T top, bot;
3757 linenr_T lnum;
3758 colnr_T col;
3759 win_T *wp = rex.reg_win == NULL ? curwin : rex.reg_win;
3760 int mode;
3761 colnr_T start, end;
3762 colnr_T start2, end2;
3763 colnr_T curswant;
3764
3765 // Check if the buffer is the current buffer.
3766 if (rex.reg_buf != curbuf || VIsual.lnum == 0) {
3767 return false;
3768 }
3769
3770 if (VIsual_active) {
3771 if (lt(VIsual, wp->w_cursor)) {
3772 top = VIsual;
3773 bot = wp->w_cursor;
3774 } else {
3775 top = wp->w_cursor;
3776 bot = VIsual;
3777 }
3778 mode = VIsual_mode;
3779 curswant = wp->w_curswant;
3780 } else {
3781 if (lt(curbuf->b_visual.vi_start, curbuf->b_visual.vi_end)) {
3782 top = curbuf->b_visual.vi_start;
3783 bot = curbuf->b_visual.vi_end;
3784 } else {
3785 top = curbuf->b_visual.vi_end;
3786 bot = curbuf->b_visual.vi_start;
3787 }
3788 mode = curbuf->b_visual.vi_mode;
3789 curswant = curbuf->b_visual.vi_curswant;
3790 }
3791 lnum = rex.lnum + rex.reg_firstlnum;
3792 if (lnum < top.lnum || lnum > bot.lnum) {
3793 return false;
3794 }
3795
3796 if (mode == 'v') {
3797 col = (colnr_T)(rex.input - rex.line);
3798 if ((lnum == top.lnum && col < top.col)
3799 || (lnum == bot.lnum && col >= bot.col + (*p_sel != 'e'))) {
3800 return false;
3801 }
3802 } else if (mode == Ctrl_V) {
3803 getvvcol(wp, &top, &start, NULL, &end);
3804 getvvcol(wp, &bot, &start2, NULL, &end2);
3805 if (start2 < start)
3806 start = start2;
3807 if (end2 > end)
3808 end = end2;
3809 if (top.col == MAXCOL || bot.col == MAXCOL || curswant == MAXCOL) {
3810 end = MAXCOL;
3811 }
3812 unsigned int cols_u = win_linetabsize(wp, rex.line,
3813 (colnr_T)(rex.input - rex.line));
3814 assert(cols_u <= MAXCOL);
3815 colnr_T cols = (colnr_T)cols_u;
3816 if (cols < start || cols > end - (*p_sel == 'e')) {
3817 return false;
3818 }
3819 }
3820 return true;
3821 }
3822
3823 #define ADVANCE_REGINPUT() MB_PTR_ADV(rex.input)
3824
3825 /*
3826 * The arguments from BRACE_LIMITS are stored here. They are actually local
3827 * to regmatch(), but they are here to reduce the amount of stack space used
3828 * (it can be called recursively many times).
3829 */
3830 static long bl_minval;
3831 static long bl_maxval;
3832
3833 /// Main matching routine
3834 ///
3835 /// Conceptually the strategy is simple: Check to see whether the current node
3836 /// matches, push an item onto the regstack and loop to see whether the rest
3837 /// matches, and then act accordingly. In practice we make some effort to
3838 /// avoid using the regstack, in particular by going through "ordinary" nodes
3839 /// (that don't need to know whether the rest of the match failed) by a nested
3840 /// loop.
3841 ///
3842 /// Returns true when there is a match. Leaves rex.input and rex.lnum
3843 /// just after the last matched character.
3844 /// Returns false when there is no match. Leaves rex.input and rex.lnum in an
3845 /// undefined state!
regmatch(char_u * scan,proftime_T * tm,int * timed_out)3846 static bool regmatch(
3847 char_u *scan, // Current node.
3848 proftime_T *tm, // timeout limit or NULL
3849 int *timed_out // flag set on timeout or NULL
3850 )
3851 {
3852 char_u *next; /* Next node. */
3853 int op;
3854 int c;
3855 regitem_T *rp;
3856 int no;
3857 int status; // one of the RA_ values:
3858 int tm_count = 0;
3859 #define RA_FAIL 1 // something failed, abort
3860 #define RA_CONT 2 // continue in inner loop
3861 #define RA_BREAK 3 // break inner loop
3862 #define RA_MATCH 4 // successful match
3863 #define RA_NOMATCH 5 // didn't match
3864
3865 // Make "regstack" and "backpos" empty. They are allocated and freed in
3866 // bt_regexec_both() to reduce malloc()/free() calls.
3867 regstack.ga_len = 0;
3868 backpos.ga_len = 0;
3869
3870 /*
3871 * Repeat until "regstack" is empty.
3872 */
3873 for (;; ) {
3874 /* Some patterns may take a long time to match, e.g., "\([a-z]\+\)\+Q".
3875 * Allow interrupting them with CTRL-C. */
3876 fast_breakcheck();
3877
3878 #ifdef REGEXP_DEBUG
3879 if (scan != NULL && regnarrate) {
3880 mch_errmsg((char *)regprop(scan));
3881 mch_errmsg("(\n");
3882 }
3883 #endif
3884
3885 /*
3886 * Repeat for items that can be matched sequentially, without using the
3887 * regstack.
3888 */
3889 for (;; ) {
3890 if (got_int || scan == NULL) {
3891 status = RA_FAIL;
3892 break;
3893 }
3894 // Check for timeout once in a 100 times to avoid overhead.
3895 if (tm != NULL && ++tm_count == 100) {
3896 tm_count = 0;
3897 if (profile_passed_limit(*tm)) {
3898 if (timed_out != NULL) {
3899 *timed_out = true;
3900 }
3901 status = RA_FAIL;
3902 break;
3903 }
3904 }
3905 status = RA_CONT;
3906
3907 #ifdef REGEXP_DEBUG
3908 if (regnarrate) {
3909 mch_errmsg((char *)regprop(scan));
3910 mch_errmsg("...\n");
3911 if (re_extmatch_in != NULL) {
3912 int i;
3913
3914 mch_errmsg(_("External submatches:\n"));
3915 for (i = 0; i < NSUBEXP; i++) {
3916 mch_errmsg(" \"");
3917 if (re_extmatch_in->matches[i] != NULL)
3918 mch_errmsg((char *)re_extmatch_in->matches[i]);
3919 mch_errmsg("\"\n");
3920 }
3921 }
3922 }
3923 #endif
3924 next = regnext(scan);
3925
3926 op = OP(scan);
3927 // Check for character class with NL added.
3928 if (!rex.reg_line_lbr && WITH_NL(op) && REG_MULTI
3929 && *rex.input == NUL && rex.lnum <= rex.reg_maxline) {
3930 reg_nextline();
3931 } else if (rex.reg_line_lbr && WITH_NL(op) && *rex.input == '\n') {
3932 ADVANCE_REGINPUT();
3933 } else {
3934 if (WITH_NL(op)) {
3935 op -= ADD_NL;
3936 }
3937 c = utf_ptr2char(rex.input);
3938 switch (op) {
3939 case BOL:
3940 if (rex.input != rex.line) {
3941 status = RA_NOMATCH;
3942 }
3943 break;
3944
3945 case EOL:
3946 if (c != NUL) {
3947 status = RA_NOMATCH;
3948 }
3949 break;
3950
3951 case RE_BOF:
3952 // We're not at the beginning of the file when below the first
3953 // line where we started, not at the start of the line or we
3954 // didn't start at the first line of the buffer.
3955 if (rex.lnum != 0 || rex.input != rex.line
3956 || (REG_MULTI && rex.reg_firstlnum > 1)) {
3957 status = RA_NOMATCH;
3958 }
3959 break;
3960
3961 case RE_EOF:
3962 if (rex.lnum != rex.reg_maxline || c != NUL) {
3963 status = RA_NOMATCH;
3964 }
3965 break;
3966
3967 case CURSOR:
3968 // Check if the buffer is in a window and compare the
3969 // rex.reg_win->w_cursor position to the match position.
3970 if (rex.reg_win == NULL
3971 || (rex.lnum + rex.reg_firstlnum != rex.reg_win->w_cursor.lnum)
3972 || ((colnr_T)(rex.input - rex.line) !=
3973 rex.reg_win->w_cursor.col)) {
3974 status = RA_NOMATCH;
3975 }
3976 break;
3977
3978 case RE_MARK:
3979 /* Compare the mark position to the match position. */
3980 {
3981 int mark = OPERAND(scan)[0];
3982 int cmp = OPERAND(scan)[1];
3983 pos_T *pos;
3984
3985 pos = getmark_buf(rex.reg_buf, mark, false);
3986 if (pos == NULL // mark doesn't exist
3987 || pos->lnum <= 0) { // mark isn't set in reg_buf
3988 status = RA_NOMATCH;
3989 } else {
3990 const colnr_T pos_col = pos->lnum == rex.lnum + rex.reg_firstlnum
3991 && pos->col == MAXCOL
3992 ? (colnr_T)STRLEN(reg_getline(pos->lnum - rex.reg_firstlnum))
3993 : pos->col;
3994
3995 if (pos->lnum == rex.lnum + rex.reg_firstlnum
3996 ? (pos_col == (colnr_T)(rex.input - rex.line)
3997 ? (cmp == '<' || cmp == '>')
3998 : (pos_col < (colnr_T)(rex.input - rex.line)
3999 ? cmp != '>'
4000 : cmp != '<'))
4001 : (pos->lnum < rex.lnum + rex.reg_firstlnum
4002 ? cmp != '>'
4003 : cmp != '<')) {
4004 status = RA_NOMATCH;
4005 }
4006 }
4007 }
4008 break;
4009
4010 case RE_VISUAL:
4011 if (!reg_match_visual())
4012 status = RA_NOMATCH;
4013 break;
4014
4015 case RE_LNUM:
4016 assert(rex.lnum + rex.reg_firstlnum >= 0
4017 && (uintmax_t)(rex.lnum + rex.reg_firstlnum) <= UINT32_MAX);
4018 if (!REG_MULTI
4019 || !re_num_cmp((uint32_t)(rex.lnum + rex.reg_firstlnum), scan)) {
4020 status = RA_NOMATCH;
4021 }
4022 break;
4023
4024 case RE_COL:
4025 assert(rex.input - rex.line + 1 >= 0
4026 && (uintmax_t)(rex.input - rex.line + 1) <= UINT32_MAX);
4027 if (!re_num_cmp((uint32_t)(rex.input - rex.line + 1), scan)) {
4028 status = RA_NOMATCH;
4029 }
4030 break;
4031
4032 case RE_VCOL:
4033 if (!re_num_cmp(win_linetabsize(rex.reg_win == NULL
4034 ? curwin : rex.reg_win,
4035 rex.line,
4036 (colnr_T)(rex.input - rex.line)) + 1,
4037 scan)) {
4038 status = RA_NOMATCH;
4039 }
4040 break;
4041
4042 case BOW: // \<word; rex.input points to w
4043 if (c == NUL) { // Can't match at end of line
4044 status = RA_NOMATCH;
4045 } else {
4046 // Get class of current and previous char (if it exists).
4047 const int this_class =
4048 mb_get_class_tab(rex.input, rex.reg_buf->b_chartab);
4049 if (this_class <= 1) {
4050 status = RA_NOMATCH; // Not on a word at all.
4051 } else if (reg_prev_class() == this_class) {
4052 status = RA_NOMATCH; // Previous char is in same word.
4053 }
4054 }
4055 break;
4056
4057 case EOW: // word\>; rex.input points after d
4058 if (rex.input == rex.line) { // Can't match at start of line
4059 status = RA_NOMATCH;
4060 } else {
4061 int this_class, prev_class;
4062
4063 // Get class of current and previous char (if it exists).
4064 this_class = mb_get_class_tab(rex.input, rex.reg_buf->b_chartab);
4065 prev_class = reg_prev_class();
4066 if (this_class == prev_class
4067 || prev_class == 0 || prev_class == 1) {
4068 status = RA_NOMATCH;
4069 }
4070 }
4071 break; // Matched with EOW
4072
4073 case ANY:
4074 // ANY does not match new lines.
4075 if (c == NUL) {
4076 status = RA_NOMATCH;
4077 } else {
4078 ADVANCE_REGINPUT();
4079 }
4080 break;
4081
4082 case IDENT:
4083 if (!vim_isIDc(c))
4084 status = RA_NOMATCH;
4085 else
4086 ADVANCE_REGINPUT();
4087 break;
4088
4089 case SIDENT:
4090 if (ascii_isdigit(*rex.input) || !vim_isIDc(c)) {
4091 status = RA_NOMATCH;
4092 } else {
4093 ADVANCE_REGINPUT();
4094 }
4095 break;
4096
4097 case KWORD:
4098 if (!vim_iswordp_buf(rex.input, rex.reg_buf)) {
4099 status = RA_NOMATCH;
4100 } else {
4101 ADVANCE_REGINPUT();
4102 }
4103 break;
4104
4105 case SKWORD:
4106 if (ascii_isdigit(*rex.input)
4107 || !vim_iswordp_buf(rex.input, rex.reg_buf)) {
4108 status = RA_NOMATCH;
4109 } else {
4110 ADVANCE_REGINPUT();
4111 }
4112 break;
4113
4114 case FNAME:
4115 if (!vim_isfilec(c)) {
4116 status = RA_NOMATCH;
4117 } else {
4118 ADVANCE_REGINPUT();
4119 }
4120 break;
4121
4122 case SFNAME:
4123 if (ascii_isdigit(*rex.input) || !vim_isfilec(c)) {
4124 status = RA_NOMATCH;
4125 } else {
4126 ADVANCE_REGINPUT();
4127 }
4128 break;
4129
4130 case PRINT:
4131 if (!vim_isprintc(utf_ptr2char(rex.input))) {
4132 status = RA_NOMATCH;
4133 } else {
4134 ADVANCE_REGINPUT();
4135 }
4136 break;
4137
4138 case SPRINT:
4139 if (ascii_isdigit(*rex.input) || !vim_isprintc(utf_ptr2char(rex.input))) {
4140 status = RA_NOMATCH;
4141 } else {
4142 ADVANCE_REGINPUT();
4143 }
4144 break;
4145
4146 case WHITE:
4147 if (!ascii_iswhite(c))
4148 status = RA_NOMATCH;
4149 else
4150 ADVANCE_REGINPUT();
4151 break;
4152
4153 case NWHITE:
4154 if (c == NUL || ascii_iswhite(c))
4155 status = RA_NOMATCH;
4156 else
4157 ADVANCE_REGINPUT();
4158 break;
4159
4160 case DIGIT:
4161 if (!ri_digit(c))
4162 status = RA_NOMATCH;
4163 else
4164 ADVANCE_REGINPUT();
4165 break;
4166
4167 case NDIGIT:
4168 if (c == NUL || ri_digit(c))
4169 status = RA_NOMATCH;
4170 else
4171 ADVANCE_REGINPUT();
4172 break;
4173
4174 case HEX:
4175 if (!ri_hex(c))
4176 status = RA_NOMATCH;
4177 else
4178 ADVANCE_REGINPUT();
4179 break;
4180
4181 case NHEX:
4182 if (c == NUL || ri_hex(c))
4183 status = RA_NOMATCH;
4184 else
4185 ADVANCE_REGINPUT();
4186 break;
4187
4188 case OCTAL:
4189 if (!ri_octal(c))
4190 status = RA_NOMATCH;
4191 else
4192 ADVANCE_REGINPUT();
4193 break;
4194
4195 case NOCTAL:
4196 if (c == NUL || ri_octal(c))
4197 status = RA_NOMATCH;
4198 else
4199 ADVANCE_REGINPUT();
4200 break;
4201
4202 case WORD:
4203 if (!ri_word(c))
4204 status = RA_NOMATCH;
4205 else
4206 ADVANCE_REGINPUT();
4207 break;
4208
4209 case NWORD:
4210 if (c == NUL || ri_word(c))
4211 status = RA_NOMATCH;
4212 else
4213 ADVANCE_REGINPUT();
4214 break;
4215
4216 case HEAD:
4217 if (!ri_head(c))
4218 status = RA_NOMATCH;
4219 else
4220 ADVANCE_REGINPUT();
4221 break;
4222
4223 case NHEAD:
4224 if (c == NUL || ri_head(c))
4225 status = RA_NOMATCH;
4226 else
4227 ADVANCE_REGINPUT();
4228 break;
4229
4230 case ALPHA:
4231 if (!ri_alpha(c))
4232 status = RA_NOMATCH;
4233 else
4234 ADVANCE_REGINPUT();
4235 break;
4236
4237 case NALPHA:
4238 if (c == NUL || ri_alpha(c))
4239 status = RA_NOMATCH;
4240 else
4241 ADVANCE_REGINPUT();
4242 break;
4243
4244 case LOWER:
4245 if (!ri_lower(c))
4246 status = RA_NOMATCH;
4247 else
4248 ADVANCE_REGINPUT();
4249 break;
4250
4251 case NLOWER:
4252 if (c == NUL || ri_lower(c))
4253 status = RA_NOMATCH;
4254 else
4255 ADVANCE_REGINPUT();
4256 break;
4257
4258 case UPPER:
4259 if (!ri_upper(c))
4260 status = RA_NOMATCH;
4261 else
4262 ADVANCE_REGINPUT();
4263 break;
4264
4265 case NUPPER:
4266 if (c == NUL || ri_upper(c))
4267 status = RA_NOMATCH;
4268 else
4269 ADVANCE_REGINPUT();
4270 break;
4271
4272 case EXACTLY:
4273 {
4274 int len;
4275 char_u *opnd;
4276
4277 opnd = OPERAND(scan);
4278 // Inline the first byte, for speed.
4279 if (*opnd != *rex.input
4280 && (!rex.reg_ic)) {
4281 status = RA_NOMATCH;
4282 } else if (*opnd == NUL) {
4283 // match empty string always works; happens when "~" is
4284 // empty.
4285 } else {
4286 if (opnd[1] == NUL && !rex.reg_ic) {
4287 len = 1; // matched a single byte above
4288 } else {
4289 // Need to match first byte again for multi-byte.
4290 len = (int)STRLEN(opnd);
4291 if (cstrncmp(opnd, rex.input, &len) != 0) {
4292 status = RA_NOMATCH;
4293 }
4294 }
4295 // Check for following composing character, unless %C
4296 // follows (skips over all composing chars).
4297 if (status != RA_NOMATCH
4298 && utf_composinglike(rex.input, rex.input + len)
4299 && !rex.reg_icombine
4300 && OP(next) != RE_COMPOSING) {
4301 // raaron: This code makes a composing character get
4302 // ignored, which is the correct behavior (sometimes)
4303 // for voweled Hebrew texts.
4304 status = RA_NOMATCH;
4305 }
4306 if (status != RA_NOMATCH) {
4307 rex.input += len;
4308 }
4309 }
4310 }
4311 break;
4312
4313 case ANYOF:
4314 case ANYBUT:
4315 if (c == NUL)
4316 status = RA_NOMATCH;
4317 else if ((cstrchr(OPERAND(scan), c) == NULL) == (op == ANYOF))
4318 status = RA_NOMATCH;
4319 else
4320 ADVANCE_REGINPUT();
4321 break;
4322
4323 case MULTIBYTECODE:
4324 {
4325 int i, len;
4326
4327 const char_u *opnd = OPERAND(scan);
4328 // Safety check (just in case 'encoding' was changed since
4329 // compiling the program).
4330 if ((len = utfc_ptr2len(opnd)) < 2) {
4331 status = RA_NOMATCH;
4332 break;
4333 }
4334 const int opndc = utf_ptr2char(opnd);
4335 if (utf_iscomposing(opndc)) {
4336 // When only a composing char is given match at any
4337 // position where that composing char appears.
4338 status = RA_NOMATCH;
4339 for (i = 0; rex.input[i] != NUL;
4340 i += utf_ptr2len(rex.input + i)) {
4341 const int inpc = utf_ptr2char(rex.input + i);
4342 if (!utf_iscomposing(inpc)) {
4343 if (i > 0) {
4344 break;
4345 }
4346 } else if (opndc == inpc) {
4347 // Include all following composing chars.
4348 len = i + utfc_ptr2len(rex.input + i);
4349 status = RA_MATCH;
4350 break;
4351 }
4352 }
4353 } else {
4354 for (i = 0; i < len; i++) {
4355 if (opnd[i] != rex.input[i]) {
4356 status = RA_NOMATCH;
4357 break;
4358 }
4359 }
4360 }
4361 rex.input += len;
4362 }
4363 break;
4364
4365 case RE_COMPOSING:
4366 {
4367 // Skip composing characters.
4368 while (utf_iscomposing(utf_ptr2char(rex.input))) {
4369 MB_CPTR_ADV(rex.input);
4370 }
4371 }
4372 break;
4373
4374 case NOTHING:
4375 break;
4376
4377 case BACK:
4378 {
4379 int i;
4380
4381 /*
4382 * When we run into BACK we need to check if we don't keep
4383 * looping without matching any input. The second and later
4384 * times a BACK is encountered it fails if the input is still
4385 * at the same position as the previous time.
4386 * The positions are stored in "backpos" and found by the
4387 * current value of "scan", the position in the RE program.
4388 */
4389 backpos_T *bp = (backpos_T *)backpos.ga_data;
4390 for (i = 0; i < backpos.ga_len; ++i)
4391 if (bp[i].bp_scan == scan)
4392 break;
4393 if (i == backpos.ga_len) {
4394 backpos_T *p = GA_APPEND_VIA_PTR(backpos_T, &backpos);
4395 p->bp_scan = scan;
4396 } else if (reg_save_equal(&bp[i].bp_pos))
4397 /* Still at same position as last time, fail. */
4398 status = RA_NOMATCH;
4399
4400 assert(status != RA_FAIL);
4401 if (status != RA_NOMATCH) {
4402 reg_save(&bp[i].bp_pos, &backpos);
4403 }
4404 }
4405 break;
4406
4407 case MOPEN + 0: /* Match start: \zs */
4408 case MOPEN + 1: /* \( */
4409 case MOPEN + 2:
4410 case MOPEN + 3:
4411 case MOPEN + 4:
4412 case MOPEN + 5:
4413 case MOPEN + 6:
4414 case MOPEN + 7:
4415 case MOPEN + 8:
4416 case MOPEN + 9:
4417 {
4418 no = op - MOPEN;
4419 cleanup_subexpr();
4420 rp = regstack_push(RS_MOPEN, scan);
4421 if (rp == NULL)
4422 status = RA_FAIL;
4423 else {
4424 rp->rs_no = no;
4425 save_se(&rp->rs_un.sesave, &rex.reg_startpos[no],
4426 &rex.reg_startp[no]);
4427 // We simply continue and handle the result when done.
4428 }
4429 }
4430 break;
4431
4432 case NOPEN: /* \%( */
4433 case NCLOSE: /* \) after \%( */
4434 if (regstack_push(RS_NOPEN, scan) == NULL)
4435 status = RA_FAIL;
4436 /* We simply continue and handle the result when done. */
4437 break;
4438
4439 case ZOPEN + 1:
4440 case ZOPEN + 2:
4441 case ZOPEN + 3:
4442 case ZOPEN + 4:
4443 case ZOPEN + 5:
4444 case ZOPEN + 6:
4445 case ZOPEN + 7:
4446 case ZOPEN + 8:
4447 case ZOPEN + 9:
4448 {
4449 no = op - ZOPEN;
4450 cleanup_zsubexpr();
4451 rp = regstack_push(RS_ZOPEN, scan);
4452 if (rp == NULL)
4453 status = RA_FAIL;
4454 else {
4455 rp->rs_no = no;
4456 save_se(&rp->rs_un.sesave, ®_startzpos[no],
4457 ®_startzp[no]);
4458 /* We simply continue and handle the result when done. */
4459 }
4460 }
4461 break;
4462
4463 case MCLOSE + 0: /* Match end: \ze */
4464 case MCLOSE + 1: /* \) */
4465 case MCLOSE + 2:
4466 case MCLOSE + 3:
4467 case MCLOSE + 4:
4468 case MCLOSE + 5:
4469 case MCLOSE + 6:
4470 case MCLOSE + 7:
4471 case MCLOSE + 8:
4472 case MCLOSE + 9:
4473 {
4474 no = op - MCLOSE;
4475 cleanup_subexpr();
4476 rp = regstack_push(RS_MCLOSE, scan);
4477 if (rp == NULL) {
4478 status = RA_FAIL;
4479 } else {
4480 rp->rs_no = no;
4481 save_se(&rp->rs_un.sesave, &rex.reg_endpos[no], &rex.reg_endp[no]);
4482 // We simply continue and handle the result when done.
4483 }
4484 }
4485 break;
4486
4487 case ZCLOSE + 1: /* \) after \z( */
4488 case ZCLOSE + 2:
4489 case ZCLOSE + 3:
4490 case ZCLOSE + 4:
4491 case ZCLOSE + 5:
4492 case ZCLOSE + 6:
4493 case ZCLOSE + 7:
4494 case ZCLOSE + 8:
4495 case ZCLOSE + 9:
4496 {
4497 no = op - ZCLOSE;
4498 cleanup_zsubexpr();
4499 rp = regstack_push(RS_ZCLOSE, scan);
4500 if (rp == NULL)
4501 status = RA_FAIL;
4502 else {
4503 rp->rs_no = no;
4504 save_se(&rp->rs_un.sesave, ®_endzpos[no],
4505 ®_endzp[no]);
4506 /* We simply continue and handle the result when done. */
4507 }
4508 }
4509 break;
4510
4511 case BACKREF + 1:
4512 case BACKREF + 2:
4513 case BACKREF + 3:
4514 case BACKREF + 4:
4515 case BACKREF + 5:
4516 case BACKREF + 6:
4517 case BACKREF + 7:
4518 case BACKREF + 8:
4519 case BACKREF + 9:
4520 {
4521 int len;
4522
4523 no = op - BACKREF;
4524 cleanup_subexpr();
4525 if (!REG_MULTI) { // Single-line regexp
4526 if (rex.reg_startp[no] == NULL || rex.reg_endp[no] == NULL) {
4527 // Backref was not set: Match an empty string.
4528 len = 0;
4529 } else {
4530 // Compare current input with back-ref in the same line.
4531 len = (int)(rex.reg_endp[no] - rex.reg_startp[no]);
4532 if (cstrncmp(rex.reg_startp[no], rex.input, &len) != 0) {
4533 status = RA_NOMATCH;
4534 }
4535 }
4536 } else { // Multi-line regexp
4537 if (rex.reg_startpos[no].lnum < 0 || rex.reg_endpos[no].lnum < 0) {
4538 // Backref was not set: Match an empty string.
4539 len = 0;
4540 } else {
4541 if (rex.reg_startpos[no].lnum == rex.lnum
4542 && rex.reg_endpos[no].lnum == rex.lnum) {
4543 // Compare back-ref within the current line.
4544 len = rex.reg_endpos[no].col - rex.reg_startpos[no].col;
4545 if (cstrncmp(rex.line + rex.reg_startpos[no].col,
4546 rex.input, &len) != 0) {
4547 status = RA_NOMATCH;
4548 }
4549 } else {
4550 // Messy situation: Need to compare between two lines.
4551 int r = match_with_backref(rex.reg_startpos[no].lnum,
4552 rex.reg_startpos[no].col,
4553 rex.reg_endpos[no].lnum,
4554 rex.reg_endpos[no].col,
4555 &len);
4556 if (r != RA_MATCH) {
4557 status = r;
4558 }
4559 }
4560 }
4561 }
4562
4563 // Matched the backref, skip over it.
4564 rex.input += len;
4565 }
4566 break;
4567
4568 case ZREF + 1:
4569 case ZREF + 2:
4570 case ZREF + 3:
4571 case ZREF + 4:
4572 case ZREF + 5:
4573 case ZREF + 6:
4574 case ZREF + 7:
4575 case ZREF + 8:
4576 case ZREF + 9:
4577 {
4578 cleanup_zsubexpr();
4579 no = op - ZREF;
4580 if (re_extmatch_in != NULL
4581 && re_extmatch_in->matches[no] != NULL) {
4582 int len = (int)STRLEN(re_extmatch_in->matches[no]);
4583 if (cstrncmp(re_extmatch_in->matches[no], rex.input, &len) != 0) {
4584 status = RA_NOMATCH;
4585 } else {
4586 rex.input += len;
4587 }
4588 } else {
4589 // Backref was not set: Match an empty string.
4590 }
4591 }
4592 break;
4593
4594 case BRANCH:
4595 {
4596 if (OP(next) != BRANCH) /* No choice. */
4597 next = OPERAND(scan); /* Avoid recursion. */
4598 else {
4599 rp = regstack_push(RS_BRANCH, scan);
4600 if (rp == NULL)
4601 status = RA_FAIL;
4602 else
4603 status = RA_BREAK; /* rest is below */
4604 }
4605 }
4606 break;
4607
4608 case BRACE_LIMITS:
4609 {
4610 if (OP(next) == BRACE_SIMPLE) {
4611 bl_minval = OPERAND_MIN(scan);
4612 bl_maxval = OPERAND_MAX(scan);
4613 } else if (OP(next) >= BRACE_COMPLEX
4614 && OP(next) < BRACE_COMPLEX + 10) {
4615 no = OP(next) - BRACE_COMPLEX;
4616 brace_min[no] = OPERAND_MIN(scan);
4617 brace_max[no] = OPERAND_MAX(scan);
4618 brace_count[no] = 0;
4619 } else {
4620 internal_error("BRACE_LIMITS");
4621 status = RA_FAIL;
4622 }
4623 }
4624 break;
4625
4626 case BRACE_COMPLEX + 0:
4627 case BRACE_COMPLEX + 1:
4628 case BRACE_COMPLEX + 2:
4629 case BRACE_COMPLEX + 3:
4630 case BRACE_COMPLEX + 4:
4631 case BRACE_COMPLEX + 5:
4632 case BRACE_COMPLEX + 6:
4633 case BRACE_COMPLEX + 7:
4634 case BRACE_COMPLEX + 8:
4635 case BRACE_COMPLEX + 9:
4636 {
4637 no = op - BRACE_COMPLEX;
4638 ++brace_count[no];
4639
4640 /* If not matched enough times yet, try one more */
4641 if (brace_count[no] <= (brace_min[no] <= brace_max[no]
4642 ? brace_min[no] : brace_max[no])) {
4643 rp = regstack_push(RS_BRCPLX_MORE, scan);
4644 if (rp == NULL)
4645 status = RA_FAIL;
4646 else {
4647 rp->rs_no = no;
4648 reg_save(&rp->rs_un.regsave, &backpos);
4649 next = OPERAND(scan);
4650 /* We continue and handle the result when done. */
4651 }
4652 break;
4653 }
4654
4655 /* If matched enough times, may try matching some more */
4656 if (brace_min[no] <= brace_max[no]) {
4657 /* Range is the normal way around, use longest match */
4658 if (brace_count[no] <= brace_max[no]) {
4659 rp = regstack_push(RS_BRCPLX_LONG, scan);
4660 if (rp == NULL)
4661 status = RA_FAIL;
4662 else {
4663 rp->rs_no = no;
4664 reg_save(&rp->rs_un.regsave, &backpos);
4665 next = OPERAND(scan);
4666 /* We continue and handle the result when done. */
4667 }
4668 }
4669 } else {
4670 /* Range is backwards, use shortest match first */
4671 if (brace_count[no] <= brace_min[no]) {
4672 rp = regstack_push(RS_BRCPLX_SHORT, scan);
4673 if (rp == NULL)
4674 status = RA_FAIL;
4675 else {
4676 reg_save(&rp->rs_un.regsave, &backpos);
4677 /* We continue and handle the result when done. */
4678 }
4679 }
4680 }
4681 }
4682 break;
4683
4684 case BRACE_SIMPLE:
4685 case STAR:
4686 case PLUS:
4687 {
4688 regstar_T rst;
4689
4690 /*
4691 * Lookahead to avoid useless match attempts when we know
4692 * what character comes next.
4693 */
4694 if (OP(next) == EXACTLY) {
4695 rst.nextb = *OPERAND(next);
4696 if (rex.reg_ic) {
4697 if (mb_isupper(rst.nextb)) {
4698 rst.nextb_ic = mb_tolower(rst.nextb);
4699 } else {
4700 rst.nextb_ic = mb_toupper(rst.nextb);
4701 }
4702 } else {
4703 rst.nextb_ic = rst.nextb;
4704 }
4705 } else {
4706 rst.nextb = NUL;
4707 rst.nextb_ic = NUL;
4708 }
4709 if (op != BRACE_SIMPLE) {
4710 rst.minval = (op == STAR) ? 0 : 1;
4711 rst.maxval = MAX_LIMIT;
4712 } else {
4713 rst.minval = bl_minval;
4714 rst.maxval = bl_maxval;
4715 }
4716
4717 /*
4718 * When maxval > minval, try matching as much as possible, up
4719 * to maxval. When maxval < minval, try matching at least the
4720 * minimal number (since the range is backwards, that's also
4721 * maxval!).
4722 */
4723 rst.count = regrepeat(OPERAND(scan), rst.maxval);
4724 if (got_int) {
4725 status = RA_FAIL;
4726 break;
4727 }
4728 if (rst.minval <= rst.maxval
4729 ? rst.count >= rst.minval : rst.count >= rst.maxval) {
4730 /* It could match. Prepare for trying to match what
4731 * follows. The code is below. Parameters are stored in
4732 * a regstar_T on the regstack. */
4733 if ((long)((unsigned)regstack.ga_len >> 10) >= p_mmp) {
4734 emsg(_(e_maxmempat));
4735 status = RA_FAIL;
4736 } else {
4737 ga_grow(®stack, sizeof(regstar_T));
4738 regstack.ga_len += sizeof(regstar_T);
4739 rp = regstack_push(rst.minval <= rst.maxval
4740 ? RS_STAR_LONG : RS_STAR_SHORT, scan);
4741 if (rp == NULL)
4742 status = RA_FAIL;
4743 else {
4744 *(((regstar_T *)rp) - 1) = rst;
4745 status = RA_BREAK; /* skip the restore bits */
4746 }
4747 }
4748 } else
4749 status = RA_NOMATCH;
4750
4751 }
4752 break;
4753
4754 case NOMATCH:
4755 case MATCH:
4756 case SUBPAT:
4757 rp = regstack_push(RS_NOMATCH, scan);
4758 if (rp == NULL)
4759 status = RA_FAIL;
4760 else {
4761 rp->rs_no = op;
4762 reg_save(&rp->rs_un.regsave, &backpos);
4763 next = OPERAND(scan);
4764 /* We continue and handle the result when done. */
4765 }
4766 break;
4767
4768 case BEHIND:
4769 case NOBEHIND:
4770 /* Need a bit of room to store extra positions. */
4771 if ((long)((unsigned)regstack.ga_len >> 10) >= p_mmp) {
4772 emsg(_(e_maxmempat));
4773 status = RA_FAIL;
4774 } else {
4775 ga_grow(®stack, sizeof(regbehind_T));
4776 regstack.ga_len += sizeof(regbehind_T);
4777 rp = regstack_push(RS_BEHIND1, scan);
4778 if (rp == NULL)
4779 status = RA_FAIL;
4780 else {
4781 /* Need to save the subexpr to be able to restore them
4782 * when there is a match but we don't use it. */
4783 save_subexpr(((regbehind_T *)rp) - 1);
4784
4785 rp->rs_no = op;
4786 reg_save(&rp->rs_un.regsave, &backpos);
4787 /* First try if what follows matches. If it does then we
4788 * check the behind match by looping. */
4789 }
4790 }
4791 break;
4792
4793 case BHPOS:
4794 if (REG_MULTI) {
4795 if (behind_pos.rs_u.pos.col != (colnr_T)(rex.input - rex.line)
4796 || behind_pos.rs_u.pos.lnum != rex.lnum) {
4797 status = RA_NOMATCH;
4798 }
4799 } else if (behind_pos.rs_u.ptr != rex.input) {
4800 status = RA_NOMATCH;
4801 }
4802 break;
4803
4804 case NEWL:
4805 if ((c != NUL || !REG_MULTI || rex.lnum > rex.reg_maxline
4806 || rex.reg_line_lbr) && (c != '\n' || !rex.reg_line_lbr)) {
4807 status = RA_NOMATCH;
4808 } else if (rex.reg_line_lbr) {
4809 ADVANCE_REGINPUT();
4810 } else {
4811 reg_nextline();
4812 }
4813 break;
4814
4815 case END:
4816 status = RA_MATCH; /* Success! */
4817 break;
4818
4819 default:
4820 iemsg(_(e_re_corr));
4821 #ifdef REGEXP_DEBUG
4822 printf("Illegal op code %d\n", op);
4823 #endif
4824 status = RA_FAIL;
4825 break;
4826 }
4827 }
4828
4829 /* If we can't continue sequentially, break the inner loop. */
4830 if (status != RA_CONT)
4831 break;
4832
4833 /* Continue in inner loop, advance to next item. */
4834 scan = next;
4835
4836 } /* end of inner loop */
4837
4838 /*
4839 * If there is something on the regstack execute the code for the state.
4840 * If the state is popped then loop and use the older state.
4841 */
4842 while (!GA_EMPTY(®stack) && status != RA_FAIL) {
4843 rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len) - 1;
4844 switch (rp->rs_state) {
4845 case RS_NOPEN:
4846 /* Result is passed on as-is, simply pop the state. */
4847 regstack_pop(&scan);
4848 break;
4849
4850 case RS_MOPEN:
4851 // Pop the state. Restore pointers when there is no match.
4852 if (status == RA_NOMATCH) {
4853 restore_se(&rp->rs_un.sesave, &rex.reg_startpos[rp->rs_no],
4854 &rex.reg_startp[rp->rs_no]);
4855 }
4856 regstack_pop(&scan);
4857 break;
4858
4859 case RS_ZOPEN:
4860 /* Pop the state. Restore pointers when there is no match. */
4861 if (status == RA_NOMATCH)
4862 restore_se(&rp->rs_un.sesave, ®_startzpos[rp->rs_no],
4863 ®_startzp[rp->rs_no]);
4864 regstack_pop(&scan);
4865 break;
4866
4867 case RS_MCLOSE:
4868 // Pop the state. Restore pointers when there is no match.
4869 if (status == RA_NOMATCH) {
4870 restore_se(&rp->rs_un.sesave, &rex.reg_endpos[rp->rs_no],
4871 &rex.reg_endp[rp->rs_no]);
4872 }
4873 regstack_pop(&scan);
4874 break;
4875
4876 case RS_ZCLOSE:
4877 /* Pop the state. Restore pointers when there is no match. */
4878 if (status == RA_NOMATCH)
4879 restore_se(&rp->rs_un.sesave, ®_endzpos[rp->rs_no],
4880 ®_endzp[rp->rs_no]);
4881 regstack_pop(&scan);
4882 break;
4883
4884 case RS_BRANCH:
4885 if (status == RA_MATCH)
4886 /* this branch matched, use it */
4887 regstack_pop(&scan);
4888 else {
4889 if (status != RA_BREAK) {
4890 /* After a non-matching branch: try next one. */
4891 reg_restore(&rp->rs_un.regsave, &backpos);
4892 scan = rp->rs_scan;
4893 }
4894 if (scan == NULL || OP(scan) != BRANCH) {
4895 /* no more branches, didn't find a match */
4896 status = RA_NOMATCH;
4897 regstack_pop(&scan);
4898 } else {
4899 /* Prepare to try a branch. */
4900 rp->rs_scan = regnext(scan);
4901 reg_save(&rp->rs_un.regsave, &backpos);
4902 scan = OPERAND(scan);
4903 }
4904 }
4905 break;
4906
4907 case RS_BRCPLX_MORE:
4908 /* Pop the state. Restore pointers when there is no match. */
4909 if (status == RA_NOMATCH) {
4910 reg_restore(&rp->rs_un.regsave, &backpos);
4911 --brace_count[rp->rs_no]; /* decrement match count */
4912 }
4913 regstack_pop(&scan);
4914 break;
4915
4916 case RS_BRCPLX_LONG:
4917 /* Pop the state. Restore pointers when there is no match. */
4918 if (status == RA_NOMATCH) {
4919 /* There was no match, but we did find enough matches. */
4920 reg_restore(&rp->rs_un.regsave, &backpos);
4921 --brace_count[rp->rs_no];
4922 /* continue with the items after "\{}" */
4923 status = RA_CONT;
4924 }
4925 regstack_pop(&scan);
4926 if (status == RA_CONT)
4927 scan = regnext(scan);
4928 break;
4929
4930 case RS_BRCPLX_SHORT:
4931 /* Pop the state. Restore pointers when there is no match. */
4932 if (status == RA_NOMATCH)
4933 /* There was no match, try to match one more item. */
4934 reg_restore(&rp->rs_un.regsave, &backpos);
4935 regstack_pop(&scan);
4936 if (status == RA_NOMATCH) {
4937 scan = OPERAND(scan);
4938 status = RA_CONT;
4939 }
4940 break;
4941
4942 case RS_NOMATCH:
4943 /* Pop the state. If the operand matches for NOMATCH or
4944 * doesn't match for MATCH/SUBPAT, we fail. Otherwise backup,
4945 * except for SUBPAT, and continue with the next item. */
4946 if (status == (rp->rs_no == NOMATCH ? RA_MATCH : RA_NOMATCH))
4947 status = RA_NOMATCH;
4948 else {
4949 status = RA_CONT;
4950 if (rp->rs_no != SUBPAT) /* zero-width */
4951 reg_restore(&rp->rs_un.regsave, &backpos);
4952 }
4953 regstack_pop(&scan);
4954 if (status == RA_CONT)
4955 scan = regnext(scan);
4956 break;
4957
4958 case RS_BEHIND1:
4959 if (status == RA_NOMATCH) {
4960 regstack_pop(&scan);
4961 regstack.ga_len -= sizeof(regbehind_T);
4962 } else {
4963 /* The stuff after BEHIND/NOBEHIND matches. Now try if
4964 * the behind part does (not) match before the current
4965 * position in the input. This must be done at every
4966 * position in the input and checking if the match ends at
4967 * the current position. */
4968
4969 /* save the position after the found match for next */
4970 reg_save(&(((regbehind_T *)rp) - 1)->save_after, &backpos);
4971
4972 /* Start looking for a match with operand at the current
4973 * position. Go back one character until we find the
4974 * result, hitting the start of the line or the previous
4975 * line (for multi-line matching).
4976 * Set behind_pos to where the match should end, BHPOS
4977 * will match it. Save the current value. */
4978 (((regbehind_T *)rp) - 1)->save_behind = behind_pos;
4979 behind_pos = rp->rs_un.regsave;
4980
4981 rp->rs_state = RS_BEHIND2;
4982
4983 reg_restore(&rp->rs_un.regsave, &backpos);
4984 scan = OPERAND(rp->rs_scan) + 4;
4985 }
4986 break;
4987
4988 case RS_BEHIND2:
4989 /*
4990 * Looping for BEHIND / NOBEHIND match.
4991 */
4992 if (status == RA_MATCH && reg_save_equal(&behind_pos)) {
4993 /* found a match that ends where "next" started */
4994 behind_pos = (((regbehind_T *)rp) - 1)->save_behind;
4995 if (rp->rs_no == BEHIND)
4996 reg_restore(&(((regbehind_T *)rp) - 1)->save_after,
4997 &backpos);
4998 else {
4999 /* But we didn't want a match. Need to restore the
5000 * subexpr, because what follows matched, so they have
5001 * been set. */
5002 status = RA_NOMATCH;
5003 restore_subexpr(((regbehind_T *)rp) - 1);
5004 }
5005 regstack_pop(&scan);
5006 regstack.ga_len -= sizeof(regbehind_T);
5007 } else {
5008 long limit;
5009
5010 /* No match or a match that doesn't end where we want it: Go
5011 * back one character. May go to previous line once. */
5012 no = OK;
5013 limit = OPERAND_MIN(rp->rs_scan);
5014 if (REG_MULTI) {
5015 if (limit > 0
5016 && ((rp->rs_un.regsave.rs_u.pos.lnum
5017 < behind_pos.rs_u.pos.lnum
5018 ? (colnr_T)STRLEN(rex.line)
5019 : behind_pos.rs_u.pos.col)
5020 - rp->rs_un.regsave.rs_u.pos.col >= limit))
5021 no = FAIL;
5022 else if (rp->rs_un.regsave.rs_u.pos.col == 0) {
5023 if (rp->rs_un.regsave.rs_u.pos.lnum
5024 < behind_pos.rs_u.pos.lnum
5025 || reg_getline(
5026 --rp->rs_un.regsave.rs_u.pos.lnum)
5027 == NULL)
5028 no = FAIL;
5029 else {
5030 reg_restore(&rp->rs_un.regsave, &backpos);
5031 rp->rs_un.regsave.rs_u.pos.col =
5032 (colnr_T)STRLEN(rex.line);
5033 }
5034 } else {
5035 const char_u *const line =
5036 reg_getline(rp->rs_un.regsave.rs_u.pos.lnum);
5037
5038 rp->rs_un.regsave.rs_u.pos.col -=
5039 utf_head_off(line,
5040 line + rp->rs_un.regsave.rs_u.pos.col - 1)
5041 + 1;
5042 }
5043 } else {
5044 if (rp->rs_un.regsave.rs_u.ptr == rex.line) {
5045 no = FAIL;
5046 } else {
5047 MB_PTR_BACK(rex.line, rp->rs_un.regsave.rs_u.ptr);
5048 if (limit > 0
5049 && (long)(behind_pos.rs_u.ptr
5050 - rp->rs_un.regsave.rs_u.ptr) > limit) {
5051 no = FAIL;
5052 }
5053 }
5054 }
5055 if (no == OK) {
5056 /* Advanced, prepare for finding match again. */
5057 reg_restore(&rp->rs_un.regsave, &backpos);
5058 scan = OPERAND(rp->rs_scan) + 4;
5059 if (status == RA_MATCH) {
5060 /* We did match, so subexpr may have been changed,
5061 * need to restore them for the next try. */
5062 status = RA_NOMATCH;
5063 restore_subexpr(((regbehind_T *)rp) - 1);
5064 }
5065 } else {
5066 /* Can't advance. For NOBEHIND that's a match. */
5067 behind_pos = (((regbehind_T *)rp) - 1)->save_behind;
5068 if (rp->rs_no == NOBEHIND) {
5069 reg_restore(&(((regbehind_T *)rp) - 1)->save_after,
5070 &backpos);
5071 status = RA_MATCH;
5072 } else {
5073 /* We do want a proper match. Need to restore the
5074 * subexpr if we had a match, because they may have
5075 * been set. */
5076 if (status == RA_MATCH) {
5077 status = RA_NOMATCH;
5078 restore_subexpr(((regbehind_T *)rp) - 1);
5079 }
5080 }
5081 regstack_pop(&scan);
5082 regstack.ga_len -= sizeof(regbehind_T);
5083 }
5084 }
5085 break;
5086
5087 case RS_STAR_LONG:
5088 case RS_STAR_SHORT:
5089 {
5090 regstar_T *rst = ((regstar_T *)rp) - 1;
5091
5092 if (status == RA_MATCH) {
5093 regstack_pop(&scan);
5094 regstack.ga_len -= sizeof(regstar_T);
5095 break;
5096 }
5097
5098 /* Tried once already, restore input pointers. */
5099 if (status != RA_BREAK)
5100 reg_restore(&rp->rs_un.regsave, &backpos);
5101
5102 /* Repeat until we found a position where it could match. */
5103 for (;; ) {
5104 if (status != RA_BREAK) {
5105 /* Tried first position already, advance. */
5106 if (rp->rs_state == RS_STAR_LONG) {
5107 /* Trying for longest match, but couldn't or
5108 * didn't match -- back up one char. */
5109 if (--rst->count < rst->minval)
5110 break;
5111 if (rex.input == rex.line) {
5112 // backup to last char of previous line
5113 rex.lnum--;
5114 rex.line = reg_getline(rex.lnum);
5115 // Just in case regrepeat() didn't count right.
5116 if (rex.line == NULL) {
5117 break;
5118 }
5119 rex.input = rex.line + STRLEN(rex.line);
5120 fast_breakcheck();
5121 } else {
5122 MB_PTR_BACK(rex.line, rex.input);
5123 }
5124 } else {
5125 /* Range is backwards, use shortest match first.
5126 * Careful: maxval and minval are exchanged!
5127 * Couldn't or didn't match: try advancing one
5128 * char. */
5129 if (rst->count == rst->minval
5130 || regrepeat(OPERAND(rp->rs_scan), 1L) == 0)
5131 break;
5132 ++rst->count;
5133 }
5134 if (got_int)
5135 break;
5136 } else
5137 status = RA_NOMATCH;
5138
5139 // If it could match, try it.
5140 if (rst->nextb == NUL || *rex.input == rst->nextb
5141 || *rex.input == rst->nextb_ic) {
5142 reg_save(&rp->rs_un.regsave, &backpos);
5143 scan = regnext(rp->rs_scan);
5144 status = RA_CONT;
5145 break;
5146 }
5147 }
5148 if (status != RA_CONT) {
5149 /* Failed. */
5150 regstack_pop(&scan);
5151 regstack.ga_len -= sizeof(regstar_T);
5152 status = RA_NOMATCH;
5153 }
5154 }
5155 break;
5156 }
5157
5158 /* If we want to continue the inner loop or didn't pop a state
5159 * continue matching loop */
5160 if (status == RA_CONT || rp == (regitem_T *)
5161 ((char *)regstack.ga_data + regstack.ga_len) - 1)
5162 break;
5163 }
5164
5165 /* May need to continue with the inner loop, starting at "scan". */
5166 if (status == RA_CONT)
5167 continue;
5168
5169 /*
5170 * If the regstack is empty or something failed we are done.
5171 */
5172 if (GA_EMPTY(®stack) || status == RA_FAIL) {
5173 if (scan == NULL) {
5174 /*
5175 * We get here only if there's trouble -- normally "case END" is
5176 * the terminating point.
5177 */
5178 iemsg(_(e_re_corr));
5179 #ifdef REGEXP_DEBUG
5180 printf("Premature EOL\n");
5181 #endif
5182 }
5183 return status == RA_MATCH;
5184 }
5185
5186 } /* End of loop until the regstack is empty. */
5187
5188 /* NOTREACHED */
5189 }
5190
5191 /*
5192 * Push an item onto the regstack.
5193 * Returns pointer to new item. Returns NULL when out of memory.
5194 */
regstack_push(regstate_T state,char_u * scan)5195 static regitem_T *regstack_push(regstate_T state, char_u *scan)
5196 {
5197 regitem_T *rp;
5198
5199 if ((long)((unsigned)regstack.ga_len >> 10) >= p_mmp) {
5200 emsg(_(e_maxmempat));
5201 return NULL;
5202 }
5203 ga_grow(®stack, sizeof(regitem_T));
5204
5205 rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len);
5206 rp->rs_state = state;
5207 rp->rs_scan = scan;
5208
5209 regstack.ga_len += sizeof(regitem_T);
5210 return rp;
5211 }
5212
5213 /*
5214 * Pop an item from the regstack.
5215 */
regstack_pop(char_u ** scan)5216 static void regstack_pop(char_u **scan)
5217 {
5218 regitem_T *rp;
5219
5220 rp = (regitem_T *)((char *)regstack.ga_data + regstack.ga_len) - 1;
5221 *scan = rp->rs_scan;
5222
5223 regstack.ga_len -= sizeof(regitem_T);
5224 }
5225
5226 /*
5227 * regrepeat - repeatedly match something simple, return how many.
5228 * Advances rex.input (and rex.lnum) to just after the matched chars.
5229 */
5230 static int
regrepeat(char_u * p,long maxcount)5231 regrepeat (
5232 char_u *p,
5233 long maxcount /* maximum number of matches allowed */
5234 )
5235 {
5236 long count = 0;
5237 char_u *opnd;
5238 int mask;
5239 int testval = 0;
5240
5241 char_u *scan = rex.input; // Make local copy of rex.input for speed.
5242 opnd = OPERAND(p);
5243 switch (OP(p)) {
5244 case ANY:
5245 case ANY + ADD_NL:
5246 while (count < maxcount) {
5247 /* Matching anything means we continue until end-of-line (or
5248 * end-of-file for ANY + ADD_NL), only limited by maxcount. */
5249 while (*scan != NUL && count < maxcount) {
5250 count++;
5251 MB_PTR_ADV(scan);
5252 }
5253 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
5254 || rex.reg_line_lbr || count == maxcount) {
5255 break;
5256 }
5257 count++; // count the line-break
5258 reg_nextline();
5259 scan = rex.input;
5260 if (got_int) {
5261 break;
5262 }
5263 }
5264 break;
5265
5266 case IDENT:
5267 case IDENT + ADD_NL:
5268 testval = 1;
5269 FALLTHROUGH;
5270 case SIDENT:
5271 case SIDENT + ADD_NL:
5272 while (count < maxcount) {
5273 if (vim_isIDc(utf_ptr2char(scan)) && (testval || !ascii_isdigit(*scan))) {
5274 MB_PTR_ADV(scan);
5275 } else if (*scan == NUL) {
5276 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
5277 || rex.reg_line_lbr) {
5278 break;
5279 }
5280 reg_nextline();
5281 scan = rex.input;
5282 if (got_int) {
5283 break;
5284 }
5285 } else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p))) {
5286 scan++;
5287 } else {
5288 break;
5289 }
5290 ++count;
5291 }
5292 break;
5293
5294 case KWORD:
5295 case KWORD + ADD_NL:
5296 testval = 1;
5297 FALLTHROUGH;
5298 case SKWORD:
5299 case SKWORD + ADD_NL:
5300 while (count < maxcount) {
5301 if (vim_iswordp_buf(scan, rex.reg_buf)
5302 && (testval || !ascii_isdigit(*scan))) {
5303 MB_PTR_ADV(scan);
5304 } else if (*scan == NUL) {
5305 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
5306 || rex.reg_line_lbr) {
5307 break;
5308 }
5309 reg_nextline();
5310 scan = rex.input;
5311 if (got_int) {
5312 break;
5313 }
5314 } else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p))) {
5315 scan++;
5316 } else {
5317 break;
5318 }
5319 count++;
5320 }
5321 break;
5322
5323 case FNAME:
5324 case FNAME + ADD_NL:
5325 testval = 1;
5326 FALLTHROUGH;
5327 case SFNAME:
5328 case SFNAME + ADD_NL:
5329 while (count < maxcount) {
5330 if (vim_isfilec(utf_ptr2char(scan)) && (testval || !ascii_isdigit(*scan))) {
5331 MB_PTR_ADV(scan);
5332 } else if (*scan == NUL) {
5333 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
5334 || rex.reg_line_lbr) {
5335 break;
5336 }
5337 reg_nextline();
5338 scan = rex.input;
5339 if (got_int) {
5340 break;
5341 }
5342 } else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p))) {
5343 scan++;
5344 } else {
5345 break;
5346 }
5347 count++;
5348 }
5349 break;
5350
5351 case PRINT:
5352 case PRINT + ADD_NL:
5353 testval = 1;
5354 FALLTHROUGH;
5355 case SPRINT:
5356 case SPRINT + ADD_NL:
5357 while (count < maxcount) {
5358 if (*scan == NUL) {
5359 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
5360 || rex.reg_line_lbr) {
5361 break;
5362 }
5363 reg_nextline();
5364 scan = rex.input;
5365 if (got_int) {
5366 break;
5367 }
5368 } else if (vim_isprintc(utf_ptr2char(scan)) == 1
5369 && (testval || !ascii_isdigit(*scan))) {
5370 MB_PTR_ADV(scan);
5371 } else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p))) {
5372 scan++;
5373 } else {
5374 break;
5375 }
5376 count++;
5377 }
5378 break;
5379
5380 case WHITE:
5381 case WHITE + ADD_NL:
5382 testval = mask = RI_WHITE;
5383 do_class:
5384 while (count < maxcount) {
5385 int l;
5386 if (*scan == NUL) {
5387 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
5388 || rex.reg_line_lbr) {
5389 break;
5390 }
5391 reg_nextline();
5392 scan = rex.input;
5393 if (got_int) {
5394 break;
5395 }
5396 } else if ((l = utfc_ptr2len(scan)) > 1) {
5397 if (testval != 0) {
5398 break;
5399 }
5400 scan += l;
5401 } else if ((class_tab[*scan] & mask) == testval) {
5402 scan++;
5403 } else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p))) {
5404 scan++;
5405 } else {
5406 break;
5407 }
5408 ++count;
5409 }
5410 break;
5411
5412 case NWHITE:
5413 case NWHITE + ADD_NL:
5414 mask = RI_WHITE;
5415 goto do_class;
5416 case DIGIT:
5417 case DIGIT + ADD_NL:
5418 testval = mask = RI_DIGIT;
5419 goto do_class;
5420 case NDIGIT:
5421 case NDIGIT + ADD_NL:
5422 mask = RI_DIGIT;
5423 goto do_class;
5424 case HEX:
5425 case HEX + ADD_NL:
5426 testval = mask = RI_HEX;
5427 goto do_class;
5428 case NHEX:
5429 case NHEX + ADD_NL:
5430 mask = RI_HEX;
5431 goto do_class;
5432 case OCTAL:
5433 case OCTAL + ADD_NL:
5434 testval = mask = RI_OCTAL;
5435 goto do_class;
5436 case NOCTAL:
5437 case NOCTAL + ADD_NL:
5438 mask = RI_OCTAL;
5439 goto do_class;
5440 case WORD:
5441 case WORD + ADD_NL:
5442 testval = mask = RI_WORD;
5443 goto do_class;
5444 case NWORD:
5445 case NWORD + ADD_NL:
5446 mask = RI_WORD;
5447 goto do_class;
5448 case HEAD:
5449 case HEAD + ADD_NL:
5450 testval = mask = RI_HEAD;
5451 goto do_class;
5452 case NHEAD:
5453 case NHEAD + ADD_NL:
5454 mask = RI_HEAD;
5455 goto do_class;
5456 case ALPHA:
5457 case ALPHA + ADD_NL:
5458 testval = mask = RI_ALPHA;
5459 goto do_class;
5460 case NALPHA:
5461 case NALPHA + ADD_NL:
5462 mask = RI_ALPHA;
5463 goto do_class;
5464 case LOWER:
5465 case LOWER + ADD_NL:
5466 testval = mask = RI_LOWER;
5467 goto do_class;
5468 case NLOWER:
5469 case NLOWER + ADD_NL:
5470 mask = RI_LOWER;
5471 goto do_class;
5472 case UPPER:
5473 case UPPER + ADD_NL:
5474 testval = mask = RI_UPPER;
5475 goto do_class;
5476 case NUPPER:
5477 case NUPPER + ADD_NL:
5478 mask = RI_UPPER;
5479 goto do_class;
5480
5481 case EXACTLY:
5482 {
5483 int cu, cl;
5484
5485 // This doesn't do a multi-byte character, because a MULTIBYTECODE
5486 // would have been used for it. It does handle single-byte
5487 // characters, such as latin1.
5488 if (rex.reg_ic) {
5489 cu = mb_toupper(*opnd);
5490 cl = mb_tolower(*opnd);
5491 while (count < maxcount && (*scan == cu || *scan == cl)) {
5492 count++;
5493 scan++;
5494 }
5495 } else {
5496 cu = *opnd;
5497 while (count < maxcount && *scan == cu) {
5498 count++;
5499 scan++;
5500 }
5501 }
5502 break;
5503 }
5504
5505 case MULTIBYTECODE:
5506 {
5507 int i, len, cf = 0;
5508
5509 /* Safety check (just in case 'encoding' was changed since
5510 * compiling the program). */
5511 if ((len = utfc_ptr2len(opnd)) > 1) {
5512 if (rex.reg_ic) {
5513 cf = utf_fold(utf_ptr2char(opnd));
5514 }
5515 while (count < maxcount && utfc_ptr2len(scan) >= len) {
5516 for (i = 0; i < len; i++) {
5517 if (opnd[i] != scan[i]) {
5518 break;
5519 }
5520 }
5521 if (i < len && (!rex.reg_ic
5522 || utf_fold(utf_ptr2char(scan)) != cf)) {
5523 break;
5524 }
5525 scan += len;
5526 ++count;
5527 }
5528 }
5529 }
5530 break;
5531
5532 case ANYOF:
5533 case ANYOF + ADD_NL:
5534 testval = 1;
5535 FALLTHROUGH;
5536
5537 case ANYBUT:
5538 case ANYBUT + ADD_NL:
5539 while (count < maxcount) {
5540 int len;
5541 if (*scan == NUL) {
5542 if (!REG_MULTI || !WITH_NL(OP(p)) || rex.lnum > rex.reg_maxline
5543 || rex.reg_line_lbr) {
5544 break;
5545 }
5546 reg_nextline();
5547 scan = rex.input;
5548 if (got_int) {
5549 break;
5550 }
5551 } else if (rex.reg_line_lbr && *scan == '\n' && WITH_NL(OP(p))) {
5552 scan++;
5553 } else if ((len = utfc_ptr2len(scan)) > 1) {
5554 if ((cstrchr(opnd, utf_ptr2char(scan)) == NULL) == testval) {
5555 break;
5556 }
5557 scan += len;
5558 } else {
5559 if ((cstrchr(opnd, *scan) == NULL) == testval)
5560 break;
5561 ++scan;
5562 }
5563 ++count;
5564 }
5565 break;
5566
5567 case NEWL:
5568 while (count < maxcount
5569 && ((*scan == NUL && rex.lnum <= rex.reg_maxline && !rex.reg_line_lbr
5570 && REG_MULTI) || (*scan == '\n' && rex.reg_line_lbr))) {
5571 count++;
5572 if (rex.reg_line_lbr) {
5573 ADVANCE_REGINPUT();
5574 } else {
5575 reg_nextline();
5576 }
5577 scan = rex.input;
5578 if (got_int) {
5579 break;
5580 }
5581 }
5582 break;
5583
5584 default: // Oh dear. Called inappropriately.
5585 iemsg(_(e_re_corr));
5586 #ifdef REGEXP_DEBUG
5587 printf("Called regrepeat with op code %d\n", OP(p));
5588 #endif
5589 break;
5590 }
5591
5592 rex.input = scan;
5593
5594 return (int)count;
5595 }
5596
5597 /*
5598 * regnext - dig the "next" pointer out of a node
5599 * Returns NULL when calculating size, when there is no next item and when
5600 * there is an error.
5601 */
regnext(char_u * p)5602 static char_u *regnext(char_u *p)
5603 FUNC_ATTR_NONNULL_ALL
5604 {
5605 int offset;
5606
5607 if (p == JUST_CALC_SIZE || reg_toolong)
5608 return NULL;
5609
5610 offset = NEXT(p);
5611 if (offset == 0)
5612 return NULL;
5613
5614 if (OP(p) == BACK)
5615 return p - offset;
5616 else
5617 return p + offset;
5618 }
5619
5620 /*
5621 * Check the regexp program for its magic number.
5622 * Return true if it's wrong.
5623 */
prog_magic_wrong(void)5624 static int prog_magic_wrong(void)
5625 {
5626 regprog_T *prog;
5627
5628 prog = REG_MULTI ? rex.reg_mmatch->regprog : rex.reg_match->regprog;
5629 if (prog->engine == &nfa_regengine) {
5630 // For NFA matcher we don't check the magic
5631 return false;
5632 }
5633
5634 if (UCHARAT(((bt_regprog_T *)prog)->program) != REGMAGIC) {
5635 emsg(_(e_re_corr));
5636 return true;
5637 }
5638 return false;
5639 }
5640
5641 /*
5642 * Cleanup the subexpressions, if this wasn't done yet.
5643 * This construction is used to clear the subexpressions only when they are
5644 * used (to increase speed).
5645 */
cleanup_subexpr(void)5646 static void cleanup_subexpr(void)
5647 {
5648 if (rex.need_clear_subexpr) {
5649 if (REG_MULTI) {
5650 // Use 0xff to set lnum to -1
5651 memset(rex.reg_startpos, 0xff, sizeof(lpos_T) * NSUBEXP);
5652 memset(rex.reg_endpos, 0xff, sizeof(lpos_T) * NSUBEXP);
5653 } else {
5654 memset(rex.reg_startp, 0, sizeof(char_u *) * NSUBEXP);
5655 memset(rex.reg_endp, 0, sizeof(char_u *) * NSUBEXP);
5656 }
5657 rex.need_clear_subexpr = false;
5658 }
5659 }
5660
cleanup_zsubexpr(void)5661 static void cleanup_zsubexpr(void)
5662 {
5663 if (rex.need_clear_zsubexpr) {
5664 if (REG_MULTI) {
5665 /* Use 0xff to set lnum to -1 */
5666 memset(reg_startzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
5667 memset(reg_endzpos, 0xff, sizeof(lpos_T) * NSUBEXP);
5668 } else {
5669 memset(reg_startzp, 0, sizeof(char_u *) * NSUBEXP);
5670 memset(reg_endzp, 0, sizeof(char_u *) * NSUBEXP);
5671 }
5672 rex.need_clear_zsubexpr = false;
5673 }
5674 }
5675
5676 // Save the current subexpr to "bp", so that they can be restored
5677 // later by restore_subexpr().
save_subexpr(regbehind_T * bp)5678 static void save_subexpr(regbehind_T *bp)
5679 FUNC_ATTR_NONNULL_ALL
5680 {
5681 // When "rex.need_clear_subexpr" is set we don't need to save the values, only
5682 // remember that this flag needs to be set again when restoring.
5683 bp->save_need_clear_subexpr = rex.need_clear_subexpr;
5684 if (!rex.need_clear_subexpr) {
5685 for (int i = 0; i < NSUBEXP; i++) {
5686 if (REG_MULTI) {
5687 bp->save_start[i].se_u.pos = rex.reg_startpos[i];
5688 bp->save_end[i].se_u.pos = rex.reg_endpos[i];
5689 } else {
5690 bp->save_start[i].se_u.ptr = rex.reg_startp[i];
5691 bp->save_end[i].se_u.ptr = rex.reg_endp[i];
5692 }
5693 }
5694 }
5695 }
5696
5697 // Restore the subexpr from "bp".
restore_subexpr(regbehind_T * bp)5698 static void restore_subexpr(regbehind_T *bp)
5699 FUNC_ATTR_NONNULL_ALL
5700 {
5701 // Only need to restore saved values when they are not to be cleared.
5702 rex.need_clear_subexpr = bp->save_need_clear_subexpr;
5703 if (!rex.need_clear_subexpr) {
5704 for (int i = 0; i < NSUBEXP; i++) {
5705 if (REG_MULTI) {
5706 rex.reg_startpos[i] = bp->save_start[i].se_u.pos;
5707 rex.reg_endpos[i] = bp->save_end[i].se_u.pos;
5708 } else {
5709 rex.reg_startp[i] = bp->save_start[i].se_u.ptr;
5710 rex.reg_endp[i] = bp->save_end[i].se_u.ptr;
5711 }
5712 }
5713 }
5714 }
5715
5716 // Advance rex.lnum, rex.line and rex.input to the next line.
reg_nextline(void)5717 static void reg_nextline(void)
5718 {
5719 rex.line = reg_getline(++rex.lnum);
5720 rex.input = rex.line;
5721 fast_breakcheck();
5722 }
5723
5724 // Save the input line and position in a regsave_T.
reg_save(regsave_T * save,garray_T * gap)5725 static void reg_save(regsave_T *save, garray_T *gap)
5726 FUNC_ATTR_NONNULL_ALL
5727 {
5728 if (REG_MULTI) {
5729 save->rs_u.pos.col = (colnr_T)(rex.input - rex.line);
5730 save->rs_u.pos.lnum = rex.lnum;
5731 } else {
5732 save->rs_u.ptr = rex.input;
5733 }
5734 save->rs_len = gap->ga_len;
5735 }
5736
5737 // Restore the input line and position from a regsave_T.
reg_restore(regsave_T * save,garray_T * gap)5738 static void reg_restore(regsave_T *save, garray_T *gap)
5739 FUNC_ATTR_NONNULL_ALL
5740 {
5741 if (REG_MULTI) {
5742 if (rex.lnum != save->rs_u.pos.lnum) {
5743 // only call reg_getline() when the line number changed to save
5744 // a bit of time
5745 rex.lnum = save->rs_u.pos.lnum;
5746 rex.line = reg_getline(rex.lnum);
5747 }
5748 rex.input = rex.line + save->rs_u.pos.col;
5749 } else {
5750 rex.input = save->rs_u.ptr;
5751 }
5752 gap->ga_len = save->rs_len;
5753 }
5754
5755 // Return true if current position is equal to saved position.
reg_save_equal(const regsave_T * save)5756 static bool reg_save_equal(const regsave_T *save)
5757 FUNC_ATTR_NONNULL_ALL
5758 {
5759 if (REG_MULTI) {
5760 return rex.lnum == save->rs_u.pos.lnum
5761 && rex.input == rex.line + save->rs_u.pos.col;
5762 }
5763 return rex.input == save->rs_u.ptr;
5764 }
5765
5766 /*
5767 * Tentatively set the sub-expression start to the current position (after
5768 * calling regmatch() they will have changed). Need to save the existing
5769 * values for when there is no match.
5770 * Use se_save() to use pointer (save_se_multi()) or position (save_se_one()),
5771 * depending on REG_MULTI.
5772 */
save_se_multi(save_se_T * savep,lpos_T * posp)5773 static void save_se_multi(save_se_T *savep, lpos_T *posp)
5774 {
5775 savep->se_u.pos = *posp;
5776 posp->lnum = rex.lnum;
5777 posp->col = (colnr_T)(rex.input - rex.line);
5778 }
5779
save_se_one(save_se_T * savep,char_u ** pp)5780 static void save_se_one(save_se_T *savep, char_u **pp)
5781 {
5782 savep->se_u.ptr = *pp;
5783 *pp = rex.input;
5784 }
5785
5786 /*
5787 * Compare a number with the operand of RE_LNUM, RE_COL or RE_VCOL.
5788 */
re_num_cmp(uint32_t val,char_u * scan)5789 static int re_num_cmp(uint32_t val, char_u *scan)
5790 {
5791 uint32_t n = (uint32_t)OPERAND_MIN(scan);
5792
5793 if (OPERAND_CMP(scan) == '>')
5794 return val > n;
5795 if (OPERAND_CMP(scan) == '<')
5796 return val < n;
5797 return val == n;
5798 }
5799
5800 /*
5801 * Check whether a backreference matches.
5802 * Returns RA_FAIL, RA_NOMATCH or RA_MATCH.
5803 * If "bytelen" is not NULL, it is set to the byte length of the match in the
5804 * last line.
5805 */
match_with_backref(linenr_T start_lnum,colnr_T start_col,linenr_T end_lnum,colnr_T end_col,int * bytelen)5806 static int match_with_backref(linenr_T start_lnum, colnr_T start_col, linenr_T end_lnum, colnr_T end_col, int *bytelen)
5807 {
5808 linenr_T clnum = start_lnum;
5809 colnr_T ccol = start_col;
5810 int len;
5811 char_u *p;
5812
5813 if (bytelen != NULL)
5814 *bytelen = 0;
5815 for (;; ) {
5816 // Since getting one line may invalidate the other, need to make copy.
5817 // Slow!
5818 if (rex.line != reg_tofree) {
5819 len = (int)STRLEN(rex.line);
5820 if (reg_tofree == NULL || len >= (int)reg_tofreelen) {
5821 len += 50; /* get some extra */
5822 xfree(reg_tofree);
5823 reg_tofree = xmalloc(len);
5824 reg_tofreelen = len;
5825 }
5826 STRCPY(reg_tofree, rex.line);
5827 rex.input = reg_tofree + (rex.input - rex.line);
5828 rex.line = reg_tofree;
5829 }
5830
5831 /* Get the line to compare with. */
5832 p = reg_getline(clnum);
5833 assert(p);
5834
5835 if (clnum == end_lnum)
5836 len = end_col - ccol;
5837 else
5838 len = (int)STRLEN(p + ccol);
5839
5840 if (cstrncmp(p + ccol, rex.input, &len) != 0) {
5841 return RA_NOMATCH; // doesn't match
5842 }
5843 if (bytelen != NULL) {
5844 *bytelen += len;
5845 }
5846 if (clnum == end_lnum) {
5847 break; // match and at end!
5848 }
5849 if (rex.lnum >= rex.reg_maxline) {
5850 return RA_NOMATCH; // text too short
5851 }
5852
5853 /* Advance to next line. */
5854 reg_nextline();
5855 if (bytelen != NULL)
5856 *bytelen = 0;
5857 ++clnum;
5858 ccol = 0;
5859 if (got_int)
5860 return RA_FAIL;
5861 }
5862
5863 // found a match! Note that rex.line may now point to a copy of the line,
5864 // that should not matter.
5865 return RA_MATCH;
5866 }
5867
5868 #ifdef BT_REGEXP_DUMP
5869
5870 /*
5871 * regdump - dump a regexp onto stdout in vaguely comprehensible form
5872 */
regdump(char_u * pattern,bt_regprog_T * r)5873 static void regdump(char_u *pattern, bt_regprog_T *r)
5874 {
5875 char_u *s;
5876 int op = EXACTLY; /* Arbitrary non-END op. */
5877 char_u *next;
5878 char_u *end = NULL;
5879 FILE *f;
5880
5881 #ifdef BT_REGEXP_LOG
5882 f = fopen("bt_regexp_log.log", "a");
5883 #else
5884 f = stdout;
5885 #endif
5886 if (f == NULL)
5887 return;
5888 fprintf(f, "-------------------------------------\n\r\nregcomp(%s):\r\n",
5889 pattern);
5890
5891 s = r->program + 1;
5892 /*
5893 * Loop until we find the END that isn't before a referred next (an END
5894 * can also appear in a NOMATCH operand).
5895 */
5896 while (op != END || s <= end) {
5897 op = OP(s);
5898 fprintf(f, "%2d%s", (int)(s - r->program), regprop(s)); /* Where, what. */
5899 next = regnext(s);
5900 if (next == NULL) /* Next ptr. */
5901 fprintf(f, "(0)");
5902 else
5903 fprintf(f, "(%d)", (int)((s - r->program) + (next - s)));
5904 if (end < next)
5905 end = next;
5906 if (op == BRACE_LIMITS) {
5907 /* Two ints */
5908 fprintf(f, " minval %" PRId64 ", maxval %" PRId64,
5909 (int64_t)OPERAND_MIN(s), (int64_t)OPERAND_MAX(s));
5910 s += 8;
5911 } else if (op == BEHIND || op == NOBEHIND) {
5912 /* one int */
5913 fprintf(f, " count %" PRId64, (int64_t)OPERAND_MIN(s));
5914 s += 4;
5915 } else if (op == RE_LNUM || op == RE_COL || op == RE_VCOL) {
5916 // one int plus comparator
5917 fprintf(f, " count %" PRId64, (int64_t)OPERAND_MIN(s));
5918 s += 5;
5919 }
5920 s += 3;
5921 if (op == ANYOF || op == ANYOF + ADD_NL
5922 || op == ANYBUT || op == ANYBUT + ADD_NL
5923 || op == EXACTLY) {
5924 /* Literal string, where present. */
5925 fprintf(f, "\nxxxxxxxxx\n");
5926 while (*s != NUL)
5927 fprintf(f, "%c", *s++);
5928 fprintf(f, "\nxxxxxxxxx\n");
5929 s++;
5930 }
5931 fprintf(f, "\r\n");
5932 }
5933
5934 /* Header fields of interest. */
5935 if (r->regstart != NUL)
5936 fprintf(f, "start `%s' 0x%x; ", r->regstart < 256
5937 ? (char *)transchar(r->regstart)
5938 : "multibyte", r->regstart);
5939 if (r->reganch)
5940 fprintf(f, "anchored; ");
5941 if (r->regmust != NULL)
5942 fprintf(f, "must have \"%s\"", r->regmust);
5943 fprintf(f, "\r\n");
5944
5945 #ifdef BT_REGEXP_LOG
5946 fclose(f);
5947 #endif
5948 }
5949 #endif /* BT_REGEXP_DUMP */
5950
5951 #ifdef REGEXP_DEBUG
5952 /*
5953 * regprop - printable representation of opcode
5954 */
regprop(char_u * op)5955 static char_u *regprop(char_u *op)
5956 {
5957 char *p;
5958 static char buf[50];
5959
5960 STRCPY(buf, ":");
5961
5962 switch ((int) OP(op)) {
5963 case BOL:
5964 p = "BOL";
5965 break;
5966 case EOL:
5967 p = "EOL";
5968 break;
5969 case RE_BOF:
5970 p = "BOF";
5971 break;
5972 case RE_EOF:
5973 p = "EOF";
5974 break;
5975 case CURSOR:
5976 p = "CURSOR";
5977 break;
5978 case RE_VISUAL:
5979 p = "RE_VISUAL";
5980 break;
5981 case RE_LNUM:
5982 p = "RE_LNUM";
5983 break;
5984 case RE_MARK:
5985 p = "RE_MARK";
5986 break;
5987 case RE_COL:
5988 p = "RE_COL";
5989 break;
5990 case RE_VCOL:
5991 p = "RE_VCOL";
5992 break;
5993 case BOW:
5994 p = "BOW";
5995 break;
5996 case EOW:
5997 p = "EOW";
5998 break;
5999 case ANY:
6000 p = "ANY";
6001 break;
6002 case ANY + ADD_NL:
6003 p = "ANY+NL";
6004 break;
6005 case ANYOF:
6006 p = "ANYOF";
6007 break;
6008 case ANYOF + ADD_NL:
6009 p = "ANYOF+NL";
6010 break;
6011 case ANYBUT:
6012 p = "ANYBUT";
6013 break;
6014 case ANYBUT + ADD_NL:
6015 p = "ANYBUT+NL";
6016 break;
6017 case IDENT:
6018 p = "IDENT";
6019 break;
6020 case IDENT + ADD_NL:
6021 p = "IDENT+NL";
6022 break;
6023 case SIDENT:
6024 p = "SIDENT";
6025 break;
6026 case SIDENT + ADD_NL:
6027 p = "SIDENT+NL";
6028 break;
6029 case KWORD:
6030 p = "KWORD";
6031 break;
6032 case KWORD + ADD_NL:
6033 p = "KWORD+NL";
6034 break;
6035 case SKWORD:
6036 p = "SKWORD";
6037 break;
6038 case SKWORD + ADD_NL:
6039 p = "SKWORD+NL";
6040 break;
6041 case FNAME:
6042 p = "FNAME";
6043 break;
6044 case FNAME + ADD_NL:
6045 p = "FNAME+NL";
6046 break;
6047 case SFNAME:
6048 p = "SFNAME";
6049 break;
6050 case SFNAME + ADD_NL:
6051 p = "SFNAME+NL";
6052 break;
6053 case PRINT:
6054 p = "PRINT";
6055 break;
6056 case PRINT + ADD_NL:
6057 p = "PRINT+NL";
6058 break;
6059 case SPRINT:
6060 p = "SPRINT";
6061 break;
6062 case SPRINT + ADD_NL:
6063 p = "SPRINT+NL";
6064 break;
6065 case WHITE:
6066 p = "WHITE";
6067 break;
6068 case WHITE + ADD_NL:
6069 p = "WHITE+NL";
6070 break;
6071 case NWHITE:
6072 p = "NWHITE";
6073 break;
6074 case NWHITE + ADD_NL:
6075 p = "NWHITE+NL";
6076 break;
6077 case DIGIT:
6078 p = "DIGIT";
6079 break;
6080 case DIGIT + ADD_NL:
6081 p = "DIGIT+NL";
6082 break;
6083 case NDIGIT:
6084 p = "NDIGIT";
6085 break;
6086 case NDIGIT + ADD_NL:
6087 p = "NDIGIT+NL";
6088 break;
6089 case HEX:
6090 p = "HEX";
6091 break;
6092 case HEX + ADD_NL:
6093 p = "HEX+NL";
6094 break;
6095 case NHEX:
6096 p = "NHEX";
6097 break;
6098 case NHEX + ADD_NL:
6099 p = "NHEX+NL";
6100 break;
6101 case OCTAL:
6102 p = "OCTAL";
6103 break;
6104 case OCTAL + ADD_NL:
6105 p = "OCTAL+NL";
6106 break;
6107 case NOCTAL:
6108 p = "NOCTAL";
6109 break;
6110 case NOCTAL + ADD_NL:
6111 p = "NOCTAL+NL";
6112 break;
6113 case WORD:
6114 p = "WORD";
6115 break;
6116 case WORD + ADD_NL:
6117 p = "WORD+NL";
6118 break;
6119 case NWORD:
6120 p = "NWORD";
6121 break;
6122 case NWORD + ADD_NL:
6123 p = "NWORD+NL";
6124 break;
6125 case HEAD:
6126 p = "HEAD";
6127 break;
6128 case HEAD + ADD_NL:
6129 p = "HEAD+NL";
6130 break;
6131 case NHEAD:
6132 p = "NHEAD";
6133 break;
6134 case NHEAD + ADD_NL:
6135 p = "NHEAD+NL";
6136 break;
6137 case ALPHA:
6138 p = "ALPHA";
6139 break;
6140 case ALPHA + ADD_NL:
6141 p = "ALPHA+NL";
6142 break;
6143 case NALPHA:
6144 p = "NALPHA";
6145 break;
6146 case NALPHA + ADD_NL:
6147 p = "NALPHA+NL";
6148 break;
6149 case LOWER:
6150 p = "LOWER";
6151 break;
6152 case LOWER + ADD_NL:
6153 p = "LOWER+NL";
6154 break;
6155 case NLOWER:
6156 p = "NLOWER";
6157 break;
6158 case NLOWER + ADD_NL:
6159 p = "NLOWER+NL";
6160 break;
6161 case UPPER:
6162 p = "UPPER";
6163 break;
6164 case UPPER + ADD_NL:
6165 p = "UPPER+NL";
6166 break;
6167 case NUPPER:
6168 p = "NUPPER";
6169 break;
6170 case NUPPER + ADD_NL:
6171 p = "NUPPER+NL";
6172 break;
6173 case BRANCH:
6174 p = "BRANCH";
6175 break;
6176 case EXACTLY:
6177 p = "EXACTLY";
6178 break;
6179 case NOTHING:
6180 p = "NOTHING";
6181 break;
6182 case BACK:
6183 p = "BACK";
6184 break;
6185 case END:
6186 p = "END";
6187 break;
6188 case MOPEN + 0:
6189 p = "MATCH START";
6190 break;
6191 case MOPEN + 1:
6192 case MOPEN + 2:
6193 case MOPEN + 3:
6194 case MOPEN + 4:
6195 case MOPEN + 5:
6196 case MOPEN + 6:
6197 case MOPEN + 7:
6198 case MOPEN + 8:
6199 case MOPEN + 9:
6200 sprintf(buf + STRLEN(buf), "MOPEN%d", OP(op) - MOPEN);
6201 p = NULL;
6202 break;
6203 case MCLOSE + 0:
6204 p = "MATCH END";
6205 break;
6206 case MCLOSE + 1:
6207 case MCLOSE + 2:
6208 case MCLOSE + 3:
6209 case MCLOSE + 4:
6210 case MCLOSE + 5:
6211 case MCLOSE + 6:
6212 case MCLOSE + 7:
6213 case MCLOSE + 8:
6214 case MCLOSE + 9:
6215 sprintf(buf + STRLEN(buf), "MCLOSE%d", OP(op) - MCLOSE);
6216 p = NULL;
6217 break;
6218 case BACKREF + 1:
6219 case BACKREF + 2:
6220 case BACKREF + 3:
6221 case BACKREF + 4:
6222 case BACKREF + 5:
6223 case BACKREF + 6:
6224 case BACKREF + 7:
6225 case BACKREF + 8:
6226 case BACKREF + 9:
6227 sprintf(buf + STRLEN(buf), "BACKREF%d", OP(op) - BACKREF);
6228 p = NULL;
6229 break;
6230 case NOPEN:
6231 p = "NOPEN";
6232 break;
6233 case NCLOSE:
6234 p = "NCLOSE";
6235 break;
6236 case ZOPEN + 1:
6237 case ZOPEN + 2:
6238 case ZOPEN + 3:
6239 case ZOPEN + 4:
6240 case ZOPEN + 5:
6241 case ZOPEN + 6:
6242 case ZOPEN + 7:
6243 case ZOPEN + 8:
6244 case ZOPEN + 9:
6245 sprintf(buf + STRLEN(buf), "ZOPEN%d", OP(op) - ZOPEN);
6246 p = NULL;
6247 break;
6248 case ZCLOSE + 1:
6249 case ZCLOSE + 2:
6250 case ZCLOSE + 3:
6251 case ZCLOSE + 4:
6252 case ZCLOSE + 5:
6253 case ZCLOSE + 6:
6254 case ZCLOSE + 7:
6255 case ZCLOSE + 8:
6256 case ZCLOSE + 9:
6257 sprintf(buf + STRLEN(buf), "ZCLOSE%d", OP(op) - ZCLOSE);
6258 p = NULL;
6259 break;
6260 case ZREF + 1:
6261 case ZREF + 2:
6262 case ZREF + 3:
6263 case ZREF + 4:
6264 case ZREF + 5:
6265 case ZREF + 6:
6266 case ZREF + 7:
6267 case ZREF + 8:
6268 case ZREF + 9:
6269 sprintf(buf + STRLEN(buf), "ZREF%d", OP(op) - ZREF);
6270 p = NULL;
6271 break;
6272 case STAR:
6273 p = "STAR";
6274 break;
6275 case PLUS:
6276 p = "PLUS";
6277 break;
6278 case NOMATCH:
6279 p = "NOMATCH";
6280 break;
6281 case MATCH:
6282 p = "MATCH";
6283 break;
6284 case BEHIND:
6285 p = "BEHIND";
6286 break;
6287 case NOBEHIND:
6288 p = "NOBEHIND";
6289 break;
6290 case SUBPAT:
6291 p = "SUBPAT";
6292 break;
6293 case BRACE_LIMITS:
6294 p = "BRACE_LIMITS";
6295 break;
6296 case BRACE_SIMPLE:
6297 p = "BRACE_SIMPLE";
6298 break;
6299 case BRACE_COMPLEX + 0:
6300 case BRACE_COMPLEX + 1:
6301 case BRACE_COMPLEX + 2:
6302 case BRACE_COMPLEX + 3:
6303 case BRACE_COMPLEX + 4:
6304 case BRACE_COMPLEX + 5:
6305 case BRACE_COMPLEX + 6:
6306 case BRACE_COMPLEX + 7:
6307 case BRACE_COMPLEX + 8:
6308 case BRACE_COMPLEX + 9:
6309 sprintf(buf + STRLEN(buf), "BRACE_COMPLEX%d", OP(op) - BRACE_COMPLEX);
6310 p = NULL;
6311 break;
6312 case MULTIBYTECODE:
6313 p = "MULTIBYTECODE";
6314 break;
6315 case NEWL:
6316 p = "NEWL";
6317 break;
6318 default:
6319 sprintf(buf + STRLEN(buf), "corrupt %d", OP(op));
6320 p = NULL;
6321 break;
6322 }
6323 if (p != NULL)
6324 STRCAT(buf, p);
6325 return (char_u *)buf;
6326 }
6327 #endif /* REGEXP_DEBUG */
6328
6329
6330
6331 /* 0xfb20 - 0xfb4f */
6332 static decomp_T decomp_table[0xfb4f-0xfb20+1] =
6333 {
6334 {0x5e2,0,0}, /* 0xfb20 alt ayin */
6335 {0x5d0,0,0}, /* 0xfb21 alt alef */
6336 {0x5d3,0,0}, /* 0xfb22 alt dalet */
6337 {0x5d4,0,0}, /* 0xfb23 alt he */
6338 {0x5db,0,0}, /* 0xfb24 alt kaf */
6339 {0x5dc,0,0}, /* 0xfb25 alt lamed */
6340 {0x5dd,0,0}, /* 0xfb26 alt mem-sofit */
6341 {0x5e8,0,0}, /* 0xfb27 alt resh */
6342 {0x5ea,0,0}, /* 0xfb28 alt tav */
6343 {'+', 0, 0}, /* 0xfb29 alt plus */
6344 {0x5e9, 0x5c1, 0}, /* 0xfb2a shin+shin-dot */
6345 {0x5e9, 0x5c2, 0}, /* 0xfb2b shin+sin-dot */
6346 {0x5e9, 0x5c1, 0x5bc}, /* 0xfb2c shin+shin-dot+dagesh */
6347 {0x5e9, 0x5c2, 0x5bc}, /* 0xfb2d shin+sin-dot+dagesh */
6348 {0x5d0, 0x5b7, 0}, /* 0xfb2e alef+patah */
6349 {0x5d0, 0x5b8, 0}, /* 0xfb2f alef+qamats */
6350 {0x5d0, 0x5b4, 0}, /* 0xfb30 alef+hiriq */
6351 {0x5d1, 0x5bc, 0}, /* 0xfb31 bet+dagesh */
6352 {0x5d2, 0x5bc, 0}, /* 0xfb32 gimel+dagesh */
6353 {0x5d3, 0x5bc, 0}, /* 0xfb33 dalet+dagesh */
6354 {0x5d4, 0x5bc, 0}, /* 0xfb34 he+dagesh */
6355 {0x5d5, 0x5bc, 0}, /* 0xfb35 vav+dagesh */
6356 {0x5d6, 0x5bc, 0}, /* 0xfb36 zayin+dagesh */
6357 {0xfb37, 0, 0}, /* 0xfb37 -- */
6358 {0x5d8, 0x5bc, 0}, /* 0xfb38 tet+dagesh */
6359 {0x5d9, 0x5bc, 0}, /* 0xfb39 yud+dagesh */
6360 {0x5da, 0x5bc, 0}, /* 0xfb3a kaf sofit+dagesh */
6361 {0x5db, 0x5bc, 0}, /* 0xfb3b kaf+dagesh */
6362 {0x5dc, 0x5bc, 0}, /* 0xfb3c lamed+dagesh */
6363 {0xfb3d, 0, 0}, /* 0xfb3d -- */
6364 {0x5de, 0x5bc, 0}, /* 0xfb3e mem+dagesh */
6365 {0xfb3f, 0, 0}, /* 0xfb3f -- */
6366 {0x5e0, 0x5bc, 0}, /* 0xfb40 nun+dagesh */
6367 {0x5e1, 0x5bc, 0}, /* 0xfb41 samech+dagesh */
6368 {0xfb42, 0, 0}, /* 0xfb42 -- */
6369 {0x5e3, 0x5bc, 0}, /* 0xfb43 pe sofit+dagesh */
6370 {0x5e4, 0x5bc,0}, /* 0xfb44 pe+dagesh */
6371 {0xfb45, 0, 0}, /* 0xfb45 -- */
6372 {0x5e6, 0x5bc, 0}, /* 0xfb46 tsadi+dagesh */
6373 {0x5e7, 0x5bc, 0}, /* 0xfb47 qof+dagesh */
6374 {0x5e8, 0x5bc, 0}, /* 0xfb48 resh+dagesh */
6375 {0x5e9, 0x5bc, 0}, /* 0xfb49 shin+dagesh */
6376 {0x5ea, 0x5bc, 0}, /* 0xfb4a tav+dagesh */
6377 {0x5d5, 0x5b9, 0}, /* 0xfb4b vav+holam */
6378 {0x5d1, 0x5bf, 0}, /* 0xfb4c bet+rafe */
6379 {0x5db, 0x5bf, 0}, /* 0xfb4d kaf+rafe */
6380 {0x5e4, 0x5bf, 0}, /* 0xfb4e pe+rafe */
6381 {0x5d0, 0x5dc, 0} /* 0xfb4f alef-lamed */
6382 };
6383
mb_decompose(int c,int * c1,int * c2,int * c3)6384 static void mb_decompose(int c, int *c1, int *c2, int *c3)
6385 {
6386 decomp_T d;
6387
6388 if (c >= 0xfb20 && c <= 0xfb4f) {
6389 d = decomp_table[c - 0xfb20];
6390 *c1 = d.a;
6391 *c2 = d.b;
6392 *c3 = d.c;
6393 } else {
6394 *c1 = c;
6395 *c2 = *c3 = 0;
6396 }
6397 }
6398
6399 // Compare two strings, ignore case if rex.reg_ic set.
6400 // Return 0 if strings match, non-zero otherwise.
6401 // Correct the length "*n" when composing characters are ignored.
cstrncmp(char_u * s1,char_u * s2,int * n)6402 static int cstrncmp(char_u *s1, char_u *s2, int *n)
6403 {
6404 int result;
6405
6406 if (!rex.reg_ic) {
6407 result = STRNCMP(s1, s2, *n);
6408 } else {
6409 assert(*n >= 0);
6410 result = mb_strnicmp(s1, s2, (size_t)*n);
6411 }
6412
6413 // if it failed and it's utf8 and we want to combineignore:
6414 if (result != 0 && rex.reg_icombine) {
6415 char_u *str1, *str2;
6416 int c1, c2, c11, c12;
6417 int junk;
6418
6419 // we have to handle the strcmp ourselves, since it is necessary to
6420 // deal with the composing characters by ignoring them:
6421 str1 = s1;
6422 str2 = s2;
6423 c1 = c2 = 0;
6424 while ((int)(str1 - s1) < *n) {
6425 c1 = mb_ptr2char_adv((const char_u **)&str1);
6426 c2 = mb_ptr2char_adv((const char_u **)&str2);
6427
6428 /* decompose the character if necessary, into 'base' characters
6429 * because I don't care about Arabic, I will hard-code the Hebrew
6430 * which I *do* care about! So sue me... */
6431 if (c1 != c2 && (!rex.reg_ic || utf_fold(c1) != utf_fold(c2))) {
6432 // decomposition necessary?
6433 mb_decompose(c1, &c11, &junk, &junk);
6434 mb_decompose(c2, &c12, &junk, &junk);
6435 c1 = c11;
6436 c2 = c12;
6437 if (c11 != c12 && (!rex.reg_ic || utf_fold(c11) != utf_fold(c12))) {
6438 break;
6439 }
6440 }
6441 }
6442 result = c2 - c1;
6443 if (result == 0)
6444 *n = (int)(str2 - s2);
6445 }
6446
6447 return result;
6448 }
6449
6450 ////////////////////////////////////////////////////////////////
6451 // regsub stuff //
6452 ////////////////////////////////////////////////////////////////
6453
6454 /* This stuff below really confuses cc on an SGI -- webb */
6455
6456
6457
do_upper(int * d,int c)6458 static fptr_T do_upper(int *d, int c)
6459 {
6460 *d = mb_toupper(c);
6461
6462 return (fptr_T)NULL;
6463 }
6464
do_Upper(int * d,int c)6465 static fptr_T do_Upper(int *d, int c)
6466 {
6467 *d = mb_toupper(c);
6468
6469 return (fptr_T)do_Upper;
6470 }
6471
do_lower(int * d,int c)6472 static fptr_T do_lower(int *d, int c)
6473 {
6474 *d = mb_tolower(c);
6475
6476 return (fptr_T)NULL;
6477 }
6478
do_Lower(int * d,int c)6479 static fptr_T do_Lower(int *d, int c)
6480 {
6481 *d = mb_tolower(c);
6482
6483 return (fptr_T)do_Lower;
6484 }
6485
6486 /*
6487 * regtilde(): Replace tildes in the pattern by the old pattern.
6488 *
6489 * Short explanation of the tilde: It stands for the previous replacement
6490 * pattern. If that previous pattern also contains a ~ we should go back a
6491 * step further... But we insert the previous pattern into the current one
6492 * and remember that.
6493 * This still does not handle the case where "magic" changes. So require the
6494 * user to keep his hands off of "magic".
6495 *
6496 * The tildes are parsed once before the first call to vim_regsub().
6497 */
regtilde(char_u * source,int magic)6498 char_u *regtilde(char_u *source, int magic)
6499 {
6500 char_u *newsub = source;
6501 char_u *tmpsub;
6502 char_u *p;
6503 int len;
6504 int prevlen;
6505
6506 for (p = newsub; *p; ++p) {
6507 if ((*p == '~' && magic) || (*p == '\\' && *(p + 1) == '~' && !magic)) {
6508 if (reg_prev_sub != NULL) {
6509 /* length = len(newsub) - 1 + len(prev_sub) + 1 */
6510 prevlen = (int)STRLEN(reg_prev_sub);
6511 tmpsub = xmalloc(STRLEN(newsub) + prevlen);
6512 /* copy prefix */
6513 len = (int)(p - newsub); /* not including ~ */
6514 memmove(tmpsub, newsub, (size_t)len);
6515 /* interpret tilde */
6516 memmove(tmpsub + len, reg_prev_sub, (size_t)prevlen);
6517 // copy postfix
6518 if (!magic) {
6519 p++; // back off backslash
6520 }
6521 STRCPY(tmpsub + len + prevlen, p + 1);
6522
6523 if (newsub != source) /* already allocated newsub */
6524 xfree(newsub);
6525 newsub = tmpsub;
6526 p = newsub + len + prevlen;
6527 } else if (magic)
6528 STRMOVE(p, p + 1); /* remove '~' */
6529 else
6530 STRMOVE(p, p + 2); /* remove '\~' */
6531 --p;
6532 } else {
6533 if (*p == '\\' && p[1]) { // skip escaped characters
6534 p++;
6535 }
6536 p += utfc_ptr2len(p) - 1;
6537 }
6538 }
6539
6540 xfree(reg_prev_sub);
6541 if (newsub != source) /* newsub was allocated, just keep it */
6542 reg_prev_sub = newsub;
6543 else /* no ~ found, need to save newsub */
6544 reg_prev_sub = vim_strsave(newsub);
6545 return newsub;
6546 }
6547
6548 static bool can_f_submatch = false; // true when submatch() can be used
6549
6550 // These pointers are used for reg_submatch(). Needed for when the
6551 // substitution string is an expression that contains a call to substitute()
6552 // and submatch().
6553 typedef struct {
6554 regmatch_T *sm_match;
6555 regmmatch_T *sm_mmatch;
6556 linenr_T sm_firstlnum;
6557 linenr_T sm_maxline;
6558 int sm_line_lbr;
6559 } regsubmatch_T;
6560
6561 static regsubmatch_T rsm; // can only be used when can_f_submatch is true
6562
6563 /// Put the submatches in "argv[argskip]" which is a list passed into
6564 /// call_func() by vim_regsub_both().
fill_submatch_list(int argc FUNC_ATTR_UNUSED,typval_T * argv,int argskip,int argcount)6565 static int fill_submatch_list(int argc FUNC_ATTR_UNUSED, typval_T *argv,
6566 int argskip, int argcount)
6567 FUNC_ATTR_NONNULL_ALL
6568 {
6569 typval_T *listarg = argv + argskip;
6570
6571 if (argcount == argskip) {
6572 // called function doesn't take a submatches argument
6573 return argskip;
6574 }
6575
6576 // Relies on sl_list to be the first item in staticList10_T.
6577 tv_list_init_static10((staticList10_T *)listarg->vval.v_list);
6578
6579 // There are always 10 list items in staticList10_T.
6580 listitem_T *li = tv_list_first(listarg->vval.v_list);
6581 for (int i = 0; i < 10; i++) {
6582 char_u *s = rsm.sm_match->startp[i];
6583 if (s == NULL || rsm.sm_match->endp[i] == NULL) {
6584 s = NULL;
6585 } else {
6586 s = vim_strnsave(s, rsm.sm_match->endp[i] - s);
6587 }
6588 TV_LIST_ITEM_TV(li)->v_type = VAR_STRING;
6589 TV_LIST_ITEM_TV(li)->vval.v_string = s;
6590 li = TV_LIST_ITEM_NEXT(argv->vval.v_list, li);
6591 }
6592 return argskip + 1;
6593 }
6594
clear_submatch_list(staticList10_T * sl)6595 static void clear_submatch_list(staticList10_T *sl)
6596 {
6597 TV_LIST_ITER(&sl->sl_list, li, {
6598 xfree(TV_LIST_ITEM_TV(li)->vval.v_string);
6599 });
6600 }
6601
6602 /// vim_regsub() - perform substitutions after a vim_regexec() or
6603 /// vim_regexec_multi() match.
6604 ///
6605 /// If "copy" is true really copy into "dest".
6606 /// If "copy" is false nothing is copied, this is just to find out the length
6607 /// of the result.
6608 ///
6609 /// If "backslash" is true, a backslash will be removed later, need to double
6610 /// them to keep them, and insert a backslash before a CR to avoid it being
6611 /// replaced with a line break later.
6612 ///
6613 /// Note: The matched text must not change between the call of
6614 /// vim_regexec()/vim_regexec_multi() and vim_regsub()! It would make the back
6615 /// references invalid!
6616 ///
6617 /// Returns the size of the replacement, including terminating NUL.
vim_regsub(regmatch_T * rmp,char_u * source,typval_T * expr,char_u * dest,int copy,int magic,int backslash)6618 int vim_regsub(regmatch_T *rmp, char_u *source, typval_T *expr, char_u *dest,
6619 int copy, int magic, int backslash)
6620 {
6621 regexec_T rex_save;
6622 bool rex_in_use_save = rex_in_use;
6623
6624 if (rex_in_use) {
6625 // Being called recursively, save the state.
6626 rex_save = rex;
6627 }
6628 rex_in_use = true;
6629
6630 rex.reg_match = rmp;
6631 rex.reg_mmatch = NULL;
6632 rex.reg_maxline = 0;
6633 rex.reg_buf = curbuf;
6634 rex.reg_line_lbr = true;
6635 int result = vim_regsub_both(source, expr, dest, copy, magic, backslash);
6636
6637 rex_in_use = rex_in_use_save;
6638 if (rex_in_use) {
6639 rex = rex_save;
6640 }
6641
6642 return result;
6643 }
6644
vim_regsub_multi(regmmatch_T * rmp,linenr_T lnum,char_u * source,char_u * dest,int copy,int magic,int backslash)6645 int vim_regsub_multi(regmmatch_T *rmp, linenr_T lnum, char_u *source, char_u *dest, int copy, int magic, int backslash)
6646 {
6647 regexec_T rex_save;
6648 bool rex_in_use_save = rex_in_use;
6649
6650 if (rex_in_use) {
6651 // Being called recursively, save the state.
6652 rex_save = rex;
6653 }
6654 rex_in_use = true;
6655
6656 rex.reg_match = NULL;
6657 rex.reg_mmatch = rmp;
6658 rex.reg_buf = curbuf; // always works on the current buffer!
6659 rex.reg_firstlnum = lnum;
6660 rex.reg_maxline = curbuf->b_ml.ml_line_count - lnum;
6661 rex.reg_line_lbr = false;
6662 int result = vim_regsub_both(source, NULL, dest, copy, magic, backslash);
6663
6664 rex_in_use = rex_in_use_save;
6665 if (rex_in_use) {
6666 rex = rex_save;
6667 }
6668
6669 return result;
6670 }
6671
vim_regsub_both(char_u * source,typval_T * expr,char_u * dest,int copy,int magic,int backslash)6672 static int vim_regsub_both(char_u *source, typval_T *expr, char_u *dest,
6673 int copy, int magic, int backslash)
6674 {
6675 char_u *src;
6676 char_u *dst;
6677 char_u *s;
6678 int c;
6679 int cc;
6680 int no = -1;
6681 fptr_T func_all = (fptr_T)NULL;
6682 fptr_T func_one = (fptr_T)NULL;
6683 linenr_T clnum = 0; /* init for GCC */
6684 int len = 0; /* init for GCC */
6685 static char_u *eval_result = NULL;
6686
6687 // We need to keep track of how many backslashes we escape, so that the byte
6688 // counts for `extmark_splice` are correct.
6689 int num_escaped = 0;
6690
6691 // Be paranoid...
6692 if ((source == NULL && expr == NULL) || dest == NULL) {
6693 emsg(_(e_null));
6694 return 0;
6695 }
6696 if (prog_magic_wrong())
6697 return 0;
6698 src = source;
6699 dst = dest;
6700
6701 // When the substitute part starts with "\=" evaluate it as an expression.
6702 if (expr != NULL || (source[0] == '\\' && source[1] == '=')) {
6703 // To make sure that the length doesn't change between checking the
6704 // length and copying the string, and to speed up things, the
6705 // resulting string is saved from the call with "copy" == false to the
6706 // call with "copy" == true.
6707 if (copy) {
6708 if (eval_result != NULL) {
6709 STRCPY(dest, eval_result);
6710 dst += STRLEN(eval_result);
6711 XFREE_CLEAR(eval_result);
6712 }
6713 } else {
6714 const bool prev_can_f_submatch = can_f_submatch;
6715 regsubmatch_T rsm_save;
6716
6717 xfree(eval_result);
6718
6719 // The expression may contain substitute(), which calls us
6720 // recursively. Make sure submatch() gets the text from the first
6721 // level.
6722 if (can_f_submatch) {
6723 rsm_save = rsm;
6724 }
6725 can_f_submatch = true;
6726 rsm.sm_match = rex.reg_match;
6727 rsm.sm_mmatch = rex.reg_mmatch;
6728 rsm.sm_firstlnum = rex.reg_firstlnum;
6729 rsm.sm_maxline = rex.reg_maxline;
6730 rsm.sm_line_lbr = rex.reg_line_lbr;
6731
6732 if (expr != NULL) {
6733 typval_T argv[2];
6734 typval_T rettv;
6735 staticList10_T matchList = TV_LIST_STATIC10_INIT;
6736 rettv.v_type = VAR_STRING;
6737 rettv.vval.v_string = NULL;
6738 argv[0].v_type = VAR_LIST;
6739 argv[0].vval.v_list = &matchList.sl_list;
6740 funcexe_T funcexe = FUNCEXE_INIT;
6741 funcexe.argv_func = fill_submatch_list;
6742 funcexe.evaluate = true;
6743 if (expr->v_type == VAR_FUNC) {
6744 s = expr->vval.v_string;
6745 call_func(s, -1, &rettv, 1, argv, &funcexe);
6746 } else if (expr->v_type == VAR_PARTIAL) {
6747 partial_T *partial = expr->vval.v_partial;
6748
6749 s = partial_name(partial);
6750 funcexe.partial = partial;
6751 call_func(s, -1, &rettv, 1, argv, &funcexe);
6752 }
6753 if (tv_list_len(&matchList.sl_list) > 0) {
6754 // fill_submatch_list() was called.
6755 clear_submatch_list(&matchList);
6756 }
6757 if (rettv.v_type == VAR_UNKNOWN) {
6758 // something failed, no need to report another error
6759 eval_result = NULL;
6760 } else {
6761 char buf[NUMBUFLEN];
6762 eval_result = (char_u *)tv_get_string_buf_chk(&rettv, buf);
6763 if (eval_result != NULL) {
6764 eval_result = vim_strsave(eval_result);
6765 }
6766 }
6767 tv_clear(&rettv);
6768 } else {
6769 eval_result = eval_to_string(source + 2, NULL, true);
6770 }
6771
6772 if (eval_result != NULL) {
6773 int had_backslash = false;
6774
6775 for (s = eval_result; *s != NUL; MB_PTR_ADV(s)) {
6776 // Change NL to CR, so that it becomes a line break,
6777 // unless called from vim_regexec_nl().
6778 // Skip over a backslashed character.
6779 if (*s == NL && !rsm.sm_line_lbr) {
6780 *s = CAR;
6781 } else if (*s == '\\' && s[1] != NUL) {
6782 s++;
6783 /* Change NL to CR here too, so that this works:
6784 * :s/abc\\\ndef/\="aaa\\\nbbb"/ on text:
6785 * abc\
6786 * def
6787 * Not when called from vim_regexec_nl().
6788 */
6789 if (*s == NL && !rsm.sm_line_lbr) {
6790 *s = CAR;
6791 }
6792 had_backslash = true;
6793 }
6794 }
6795 if (had_backslash && backslash) {
6796 /* Backslashes will be consumed, need to double them. */
6797 s = vim_strsave_escaped(eval_result, (char_u *)"\\");
6798 xfree(eval_result);
6799 eval_result = s;
6800 }
6801
6802 dst += STRLEN(eval_result);
6803 }
6804
6805 can_f_submatch = prev_can_f_submatch;
6806 if (can_f_submatch) {
6807 rsm = rsm_save;
6808 }
6809 }
6810 } else
6811 while ((c = *src++) != NUL) {
6812 if (c == '&' && magic)
6813 no = 0;
6814 else if (c == '\\' && *src != NUL) {
6815 if (*src == '&' && !magic) {
6816 ++src;
6817 no = 0;
6818 } else if ('0' <= *src && *src <= '9') {
6819 no = *src++ - '0';
6820 } else if (vim_strchr((char_u *)"uUlLeE", *src)) {
6821 switch (*src++) {
6822 case 'u': func_one = (fptr_T)do_upper;
6823 continue;
6824 case 'U': func_all = (fptr_T)do_Upper;
6825 continue;
6826 case 'l': func_one = (fptr_T)do_lower;
6827 continue;
6828 case 'L': func_all = (fptr_T)do_Lower;
6829 continue;
6830 case 'e':
6831 case 'E': func_one = func_all = (fptr_T)NULL;
6832 continue;
6833 }
6834 }
6835 }
6836 if (no < 0) { /* Ordinary character. */
6837 if (c == K_SPECIAL && src[0] != NUL && src[1] != NUL) {
6838 /* Copy a special key as-is. */
6839 if (copy) {
6840 *dst++ = c;
6841 *dst++ = *src++;
6842 *dst++ = *src++;
6843 } else {
6844 dst += 3;
6845 src += 2;
6846 }
6847 continue;
6848 }
6849
6850 if (c == '\\' && *src != NUL) {
6851 // Check for abbreviations -- webb
6852 switch (*src) {
6853 case 'r': c = CAR; ++src; break;
6854 case 'n': c = NL; ++src; break;
6855 case 't': c = TAB; ++src; break;
6856 // Oh no! \e already has meaning in subst pat :-(
6857 // case 'e': c = ESC; ++src; break;
6858 case 'b': c = Ctrl_H; ++src; break;
6859
6860 // If "backslash" is true the backslash will be removed
6861 // later. Used to insert a literal CR.
6862 default:
6863 if (backslash) {
6864 num_escaped += 1;
6865 if (copy) {
6866 *dst = '\\';
6867 }
6868 dst++;
6869 }
6870 c = *src++;
6871 }
6872 } else {
6873 c = utf_ptr2char(src - 1);
6874 }
6875 // Write to buffer, if copy is set.
6876 if (func_one != NULL) {
6877 func_one = (fptr_T)(func_one(&cc, c));
6878 } else if (func_all != NULL) {
6879 func_all = (fptr_T)(func_all(&cc, c));
6880 } else {
6881 // just copy
6882 cc = c;
6883 }
6884
6885 int totlen = utfc_ptr2len(src - 1);
6886
6887 if (copy) {
6888 utf_char2bytes(cc, dst);
6889 }
6890 dst += utf_char2len(cc) - 1;
6891 int clen = utf_ptr2len(src - 1);
6892
6893 // If the character length is shorter than "totlen", there
6894 // are composing characters; copy them as-is.
6895 if (clen < totlen) {
6896 if (copy) {
6897 memmove(dst + 1, src - 1 + clen, (size_t)(totlen - clen));
6898 }
6899 dst += totlen - clen;
6900 }
6901 src += totlen - 1;
6902 dst++;
6903 } else {
6904 if (REG_MULTI) {
6905 clnum = rex.reg_mmatch->startpos[no].lnum;
6906 if (clnum < 0 || rex.reg_mmatch->endpos[no].lnum < 0) {
6907 s = NULL;
6908 } else {
6909 s = reg_getline(clnum) + rex.reg_mmatch->startpos[no].col;
6910 if (rex.reg_mmatch->endpos[no].lnum == clnum) {
6911 len = rex.reg_mmatch->endpos[no].col
6912 - rex.reg_mmatch->startpos[no].col;
6913 } else {
6914 len = (int)STRLEN(s);
6915 }
6916 }
6917 } else {
6918 s = rex.reg_match->startp[no];
6919 if (rex.reg_match->endp[no] == NULL) {
6920 s = NULL;
6921 } else {
6922 len = (int)(rex.reg_match->endp[no] - s);
6923 }
6924 }
6925 if (s != NULL) {
6926 for (;; ) {
6927 if (len == 0) {
6928 if (REG_MULTI) {
6929 if (rex.reg_mmatch->endpos[no].lnum == clnum) {
6930 break;
6931 }
6932 if (copy) {
6933 *dst = CAR;
6934 }
6935 dst++;
6936 s = reg_getline(++clnum);
6937 if (rex.reg_mmatch->endpos[no].lnum == clnum) {
6938 len = rex.reg_mmatch->endpos[no].col;
6939 } else {
6940 len = (int)STRLEN(s);
6941 }
6942 } else {
6943 break;
6944 }
6945 } else if (*s == NUL) { // we hit NUL.
6946 if (copy) {
6947 iemsg(_(e_re_damg));
6948 }
6949 goto exit;
6950 } else {
6951 if (backslash && (*s == CAR || *s == '\\')) {
6952 /*
6953 * Insert a backslash in front of a CR, otherwise
6954 * it will be replaced by a line break.
6955 * Number of backslashes will be halved later,
6956 * double them here.
6957 */
6958 if (copy) {
6959 dst[0] = '\\';
6960 dst[1] = *s;
6961 }
6962 dst += 2;
6963 } else {
6964 c = utf_ptr2char(s);
6965
6966 if (func_one != (fptr_T)NULL)
6967 /* Turbo C complains without the typecast */
6968 func_one = (fptr_T)(func_one(&cc, c));
6969 else if (func_all != (fptr_T)NULL)
6970 /* Turbo C complains without the typecast */
6971 func_all = (fptr_T)(func_all(&cc, c));
6972 else /* just copy */
6973 cc = c;
6974
6975 {
6976 int l;
6977
6978 // Copy composing characters separately, one
6979 // at a time.
6980 l = utf_ptr2len(s) - 1;
6981
6982 s += l;
6983 len -= l;
6984 if (copy) {
6985 utf_char2bytes(cc, dst);
6986 }
6987 dst += utf_char2len(cc) - 1;
6988 }
6989 dst++;
6990 }
6991
6992 ++s;
6993 --len;
6994 }
6995 }
6996 }
6997 no = -1;
6998 }
6999 }
7000 if (copy)
7001 *dst = NUL;
7002
7003 exit:
7004 return (int)((dst - dest) + 1 - num_escaped);
7005 }
7006
7007
7008 /*
7009 * Call reg_getline() with the line numbers from the submatch. If a
7010 * substitute() was used the reg_maxline and other values have been
7011 * overwritten.
7012 */
reg_getline_submatch(linenr_T lnum)7013 static char_u *reg_getline_submatch(linenr_T lnum)
7014 {
7015 char_u *s;
7016 linenr_T save_first = rex.reg_firstlnum;
7017 linenr_T save_max = rex.reg_maxline;
7018
7019 rex.reg_firstlnum = rsm.sm_firstlnum;
7020 rex.reg_maxline = rsm.sm_maxline;
7021
7022 s = reg_getline(lnum);
7023
7024 rex.reg_firstlnum = save_first;
7025 rex.reg_maxline = save_max;
7026 return s;
7027 }
7028
7029 /*
7030 * Used for the submatch() function: get the string from the n'th submatch in
7031 * allocated memory.
7032 * Returns NULL when not in a ":s" command and for a non-existing submatch.
7033 */
reg_submatch(int no)7034 char_u *reg_submatch(int no)
7035 {
7036 char_u *retval = NULL;
7037 char_u *s;
7038 int round;
7039 linenr_T lnum;
7040
7041 if (!can_f_submatch || no < 0)
7042 return NULL;
7043
7044 if (rsm.sm_match == NULL) {
7045 ssize_t len;
7046
7047 /*
7048 * First round: compute the length and allocate memory.
7049 * Second round: copy the text.
7050 */
7051 for (round = 1; round <= 2; round++) {
7052 lnum = rsm.sm_mmatch->startpos[no].lnum;
7053 if (lnum < 0 || rsm.sm_mmatch->endpos[no].lnum < 0) {
7054 return NULL;
7055 }
7056
7057 s = reg_getline_submatch(lnum);
7058 if (s == NULL) { // anti-crash check, cannot happen?
7059 break;
7060 }
7061 s += rsm.sm_mmatch->startpos[no].col;
7062 if (rsm.sm_mmatch->endpos[no].lnum == lnum) {
7063 // Within one line: take form start to end col.
7064 len = rsm.sm_mmatch->endpos[no].col - rsm.sm_mmatch->startpos[no].col;
7065 if (round == 2) {
7066 STRLCPY(retval, s, len + 1);
7067 }
7068 len++;
7069 } else {
7070 // Multiple lines: take start line from start col, middle
7071 // lines completely and end line up to end col.
7072 len = (ssize_t)STRLEN(s);
7073 if (round == 2) {
7074 STRCPY(retval, s);
7075 retval[len] = '\n';
7076 }
7077 len++;
7078 lnum++;
7079 while (lnum < rsm.sm_mmatch->endpos[no].lnum) {
7080 s = reg_getline_submatch(lnum++);
7081 if (round == 2)
7082 STRCPY(retval + len, s);
7083 len += STRLEN(s);
7084 if (round == 2)
7085 retval[len] = '\n';
7086 ++len;
7087 }
7088 if (round == 2) {
7089 STRNCPY(retval + len, reg_getline_submatch(lnum),
7090 rsm.sm_mmatch->endpos[no].col);
7091 }
7092 len += rsm.sm_mmatch->endpos[no].col;
7093 if (round == 2) {
7094 retval[len] = NUL; // -V595
7095 }
7096 len++;
7097 }
7098
7099 if (retval == NULL) {
7100 retval = xmalloc(len);
7101 }
7102 }
7103 } else {
7104 s = rsm.sm_match->startp[no];
7105 if (s == NULL || rsm.sm_match->endp[no] == NULL) {
7106 retval = NULL;
7107 } else {
7108 retval = vim_strnsave(s, rsm.sm_match->endp[no] - s);
7109 }
7110 }
7111
7112 return retval;
7113 }
7114
7115 // Used for the submatch() function with the optional non-zero argument: get
7116 // the list of strings from the n'th submatch in allocated memory with NULs
7117 // represented in NLs.
7118 // Returns a list of allocated strings. Returns NULL when not in a ":s"
7119 // command, for a non-existing submatch and for any error.
reg_submatch_list(int no)7120 list_T *reg_submatch_list(int no)
7121 {
7122 if (!can_f_submatch || no < 0) {
7123 return NULL;
7124 }
7125
7126 linenr_T slnum;
7127 linenr_T elnum;
7128 list_T *list;
7129 const char *s;
7130
7131 if (rsm.sm_match == NULL) {
7132 slnum = rsm.sm_mmatch->startpos[no].lnum;
7133 elnum = rsm.sm_mmatch->endpos[no].lnum;
7134 if (slnum < 0 || elnum < 0) {
7135 return NULL;
7136 }
7137
7138 colnr_T scol = rsm.sm_mmatch->startpos[no].col;
7139 colnr_T ecol = rsm.sm_mmatch->endpos[no].col;
7140
7141 list = tv_list_alloc(elnum - slnum + 1);
7142
7143 s = (const char *)reg_getline_submatch(slnum) + scol;
7144 if (slnum == elnum) {
7145 tv_list_append_string(list, s, ecol - scol);
7146 } else {
7147 tv_list_append_string(list, s, -1);
7148 for (int i = 1; i < elnum - slnum; i++) {
7149 s = (const char *)reg_getline_submatch(slnum + i);
7150 tv_list_append_string(list, s, -1);
7151 }
7152 s = (const char *)reg_getline_submatch(elnum);
7153 tv_list_append_string(list, s, ecol);
7154 }
7155 } else {
7156 s = (const char *)rsm.sm_match->startp[no];
7157 if (s == NULL || rsm.sm_match->endp[no] == NULL) {
7158 return NULL;
7159 }
7160 list = tv_list_alloc(1);
7161 tv_list_append_string(list, s, (const char *)rsm.sm_match->endp[no] - s);
7162 }
7163
7164 tv_list_ref(list);
7165 return list;
7166 }
7167
7168 static regengine_T bt_regengine =
7169 {
7170 bt_regcomp,
7171 bt_regfree,
7172 bt_regexec_nl,
7173 bt_regexec_multi,
7174 (char_u *)""
7175 };
7176
7177
7178 // XXX Do not allow headers generator to catch definitions from regexp_nfa.c
7179 #ifndef DO_NOT_DEFINE_EMPTY_ATTRIBUTES
7180 # include "nvim/regexp_nfa.c"
7181 #endif
7182
7183 static regengine_T nfa_regengine =
7184 {
7185 nfa_regcomp,
7186 nfa_regfree,
7187 nfa_regexec_nl,
7188 nfa_regexec_multi,
7189 (char_u *)""
7190 };
7191
7192 // Which regexp engine to use? Needed for vim_regcomp().
7193 // Must match with 'regexpengine'.
7194 static int regexp_engine = 0;
7195
7196 #ifdef REGEXP_DEBUG
7197 static char_u regname[][30] = {
7198 "AUTOMATIC Regexp Engine",
7199 "BACKTRACKING Regexp Engine",
7200 "NFA Regexp Engine"
7201 };
7202 #endif
7203
7204 /*
7205 * Compile a regular expression into internal code.
7206 * Returns the program in allocated memory.
7207 * Use vim_regfree() to free the memory.
7208 * Returns NULL for an error.
7209 */
vim_regcomp(char_u * expr_arg,int re_flags)7210 regprog_T *vim_regcomp(char_u *expr_arg, int re_flags)
7211 {
7212 regprog_T *prog = NULL;
7213 char_u *expr = expr_arg;
7214 int save_called_emsg;
7215
7216 regexp_engine = p_re;
7217
7218 /* Check for prefix "\%#=", that sets the regexp engine */
7219 if (STRNCMP(expr, "\\%#=", 4) == 0) {
7220 int newengine = expr[4] - '0';
7221
7222 if (newengine == AUTOMATIC_ENGINE
7223 || newengine == BACKTRACKING_ENGINE
7224 || newengine == NFA_ENGINE) {
7225 regexp_engine = expr[4] - '0';
7226 expr += 5;
7227 #ifdef REGEXP_DEBUG
7228 smsg("New regexp mode selected (%d): %s",
7229 regexp_engine,
7230 regname[newengine]);
7231 #endif
7232 } else {
7233 emsg(_(
7234 "E864: \\%#= can only be followed by 0, 1, or 2. The automatic engine will be used "));
7235 regexp_engine = AUTOMATIC_ENGINE;
7236 }
7237 }
7238 #ifdef REGEXP_DEBUG
7239 bt_regengine.expr = expr;
7240 nfa_regengine.expr = expr;
7241 #endif
7242 // reg_iswordc() uses rex.reg_buf
7243 rex.reg_buf = curbuf;
7244
7245 //
7246 // First try the NFA engine, unless backtracking was requested.
7247 //
7248 save_called_emsg = called_emsg;
7249 called_emsg = false;
7250 if (regexp_engine != BACKTRACKING_ENGINE) {
7251 prog = nfa_regengine.regcomp(expr,
7252 re_flags + (regexp_engine == AUTOMATIC_ENGINE ? RE_AUTO : 0));
7253 } else {
7254 prog = bt_regengine.regcomp(expr, re_flags);
7255 }
7256
7257 // Check for error compiling regexp with initial engine.
7258 if (prog == NULL) {
7259 #ifdef BT_REGEXP_DEBUG_LOG
7260 // Debugging log for BT engine.
7261 if (regexp_engine != BACKTRACKING_ENGINE) {
7262 FILE *f = fopen(BT_REGEXP_DEBUG_LOG_NAME, "a");
7263 if (f) {
7264 fprintf(f, "Syntax error in \"%s\"\n", expr);
7265 fclose(f);
7266 } else
7267 semsg("(NFA) Could not open \"%s\" to write !!!",
7268 BT_REGEXP_DEBUG_LOG_NAME);
7269 }
7270 #endif
7271 // If the NFA engine failed, try the backtracking engine. The NFA engine
7272 // also fails for patterns that it can't handle well but are still valid
7273 // patterns, thus a retry should work.
7274 // But don't try if an error message was given.
7275 if (regexp_engine == AUTOMATIC_ENGINE && !called_emsg) {
7276 regexp_engine = BACKTRACKING_ENGINE;
7277 report_re_switch(expr);
7278 prog = bt_regengine.regcomp(expr, re_flags);
7279 }
7280 }
7281 called_emsg |= save_called_emsg;
7282
7283 if (prog != NULL) {
7284 // Store the info needed to call regcomp() again when the engine turns out
7285 // to be very slow when executing it.
7286 prog->re_engine = regexp_engine;
7287 prog->re_flags = re_flags;
7288 }
7289
7290 return prog;
7291 }
7292
7293 /*
7294 * Free a compiled regexp program, returned by vim_regcomp().
7295 */
vim_regfree(regprog_T * prog)7296 void vim_regfree(regprog_T *prog)
7297 {
7298 if (prog != NULL)
7299 prog->engine->regfree(prog);
7300 }
7301
report_re_switch(char_u * pat)7302 static void report_re_switch(char_u *pat)
7303 {
7304 if (p_verbose > 0) {
7305 verbose_enter();
7306 msg_puts(_("Switching to backtracking RE engine for pattern: "));
7307 msg_puts((char *)pat);
7308 verbose_leave();
7309 }
7310 }
7311
7312 /// Matches a regexp against a string.
7313 /// "rmp->regprog" is a compiled regexp as returned by vim_regcomp().
7314 /// Note: "rmp->regprog" may be freed and changed.
7315 /// Uses curbuf for line count and 'iskeyword'.
7316 /// When "nl" is true consider a "\n" in "line" to be a line break.
7317 ///
7318 /// @param rmp
7319 /// @param line the string to match against
7320 /// @param col the column to start looking for match
7321 /// @param nl
7322 ///
7323 /// @return true if there is a match, false if not.
vim_regexec_string(regmatch_T * rmp,char_u * line,colnr_T col,bool nl)7324 static bool vim_regexec_string(regmatch_T *rmp, char_u *line, colnr_T col,
7325 bool nl)
7326 {
7327 regexec_T rex_save;
7328 bool rex_in_use_save = rex_in_use;
7329
7330 // Cannot use the same prog recursively, it contains state.
7331 if (rmp->regprog->re_in_use) {
7332 emsg(_(e_recursive));
7333 return false;
7334 }
7335 rmp->regprog->re_in_use = true;
7336
7337 if (rex_in_use) {
7338 // Being called recursively, save the state.
7339 rex_save = rex;
7340 }
7341 rex_in_use = true;
7342
7343 rex.reg_startp = NULL;
7344 rex.reg_endp = NULL;
7345 rex.reg_startpos = NULL;
7346 rex.reg_endpos = NULL;
7347
7348 int result = rmp->regprog->engine->regexec_nl(rmp, line, col, nl);
7349 rmp->regprog->re_in_use = false;
7350
7351 // NFA engine aborted because it's very slow, use backtracking engine instead.
7352 if (rmp->regprog->re_engine == AUTOMATIC_ENGINE
7353 && result == NFA_TOO_EXPENSIVE) {
7354 int save_p_re = p_re;
7355 int re_flags = rmp->regprog->re_flags;
7356 char_u *pat = vim_strsave(((nfa_regprog_T *)rmp->regprog)->pattern);
7357
7358 p_re = BACKTRACKING_ENGINE;
7359 vim_regfree(rmp->regprog);
7360 report_re_switch(pat);
7361 rmp->regprog = vim_regcomp(pat, re_flags);
7362 if (rmp->regprog != NULL) {
7363 rmp->regprog->re_in_use = true;
7364 result = rmp->regprog->engine->regexec_nl(rmp, line, col, nl);
7365 rmp->regprog->re_in_use = false;
7366 }
7367
7368 xfree(pat);
7369 p_re = save_p_re;
7370 }
7371
7372 rex_in_use = rex_in_use_save;
7373 if (rex_in_use) {
7374 rex = rex_save;
7375 }
7376
7377 return result > 0;
7378 }
7379
7380 // Note: "*prog" may be freed and changed.
7381 // Return true if there is a match, false if not.
vim_regexec_prog(regprog_T ** prog,bool ignore_case,char_u * line,colnr_T col)7382 bool vim_regexec_prog(regprog_T **prog, bool ignore_case, char_u *line,
7383 colnr_T col)
7384 {
7385 regmatch_T regmatch = { .regprog = *prog, .rm_ic = ignore_case };
7386 bool r = vim_regexec_string(®match, line, col, false);
7387 *prog = regmatch.regprog;
7388 return r;
7389 }
7390
7391 // Note: "rmp->regprog" may be freed and changed.
7392 // Return true if there is a match, false if not.
vim_regexec(regmatch_T * rmp,char_u * line,colnr_T col)7393 bool vim_regexec(regmatch_T *rmp, char_u *line, colnr_T col)
7394 {
7395 return vim_regexec_string(rmp, line, col, false);
7396 }
7397
7398 // Like vim_regexec(), but consider a "\n" in "line" to be a line break.
7399 // Note: "rmp->regprog" may be freed and changed.
7400 // Return true if there is a match, false if not.
vim_regexec_nl(regmatch_T * rmp,char_u * line,colnr_T col)7401 bool vim_regexec_nl(regmatch_T *rmp, char_u *line, colnr_T col)
7402 {
7403 return vim_regexec_string(rmp, line, col, true);
7404 }
7405
7406 /// Match a regexp against multiple lines.
7407 /// "rmp->regprog" must be a compiled regexp as returned by vim_regcomp().
7408 /// Note: "rmp->regprog" may be freed and changed, even set to NULL.
7409 /// Uses curbuf for line count and 'iskeyword'.
7410 ///
7411 /// Return zero if there is no match. Return number of lines contained in the
7412 /// match otherwise.
vim_regexec_multi(regmmatch_T * rmp,win_T * win,buf_T * buf,linenr_T lnum,colnr_T col,proftime_T * tm,int * timed_out)7413 long vim_regexec_multi(
7414 regmmatch_T *rmp,
7415 win_T *win, // window in which to search or NULL
7416 buf_T *buf, // buffer in which to search
7417 linenr_T lnum, // nr of line to start looking for match
7418 colnr_T col, // column to start looking for match
7419 proftime_T *tm, // timeout limit or NULL
7420 int *timed_out // flag is set when timeout limit reached
7421 )
7422 FUNC_ATTR_NONNULL_ARG(1)
7423 {
7424 regexec_T rex_save;
7425 bool rex_in_use_save = rex_in_use;
7426
7427 // Cannot use the same prog recursively, it contains state.
7428 if (rmp->regprog->re_in_use) {
7429 emsg(_(e_recursive));
7430 return false;
7431 }
7432 rmp->regprog->re_in_use = true;
7433
7434 if (rex_in_use) {
7435 // Being called recursively, save the state.
7436 rex_save = rex;
7437 }
7438 rex_in_use = true;
7439
7440 int result = rmp->regprog->engine->regexec_multi(rmp, win, buf, lnum, col,
7441 tm, timed_out);
7442 rmp->regprog->re_in_use = false;
7443
7444 // NFA engine aborted because it's very slow, use backtracking engine instead.
7445 if (rmp->regprog->re_engine == AUTOMATIC_ENGINE
7446 && result == NFA_TOO_EXPENSIVE) {
7447 int save_p_re = p_re;
7448 int re_flags = rmp->regprog->re_flags;
7449 char_u *pat = vim_strsave(((nfa_regprog_T *)rmp->regprog)->pattern);
7450
7451 p_re = BACKTRACKING_ENGINE;
7452 vim_regfree(rmp->regprog);
7453 report_re_switch(pat);
7454 // checking for \z misuse was already done when compiling for NFA,
7455 // allow all here
7456 reg_do_extmatch = REX_ALL;
7457 rmp->regprog = vim_regcomp(pat, re_flags);
7458 reg_do_extmatch = 0;
7459
7460 if (rmp->regprog != NULL) {
7461 rmp->regprog->re_in_use = true;
7462 result = rmp->regprog->engine->regexec_multi(rmp, win, buf, lnum, col,
7463 tm, timed_out);
7464 rmp->regprog->re_in_use = false;
7465 }
7466
7467 xfree(pat);
7468 p_re = save_p_re;
7469 }
7470
7471 rex_in_use = rex_in_use_save;
7472 if (rex_in_use) {
7473 rex = rex_save;
7474 }
7475
7476 return result <= 0 ? 0 : result;
7477 }
7478