1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2014 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 /* This module contains the external function pcre2_dfa_match(), which is an
43 alternative matching function that uses a sort of DFA algorithm (not a true
44 FSM). This is NOT Perl-compatible, but it has advantages in certain
45 applications. */
46
47
48 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49 the performance of his patterns greatly. I could not use it as it stood, as it
50 was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 test 7 to loop, and test 9 to crash with a segfault.
52
53 The issue is the check for duplicate states, which is done by a simple linear
54 search up the state list. (Grep for "duplicate" below to find the code.) For
55 many patterns, there will never be many states active at one time, so a simple
56 linear search is fine. In patterns that have many active states, it might be a
57 bottleneck. The suggested code used an indexing scheme to remember which states
58 had previously been used for each character, and avoided the linear search when
59 it knew there was no chance of a duplicate. This was implemented when adding
60 states to the state lists.
61
62 I wrote some thread-safe, not-limited code to try something similar at the time
63 of checking for duplicates (instead of when adding states), using index vectors
64 on the stack. It did give a 13% improvement with one specially constructed
65 pattern for certain subject strings, but on other strings and on many of the
66 simpler patterns in the test suite it did worse. The major problem, I think,
67 was the extra time to initialize the index. This had to be done for each call
68 of internal_dfa_match(). (The supplied patch used a static vector, initialized
69 only once - I suspect this was the cause of the problems with the tests.)
70
71 Overall, I concluded that the gains in some cases did not outweigh the losses
72 in others, so I abandoned this code. */
73
74
75 #ifdef HAVE_CONFIG_H
76 #include "config.h"
77 #endif
78
79 #define NLBLOCK mb /* Block containing newline information */
80 #define PSSTART start_subject /* Field containing processed string start */
81 #define PSEND end_subject /* Field containing processed string end */
82
83 #include "pcre2_internal.h"
84
85 #define PUBLIC_DFA_MATCH_OPTIONS \
86 (PCRE2_ANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
87 PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
88 PCRE2_PARTIAL_SOFT|PCRE2_DFA_SHORTEST|PCRE2_DFA_RESTART)
89
90
91 /*************************************************
92 * Code parameters and static tables *
93 *************************************************/
94
95 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
96 into others, under special conditions. A gap of 20 between the blocks should be
97 enough. The resulting opcodes don't have to be less than 256 because they are
98 never stored, so we push them well clear of the normal opcodes. */
99
100 #define OP_PROP_EXTRA 300
101 #define OP_EXTUNI_EXTRA 320
102 #define OP_ANYNL_EXTRA 340
103 #define OP_HSPACE_EXTRA 360
104 #define OP_VSPACE_EXTRA 380
105
106
107 /* This table identifies those opcodes that are followed immediately by a
108 character that is to be tested in some way. This makes it possible to
109 centralize the loading of these characters. In the case of Type * etc, the
110 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
111 small value. Non-zero values in the table are the offsets from the opcode where
112 the character is to be found. ***NOTE*** If the start of this table is
113 modified, the three tables that follow must also be modified. */
114
115 static const uint8_t coptable[] = {
116 0, /* End */
117 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
118 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
119 0, 0, 0, /* Any, AllAny, Anybyte */
120 0, 0, /* \P, \p */
121 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
122 0, /* \X */
123 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
124 1, /* Char */
125 1, /* Chari */
126 1, /* not */
127 1, /* noti */
128 /* Positive single-char repeats */
129 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
130 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */
131 1+IMM2_SIZE, /* exact */
132 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */
133 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
134 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */
135 1+IMM2_SIZE, /* exact I */
136 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */
137 /* Negative single-char repeats - only for chars < 256 */
138 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
139 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */
140 1+IMM2_SIZE, /* NOT exact */
141 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */
142 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
143 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */
144 1+IMM2_SIZE, /* NOT exact I */
145 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */
146 /* Positive type repeats */
147 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
148 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */
149 1+IMM2_SIZE, /* Type exact */
150 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */
151 /* Character class & ref repeats */
152 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
153 0, 0, /* CRRANGE, CRMINRANGE */
154 0, 0, 0, 0, /* Possessive *+, ++, ?+, CRPOSRANGE */
155 0, /* CLASS */
156 0, /* NCLASS */
157 0, /* XCLASS - variable length */
158 0, /* REF */
159 0, /* REFI */
160 0, /* DNREF */
161 0, /* DNREFI */
162 0, /* RECURSE */
163 0, /* CALLOUT */
164 0, /* Alt */
165 0, /* Ket */
166 0, /* KetRmax */
167 0, /* KetRmin */
168 0, /* KetRpos */
169 0, /* Reverse */
170 0, /* Assert */
171 0, /* Assert not */
172 0, /* Assert behind */
173 0, /* Assert behind not */
174 0, 0, /* ONCE, ONCE_NC */
175 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
176 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
177 0, 0, /* CREF, DNCREF */
178 0, 0, /* RREF, DNRREF */
179 0, 0, /* FALSE, TRUE */
180 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
181 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
182 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
183 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
184 0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */
185 };
186
187 /* This table identifies those opcodes that inspect a character. It is used to
188 remember the fact that a character could have been inspected when the end of
189 the subject is reached. ***NOTE*** If the start of this table is modified, the
190 two tables that follow must also be modified. */
191
192 static const uint8_t poptable[] = {
193 0, /* End */
194 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
195 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
196 1, 1, 1, /* Any, AllAny, Anybyte */
197 1, 1, /* \P, \p */
198 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
199 1, /* \X */
200 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
201 1, /* Char */
202 1, /* Chari */
203 1, /* not */
204 1, /* noti */
205 /* Positive single-char repeats */
206 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
207 1, 1, 1, /* upto, minupto, exact */
208 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
209 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
210 1, 1, 1, /* upto I, minupto I, exact I */
211 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
212 /* Negative single-char repeats - only for chars < 256 */
213 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
214 1, 1, 1, /* NOT upto, minupto, exact */
215 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
216 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
217 1, 1, 1, /* NOT upto I, minupto I, exact I */
218 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
219 /* Positive type repeats */
220 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
221 1, 1, 1, /* Type upto, minupto, exact */
222 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
223 /* Character class & ref repeats */
224 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
225 1, 1, /* CRRANGE, CRMINRANGE */
226 1, 1, 1, 1, /* Possessive *+, ++, ?+, CRPOSRANGE */
227 1, /* CLASS */
228 1, /* NCLASS */
229 1, /* XCLASS - variable length */
230 0, /* REF */
231 0, /* REFI */
232 0, /* DNREF */
233 0, /* DNREFI */
234 0, /* RECURSE */
235 0, /* CALLOUT */
236 0, /* Alt */
237 0, /* Ket */
238 0, /* KetRmax */
239 0, /* KetRmin */
240 0, /* KetRpos */
241 0, /* Reverse */
242 0, /* Assert */
243 0, /* Assert not */
244 0, /* Assert behind */
245 0, /* Assert behind not */
246 0, 0, /* ONCE, ONCE_NC */
247 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
248 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
249 0, 0, /* CREF, DNCREF */
250 0, 0, /* RREF, DNRREF */
251 0, 0, /* FALSE, TRUE */
252 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
253 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
254 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
255 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
256 0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */
257 };
258
259 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
260 and \w */
261
262 static const uint8_t toptable1[] = {
263 0, 0, 0, 0, 0, 0,
264 ctype_digit, ctype_digit,
265 ctype_space, ctype_space,
266 ctype_word, ctype_word,
267 0, 0 /* OP_ANY, OP_ALLANY */
268 };
269
270 static const uint8_t toptable2[] = {
271 0, 0, 0, 0, 0, 0,
272 ctype_digit, 0,
273 ctype_space, 0,
274 ctype_word, 0,
275 1, 1 /* OP_ANY, OP_ALLANY */
276 };
277
278
279 /* Structure for holding data about a particular state, which is in effect the
280 current data for an active path through the match tree. It must consist
281 entirely of ints because the working vector we are passed, and which we put
282 these structures in, is a vector of ints. */
283
284 typedef struct stateblock {
285 int offset; /* Offset to opcode (-ve has meaning) */
286 int count; /* Count for repeats */
287 int data; /* Some use extra data */
288 } stateblock;
289
290 #define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int))
291
292
293
294 /*************************************************
295 * Match a Regular Expression - DFA engine *
296 *************************************************/
297
298 /* This internal function applies a compiled pattern to a subject string,
299 starting at a given point, using a DFA engine. This function is called from the
300 external one, possibly multiple times if the pattern is not anchored. The
301 function calls itself recursively for some kinds of subpattern.
302
303 Arguments:
304 mb the match_data block with fixed information
305 this_start_code the opening bracket of this subexpression's code
306 current_subject where we currently are in the subject string
307 start_offset start offset in the subject string
308 offsets vector to contain the matching string offsets
309 offsetcount size of same
310 workspace vector of workspace
311 wscount size of same
312 rlevel function call recursion level
313
314 Returns: > 0 => number of match offset pairs placed in offsets
315 = 0 => offsets overflowed; longest matches are present
316 -1 => failed to match
317 < -1 => some kind of unexpected problem
318
319 The following macros are used for adding states to the two state vectors (one
320 for the current character, one for the following character). */
321
322 #define ADD_ACTIVE(x,y) \
323 if (active_count++ < wscount) \
324 { \
325 next_active_state->offset = (x); \
326 next_active_state->count = (y); \
327 next_active_state++; \
328 } \
329 else return PCRE2_ERROR_DFA_WSSIZE
330
331 #define ADD_ACTIVE_DATA(x,y,z) \
332 if (active_count++ < wscount) \
333 { \
334 next_active_state->offset = (x); \
335 next_active_state->count = (y); \
336 next_active_state->data = (z); \
337 next_active_state++; \
338 } \
339 else return PCRE2_ERROR_DFA_WSSIZE
340
341 #define ADD_NEW(x,y) \
342 if (new_count++ < wscount) \
343 { \
344 next_new_state->offset = (x); \
345 next_new_state->count = (y); \
346 next_new_state++; \
347 } \
348 else return PCRE2_ERROR_DFA_WSSIZE
349
350 #define ADD_NEW_DATA(x,y,z) \
351 if (new_count++ < wscount) \
352 { \
353 next_new_state->offset = (x); \
354 next_new_state->count = (y); \
355 next_new_state->data = (z); \
356 next_new_state++; \
357 } \
358 else return PCRE2_ERROR_DFA_WSSIZE
359
360 /* And now, here is the code */
361
362 static int
internal_dfa_match(dfa_match_block * mb,PCRE2_SPTR this_start_code,PCRE2_SPTR current_subject,PCRE2_SIZE start_offset,PCRE2_SIZE * offsets,uint32_t offsetcount,int * workspace,int wscount,int rlevel)363 internal_dfa_match(
364 dfa_match_block *mb,
365 PCRE2_SPTR this_start_code,
366 PCRE2_SPTR current_subject,
367 PCRE2_SIZE start_offset,
368 PCRE2_SIZE *offsets,
369 uint32_t offsetcount,
370 int *workspace,
371 int wscount,
372 int rlevel)
373 {
374 stateblock *active_states, *new_states, *temp_states;
375 stateblock *next_active_state, *next_new_state;
376
377 const uint8_t *ctypes, *lcc, *fcc;
378 PCRE2_SPTR ptr;
379 PCRE2_SPTR end_code;
380 PCRE2_SPTR first_op;
381
382 dfa_recursion_info new_recursive;
383
384 int active_count, new_count, match_count;
385
386 /* Some fields in the mb block are frequently referenced, so we load them into
387 independent variables in the hope that this will perform better. */
388
389 PCRE2_SPTR start_subject = mb->start_subject;
390 PCRE2_SPTR end_subject = mb->end_subject;
391 PCRE2_SPTR start_code = mb->start_code;
392
393 #ifdef SUPPORT_UNICODE
394 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
395 #else
396 BOOL utf = FALSE;
397 #endif
398
399 BOOL reset_could_continue = FALSE;
400
401 rlevel++;
402 offsetcount &= (-2);
403
404 wscount -= 2;
405 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
406 (2 * INTS_PER_STATEBLOCK);
407
408 ctypes = mb->tables + ctypes_offset;
409 lcc = mb->tables + lcc_offset;
410 fcc = mb->tables + fcc_offset;
411
412 match_count = PCRE2_ERROR_NOMATCH; /* A negative number */
413
414 active_states = (stateblock *)(workspace + 2);
415 next_new_state = new_states = active_states + wscount;
416 new_count = 0;
417
418 first_op = this_start_code + 1 + LINK_SIZE +
419 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
420 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
421 ? IMM2_SIZE:0);
422
423 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
424 the alternative states onto the list, and find out where the end is. This
425 makes is possible to use this function recursively, when we want to stop at a
426 matching internal ket rather than at the end.
427
428 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
429 a backward assertion. In that case, we have to find out the maximum amount to
430 move back, and set up each alternative appropriately. */
431
432 if (*first_op == OP_REVERSE)
433 {
434 int max_back = 0;
435 int gone_back;
436
437 end_code = this_start_code;
438 do
439 {
440 int back = GET(end_code, 2+LINK_SIZE);
441 if (back > max_back) max_back = back;
442 end_code += GET(end_code, 1);
443 }
444 while (*end_code == OP_ALT);
445
446 /* If we can't go back the amount required for the longest lookbehind
447 pattern, go back as far as we can; some alternatives may still be viable. */
448
449 #ifdef SUPPORT_UNICODE
450 /* In character mode we have to step back character by character */
451
452 if (utf)
453 {
454 for (gone_back = 0; gone_back < max_back; gone_back++)
455 {
456 if (current_subject <= start_subject) break;
457 current_subject--;
458 ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
459 }
460 }
461 else
462 #endif
463
464 /* In byte-mode we can do this quickly. */
465
466 {
467 gone_back = (current_subject - max_back < start_subject)?
468 (int)(current_subject - start_subject) : max_back;
469 current_subject -= gone_back;
470 }
471
472 /* Save the earliest consulted character */
473
474 if (current_subject < mb->start_used_ptr)
475 mb->start_used_ptr = current_subject;
476
477 /* Now we can process the individual branches. */
478
479 end_code = this_start_code;
480 do
481 {
482 int back = GET(end_code, 2+LINK_SIZE);
483 if (back <= gone_back)
484 {
485 int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
486 ADD_NEW_DATA(-bstate, 0, gone_back - back);
487 }
488 end_code += GET(end_code, 1);
489 }
490 while (*end_code == OP_ALT);
491 }
492
493 /* This is the code for a "normal" subpattern (not a backward assertion). The
494 start of a whole pattern is always one of these. If we are at the top level,
495 we may be asked to restart matching from the same point that we reached for a
496 previous partial match. We still have to scan through the top-level branches to
497 find the end state. */
498
499 else
500 {
501 end_code = this_start_code;
502
503 /* Restarting */
504
505 if (rlevel == 1 && (mb->moptions & PCRE2_DFA_RESTART) != 0)
506 {
507 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
508 new_count = workspace[1];
509 if (!workspace[0])
510 memcpy(new_states, active_states, new_count * sizeof(stateblock));
511 }
512
513 /* Not restarting */
514
515 else
516 {
517 int length = 1 + LINK_SIZE +
518 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
519 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
520 ? IMM2_SIZE:0);
521 do
522 {
523 ADD_NEW((int)(end_code - start_code + length), 0);
524 end_code += GET(end_code, 1);
525 length = 1 + LINK_SIZE;
526 }
527 while (*end_code == OP_ALT);
528 }
529 }
530
531 workspace[0] = 0; /* Bit indicating which vector is current */
532
533 /* Loop for scanning the subject */
534
535 ptr = current_subject;
536 for (;;)
537 {
538 int i, j;
539 int clen, dlen;
540 uint32_t c, d;
541 int forced_fail = 0;
542 BOOL partial_newline = FALSE;
543 BOOL could_continue = reset_could_continue;
544 reset_could_continue = FALSE;
545
546 if (ptr > mb->last_used_ptr) mb->last_used_ptr = ptr;
547
548 /* Make the new state list into the active state list and empty the
549 new state list. */
550
551 temp_states = active_states;
552 active_states = new_states;
553 new_states = temp_states;
554 active_count = new_count;
555 new_count = 0;
556
557 workspace[0] ^= 1; /* Remember for the restarting feature */
558 workspace[1] = active_count;
559
560 /* Set the pointers for adding new states */
561
562 next_active_state = active_states + active_count;
563 next_new_state = new_states;
564
565 /* Load the current character from the subject outside the loop, as many
566 different states may want to look at it, and we assume that at least one
567 will. */
568
569 if (ptr < end_subject)
570 {
571 clen = 1; /* Number of data items in the character */
572 #ifdef SUPPORT_UNICODE
573 GETCHARLENTEST(c, ptr, clen);
574 #else
575 c = *ptr;
576 #endif /* SUPPORT_UNICODE */
577 }
578 else
579 {
580 clen = 0; /* This indicates the end of the subject */
581 c = NOTACHAR; /* This value should never actually be used */
582 }
583
584 /* Scan up the active states and act on each one. The result of an action
585 may be to add more states to the currently active list (e.g. on hitting a
586 parenthesis) or it may be to put states on the new list, for considering
587 when we move the character pointer on. */
588
589 for (i = 0; i < active_count; i++)
590 {
591 stateblock *current_state = active_states + i;
592 BOOL caseless = FALSE;
593 PCRE2_SPTR code;
594 int state_offset = current_state->offset;
595 int codevalue, rrc;
596 int count;
597
598 /* A negative offset is a special case meaning "hold off going to this
599 (negated) state until the number of characters in the data field have
600 been skipped". If the could_continue flag was passed over from a previous
601 state, arrange for it to passed on. */
602
603 if (state_offset < 0)
604 {
605 if (current_state->data > 0)
606 {
607 ADD_NEW_DATA(state_offset, current_state->count,
608 current_state->data - 1);
609 if (could_continue) reset_could_continue = TRUE;
610 continue;
611 }
612 else
613 {
614 current_state->offset = state_offset = -state_offset;
615 }
616 }
617
618 /* Check for a duplicate state with the same count, and skip if found.
619 See the note at the head of this module about the possibility of improving
620 performance here. */
621
622 for (j = 0; j < i; j++)
623 {
624 if (active_states[j].offset == state_offset &&
625 active_states[j].count == current_state->count)
626 goto NEXT_ACTIVE_STATE;
627 }
628
629 /* The state offset is the offset to the opcode */
630
631 code = start_code + state_offset;
632 codevalue = *code;
633
634 /* If this opcode inspects a character, but we are at the end of the
635 subject, remember the fact for use when testing for a partial match. */
636
637 if (clen == 0 && poptable[codevalue] != 0)
638 could_continue = TRUE;
639
640 /* If this opcode is followed by an inline character, load it. It is
641 tempting to test for the presence of a subject character here, but that
642 is wrong, because sometimes zero repetitions of the subject are
643 permitted.
644
645 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
646 argument that is not a data character - but is always one byte long because
647 the values are small. We have to take special action to deal with \P, \p,
648 \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
649 these ones to new opcodes. */
650
651 if (coptable[codevalue] > 0)
652 {
653 dlen = 1;
654 #ifdef SUPPORT_UNICODE
655 if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
656 #endif /* SUPPORT_UNICODE */
657 d = code[coptable[codevalue]];
658 if (codevalue >= OP_TYPESTAR)
659 {
660 switch(d)
661 {
662 case OP_ANYBYTE: return PCRE2_ERROR_DFA_UITEM;
663 case OP_NOTPROP:
664 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
665 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
666 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
667 case OP_NOT_HSPACE:
668 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
669 case OP_NOT_VSPACE:
670 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
671 default: break;
672 }
673 }
674 }
675 else
676 {
677 dlen = 0; /* Not strictly necessary, but compilers moan */
678 d = NOTACHAR; /* if these variables are not set. */
679 }
680
681
682 /* Now process the individual opcodes */
683
684 switch (codevalue)
685 {
686 /* ========================================================================== */
687 /* These cases are never obeyed. This is a fudge that causes a compile-
688 time error if the vectors coptable or poptable, which are indexed by
689 opcode, are not the correct length. It seems to be the only way to do
690 such a check at compile time, as the sizeof() operator does not work
691 in the C preprocessor. */
692
693 case OP_TABLE_LENGTH:
694 case OP_TABLE_LENGTH +
695 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
696 (sizeof(poptable) == OP_TABLE_LENGTH)):
697 break;
698
699 /* ========================================================================== */
700 /* Reached a closing bracket. If not at the end of the pattern, carry
701 on with the next opcode. For repeating opcodes, also add the repeat
702 state. Note that KETRPOS will always be encountered at the end of the
703 subpattern, because the possessive subpattern repeats are always handled
704 using recursive calls. Thus, it never adds any new states.
705
706 At the end of the (sub)pattern, unless we have an empty string and
707 PCRE2_NOTEMPTY is set, or PCRE2_NOTEMPTY_ATSTART is set and we are at the
708 start of the subject, save the match data, shifting up all previous
709 matches so we always have the longest first. */
710
711 case OP_KET:
712 case OP_KETRMIN:
713 case OP_KETRMAX:
714 case OP_KETRPOS:
715 if (code != end_code)
716 {
717 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
718 if (codevalue != OP_KET)
719 {
720 ADD_ACTIVE(state_offset - GET(code, 1), 0);
721 }
722 }
723 else
724 {
725 if (ptr > current_subject ||
726 ((mb->moptions & PCRE2_NOTEMPTY) == 0 &&
727 ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) == 0 ||
728 current_subject > start_subject + mb->start_offset)))
729 {
730 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
731 else if (match_count > 0 && ++match_count * 2 > (int)offsetcount)
732 match_count = 0;
733 count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2;
734 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(PCRE2_SIZE));
735 if (offsetcount >= 2)
736 {
737 offsets[0] = (int)(current_subject - start_subject);
738 offsets[1] = (int)(ptr - start_subject);
739 }
740 if ((mb->moptions & PCRE2_DFA_SHORTEST) != 0) return match_count;
741 }
742 }
743 break;
744
745 /* ========================================================================== */
746 /* These opcodes add to the current list of states without looking
747 at the current character. */
748
749 /*-----------------------------------------------------------------*/
750 case OP_ALT:
751 do { code += GET(code, 1); } while (*code == OP_ALT);
752 ADD_ACTIVE((int)(code - start_code), 0);
753 break;
754
755 /*-----------------------------------------------------------------*/
756 case OP_BRA:
757 case OP_SBRA:
758 do
759 {
760 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
761 code += GET(code, 1);
762 }
763 while (*code == OP_ALT);
764 break;
765
766 /*-----------------------------------------------------------------*/
767 case OP_CBRA:
768 case OP_SCBRA:
769 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
770 code += GET(code, 1);
771 while (*code == OP_ALT)
772 {
773 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
774 code += GET(code, 1);
775 }
776 break;
777
778 /*-----------------------------------------------------------------*/
779 case OP_BRAZERO:
780 case OP_BRAMINZERO:
781 ADD_ACTIVE(state_offset + 1, 0);
782 code += 1 + GET(code, 2);
783 while (*code == OP_ALT) code += GET(code, 1);
784 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
785 break;
786
787 /*-----------------------------------------------------------------*/
788 case OP_SKIPZERO:
789 code += 1 + GET(code, 2);
790 while (*code == OP_ALT) code += GET(code, 1);
791 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
792 break;
793
794 /*-----------------------------------------------------------------*/
795 case OP_CIRC:
796 if (ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0)
797 { ADD_ACTIVE(state_offset + 1, 0); }
798 break;
799
800 /*-----------------------------------------------------------------*/
801 case OP_CIRCM:
802 if ((ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) ||
803 (ptr != end_subject && WAS_NEWLINE(ptr)))
804 { ADD_ACTIVE(state_offset + 1, 0); }
805 break;
806
807 /*-----------------------------------------------------------------*/
808 case OP_EOD:
809 if (ptr >= end_subject)
810 {
811 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
812 could_continue = TRUE;
813 else { ADD_ACTIVE(state_offset + 1, 0); }
814 }
815 break;
816
817 /*-----------------------------------------------------------------*/
818 case OP_SOD:
819 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
820 break;
821
822 /*-----------------------------------------------------------------*/
823 case OP_SOM:
824 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
825 break;
826
827
828 /* ========================================================================== */
829 /* These opcodes inspect the next subject character, and sometimes
830 the previous one as well, but do not have an argument. The variable
831 clen contains the length of the current character and is zero if we are
832 at the end of the subject. */
833
834 /*-----------------------------------------------------------------*/
835 case OP_ANY:
836 if (clen > 0 && !IS_NEWLINE(ptr))
837 {
838 if (ptr + 1 >= mb->end_subject &&
839 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
840 NLBLOCK->nltype == NLTYPE_FIXED &&
841 NLBLOCK->nllen == 2 &&
842 c == NLBLOCK->nl[0])
843 {
844 could_continue = partial_newline = TRUE;
845 }
846 else
847 {
848 ADD_NEW(state_offset + 1, 0);
849 }
850 }
851 break;
852
853 /*-----------------------------------------------------------------*/
854 case OP_ALLANY:
855 if (clen > 0)
856 { ADD_NEW(state_offset + 1, 0); }
857 break;
858
859 /*-----------------------------------------------------------------*/
860 case OP_EODN:
861 if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
862 could_continue = TRUE;
863 else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - mb->nllen))
864 { ADD_ACTIVE(state_offset + 1, 0); }
865 break;
866
867 /*-----------------------------------------------------------------*/
868 case OP_DOLL:
869 if ((mb->moptions & PCRE2_NOTEOL) == 0)
870 {
871 if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
872 could_continue = TRUE;
873 else if (clen == 0 ||
874 ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
875 (ptr == end_subject - mb->nllen)
876 ))
877 { ADD_ACTIVE(state_offset + 1, 0); }
878 else if (ptr + 1 >= mb->end_subject &&
879 (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
880 NLBLOCK->nltype == NLTYPE_FIXED &&
881 NLBLOCK->nllen == 2 &&
882 c == NLBLOCK->nl[0])
883 {
884 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
885 {
886 reset_could_continue = TRUE;
887 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
888 }
889 else could_continue = partial_newline = TRUE;
890 }
891 }
892 break;
893
894 /*-----------------------------------------------------------------*/
895 case OP_DOLLM:
896 if ((mb->moptions & PCRE2_NOTEOL) == 0)
897 {
898 if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
899 could_continue = TRUE;
900 else if (clen == 0 ||
901 ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
902 { ADD_ACTIVE(state_offset + 1, 0); }
903 else if (ptr + 1 >= mb->end_subject &&
904 (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
905 NLBLOCK->nltype == NLTYPE_FIXED &&
906 NLBLOCK->nllen == 2 &&
907 c == NLBLOCK->nl[0])
908 {
909 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
910 {
911 reset_could_continue = TRUE;
912 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
913 }
914 else could_continue = partial_newline = TRUE;
915 }
916 }
917 else if (IS_NEWLINE(ptr))
918 { ADD_ACTIVE(state_offset + 1, 0); }
919 break;
920
921 /*-----------------------------------------------------------------*/
922
923 case OP_DIGIT:
924 case OP_WHITESPACE:
925 case OP_WORDCHAR:
926 if (clen > 0 && c < 256 &&
927 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
928 { ADD_NEW(state_offset + 1, 0); }
929 break;
930
931 /*-----------------------------------------------------------------*/
932 case OP_NOT_DIGIT:
933 case OP_NOT_WHITESPACE:
934 case OP_NOT_WORDCHAR:
935 if (clen > 0 && (c >= 256 ||
936 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
937 { ADD_NEW(state_offset + 1, 0); }
938 break;
939
940 /*-----------------------------------------------------------------*/
941 case OP_WORD_BOUNDARY:
942 case OP_NOT_WORD_BOUNDARY:
943 {
944 int left_word, right_word;
945
946 if (ptr > start_subject)
947 {
948 PCRE2_SPTR temp = ptr - 1;
949 if (temp < mb->start_used_ptr) mb->start_used_ptr = temp;
950 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
951 if (utf) { BACKCHAR(temp); }
952 #endif
953 GETCHARTEST(d, temp);
954 #ifdef SUPPORT_UNICODE
955 if ((mb->poptions & PCRE2_UCP) != 0)
956 {
957 if (d == '_') left_word = TRUE; else
958 {
959 int cat = UCD_CATEGORY(d);
960 left_word = (cat == ucp_L || cat == ucp_N);
961 }
962 }
963 else
964 #endif
965 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
966 }
967 else left_word = FALSE;
968
969 if (clen > 0)
970 {
971 if (ptr >= mb->last_used_ptr)
972 {
973 PCRE2_SPTR temp = ptr + 1;
974 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
975 if (utf) { FORWARDCHARTEST(temp, mb->end_subject); }
976 #endif
977 mb->last_used_ptr = temp;
978 }
979 #ifdef SUPPORT_UNICODE
980 if ((mb->poptions & PCRE2_UCP) != 0)
981 {
982 if (c == '_') right_word = TRUE; else
983 {
984 int cat = UCD_CATEGORY(c);
985 right_word = (cat == ucp_L || cat == ucp_N);
986 }
987 }
988 else
989 #endif
990 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
991 }
992 else right_word = FALSE;
993
994 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
995 { ADD_ACTIVE(state_offset + 1, 0); }
996 }
997 break;
998
999
1000 /*-----------------------------------------------------------------*/
1001 /* Check the next character by Unicode property. We will get here only
1002 if the support is in the binary; otherwise a compile-time error occurs.
1003 */
1004
1005 #ifdef SUPPORT_UNICODE
1006 case OP_PROP:
1007 case OP_NOTPROP:
1008 if (clen > 0)
1009 {
1010 BOOL OK;
1011 const uint32_t *cp;
1012 const ucd_record * prop = GET_UCD(c);
1013 switch(code[1])
1014 {
1015 case PT_ANY:
1016 OK = TRUE;
1017 break;
1018
1019 case PT_LAMP:
1020 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1021 prop->chartype == ucp_Lt;
1022 break;
1023
1024 case PT_GC:
1025 OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1026 break;
1027
1028 case PT_PC:
1029 OK = prop->chartype == code[2];
1030 break;
1031
1032 case PT_SC:
1033 OK = prop->script == code[2];
1034 break;
1035
1036 /* These are specials for combination cases. */
1037
1038 case PT_ALNUM:
1039 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1040 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1041 break;
1042
1043 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1044 which means that Perl space and POSIX space are now identical. PCRE
1045 was changed at release 8.34. */
1046
1047 case PT_SPACE: /* Perl space */
1048 case PT_PXSPACE: /* POSIX space */
1049 switch(c)
1050 {
1051 HSPACE_CASES:
1052 VSPACE_CASES:
1053 OK = TRUE;
1054 break;
1055
1056 default:
1057 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1058 break;
1059 }
1060 break;
1061
1062 case PT_WORD:
1063 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1064 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1065 c == CHAR_UNDERSCORE;
1066 break;
1067
1068 case PT_CLIST:
1069 cp = PRIV(ucd_caseless_sets) + code[2];
1070 for (;;)
1071 {
1072 if (c < *cp) { OK = FALSE; break; }
1073 if (c == *cp++) { OK = TRUE; break; }
1074 }
1075 break;
1076
1077 case PT_UCNC:
1078 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1079 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1080 c >= 0xe000;
1081 break;
1082
1083 /* Should never occur, but keep compilers from grumbling. */
1084
1085 default:
1086 OK = codevalue != OP_PROP;
1087 break;
1088 }
1089
1090 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1091 }
1092 break;
1093 #endif
1094
1095
1096
1097 /* ========================================================================== */
1098 /* These opcodes likewise inspect the subject character, but have an
1099 argument that is not a data character. It is one of these opcodes:
1100 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1101 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1102
1103 case OP_TYPEPLUS:
1104 case OP_TYPEMINPLUS:
1105 case OP_TYPEPOSPLUS:
1106 count = current_state->count; /* Already matched */
1107 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1108 if (clen > 0)
1109 {
1110 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1111 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1112 NLBLOCK->nltype == NLTYPE_FIXED &&
1113 NLBLOCK->nllen == 2 &&
1114 c == NLBLOCK->nl[0])
1115 {
1116 could_continue = partial_newline = TRUE;
1117 }
1118 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1119 (c < 256 &&
1120 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1121 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1122 {
1123 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1124 {
1125 active_count--; /* Remove non-match possibility */
1126 next_active_state--;
1127 }
1128 count++;
1129 ADD_NEW(state_offset, count);
1130 }
1131 }
1132 break;
1133
1134 /*-----------------------------------------------------------------*/
1135 case OP_TYPEQUERY:
1136 case OP_TYPEMINQUERY:
1137 case OP_TYPEPOSQUERY:
1138 ADD_ACTIVE(state_offset + 2, 0);
1139 if (clen > 0)
1140 {
1141 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1142 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1143 NLBLOCK->nltype == NLTYPE_FIXED &&
1144 NLBLOCK->nllen == 2 &&
1145 c == NLBLOCK->nl[0])
1146 {
1147 could_continue = partial_newline = TRUE;
1148 }
1149 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1150 (c < 256 &&
1151 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1152 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1153 {
1154 if (codevalue == OP_TYPEPOSQUERY)
1155 {
1156 active_count--; /* Remove non-match possibility */
1157 next_active_state--;
1158 }
1159 ADD_NEW(state_offset + 2, 0);
1160 }
1161 }
1162 break;
1163
1164 /*-----------------------------------------------------------------*/
1165 case OP_TYPESTAR:
1166 case OP_TYPEMINSTAR:
1167 case OP_TYPEPOSSTAR:
1168 ADD_ACTIVE(state_offset + 2, 0);
1169 if (clen > 0)
1170 {
1171 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1172 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1173 NLBLOCK->nltype == NLTYPE_FIXED &&
1174 NLBLOCK->nllen == 2 &&
1175 c == NLBLOCK->nl[0])
1176 {
1177 could_continue = partial_newline = TRUE;
1178 }
1179 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1180 (c < 256 &&
1181 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1182 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1183 {
1184 if (codevalue == OP_TYPEPOSSTAR)
1185 {
1186 active_count--; /* Remove non-match possibility */
1187 next_active_state--;
1188 }
1189 ADD_NEW(state_offset, 0);
1190 }
1191 }
1192 break;
1193
1194 /*-----------------------------------------------------------------*/
1195 case OP_TYPEEXACT:
1196 count = current_state->count; /* Number already matched */
1197 if (clen > 0)
1198 {
1199 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1200 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1201 NLBLOCK->nltype == NLTYPE_FIXED &&
1202 NLBLOCK->nllen == 2 &&
1203 c == NLBLOCK->nl[0])
1204 {
1205 could_continue = partial_newline = TRUE;
1206 }
1207 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1208 (c < 256 &&
1209 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1210 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1211 {
1212 if (++count >= (int)GET2(code, 1))
1213 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1214 else
1215 { ADD_NEW(state_offset, count); }
1216 }
1217 }
1218 break;
1219
1220 /*-----------------------------------------------------------------*/
1221 case OP_TYPEUPTO:
1222 case OP_TYPEMINUPTO:
1223 case OP_TYPEPOSUPTO:
1224 ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1225 count = current_state->count; /* Number already matched */
1226 if (clen > 0)
1227 {
1228 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1229 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1230 NLBLOCK->nltype == NLTYPE_FIXED &&
1231 NLBLOCK->nllen == 2 &&
1232 c == NLBLOCK->nl[0])
1233 {
1234 could_continue = partial_newline = TRUE;
1235 }
1236 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1237 (c < 256 &&
1238 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1239 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1240 {
1241 if (codevalue == OP_TYPEPOSUPTO)
1242 {
1243 active_count--; /* Remove non-match possibility */
1244 next_active_state--;
1245 }
1246 if (++count >= (int)GET2(code, 1))
1247 { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1248 else
1249 { ADD_NEW(state_offset, count); }
1250 }
1251 }
1252 break;
1253
1254 /* ========================================================================== */
1255 /* These are virtual opcodes that are used when something like
1256 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1257 argument. It keeps the code above fast for the other cases. The argument
1258 is in the d variable. */
1259
1260 #ifdef SUPPORT_UNICODE
1261 case OP_PROP_EXTRA + OP_TYPEPLUS:
1262 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1263 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1264 count = current_state->count; /* Already matched */
1265 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1266 if (clen > 0)
1267 {
1268 BOOL OK;
1269 const uint32_t *cp;
1270 const ucd_record * prop = GET_UCD(c);
1271 switch(code[2])
1272 {
1273 case PT_ANY:
1274 OK = TRUE;
1275 break;
1276
1277 case PT_LAMP:
1278 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1279 prop->chartype == ucp_Lt;
1280 break;
1281
1282 case PT_GC:
1283 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1284 break;
1285
1286 case PT_PC:
1287 OK = prop->chartype == code[3];
1288 break;
1289
1290 case PT_SC:
1291 OK = prop->script == code[3];
1292 break;
1293
1294 /* These are specials for combination cases. */
1295
1296 case PT_ALNUM:
1297 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1298 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1299 break;
1300
1301 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1302 which means that Perl space and POSIX space are now identical. PCRE
1303 was changed at release 8.34. */
1304
1305 case PT_SPACE: /* Perl space */
1306 case PT_PXSPACE: /* POSIX space */
1307 switch(c)
1308 {
1309 HSPACE_CASES:
1310 VSPACE_CASES:
1311 OK = TRUE;
1312 break;
1313
1314 default:
1315 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1316 break;
1317 }
1318 break;
1319
1320 case PT_WORD:
1321 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1322 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1323 c == CHAR_UNDERSCORE;
1324 break;
1325
1326 case PT_CLIST:
1327 cp = PRIV(ucd_caseless_sets) + code[3];
1328 for (;;)
1329 {
1330 if (c < *cp) { OK = FALSE; break; }
1331 if (c == *cp++) { OK = TRUE; break; }
1332 }
1333 break;
1334
1335 case PT_UCNC:
1336 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1337 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1338 c >= 0xe000;
1339 break;
1340
1341 /* Should never occur, but keep compilers from grumbling. */
1342
1343 default:
1344 OK = codevalue != OP_PROP;
1345 break;
1346 }
1347
1348 if (OK == (d == OP_PROP))
1349 {
1350 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1351 {
1352 active_count--; /* Remove non-match possibility */
1353 next_active_state--;
1354 }
1355 count++;
1356 ADD_NEW(state_offset, count);
1357 }
1358 }
1359 break;
1360
1361 /*-----------------------------------------------------------------*/
1362 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1363 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1364 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1365 count = current_state->count; /* Already matched */
1366 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1367 if (clen > 0)
1368 {
1369 int lgb, rgb;
1370 PCRE2_SPTR nptr = ptr + clen;
1371 int ncount = 0;
1372 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1373 {
1374 active_count--; /* Remove non-match possibility */
1375 next_active_state--;
1376 }
1377 lgb = UCD_GRAPHBREAK(c);
1378 while (nptr < end_subject)
1379 {
1380 dlen = 1;
1381 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1382 rgb = UCD_GRAPHBREAK(d);
1383 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1384 ncount++;
1385 lgb = rgb;
1386 nptr += dlen;
1387 }
1388 count++;
1389 ADD_NEW_DATA(-state_offset, count, ncount);
1390 }
1391 break;
1392 #endif
1393
1394 /*-----------------------------------------------------------------*/
1395 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1396 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1397 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1398 count = current_state->count; /* Already matched */
1399 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1400 if (clen > 0)
1401 {
1402 int ncount = 0;
1403 switch (c)
1404 {
1405 case CHAR_VT:
1406 case CHAR_FF:
1407 case CHAR_NEL:
1408 #ifndef EBCDIC
1409 case 0x2028:
1410 case 0x2029:
1411 #endif /* Not EBCDIC */
1412 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1413 goto ANYNL01;
1414
1415 case CHAR_CR:
1416 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1417 /* Fall through */
1418
1419 ANYNL01:
1420 case CHAR_LF:
1421 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1422 {
1423 active_count--; /* Remove non-match possibility */
1424 next_active_state--;
1425 }
1426 count++;
1427 ADD_NEW_DATA(-state_offset, count, ncount);
1428 break;
1429
1430 default:
1431 break;
1432 }
1433 }
1434 break;
1435
1436 /*-----------------------------------------------------------------*/
1437 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1438 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1439 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1440 count = current_state->count; /* Already matched */
1441 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1442 if (clen > 0)
1443 {
1444 BOOL OK;
1445 switch (c)
1446 {
1447 VSPACE_CASES:
1448 OK = TRUE;
1449 break;
1450
1451 default:
1452 OK = FALSE;
1453 break;
1454 }
1455
1456 if (OK == (d == OP_VSPACE))
1457 {
1458 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1459 {
1460 active_count--; /* Remove non-match possibility */
1461 next_active_state--;
1462 }
1463 count++;
1464 ADD_NEW_DATA(-state_offset, count, 0);
1465 }
1466 }
1467 break;
1468
1469 /*-----------------------------------------------------------------*/
1470 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1471 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1472 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1473 count = current_state->count; /* Already matched */
1474 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1475 if (clen > 0)
1476 {
1477 BOOL OK;
1478 switch (c)
1479 {
1480 HSPACE_CASES:
1481 OK = TRUE;
1482 break;
1483
1484 default:
1485 OK = FALSE;
1486 break;
1487 }
1488
1489 if (OK == (d == OP_HSPACE))
1490 {
1491 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1492 {
1493 active_count--; /* Remove non-match possibility */
1494 next_active_state--;
1495 }
1496 count++;
1497 ADD_NEW_DATA(-state_offset, count, 0);
1498 }
1499 }
1500 break;
1501
1502 /*-----------------------------------------------------------------*/
1503 #ifdef SUPPORT_UNICODE
1504 case OP_PROP_EXTRA + OP_TYPEQUERY:
1505 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1506 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1507 count = 4;
1508 goto QS1;
1509
1510 case OP_PROP_EXTRA + OP_TYPESTAR:
1511 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1512 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1513 count = 0;
1514
1515 QS1:
1516
1517 ADD_ACTIVE(state_offset + 4, 0);
1518 if (clen > 0)
1519 {
1520 BOOL OK;
1521 const uint32_t *cp;
1522 const ucd_record * prop = GET_UCD(c);
1523 switch(code[2])
1524 {
1525 case PT_ANY:
1526 OK = TRUE;
1527 break;
1528
1529 case PT_LAMP:
1530 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1531 prop->chartype == ucp_Lt;
1532 break;
1533
1534 case PT_GC:
1535 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1536 break;
1537
1538 case PT_PC:
1539 OK = prop->chartype == code[3];
1540 break;
1541
1542 case PT_SC:
1543 OK = prop->script == code[3];
1544 break;
1545
1546 /* These are specials for combination cases. */
1547
1548 case PT_ALNUM:
1549 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1550 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1551 break;
1552
1553 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1554 which means that Perl space and POSIX space are now identical. PCRE
1555 was changed at release 8.34. */
1556
1557 case PT_SPACE: /* Perl space */
1558 case PT_PXSPACE: /* POSIX space */
1559 switch(c)
1560 {
1561 HSPACE_CASES:
1562 VSPACE_CASES:
1563 OK = TRUE;
1564 break;
1565
1566 default:
1567 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1568 break;
1569 }
1570 break;
1571
1572 case PT_WORD:
1573 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1574 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1575 c == CHAR_UNDERSCORE;
1576 break;
1577
1578 case PT_CLIST:
1579 cp = PRIV(ucd_caseless_sets) + code[3];
1580 for (;;)
1581 {
1582 if (c < *cp) { OK = FALSE; break; }
1583 if (c == *cp++) { OK = TRUE; break; }
1584 }
1585 break;
1586
1587 case PT_UCNC:
1588 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1589 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1590 c >= 0xe000;
1591 break;
1592
1593 /* Should never occur, but keep compilers from grumbling. */
1594
1595 default:
1596 OK = codevalue != OP_PROP;
1597 break;
1598 }
1599
1600 if (OK == (d == OP_PROP))
1601 {
1602 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1603 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1604 {
1605 active_count--; /* Remove non-match possibility */
1606 next_active_state--;
1607 }
1608 ADD_NEW(state_offset + count, 0);
1609 }
1610 }
1611 break;
1612
1613 /*-----------------------------------------------------------------*/
1614 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1615 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1616 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1617 count = 2;
1618 goto QS2;
1619
1620 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1621 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1622 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1623 count = 0;
1624
1625 QS2:
1626
1627 ADD_ACTIVE(state_offset + 2, 0);
1628 if (clen > 0)
1629 {
1630 int lgb, rgb;
1631 PCRE2_SPTR nptr = ptr + clen;
1632 int ncount = 0;
1633 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1634 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1635 {
1636 active_count--; /* Remove non-match possibility */
1637 next_active_state--;
1638 }
1639 lgb = UCD_GRAPHBREAK(c);
1640 while (nptr < end_subject)
1641 {
1642 dlen = 1;
1643 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1644 rgb = UCD_GRAPHBREAK(d);
1645 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1646 ncount++;
1647 lgb = rgb;
1648 nptr += dlen;
1649 }
1650 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1651 }
1652 break;
1653 #endif
1654
1655 /*-----------------------------------------------------------------*/
1656 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1657 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1658 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1659 count = 2;
1660 goto QS3;
1661
1662 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1663 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1664 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1665 count = 0;
1666
1667 QS3:
1668 ADD_ACTIVE(state_offset + 2, 0);
1669 if (clen > 0)
1670 {
1671 int ncount = 0;
1672 switch (c)
1673 {
1674 case CHAR_VT:
1675 case CHAR_FF:
1676 case CHAR_NEL:
1677 #ifndef EBCDIC
1678 case 0x2028:
1679 case 0x2029:
1680 #endif /* Not EBCDIC */
1681 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1682 goto ANYNL02;
1683
1684 case CHAR_CR:
1685 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1686 /* Fall through */
1687
1688 ANYNL02:
1689 case CHAR_LF:
1690 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1691 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1692 {
1693 active_count--; /* Remove non-match possibility */
1694 next_active_state--;
1695 }
1696 ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1697 break;
1698
1699 default:
1700 break;
1701 }
1702 }
1703 break;
1704
1705 /*-----------------------------------------------------------------*/
1706 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1707 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1708 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1709 count = 2;
1710 goto QS4;
1711
1712 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1713 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1714 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1715 count = 0;
1716
1717 QS4:
1718 ADD_ACTIVE(state_offset + 2, 0);
1719 if (clen > 0)
1720 {
1721 BOOL OK;
1722 switch (c)
1723 {
1724 VSPACE_CASES:
1725 OK = TRUE;
1726 break;
1727
1728 default:
1729 OK = FALSE;
1730 break;
1731 }
1732 if (OK == (d == OP_VSPACE))
1733 {
1734 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1735 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1736 {
1737 active_count--; /* Remove non-match possibility */
1738 next_active_state--;
1739 }
1740 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1741 }
1742 }
1743 break;
1744
1745 /*-----------------------------------------------------------------*/
1746 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1747 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1748 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1749 count = 2;
1750 goto QS5;
1751
1752 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1753 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1754 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1755 count = 0;
1756
1757 QS5:
1758 ADD_ACTIVE(state_offset + 2, 0);
1759 if (clen > 0)
1760 {
1761 BOOL OK;
1762 switch (c)
1763 {
1764 HSPACE_CASES:
1765 OK = TRUE;
1766 break;
1767
1768 default:
1769 OK = FALSE;
1770 break;
1771 }
1772
1773 if (OK == (d == OP_HSPACE))
1774 {
1775 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1776 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1777 {
1778 active_count--; /* Remove non-match possibility */
1779 next_active_state--;
1780 }
1781 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1782 }
1783 }
1784 break;
1785
1786 /*-----------------------------------------------------------------*/
1787 #ifdef SUPPORT_UNICODE
1788 case OP_PROP_EXTRA + OP_TYPEEXACT:
1789 case OP_PROP_EXTRA + OP_TYPEUPTO:
1790 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1791 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1792 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1793 { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1794 count = current_state->count; /* Number already matched */
1795 if (clen > 0)
1796 {
1797 BOOL OK;
1798 const uint32_t *cp;
1799 const ucd_record * prop = GET_UCD(c);
1800 switch(code[1 + IMM2_SIZE + 1])
1801 {
1802 case PT_ANY:
1803 OK = TRUE;
1804 break;
1805
1806 case PT_LAMP:
1807 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1808 prop->chartype == ucp_Lt;
1809 break;
1810
1811 case PT_GC:
1812 OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1813 break;
1814
1815 case PT_PC:
1816 OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1817 break;
1818
1819 case PT_SC:
1820 OK = prop->script == code[1 + IMM2_SIZE + 2];
1821 break;
1822
1823 /* These are specials for combination cases. */
1824
1825 case PT_ALNUM:
1826 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1827 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1828 break;
1829
1830 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1831 which means that Perl space and POSIX space are now identical. PCRE
1832 was changed at release 8.34. */
1833
1834 case PT_SPACE: /* Perl space */
1835 case PT_PXSPACE: /* POSIX space */
1836 switch(c)
1837 {
1838 HSPACE_CASES:
1839 VSPACE_CASES:
1840 OK = TRUE;
1841 break;
1842
1843 default:
1844 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1845 break;
1846 }
1847 break;
1848
1849 case PT_WORD:
1850 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1851 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1852 c == CHAR_UNDERSCORE;
1853 break;
1854
1855 case PT_CLIST:
1856 cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
1857 for (;;)
1858 {
1859 if (c < *cp) { OK = FALSE; break; }
1860 if (c == *cp++) { OK = TRUE; break; }
1861 }
1862 break;
1863
1864 case PT_UCNC:
1865 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1866 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1867 c >= 0xe000;
1868 break;
1869
1870 /* Should never occur, but keep compilers from grumbling. */
1871
1872 default:
1873 OK = codevalue != OP_PROP;
1874 break;
1875 }
1876
1877 if (OK == (d == OP_PROP))
1878 {
1879 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1880 {
1881 active_count--; /* Remove non-match possibility */
1882 next_active_state--;
1883 }
1884 if (++count >= (int)GET2(code, 1))
1885 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1886 else
1887 { ADD_NEW(state_offset, count); }
1888 }
1889 }
1890 break;
1891
1892 /*-----------------------------------------------------------------*/
1893 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1894 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1895 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1896 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1897 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1898 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1899 count = current_state->count; /* Number already matched */
1900 if (clen > 0)
1901 {
1902 int lgb, rgb;
1903 PCRE2_SPTR nptr = ptr + clen;
1904 int ncount = 0;
1905 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1906 {
1907 active_count--; /* Remove non-match possibility */
1908 next_active_state--;
1909 }
1910 lgb = UCD_GRAPHBREAK(c);
1911 while (nptr < end_subject)
1912 {
1913 dlen = 1;
1914 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1915 rgb = UCD_GRAPHBREAK(d);
1916 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1917 ncount++;
1918 lgb = rgb;
1919 nptr += dlen;
1920 }
1921 if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1922 reset_could_continue = TRUE;
1923 if (++count >= (int)GET2(code, 1))
1924 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1925 else
1926 { ADD_NEW_DATA(-state_offset, count, ncount); }
1927 }
1928 break;
1929 #endif
1930
1931 /*-----------------------------------------------------------------*/
1932 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1933 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1934 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1935 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1936 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1937 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1938 count = current_state->count; /* Number already matched */
1939 if (clen > 0)
1940 {
1941 int ncount = 0;
1942 switch (c)
1943 {
1944 case CHAR_VT:
1945 case CHAR_FF:
1946 case CHAR_NEL:
1947 #ifndef EBCDIC
1948 case 0x2028:
1949 case 0x2029:
1950 #endif /* Not EBCDIC */
1951 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1952 goto ANYNL03;
1953
1954 case CHAR_CR:
1955 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1956 /* Fall through */
1957
1958 ANYNL03:
1959 case CHAR_LF:
1960 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1961 {
1962 active_count--; /* Remove non-match possibility */
1963 next_active_state--;
1964 }
1965 if (++count >= (int)GET2(code, 1))
1966 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1967 else
1968 { ADD_NEW_DATA(-state_offset, count, ncount); }
1969 break;
1970
1971 default:
1972 break;
1973 }
1974 }
1975 break;
1976
1977 /*-----------------------------------------------------------------*/
1978 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1979 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1980 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1981 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1982 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1983 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1984 count = current_state->count; /* Number already matched */
1985 if (clen > 0)
1986 {
1987 BOOL OK;
1988 switch (c)
1989 {
1990 VSPACE_CASES:
1991 OK = TRUE;
1992 break;
1993
1994 default:
1995 OK = FALSE;
1996 }
1997
1998 if (OK == (d == OP_VSPACE))
1999 {
2000 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2001 {
2002 active_count--; /* Remove non-match possibility */
2003 next_active_state--;
2004 }
2005 if (++count >= (int)GET2(code, 1))
2006 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2007 else
2008 { ADD_NEW_DATA(-state_offset, count, 0); }
2009 }
2010 }
2011 break;
2012
2013 /*-----------------------------------------------------------------*/
2014 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2015 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2016 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2017 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2018 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2019 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2020 count = current_state->count; /* Number already matched */
2021 if (clen > 0)
2022 {
2023 BOOL OK;
2024 switch (c)
2025 {
2026 HSPACE_CASES:
2027 OK = TRUE;
2028 break;
2029
2030 default:
2031 OK = FALSE;
2032 break;
2033 }
2034
2035 if (OK == (d == OP_HSPACE))
2036 {
2037 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2038 {
2039 active_count--; /* Remove non-match possibility */
2040 next_active_state--;
2041 }
2042 if (++count >= (int)GET2(code, 1))
2043 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2044 else
2045 { ADD_NEW_DATA(-state_offset, count, 0); }
2046 }
2047 }
2048 break;
2049
2050 /* ========================================================================== */
2051 /* These opcodes are followed by a character that is usually compared
2052 to the current subject character; it is loaded into d. We still get
2053 here even if there is no subject character, because in some cases zero
2054 repetitions are permitted. */
2055
2056 /*-----------------------------------------------------------------*/
2057 case OP_CHAR:
2058 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2059 break;
2060
2061 /*-----------------------------------------------------------------*/
2062 case OP_CHARI:
2063 if (clen == 0) break;
2064
2065 #ifdef SUPPORT_UNICODE
2066 if (utf)
2067 {
2068 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2069 {
2070 unsigned int othercase;
2071 if (c < 128)
2072 othercase = fcc[c];
2073 else
2074 othercase = UCD_OTHERCASE(c);
2075 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2076 }
2077 }
2078 else
2079 #endif /* SUPPORT_UNICODE */
2080 /* Not UTF mode */
2081 {
2082 if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2083 { ADD_NEW(state_offset + 2, 0); }
2084 }
2085 break;
2086
2087
2088 #ifdef SUPPORT_UNICODE
2089 /*-----------------------------------------------------------------*/
2090 /* This is a tricky one because it can match more than one character.
2091 Find out how many characters to skip, and then set up a negative state
2092 to wait for them to pass before continuing. */
2093
2094 case OP_EXTUNI:
2095 if (clen > 0)
2096 {
2097 int lgb, rgb;
2098 PCRE2_SPTR nptr = ptr + clen;
2099 int ncount = 0;
2100 lgb = UCD_GRAPHBREAK(c);
2101 while (nptr < end_subject)
2102 {
2103 dlen = 1;
2104 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
2105 rgb = UCD_GRAPHBREAK(d);
2106 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2107 ncount++;
2108 lgb = rgb;
2109 nptr += dlen;
2110 }
2111 if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2112 reset_could_continue = TRUE;
2113 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2114 }
2115 break;
2116 #endif
2117
2118 /*-----------------------------------------------------------------*/
2119 /* This is a tricky like EXTUNI because it too can match more than one
2120 character (when CR is followed by LF). In this case, set up a negative
2121 state to wait for one character to pass before continuing. */
2122
2123 case OP_ANYNL:
2124 if (clen > 0) switch(c)
2125 {
2126 case CHAR_VT:
2127 case CHAR_FF:
2128 case CHAR_NEL:
2129 #ifndef EBCDIC
2130 case 0x2028:
2131 case 0x2029:
2132 #endif /* Not EBCDIC */
2133 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2134
2135 case CHAR_LF:
2136 ADD_NEW(state_offset + 1, 0);
2137 break;
2138
2139 case CHAR_CR:
2140 if (ptr + 1 >= end_subject)
2141 {
2142 ADD_NEW(state_offset + 1, 0);
2143 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2144 reset_could_continue = TRUE;
2145 }
2146 else if (UCHAR21TEST(ptr + 1) == CHAR_LF)
2147 {
2148 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2149 }
2150 else
2151 {
2152 ADD_NEW(state_offset + 1, 0);
2153 }
2154 break;
2155 }
2156 break;
2157
2158 /*-----------------------------------------------------------------*/
2159 case OP_NOT_VSPACE:
2160 if (clen > 0) switch(c)
2161 {
2162 VSPACE_CASES:
2163 break;
2164
2165 default:
2166 ADD_NEW(state_offset + 1, 0);
2167 break;
2168 }
2169 break;
2170
2171 /*-----------------------------------------------------------------*/
2172 case OP_VSPACE:
2173 if (clen > 0) switch(c)
2174 {
2175 VSPACE_CASES:
2176 ADD_NEW(state_offset + 1, 0);
2177 break;
2178
2179 default:
2180 break;
2181 }
2182 break;
2183
2184 /*-----------------------------------------------------------------*/
2185 case OP_NOT_HSPACE:
2186 if (clen > 0) switch(c)
2187 {
2188 HSPACE_CASES:
2189 break;
2190
2191 default:
2192 ADD_NEW(state_offset + 1, 0);
2193 break;
2194 }
2195 break;
2196
2197 /*-----------------------------------------------------------------*/
2198 case OP_HSPACE:
2199 if (clen > 0) switch(c)
2200 {
2201 HSPACE_CASES:
2202 ADD_NEW(state_offset + 1, 0);
2203 break;
2204
2205 default:
2206 break;
2207 }
2208 break;
2209
2210 /*-----------------------------------------------------------------*/
2211 /* Match a negated single character casefully. */
2212
2213 case OP_NOT:
2214 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2215 break;
2216
2217 /*-----------------------------------------------------------------*/
2218 /* Match a negated single character caselessly. */
2219
2220 case OP_NOTI:
2221 if (clen > 0)
2222 {
2223 unsigned int otherd;
2224 #ifdef SUPPORT_UNICODE
2225 if (utf && d >= 128)
2226 otherd = UCD_OTHERCASE(d);
2227 else
2228 #endif /* SUPPORT_UNICODE */
2229 otherd = TABLE_GET(d, fcc, d);
2230 if (c != d && c != otherd)
2231 { ADD_NEW(state_offset + dlen + 1, 0); }
2232 }
2233 break;
2234
2235 /*-----------------------------------------------------------------*/
2236 case OP_PLUSI:
2237 case OP_MINPLUSI:
2238 case OP_POSPLUSI:
2239 case OP_NOTPLUSI:
2240 case OP_NOTMINPLUSI:
2241 case OP_NOTPOSPLUSI:
2242 caseless = TRUE;
2243 codevalue -= OP_STARI - OP_STAR;
2244
2245 /* Fall through */
2246 case OP_PLUS:
2247 case OP_MINPLUS:
2248 case OP_POSPLUS:
2249 case OP_NOTPLUS:
2250 case OP_NOTMINPLUS:
2251 case OP_NOTPOSPLUS:
2252 count = current_state->count; /* Already matched */
2253 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2254 if (clen > 0)
2255 {
2256 uint32_t otherd = NOTACHAR;
2257 if (caseless)
2258 {
2259 #ifdef SUPPORT_UNICODE
2260 if (utf && d >= 128)
2261 otherd = UCD_OTHERCASE(d);
2262 else
2263 #endif /* SUPPORT_UNICODE */
2264 otherd = TABLE_GET(d, fcc, d);
2265 }
2266 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2267 {
2268 if (count > 0 &&
2269 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2270 {
2271 active_count--; /* Remove non-match possibility */
2272 next_active_state--;
2273 }
2274 count++;
2275 ADD_NEW(state_offset, count);
2276 }
2277 }
2278 break;
2279
2280 /*-----------------------------------------------------------------*/
2281 case OP_QUERYI:
2282 case OP_MINQUERYI:
2283 case OP_POSQUERYI:
2284 case OP_NOTQUERYI:
2285 case OP_NOTMINQUERYI:
2286 case OP_NOTPOSQUERYI:
2287 caseless = TRUE;
2288 codevalue -= OP_STARI - OP_STAR;
2289 /* Fall through */
2290 case OP_QUERY:
2291 case OP_MINQUERY:
2292 case OP_POSQUERY:
2293 case OP_NOTQUERY:
2294 case OP_NOTMINQUERY:
2295 case OP_NOTPOSQUERY:
2296 ADD_ACTIVE(state_offset + dlen + 1, 0);
2297 if (clen > 0)
2298 {
2299 uint32_t otherd = NOTACHAR;
2300 if (caseless)
2301 {
2302 #ifdef SUPPORT_UNICODE
2303 if (utf && d >= 128)
2304 otherd = UCD_OTHERCASE(d);
2305 else
2306 #endif /* SUPPORT_UNICODE */
2307 otherd = TABLE_GET(d, fcc, d);
2308 }
2309 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2310 {
2311 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2312 {
2313 active_count--; /* Remove non-match possibility */
2314 next_active_state--;
2315 }
2316 ADD_NEW(state_offset + dlen + 1, 0);
2317 }
2318 }
2319 break;
2320
2321 /*-----------------------------------------------------------------*/
2322 case OP_STARI:
2323 case OP_MINSTARI:
2324 case OP_POSSTARI:
2325 case OP_NOTSTARI:
2326 case OP_NOTMINSTARI:
2327 case OP_NOTPOSSTARI:
2328 caseless = TRUE;
2329 codevalue -= OP_STARI - OP_STAR;
2330 /* Fall through */
2331 case OP_STAR:
2332 case OP_MINSTAR:
2333 case OP_POSSTAR:
2334 case OP_NOTSTAR:
2335 case OP_NOTMINSTAR:
2336 case OP_NOTPOSSTAR:
2337 ADD_ACTIVE(state_offset + dlen + 1, 0);
2338 if (clen > 0)
2339 {
2340 uint32_t otherd = NOTACHAR;
2341 if (caseless)
2342 {
2343 #ifdef SUPPORT_UNICODE
2344 if (utf && d >= 128)
2345 otherd = UCD_OTHERCASE(d);
2346 else
2347 #endif /* SUPPORT_UNICODE */
2348 otherd = TABLE_GET(d, fcc, d);
2349 }
2350 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2351 {
2352 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2353 {
2354 active_count--; /* Remove non-match possibility */
2355 next_active_state--;
2356 }
2357 ADD_NEW(state_offset, 0);
2358 }
2359 }
2360 break;
2361
2362 /*-----------------------------------------------------------------*/
2363 case OP_EXACTI:
2364 case OP_NOTEXACTI:
2365 caseless = TRUE;
2366 codevalue -= OP_STARI - OP_STAR;
2367 /* Fall through */
2368 case OP_EXACT:
2369 case OP_NOTEXACT:
2370 count = current_state->count; /* Number already matched */
2371 if (clen > 0)
2372 {
2373 uint32_t otherd = NOTACHAR;
2374 if (caseless)
2375 {
2376 #ifdef SUPPORT_UNICODE
2377 if (utf && d >= 128)
2378 otherd = UCD_OTHERCASE(d);
2379 else
2380 #endif /* SUPPORT_UNICODE */
2381 otherd = TABLE_GET(d, fcc, d);
2382 }
2383 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2384 {
2385 if (++count >= (int)GET2(code, 1))
2386 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2387 else
2388 { ADD_NEW(state_offset, count); }
2389 }
2390 }
2391 break;
2392
2393 /*-----------------------------------------------------------------*/
2394 case OP_UPTOI:
2395 case OP_MINUPTOI:
2396 case OP_POSUPTOI:
2397 case OP_NOTUPTOI:
2398 case OP_NOTMINUPTOI:
2399 case OP_NOTPOSUPTOI:
2400 caseless = TRUE;
2401 codevalue -= OP_STARI - OP_STAR;
2402 /* Fall through */
2403 case OP_UPTO:
2404 case OP_MINUPTO:
2405 case OP_POSUPTO:
2406 case OP_NOTUPTO:
2407 case OP_NOTMINUPTO:
2408 case OP_NOTPOSUPTO:
2409 ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2410 count = current_state->count; /* Number already matched */
2411 if (clen > 0)
2412 {
2413 uint32_t otherd = NOTACHAR;
2414 if (caseless)
2415 {
2416 #ifdef SUPPORT_UNICODE
2417 if (utf && d >= 128)
2418 otherd = UCD_OTHERCASE(d);
2419 else
2420 #endif /* SUPPORT_UNICODE */
2421 otherd = TABLE_GET(d, fcc, d);
2422 }
2423 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2424 {
2425 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2426 {
2427 active_count--; /* Remove non-match possibility */
2428 next_active_state--;
2429 }
2430 if (++count >= (int)GET2(code, 1))
2431 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2432 else
2433 { ADD_NEW(state_offset, count); }
2434 }
2435 }
2436 break;
2437
2438
2439 /* ========================================================================== */
2440 /* These are the class-handling opcodes */
2441
2442 case OP_CLASS:
2443 case OP_NCLASS:
2444 case OP_XCLASS:
2445 {
2446 BOOL isinclass = FALSE;
2447 int next_state_offset;
2448 PCRE2_SPTR ecode;
2449
2450 /* For a simple class, there is always just a 32-byte table, and we
2451 can set isinclass from it. */
2452
2453 if (codevalue != OP_XCLASS)
2454 {
2455 ecode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
2456 if (clen > 0)
2457 {
2458 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2459 ((((uint8_t *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2460 }
2461 }
2462
2463 /* An extended class may have a table or a list of single characters,
2464 ranges, or both, and it may be positive or negative. There's a
2465 function that sorts all this out. */
2466
2467 else
2468 {
2469 ecode = code + GET(code, 1);
2470 if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2471 }
2472
2473 /* At this point, isinclass is set for all kinds of class, and ecode
2474 points to the byte after the end of the class. If there is a
2475 quantifier, this is where it will be. */
2476
2477 next_state_offset = (int)(ecode - start_code);
2478
2479 switch (*ecode)
2480 {
2481 case OP_CRSTAR:
2482 case OP_CRMINSTAR:
2483 case OP_CRPOSSTAR:
2484 ADD_ACTIVE(next_state_offset + 1, 0);
2485 if (isinclass)
2486 {
2487 if (*ecode == OP_CRPOSSTAR)
2488 {
2489 active_count--; /* Remove non-match possibility */
2490 next_active_state--;
2491 }
2492 ADD_NEW(state_offset, 0);
2493 }
2494 break;
2495
2496 case OP_CRPLUS:
2497 case OP_CRMINPLUS:
2498 case OP_CRPOSPLUS:
2499 count = current_state->count; /* Already matched */
2500 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2501 if (isinclass)
2502 {
2503 if (count > 0 && *ecode == OP_CRPOSPLUS)
2504 {
2505 active_count--; /* Remove non-match possibility */
2506 next_active_state--;
2507 }
2508 count++;
2509 ADD_NEW(state_offset, count);
2510 }
2511 break;
2512
2513 case OP_CRQUERY:
2514 case OP_CRMINQUERY:
2515 case OP_CRPOSQUERY:
2516 ADD_ACTIVE(next_state_offset + 1, 0);
2517 if (isinclass)
2518 {
2519 if (*ecode == OP_CRPOSQUERY)
2520 {
2521 active_count--; /* Remove non-match possibility */
2522 next_active_state--;
2523 }
2524 ADD_NEW(next_state_offset + 1, 0);
2525 }
2526 break;
2527
2528 case OP_CRRANGE:
2529 case OP_CRMINRANGE:
2530 case OP_CRPOSRANGE:
2531 count = current_state->count; /* Already matched */
2532 if (count >= (int)GET2(ecode, 1))
2533 { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2534 if (isinclass)
2535 {
2536 int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2537 if (*ecode == OP_CRPOSRANGE)
2538 {
2539 active_count--; /* Remove non-match possibility */
2540 next_active_state--;
2541 }
2542 if (++count >= max && max != 0) /* Max 0 => no limit */
2543 { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2544 else
2545 { ADD_NEW(state_offset, count); }
2546 }
2547 break;
2548
2549 default:
2550 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2551 break;
2552 }
2553 }
2554 break;
2555
2556 /* ========================================================================== */
2557 /* These are the opcodes for fancy brackets of various kinds. We have
2558 to use recursion in order to handle them. The "always failing" assertion
2559 (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2560 though the other "backtracking verbs" are not supported. */
2561
2562 case OP_FAIL:
2563 forced_fail++; /* Count FAILs for multiple states */
2564 break;
2565
2566 case OP_ASSERT:
2567 case OP_ASSERT_NOT:
2568 case OP_ASSERTBACK:
2569 case OP_ASSERTBACK_NOT:
2570 {
2571 PCRE2_SPTR endasscode = code + GET(code, 1);
2572 PCRE2_SIZE local_offsets[2];
2573 int rc;
2574 int local_workspace[1000];
2575
2576 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2577
2578 rc = internal_dfa_match(
2579 mb, /* static match data */
2580 code, /* this subexpression's code */
2581 ptr, /* where we currently are */
2582 (int)(ptr - start_subject), /* start offset */
2583 local_offsets, /* offset vector */
2584 sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
2585 local_workspace, /* workspace vector */
2586 sizeof(local_workspace)/sizeof(int), /* size of same */
2587 rlevel); /* function recursion level */
2588
2589 if (rc == PCRE2_ERROR_DFA_UITEM) return rc;
2590 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2591 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2592 }
2593 break;
2594
2595 /*-----------------------------------------------------------------*/
2596 case OP_COND:
2597 case OP_SCOND:
2598 {
2599 PCRE2_SIZE local_offsets[1000];
2600 int local_workspace[1000];
2601 int codelink = GET(code, 1);
2602 int condcode;
2603
2604 /* Because of the way auto-callout works during compile, a callout item
2605 is inserted between OP_COND and an assertion condition. This does not
2606 happen for the other conditions. */
2607
2608 if (code[LINK_SIZE+1] == OP_CALLOUT)
2609 {
2610 rrc = 0;
2611 if (mb->callout != NULL)
2612 {
2613 pcre2_callout_block cb;
2614 cb.version = 0;
2615 cb.callout_number = code[LINK_SIZE+2];
2616 cb.capture_top = 1;
2617 cb.capture_last = 0;
2618 cb.offset_vector = offsets;
2619 cb.mark = NULL; /* No (*MARK) support */
2620 cb.subject = start_subject;
2621 cb.subject_length = (PCRE2_SIZE)(end_subject - start_subject);
2622 cb.start_match = (PCRE2_SIZE)(current_subject - start_subject);
2623 cb.current_position = (PCRE2_SIZE)(ptr - start_subject);
2624 cb.pattern_position = GET(code, LINK_SIZE + 3);
2625 cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2626 if ((rrc = (mb->callout)(&cb, mb->callout_data)) < 0)
2627 return rrc; /* Abandon */
2628 }
2629 if (rrc > 0) break; /* Fail this thread */
2630 code += PRIV(OP_lengths)[OP_CALLOUT]; /* Skip callout data */
2631 }
2632
2633 condcode = code[LINK_SIZE+1];
2634
2635 /* Back reference conditions and duplicate named recursion conditions
2636 are not supported */
2637
2638 if (condcode == OP_CREF || condcode == OP_DNCREF ||
2639 condcode == OP_DNRREF)
2640 return PCRE2_ERROR_DFA_UCOND;
2641
2642 /* The DEFINE condition is always false */
2643
2644 if (condcode == OP_FALSE)
2645 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2646
2647 /* There is also an always-true condition */
2648
2649 if (condcode == OP_TRUE)
2650 { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2651
2652 /* The only supported version of OP_RREF is for the value RREF_ANY,
2653 which means "test if in any recursion". We can't test for specifically
2654 recursed groups. */
2655
2656 else if (condcode == OP_RREF)
2657 {
2658 int value = GET2(code, LINK_SIZE + 2);
2659 if (value != RREF_ANY) return PCRE2_ERROR_DFA_UCOND;
2660 if (mb->recursive != NULL)
2661 { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2662 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2663 }
2664
2665 /* Otherwise, the condition is an assertion */
2666
2667 else
2668 {
2669 int rc;
2670 PCRE2_SPTR asscode = code + LINK_SIZE + 1;
2671 PCRE2_SPTR endasscode = asscode + GET(asscode, 1);
2672
2673 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2674
2675 rc = internal_dfa_match(
2676 mb, /* fixed match data */
2677 asscode, /* this subexpression's code */
2678 ptr, /* where we currently are */
2679 (int)(ptr - start_subject), /* start offset */
2680 local_offsets, /* offset vector */
2681 sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
2682 local_workspace, /* workspace vector */
2683 sizeof(local_workspace)/sizeof(int), /* size of same */
2684 rlevel); /* function recursion level */
2685
2686 if (rc == PCRE2_ERROR_DFA_UITEM) return rc;
2687 if ((rc >= 0) ==
2688 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2689 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2690 else
2691 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2692 }
2693 }
2694 break;
2695
2696 /*-----------------------------------------------------------------*/
2697 case OP_RECURSE:
2698 {
2699 dfa_recursion_info *ri;
2700 PCRE2_SIZE local_offsets[1000];
2701 int local_workspace[1000];
2702 PCRE2_SPTR callpat = start_code + GET(code, 1);
2703 uint32_t recno = (callpat == mb->start_code)? 0 :
2704 GET2(callpat, 1 + LINK_SIZE);
2705 int rc;
2706
2707 /* Check for repeating a recursion without advancing the subject
2708 pointer. This should catch convoluted mutual recursions. (Some simple
2709 cases are caught at compile time.) */
2710
2711 for (ri = mb->recursive; ri != NULL; ri = ri->prevrec)
2712 if (recno == ri->group_num && ptr == ri->subject_position)
2713 return PCRE2_ERROR_RECURSELOOP;
2714
2715 /* Remember this recursion and where we started it so as to
2716 catch infinite loops. */
2717
2718 new_recursive.group_num = recno;
2719 new_recursive.subject_position = ptr;
2720 new_recursive.prevrec = mb->recursive;
2721 mb->recursive = &new_recursive;
2722
2723 rc = internal_dfa_match(
2724 mb, /* fixed match data */
2725 callpat, /* this subexpression's code */
2726 ptr, /* where we currently are */
2727 (int)(ptr - start_subject), /* start offset */
2728 local_offsets, /* offset vector */
2729 sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
2730 local_workspace, /* workspace vector */
2731 sizeof(local_workspace)/sizeof(int), /* size of same */
2732 rlevel); /* function recursion level */
2733
2734 mb->recursive = new_recursive.prevrec; /* Done this recursion */
2735
2736 /* Ran out of internal offsets */
2737
2738 if (rc == 0) return PCRE2_ERROR_DFA_RECURSE;
2739
2740 /* For each successful matched substring, set up the next state with a
2741 count of characters to skip before trying it. Note that the count is in
2742 characters, not bytes. */
2743
2744 if (rc > 0)
2745 {
2746 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2747 {
2748 int charcount = local_offsets[rc+1] - local_offsets[rc];
2749 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2750 if (utf)
2751 {
2752 PCRE2_SPTR p = start_subject + local_offsets[rc];
2753 PCRE2_SPTR pp = start_subject + local_offsets[rc+1];
2754 while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2755 }
2756 #endif
2757 if (charcount > 0)
2758 {
2759 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2760 }
2761 else
2762 {
2763 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2764 }
2765 }
2766 }
2767 else if (rc != PCRE2_ERROR_NOMATCH) return rc;
2768 }
2769 break;
2770
2771 /*-----------------------------------------------------------------*/
2772 case OP_BRAPOS:
2773 case OP_SBRAPOS:
2774 case OP_CBRAPOS:
2775 case OP_SCBRAPOS:
2776 case OP_BRAPOSZERO:
2777 {
2778 int charcount, matched_count;
2779 PCRE2_SPTR local_ptr = ptr;
2780 BOOL allow_zero;
2781
2782 if (codevalue == OP_BRAPOSZERO)
2783 {
2784 allow_zero = TRUE;
2785 codevalue = *(++code); /* Codevalue will be one of above BRAs */
2786 }
2787 else allow_zero = FALSE;
2788
2789 /* Loop to match the subpattern as many times as possible as if it were
2790 a complete pattern. */
2791
2792 for (matched_count = 0;; matched_count++)
2793 {
2794 PCRE2_SIZE local_offsets[2];
2795 int local_workspace[1000];
2796
2797 int rc = internal_dfa_match(
2798 mb, /* fixed match data */
2799 code, /* this subexpression's code */
2800 local_ptr, /* where we currently are */
2801 (int)(ptr - start_subject), /* start offset */
2802 local_offsets, /* offset vector */
2803 sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
2804 local_workspace, /* workspace vector */
2805 sizeof(local_workspace)/sizeof(int), /* size of same */
2806 rlevel); /* function recursion level */
2807
2808 /* Failed to match */
2809
2810 if (rc < 0)
2811 {
2812 if (rc != PCRE2_ERROR_NOMATCH) return rc;
2813 break;
2814 }
2815
2816 /* Matched: break the loop if zero characters matched. */
2817
2818 charcount = local_offsets[1] - local_offsets[0];
2819 if (charcount == 0) break;
2820 local_ptr += charcount; /* Advance temporary position ptr */
2821 }
2822
2823 /* At this point we have matched the subpattern matched_count
2824 times, and local_ptr is pointing to the character after the end of the
2825 last match. */
2826
2827 if (matched_count > 0 || allow_zero)
2828 {
2829 PCRE2_SPTR end_subpattern = code;
2830 int next_state_offset;
2831
2832 do { end_subpattern += GET(end_subpattern, 1); }
2833 while (*end_subpattern == OP_ALT);
2834 next_state_offset =
2835 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2836
2837 /* Optimization: if there are no more active states, and there
2838 are no new states yet set up, then skip over the subject string
2839 right here, to save looping. Otherwise, set up the new state to swing
2840 into action when the end of the matched substring is reached. */
2841
2842 if (i + 1 >= active_count && new_count == 0)
2843 {
2844 ptr = local_ptr;
2845 clen = 0;
2846 ADD_NEW(next_state_offset, 0);
2847 }
2848 else
2849 {
2850 PCRE2_SPTR p = ptr;
2851 PCRE2_SPTR pp = local_ptr;
2852 charcount = (int)(pp - p);
2853 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2854 if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2855 #endif
2856 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2857 }
2858 }
2859 }
2860 break;
2861
2862 /*-----------------------------------------------------------------*/
2863 case OP_ONCE:
2864 case OP_ONCE_NC:
2865 {
2866 PCRE2_SIZE local_offsets[2];
2867 int local_workspace[1000];
2868
2869 int rc = internal_dfa_match(
2870 mb, /* fixed match data */
2871 code, /* this subexpression's code */
2872 ptr, /* where we currently are */
2873 (int)(ptr - start_subject), /* start offset */
2874 local_offsets, /* offset vector */
2875 sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
2876 local_workspace, /* workspace vector */
2877 sizeof(local_workspace)/sizeof(int), /* size of same */
2878 rlevel); /* function recursion level */
2879
2880 if (rc >= 0)
2881 {
2882 PCRE2_SPTR end_subpattern = code;
2883 int charcount = local_offsets[1] - local_offsets[0];
2884 int next_state_offset, repeat_state_offset;
2885
2886 do { end_subpattern += GET(end_subpattern, 1); }
2887 while (*end_subpattern == OP_ALT);
2888 next_state_offset =
2889 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2890
2891 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2892 arrange for the repeat state also to be added to the relevant list.
2893 Calculate the offset, or set -1 for no repeat. */
2894
2895 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2896 *end_subpattern == OP_KETRMIN)?
2897 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2898
2899 /* If we have matched an empty string, add the next state at the
2900 current character pointer. This is important so that the duplicate
2901 checking kicks in, which is what breaks infinite loops that match an
2902 empty string. */
2903
2904 if (charcount == 0)
2905 {
2906 ADD_ACTIVE(next_state_offset, 0);
2907 }
2908
2909 /* Optimization: if there are no more active states, and there
2910 are no new states yet set up, then skip over the subject string
2911 right here, to save looping. Otherwise, set up the new state to swing
2912 into action when the end of the matched substring is reached. */
2913
2914 else if (i + 1 >= active_count && new_count == 0)
2915 {
2916 ptr += charcount;
2917 clen = 0;
2918 ADD_NEW(next_state_offset, 0);
2919
2920 /* If we are adding a repeat state at the new character position,
2921 we must fudge things so that it is the only current state.
2922 Otherwise, it might be a duplicate of one we processed before, and
2923 that would cause it to be skipped. */
2924
2925 if (repeat_state_offset >= 0)
2926 {
2927 next_active_state = active_states;
2928 active_count = 0;
2929 i = -1;
2930 ADD_ACTIVE(repeat_state_offset, 0);
2931 }
2932 }
2933 else
2934 {
2935 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2936 if (utf)
2937 {
2938 PCRE2_SPTR p = start_subject + local_offsets[0];
2939 PCRE2_SPTR pp = start_subject + local_offsets[1];
2940 while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2941 }
2942 #endif
2943 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2944 if (repeat_state_offset >= 0)
2945 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2946 }
2947 }
2948 else if (rc != PCRE2_ERROR_NOMATCH) return rc;
2949 }
2950 break;
2951
2952
2953 /* ========================================================================== */
2954 /* Handle callouts */
2955
2956 case OP_CALLOUT:
2957 rrc = 0;
2958 if (mb->callout != NULL)
2959 {
2960 pcre2_callout_block cb;
2961 cb.version = 0;
2962 cb.callout_number = code[1];
2963 cb.capture_top = 1;
2964 cb.capture_last = 0;
2965 cb.offset_vector = offsets;
2966 cb.mark = NULL; /* No (*MARK) support */
2967 cb.subject = start_subject;
2968 cb.subject_length = (PCRE2_SIZE)(end_subject - start_subject);
2969 cb.start_match = (PCRE2_SIZE)(current_subject - start_subject);
2970 cb.current_position = (PCRE2_SIZE)(ptr - start_subject);
2971 cb.pattern_position = GET(code, 2);
2972 cb.next_item_length = GET(code, 2 + LINK_SIZE);
2973 if ((rrc = (mb->callout)(&cb, mb->callout_data)) < 0)
2974 return rrc; /* Abandon */
2975 }
2976 if (rrc == 0)
2977 { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
2978 break;
2979
2980
2981 /* ========================================================================== */
2982 default: /* Unsupported opcode */
2983 return PCRE2_ERROR_DFA_UITEM;
2984 }
2985
2986 NEXT_ACTIVE_STATE: continue;
2987
2988 } /* End of loop scanning active states */
2989
2990 /* We have finished the processing at the current subject character. If no
2991 new states have been set for the next character, we have found all the
2992 matches that we are going to find. If we are at the top level and partial
2993 matching has been requested, check for appropriate conditions.
2994
2995 The "forced_ fail" variable counts the number of (*F) encountered for the
2996 character. If it is equal to the original active_count (saved in
2997 workspace[1]) it means that (*F) was found on every active state. In this
2998 case we don't want to give a partial match.
2999
3000 The "could_continue" variable is true if a state could have continued but
3001 for the fact that the end of the subject was reached. */
3002
3003 if (new_count <= 0)
3004 {
3005 if (rlevel == 1 && /* Top level, and */
3006 could_continue && /* Some could go on, and */
3007 forced_fail != workspace[1] && /* Not all forced fail & */
3008 ( /* either... */
3009 (mb->moptions & PCRE2_PARTIAL_HARD) != 0 /* Hard partial */
3010 || /* or... */
3011 ((mb->moptions & PCRE2_PARTIAL_SOFT) != 0 && /* Soft partial and */
3012 match_count < 0) /* no matches */
3013 ) && /* And... */
3014 (
3015 partial_newline || /* Either partial NL */
3016 ( /* or ... */
3017 ptr >= end_subject && /* End of subject and */
3018 ptr > mb->start_used_ptr) /* Inspected non-empty string */
3019 )
3020 )
3021 match_count = PCRE2_ERROR_PARTIAL;
3022 break; /* In effect, "return", but see the comment below */
3023 }
3024
3025 /* One or more states are active for the next character. */
3026
3027 ptr += clen; /* Advance to next subject character */
3028 } /* Loop to move along the subject string */
3029
3030 /* Control gets here from "break" a few lines above. We do it this way because
3031 if we use "return" above, we have compiler trouble. Some compilers warn if
3032 there's nothing here because they think the function doesn't return a value. On
3033 the other hand, if we put a dummy statement here, some more clever compilers
3034 complain that it can't be reached. Sigh. */
3035
3036 return match_count;
3037 }
3038
3039
3040
3041 /*************************************************
3042 * Match a pattern using the DFA algorithm *
3043 *************************************************/
3044
3045 /* This function matches a compiled pattern to a subject string, using the
3046 alternate matching algorithm that finds all matches at once.
3047
3048 Arguments:
3049 code points to the compiled pattern
3050 subject subject string
3051 length length of subject string
3052 startoffset where to start matching in the subject
3053 options option bits
3054 match_data points to a match data structure
3055 gcontext points to a match context
3056 workspace pointer to workspace
3057 wscount size of workspace
3058
3059 Returns: > 0 => number of match offset pairs placed in offsets
3060 = 0 => offsets overflowed; longest matches are present
3061 -1 => failed to match
3062 < -1 => some kind of unexpected problem
3063 */
3064
3065 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_dfa_match(const pcre2_code * code,PCRE2_SPTR subject,PCRE2_SIZE length,PCRE2_SIZE start_offset,uint32_t options,pcre2_match_data * match_data,pcre2_match_context * mcontext,int * workspace,size_t wscount)3066 pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
3067 PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
3068 pcre2_match_context *mcontext, int *workspace, size_t wscount)
3069 {
3070 const pcre2_real_code *re = (const pcre2_real_code *)code;
3071
3072 PCRE2_SPTR start_match;
3073 PCRE2_SPTR end_subject;
3074 PCRE2_SPTR req_cu_ptr;
3075
3076 BOOL utf, anchored, startline, firstline;
3077
3078 BOOL has_first_cu = FALSE;
3079 BOOL has_req_cu = FALSE;
3080 PCRE2_UCHAR first_cu = 0;
3081 PCRE2_UCHAR first_cu2 = 0;
3082 PCRE2_UCHAR req_cu = 0;
3083 PCRE2_UCHAR req_cu2 = 0;
3084
3085 const uint8_t *start_bits = NULL;
3086
3087 /* We need to have mb pointing to a match block, because the IS_NEWLINE macro
3088 is used below, and it expects NLBLOCK to be defined as a pointer. */
3089
3090 dfa_match_block actual_match_block;
3091 dfa_match_block *mb = &actual_match_block;
3092
3093 /* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated
3094 subject string. */
3095
3096 if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject);
3097
3098 /* Plausibility checks */
3099
3100 if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
3101 if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL)
3102 return PCRE2_ERROR_NULL;
3103 if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE;
3104 if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
3105
3106 /* Check that the first field in the block is the magic number. If it is not,
3107 return with PCRE2_ERROR_BADMAGIC. */
3108
3109 if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
3110
3111 /* Check the code unit width. */
3112
3113 if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
3114 return PCRE2_ERROR_BADMODE;
3115
3116 /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
3117 options variable for this function. Users of PCRE2 who are not calling the
3118 function directly would like to have a way of setting these flags, in the same
3119 way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with
3120 constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
3121 (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be
3122 transferred to the options for this function. The bits are guaranteed to be
3123 adjacent, but do not have the same values. This bit of Boolean trickery assumes
3124 that the match-time bits are not more significant than the flag bits. If by
3125 accident this is not the case, a compile-time division by zero error will
3126 occur. */
3127
3128 #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
3129 #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
3130 options |= (re->flags & FF) / ((FF & -FF) / (OO & -OO));
3131 #undef FF
3132 #undef OO
3133
3134 /* A NULL match context means "use a default context" */
3135
3136 if (mcontext == NULL)
3137 mcontext = (pcre2_match_context *)(&PRIV(default_match_context));
3138
3139 /* If restarting after a partial match, do some sanity checks on the contents
3140 of the workspace. */
3141
3142 if ((options & PCRE2_DFA_RESTART) != 0)
3143 {
3144 if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3145 workspace[1] > (int)((wscount - 2)/INTS_PER_STATEBLOCK))
3146 return PCRE2_ERROR_DFA_BADRESTART;
3147 }
3148
3149 /* Set some local values */
3150
3151 utf = (re->overall_options & PCRE2_UTF) != 0;
3152 start_match = subject + start_offset;
3153 end_subject = subject + length;
3154 req_cu_ptr = start_match - 1;
3155 anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 ||
3156 (re->overall_options & PCRE2_ANCHORED) != 0;
3157
3158 /* The "must be at the start of a line" flags are used in a loop when finding
3159 where to start. */
3160
3161 startline = (re->flags & PCRE2_STARTLINE) != 0;
3162 firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;
3163
3164 /* Fill in the fields in the match block. */
3165
3166 if (mcontext == NULL)
3167 {
3168 mb->callout = NULL;
3169 mb->memctl = re->memctl;
3170 }
3171 else
3172 {
3173 mb->callout = mcontext->callout;
3174 mb->callout_data = mcontext->callout_data;
3175 mb->memctl = mcontext->memctl;
3176 }
3177
3178 mb->start_code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
3179 re->name_count * re->name_entry_size;
3180 mb->tables = re->tables;
3181 mb->start_subject = subject;
3182 mb->end_subject = end_subject;
3183 mb->start_offset = start_offset;
3184 mb->moptions = options;
3185 mb->poptions = re->overall_options;
3186
3187 /* Process the \R and newline settings. */
3188
3189 mb->bsr_convention = re->bsr_convention;
3190 mb->nltype = NLTYPE_FIXED;
3191 switch(re->newline_convention)
3192 {
3193 case PCRE2_NEWLINE_CR:
3194 mb->nllen = 1;
3195 mb->nl[0] = CHAR_CR;
3196 break;
3197
3198 case PCRE2_NEWLINE_LF:
3199 mb->nllen = 1;
3200 mb->nl[0] = CHAR_NL;
3201 break;
3202
3203 case PCRE2_NEWLINE_CRLF:
3204 mb->nllen = 2;
3205 mb->nl[0] = CHAR_CR;
3206 mb->nl[1] = CHAR_NL;
3207 break;
3208
3209 case PCRE2_NEWLINE_ANY:
3210 mb->nltype = NLTYPE_ANY;
3211 break;
3212
3213 case PCRE2_NEWLINE_ANYCRLF:
3214 mb->nltype = NLTYPE_ANYCRLF;
3215 break;
3216
3217 default: return PCRE2_ERROR_INTERNAL;
3218 }
3219
3220 /* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
3221 we must also check that a starting offset does not point into the middle of a
3222 multiunit character. */
3223
3224 #ifdef SUPPORT_UNICODE
3225 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
3226 {
3227 match_data->rc = PRIV(valid_utf)(subject, length, &(match_data->startchar));
3228 if (match_data->rc != 0) return match_data->rc;
3229 #if PCRE2_CODE_UNIT_WIDTH != 32
3230 if (start_offset > 0 && start_offset < length &&
3231 NOT_FIRSTCHAR(subject[start_offset]))
3232 return PCRE2_ERROR_BADUTFOFFSET;
3233 #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
3234 }
3235 #endif /* SUPPORT_UNICODE */
3236
3237 /* Set up the first code unit to match, if available. The first_codeunit value
3238 is never set for an anchored regular expression, but the anchoring may be
3239 forced at run time, so we have to test for anchoring. The first code unit may
3240 be unset for an unanchored pattern, of course. If there's no first code unit
3241 there may be a bitmap of possible first characters. */
3242
3243 if (!anchored)
3244 {
3245 if ((re->flags & PCRE2_FIRSTSET) != 0)
3246 {
3247 has_first_cu = TRUE;
3248 first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
3249 if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
3250 {
3251 first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
3252 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
3253 if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu);
3254 #endif
3255 }
3256 }
3257 else
3258 if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
3259 start_bits = re->start_bitmap;
3260 }
3261
3262 /* For anchored or unanchored matches, there may be a "last known required
3263 character" set. */
3264
3265 if ((re->flags & PCRE2_LASTSET) != 0)
3266 {
3267 has_req_cu = TRUE;
3268 req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
3269 if ((re->flags & PCRE2_LASTCASELESS) != 0)
3270 {
3271 req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
3272 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
3273 if (utf && req_cu > 127) req_cu2 = UCD_OTHERCASE(req_cu);
3274 #endif
3275 }
3276 }
3277
3278 /* Fill in fields that are always returned in the match data. */
3279
3280 match_data->code = re;
3281 match_data->subject = subject;
3282 match_data->mark = NULL;
3283 match_data->matchedby = PCRE2_MATCHEDBY_DFA_INTERPRETER;
3284
3285 /* Call the main matching function, looping for a non-anchored regex after a
3286 failed match. If not restarting, perform certain optimizations at the start of
3287 a match. */
3288
3289 for (;;)
3290 {
3291 int rc;
3292
3293 /* ----------------- Start of match optimizations ---------------- */
3294
3295 /* There are some optimizations that avoid running the match if a known
3296 starting point is not found, or if a known later code unit is not present.
3297 However, there is an option (settable at compile time) that disables
3298 these, for testing and for ensuring that all callouts do actually occur.
3299 The optimizations must also be avoided when restarting a DFA match. */
3300
3301 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
3302 (options & PCRE2_DFA_RESTART) == 0)
3303 {
3304 PCRE2_SPTR save_end_subject = end_subject;
3305
3306 /* If firstline is TRUE, the start of the match is constrained to the first
3307 line of a multiline string. That is, the match must be before or at the
3308 first newline. Implement this by temporarily adjusting end_subject so that
3309 we stop the optimization scans at a newline. If the match fails at the
3310 newline, later code breaks this loop. */
3311
3312 if (firstline)
3313 {
3314 PCRE2_SPTR t = start_match;
3315 #ifdef SUPPORT_UNICODE
3316 if (utf)
3317 {
3318 while (t < mb->end_subject && !IS_NEWLINE(t))
3319 {
3320 t++;
3321 ACROSSCHAR(t < end_subject, *t, t++);
3322 }
3323 }
3324 else
3325 #endif
3326 while (t < mb->end_subject && !IS_NEWLINE(t)) t++;
3327 end_subject = t;
3328 }
3329
3330 /* Advance to a unique first code unit if there is one. */
3331
3332 if (has_first_cu)
3333 {
3334 PCRE2_UCHAR smc;
3335 if (first_cu != first_cu2)
3336 while (start_match < end_subject &&
3337 (smc = UCHAR21TEST(start_match)) != first_cu && smc != first_cu2)
3338 start_match++;
3339 else
3340 while (start_match < end_subject && UCHAR21TEST(start_match) != first_cu)
3341 start_match++;
3342 }
3343
3344 /* Or to just after a linebreak for a multiline match */
3345
3346 else if (startline)
3347 {
3348 if (start_match > mb->start_subject + start_offset)
3349 {
3350 #ifdef SUPPORT_UNICODE
3351 if (utf)
3352 {
3353 while (start_match < end_subject && !WAS_NEWLINE(start_match))
3354 {
3355 start_match++;
3356 ACROSSCHAR(start_match < end_subject, *start_match,
3357 start_match++);
3358 }
3359 }
3360 else
3361 #endif
3362 while (start_match < end_subject && !WAS_NEWLINE(start_match))
3363 start_match++;
3364
3365 /* If we have just passed a CR and the newline option is ANY or
3366 ANYCRLF, and we are now at a LF, advance the match position by one more
3367 code unit. */
3368
3369 if (start_match[-1] == CHAR_CR &&
3370 (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
3371 start_match < end_subject &&
3372 UCHAR21TEST(start_match) == CHAR_NL)
3373 start_match++;
3374 }
3375 }
3376
3377 /* Or to a non-unique first code unit if any have been identified. The
3378 bitmap contains only 256 bits. When code units are 16 or 32 bits wide, all
3379 code units greater than 254 set the 255 bit. */
3380
3381 else if (start_bits != NULL)
3382 {
3383 while (start_match < end_subject)
3384 {
3385 register uint32_t c = UCHAR21TEST(start_match);
3386 #if PCRE2_CODE_UNIT_WIDTH != 8
3387 if (c > 255) c = 255;
3388 #endif
3389 if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
3390 start_match++;
3391 }
3392 }
3393
3394 /* Restore fudged end_subject */
3395
3396 end_subject = save_end_subject;
3397
3398 /* The following two optimizations are disabled for partial matching. */
3399
3400 if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0)
3401 {
3402 /* The minimum matching length is a lower bound; no actual string of that
3403 length may actually match the pattern. Although the value is, strictly,
3404 in characters, we treat it as code units to avoid spending too much time
3405 in this optimization. */
3406
3407 if (end_subject - start_match < re->minlength) return PCRE2_ERROR_NOMATCH;
3408
3409 /* If req_cu is set, we know that that code unit must appear in the
3410 subject for the match to succeed. If the first code unit is set, req_cu
3411 must be later in the subject; otherwise the test starts at the match
3412 point. This optimization can save a huge amount of backtracking in
3413 patterns with nested unlimited repeats that aren't going to match.
3414 Writing separate code for cased/caseless versions makes it go faster, as
3415 does using an autoincrement and backing off on a match.
3416
3417 HOWEVER: when the subject string is very, very long, searching to its end
3418 can take a long time, and give bad performance on quite ordinary
3419 patterns. This showed up when somebody was matching something like
3420 /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
3421 sufficiently long. */
3422
3423 if (has_req_cu && end_subject - start_match < REQ_CU_MAX)
3424 {
3425 register PCRE2_SPTR p = start_match + (has_first_cu? 1:0);
3426
3427 /* We don't need to repeat the search if we haven't yet reached the
3428 place we found it at last time. */
3429
3430 if (p > req_cu_ptr)
3431 {
3432 if (req_cu != req_cu2)
3433 {
3434 while (p < end_subject)
3435 {
3436 register uint32_t pp = UCHAR21INCTEST(p);
3437 if (pp == req_cu || pp == req_cu2) { p--; break; }
3438 }
3439 }
3440 else
3441 {
3442 while (p < end_subject)
3443 {
3444 if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
3445 }
3446 }
3447
3448 /* If we can't find the required code unit, break the matching loop,
3449 forcing a match failure. */
3450
3451 if (p >= end_subject) break;
3452
3453 /* If we have found the required code unit, save the point where we
3454 found it, so that we don't search again next time round the loop if
3455 the start hasn't passed this code unit yet. */
3456
3457 req_cu_ptr = p;
3458 }
3459 }
3460 }
3461 }
3462
3463 /* ------------ End of start of match optimizations ------------ */
3464
3465 /* OK, now we can do the business */
3466
3467 mb->start_used_ptr = start_match;
3468 mb->last_used_ptr = start_match;
3469 mb->recursive = NULL;
3470
3471 rc = internal_dfa_match(
3472 mb, /* fixed match data */
3473 mb->start_code, /* this subexpression's code */
3474 start_match, /* where we currently are */
3475 start_offset, /* start offset in subject */
3476 match_data->ovector, /* offset vector */
3477 match_data->oveccount * 2, /* actual size of same */
3478 workspace, /* workspace vector */
3479 wscount, /* size of same */
3480 0); /* function recurse level */
3481
3482 /* Anything other than "no match" means we are done, always; otherwise, carry
3483 on only if not anchored. */
3484
3485 if (rc != PCRE2_ERROR_NOMATCH || anchored)
3486 {
3487 if (rc == PCRE2_ERROR_PARTIAL && match_data->oveccount > 0)
3488 {
3489 match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject);
3490 match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject);
3491 }
3492 match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject);
3493 match_data->rightchar = mb->last_used_ptr - subject;
3494 match_data->startchar = (PCRE2_SIZE)(start_match - subject);
3495 match_data->rc = rc;
3496 return rc;
3497 }
3498
3499 /* Advance to the next subject character unless we are at the end of a line
3500 and firstline is set. */
3501
3502 if (firstline && IS_NEWLINE(start_match)) break;
3503 start_match++;
3504 #ifdef SUPPORT_UNICODE
3505 if (utf)
3506 {
3507 ACROSSCHAR(start_match < end_subject, *start_match,
3508 start_match++);
3509 }
3510 #endif
3511 if (start_match > end_subject) break;
3512
3513 /* If we have just passed a CR and we are now at a LF, and the pattern does
3514 not contain any explicit matches for \r or \n, and the newline option is CRLF
3515 or ANY or ANYCRLF, advance the match position by one more character. */
3516
3517 if (UCHAR21TEST(start_match - 1) == CHAR_CR &&
3518 start_match < end_subject &&
3519 UCHAR21TEST(start_match) == CHAR_NL &&
3520 (re->flags & PCRE2_HASCRORLF) == 0 &&
3521 (mb->nltype == NLTYPE_ANY ||
3522 mb->nltype == NLTYPE_ANYCRLF ||
3523 mb->nllen == 2))
3524 start_match++;
3525
3526 } /* "Bumpalong" loop */
3527
3528
3529 return PCRE2_ERROR_NOMATCH;
3530 }
3531
3532 /* End of pcre2_dfa_match.c */
3533