1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016-2019 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 /* This module contains the external function pcre2_dfa_match(), which is an
43 alternative matching function that uses a sort of DFA algorithm (not a true
44 FSM). This is NOT Perl-compatible, but it has advantages in certain
45 applications. */
46
47
48 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49 the performance of his patterns greatly. I could not use it as it stood, as it
50 was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 test 7 to loop, and test 9 to crash with a segfault.
52
53 The issue is the check for duplicate states, which is done by a simple linear
54 search up the state list. (Grep for "duplicate" below to find the code.) For
55 many patterns, there will never be many states active at one time, so a simple
56 linear search is fine. In patterns that have many active states, it might be a
57 bottleneck. The suggested code used an indexing scheme to remember which states
58 had previously been used for each character, and avoided the linear search when
59 it knew there was no chance of a duplicate. This was implemented when adding
60 states to the state lists.
61
62 I wrote some thread-safe, not-limited code to try something similar at the time
63 of checking for duplicates (instead of when adding states), using index vectors
64 on the stack. It did give a 13% improvement with one specially constructed
65 pattern for certain subject strings, but on other strings and on many of the
66 simpler patterns in the test suite it did worse. The major problem, I think,
67 was the extra time to initialize the index. This had to be done for each call
68 of internal_dfa_match(). (The supplied patch used a static vector, initialized
69 only once - I suspect this was the cause of the problems with the tests.)
70
71 Overall, I concluded that the gains in some cases did not outweigh the losses
72 in others, so I abandoned this code. */
73
74
75 #ifdef HAVE_CONFIG_H
76 #include "config.h"
77 #endif
78
79 #define NLBLOCK mb /* Block containing newline information */
80 #define PSSTART start_subject /* Field containing processed string start */
81 #define PSEND end_subject /* Field containing processed string end */
82
83 #include "pcre2_internal.h"
84
85 #define PUBLIC_DFA_MATCH_OPTIONS \
86 (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
87 PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
88 PCRE2_PARTIAL_SOFT|PCRE2_DFA_SHORTEST|PCRE2_DFA_RESTART| \
89 PCRE2_COPY_MATCHED_SUBJECT)
90
91
92 /*************************************************
93 * Code parameters and static tables *
94 *************************************************/
95
96 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 into others, under special conditions. A gap of 20 between the blocks should be
98 enough. The resulting opcodes don't have to be less than 256 because they are
99 never stored, so we push them well clear of the normal opcodes. */
100
101 #define OP_PROP_EXTRA 300
102 #define OP_EXTUNI_EXTRA 320
103 #define OP_ANYNL_EXTRA 340
104 #define OP_HSPACE_EXTRA 360
105 #define OP_VSPACE_EXTRA 380
106
107
108 /* This table identifies those opcodes that are followed immediately by a
109 character that is to be tested in some way. This makes it possible to
110 centralize the loading of these characters. In the case of Type * etc, the
111 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 small value. Non-zero values in the table are the offsets from the opcode where
113 the character is to be found. ***NOTE*** If the start of this table is
114 modified, the three tables that follow must also be modified. */
115
116 static const uint8_t coptable[] = {
117 0, /* End */
118 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
119 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
120 0, 0, 0, /* Any, AllAny, Anybyte */
121 0, 0, /* \P, \p */
122 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
123 0, /* \X */
124 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
125 1, /* Char */
126 1, /* Chari */
127 1, /* not */
128 1, /* noti */
129 /* Positive single-char repeats */
130 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
131 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */
132 1+IMM2_SIZE, /* exact */
133 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */
134 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
135 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */
136 1+IMM2_SIZE, /* exact I */
137 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */
138 /* Negative single-char repeats - only for chars < 256 */
139 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
140 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */
141 1+IMM2_SIZE, /* NOT exact */
142 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */
143 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
144 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */
145 1+IMM2_SIZE, /* NOT exact I */
146 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */
147 /* Positive type repeats */
148 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
149 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */
150 1+IMM2_SIZE, /* Type exact */
151 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */
152 /* Character class & ref repeats */
153 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
154 0, 0, /* CRRANGE, CRMINRANGE */
155 0, 0, 0, 0, /* Possessive *+, ++, ?+, CRPOSRANGE */
156 0, /* CLASS */
157 0, /* NCLASS */
158 0, /* XCLASS - variable length */
159 0, /* REF */
160 0, /* REFI */
161 0, /* DNREF */
162 0, /* DNREFI */
163 0, /* RECURSE */
164 0, /* CALLOUT */
165 0, /* CALLOUT_STR */
166 0, /* Alt */
167 0, /* Ket */
168 0, /* KetRmax */
169 0, /* KetRmin */
170 0, /* KetRpos */
171 0, /* Reverse */
172 0, /* Assert */
173 0, /* Assert not */
174 0, /* Assert behind */
175 0, /* Assert behind not */
176 0, /* NA assert */
177 0, /* NA assert behind */
178 0, /* ONCE */
179 0, /* SCRIPT_RUN */
180 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
181 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
182 0, 0, /* CREF, DNCREF */
183 0, 0, /* RREF, DNRREF */
184 0, 0, /* FALSE, TRUE */
185 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
186 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
187 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
188 0, 0, /* COMMIT, COMMIT_ARG */
189 0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
190 0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */
191 };
192
193 /* This table identifies those opcodes that inspect a character. It is used to
194 remember the fact that a character could have been inspected when the end of
195 the subject is reached. ***NOTE*** If the start of this table is modified, the
196 two tables that follow must also be modified. */
197
198 static const uint8_t poptable[] = {
199 0, /* End */
200 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
201 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
202 1, 1, 1, /* Any, AllAny, Anybyte */
203 1, 1, /* \P, \p */
204 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
205 1, /* \X */
206 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
207 1, /* Char */
208 1, /* Chari */
209 1, /* not */
210 1, /* noti */
211 /* Positive single-char repeats */
212 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
213 1, 1, 1, /* upto, minupto, exact */
214 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
215 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
216 1, 1, 1, /* upto I, minupto I, exact I */
217 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
218 /* Negative single-char repeats - only for chars < 256 */
219 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
220 1, 1, 1, /* NOT upto, minupto, exact */
221 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
222 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
223 1, 1, 1, /* NOT upto I, minupto I, exact I */
224 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
225 /* Positive type repeats */
226 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
227 1, 1, 1, /* Type upto, minupto, exact */
228 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
229 /* Character class & ref repeats */
230 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
231 1, 1, /* CRRANGE, CRMINRANGE */
232 1, 1, 1, 1, /* Possessive *+, ++, ?+, CRPOSRANGE */
233 1, /* CLASS */
234 1, /* NCLASS */
235 1, /* XCLASS - variable length */
236 0, /* REF */
237 0, /* REFI */
238 0, /* DNREF */
239 0, /* DNREFI */
240 0, /* RECURSE */
241 0, /* CALLOUT */
242 0, /* CALLOUT_STR */
243 0, /* Alt */
244 0, /* Ket */
245 0, /* KetRmax */
246 0, /* KetRmin */
247 0, /* KetRpos */
248 0, /* Reverse */
249 0, /* Assert */
250 0, /* Assert not */
251 0, /* Assert behind */
252 0, /* Assert behind not */
253 0, /* NA assert */
254 0, /* NA assert behind */
255 0, /* ONCE */
256 0, /* SCRIPT_RUN */
257 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
258 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
259 0, 0, /* CREF, DNCREF */
260 0, 0, /* RREF, DNRREF */
261 0, 0, /* FALSE, TRUE */
262 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
263 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
264 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
265 0, 0, /* COMMIT, COMMIT_ARG */
266 0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
267 0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */
268 };
269
270 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
271 and \w */
272
273 static const uint8_t toptable1[] = {
274 0, 0, 0, 0, 0, 0,
275 ctype_digit, ctype_digit,
276 ctype_space, ctype_space,
277 ctype_word, ctype_word,
278 0, 0 /* OP_ANY, OP_ALLANY */
279 };
280
281 static const uint8_t toptable2[] = {
282 0, 0, 0, 0, 0, 0,
283 ctype_digit, 0,
284 ctype_space, 0,
285 ctype_word, 0,
286 1, 1 /* OP_ANY, OP_ALLANY */
287 };
288
289
290 /* Structure for holding data about a particular state, which is in effect the
291 current data for an active path through the match tree. It must consist
292 entirely of ints because the working vector we are passed, and which we put
293 these structures in, is a vector of ints. */
294
295 typedef struct stateblock {
296 int offset; /* Offset to opcode (-ve has meaning) */
297 int count; /* Count for repeats */
298 int data; /* Some use extra data */
299 } stateblock;
300
301 #define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int))
302
303
304 /* Before version 10.32 the recursive calls of internal_dfa_match() were passed
305 local working space and output vectors that were created on the stack. This has
306 caused issues for some patterns, especially in small-stack environments such as
307 Windows. A new scheme is now in use which sets up a vector on the stack, but if
308 this is too small, heap memory is used, up to the heap_limit. The main
309 parameters are all numbers of ints because the workspace is a vector of ints.
310
311 The size of the starting stack vector, DFA_START_RWS_SIZE, is in bytes, and is
312 defined in pcre2_internal.h so as to be available to pcre2test when it is
313 finding the minimum heap requirement for a match. */
314
315 #define OVEC_UNIT (sizeof(PCRE2_SIZE)/sizeof(int))
316
317 #define RWS_BASE_SIZE (DFA_START_RWS_SIZE/sizeof(int)) /* Stack vector */
318 #define RWS_RSIZE 1000 /* Work size for recursion */
319 #define RWS_OVEC_RSIZE (1000*OVEC_UNIT) /* Ovector for recursion */
320 #define RWS_OVEC_OSIZE (2*OVEC_UNIT) /* Ovector in other cases */
321
322 /* This structure is at the start of each workspace block. */
323
324 typedef struct RWS_anchor {
325 struct RWS_anchor *next;
326 uint32_t size; /* Number of ints */
327 uint32_t free; /* Number of ints */
328 } RWS_anchor;
329
330 #define RWS_ANCHOR_SIZE (sizeof(RWS_anchor)/sizeof(int))
331
332
333
334 /*************************************************
335 * Process a callout *
336 *************************************************/
337
338 /* This function is called to perform a callout.
339
340 Arguments:
341 code current code pointer
342 offsets points to current capture offsets
343 current_subject start of current subject match
344 ptr current position in subject
345 mb the match block
346 extracode extra code offset when called from condition
347 lengthptr where to return the callout length
348
349 Returns: the return from the callout
350 */
351
352 static int
do_callout(PCRE2_SPTR code,PCRE2_SIZE * offsets,PCRE2_SPTR current_subject,PCRE2_SPTR ptr,dfa_match_block * mb,PCRE2_SIZE extracode,PCRE2_SIZE * lengthptr)353 do_callout(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject,
354 PCRE2_SPTR ptr, dfa_match_block *mb, PCRE2_SIZE extracode,
355 PCRE2_SIZE *lengthptr)
356 {
357 pcre2_callout_block *cb = mb->cb;
358
359 *lengthptr = (code[extracode] == OP_CALLOUT)?
360 (PCRE2_SIZE)PRIV(OP_lengths)[OP_CALLOUT] :
361 (PCRE2_SIZE)GET(code, 1 + 2*LINK_SIZE + extracode);
362
363 if (mb->callout == NULL) return 0; /* No callout provided */
364
365 /* Fixed fields in the callout block are set once and for all at the start of
366 matching. */
367
368 cb->offset_vector = offsets;
369 cb->start_match = (PCRE2_SIZE)(current_subject - mb->start_subject);
370 cb->current_position = (PCRE2_SIZE)(ptr - mb->start_subject);
371 cb->pattern_position = GET(code, 1 + extracode);
372 cb->next_item_length = GET(code, 1 + LINK_SIZE + extracode);
373
374 if (code[extracode] == OP_CALLOUT)
375 {
376 cb->callout_number = code[1 + 2*LINK_SIZE + extracode];
377 cb->callout_string_offset = 0;
378 cb->callout_string = NULL;
379 cb->callout_string_length = 0;
380 }
381 else
382 {
383 cb->callout_number = 0;
384 cb->callout_string_offset = GET(code, 1 + 3*LINK_SIZE + extracode);
385 cb->callout_string = code + (1 + 4*LINK_SIZE + extracode) + 1;
386 cb->callout_string_length = *lengthptr - (1 + 4*LINK_SIZE) - 2;
387 }
388
389 return (mb->callout)(cb, mb->callout_data);
390 }
391
392
393
394 /*************************************************
395 * Expand local workspace memory *
396 *************************************************/
397
398 /* This function is called when internal_dfa_match() is about to be called
399 recursively and there is insufficient working space left in the current
400 workspace block. If there's an existing next block, use it; otherwise get a new
401 block unless the heap limit is reached.
402
403 Arguments:
404 rwsptr pointer to block pointer (updated)
405 ovecsize space needed for an ovector
406 mb the match block
407
408 Returns: 0 rwsptr has been updated
409 !0 an error code
410 */
411
412 static int
more_workspace(RWS_anchor ** rwsptr,unsigned int ovecsize,dfa_match_block * mb)413 more_workspace(RWS_anchor **rwsptr, unsigned int ovecsize, dfa_match_block *mb)
414 {
415 RWS_anchor *rws = *rwsptr;
416 RWS_anchor *new;
417
418 if (rws->next != NULL)
419 {
420 new = rws->next;
421 }
422
423 /* Sizes in the RWS_anchor blocks are in units of sizeof(int), but
424 mb->heap_limit and mb->heap_used are in kibibytes. Play carefully, to avoid
425 overflow. */
426
427 else
428 {
429 uint32_t newsize = (rws->size >= UINT32_MAX/2)? UINT32_MAX/2 : rws->size * 2;
430 uint32_t newsizeK = newsize/(1024/sizeof(int));
431
432 if (newsizeK + mb->heap_used > mb->heap_limit)
433 newsizeK = (uint32_t)(mb->heap_limit - mb->heap_used);
434 newsize = newsizeK*(1024/sizeof(int));
435
436 if (newsize < RWS_RSIZE + ovecsize + RWS_ANCHOR_SIZE)
437 return PCRE2_ERROR_HEAPLIMIT;
438 new = mb->memctl.malloc(newsize*sizeof(int), mb->memctl.memory_data);
439 if (new == NULL) return PCRE2_ERROR_NOMEMORY;
440 mb->heap_used += newsizeK;
441 new->next = NULL;
442 new->size = newsize;
443 rws->next = new;
444 }
445
446 new->free = new->size - RWS_ANCHOR_SIZE;
447 *rwsptr = new;
448 return 0;
449 }
450
451
452
453 /*************************************************
454 * Match a Regular Expression - DFA engine *
455 *************************************************/
456
457 /* This internal function applies a compiled pattern to a subject string,
458 starting at a given point, using a DFA engine. This function is called from the
459 external one, possibly multiple times if the pattern is not anchored. The
460 function calls itself recursively for some kinds of subpattern.
461
462 Arguments:
463 mb the match_data block with fixed information
464 this_start_code the opening bracket of this subexpression's code
465 current_subject where we currently are in the subject string
466 start_offset start offset in the subject string
467 offsets vector to contain the matching string offsets
468 offsetcount size of same
469 workspace vector of workspace
470 wscount size of same
471 rlevel function call recursion level
472
473 Returns: > 0 => number of match offset pairs placed in offsets
474 = 0 => offsets overflowed; longest matches are present
475 -1 => failed to match
476 < -1 => some kind of unexpected problem
477
478 The following macros are used for adding states to the two state vectors (one
479 for the current character, one for the following character). */
480
481 #define ADD_ACTIVE(x,y) \
482 if (active_count++ < wscount) \
483 { \
484 next_active_state->offset = (x); \
485 next_active_state->count = (y); \
486 next_active_state++; \
487 } \
488 else return PCRE2_ERROR_DFA_WSSIZE
489
490 #define ADD_ACTIVE_DATA(x,y,z) \
491 if (active_count++ < wscount) \
492 { \
493 next_active_state->offset = (x); \
494 next_active_state->count = (y); \
495 next_active_state->data = (z); \
496 next_active_state++; \
497 } \
498 else return PCRE2_ERROR_DFA_WSSIZE
499
500 #define ADD_NEW(x,y) \
501 if (new_count++ < wscount) \
502 { \
503 next_new_state->offset = (x); \
504 next_new_state->count = (y); \
505 next_new_state++; \
506 } \
507 else return PCRE2_ERROR_DFA_WSSIZE
508
509 #define ADD_NEW_DATA(x,y,z) \
510 if (new_count++ < wscount) \
511 { \
512 next_new_state->offset = (x); \
513 next_new_state->count = (y); \
514 next_new_state->data = (z); \
515 next_new_state++; \
516 } \
517 else return PCRE2_ERROR_DFA_WSSIZE
518
519 /* And now, here is the code */
520
521 static int
internal_dfa_match(dfa_match_block * mb,PCRE2_SPTR this_start_code,PCRE2_SPTR current_subject,PCRE2_SIZE start_offset,PCRE2_SIZE * offsets,uint32_t offsetcount,int * workspace,int wscount,uint32_t rlevel,int * RWS)522 internal_dfa_match(
523 dfa_match_block *mb,
524 PCRE2_SPTR this_start_code,
525 PCRE2_SPTR current_subject,
526 PCRE2_SIZE start_offset,
527 PCRE2_SIZE *offsets,
528 uint32_t offsetcount,
529 int *workspace,
530 int wscount,
531 uint32_t rlevel,
532 int *RWS)
533 {
534 stateblock *active_states, *new_states, *temp_states;
535 stateblock *next_active_state, *next_new_state;
536 const uint8_t *ctypes, *lcc, *fcc;
537 PCRE2_SPTR ptr;
538 PCRE2_SPTR end_code;
539 dfa_recursion_info new_recursive;
540 int active_count, new_count, match_count;
541
542 /* Some fields in the mb block are frequently referenced, so we load them into
543 independent variables in the hope that this will perform better. */
544
545 PCRE2_SPTR start_subject = mb->start_subject;
546 PCRE2_SPTR end_subject = mb->end_subject;
547 PCRE2_SPTR start_code = mb->start_code;
548
549 #ifdef SUPPORT_UNICODE
550 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
551 #else
552 BOOL utf = FALSE;
553 #endif
554
555 BOOL reset_could_continue = FALSE;
556
557 if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;
558 if (rlevel++ > mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
559 offsetcount &= (uint32_t)(-2); /* Round down */
560
561 wscount -= 2;
562 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
563 (2 * INTS_PER_STATEBLOCK);
564
565 ctypes = mb->tables + ctypes_offset;
566 lcc = mb->tables + lcc_offset;
567 fcc = mb->tables + fcc_offset;
568
569 match_count = PCRE2_ERROR_NOMATCH; /* A negative number */
570
571 active_states = (stateblock *)(workspace + 2);
572 next_new_state = new_states = active_states + wscount;
573 new_count = 0;
574
575 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
576 the alternative states onto the list, and find out where the end is. This
577 makes is possible to use this function recursively, when we want to stop at a
578 matching internal ket rather than at the end.
579
580 If we are dealing with a backward assertion we have to find out the maximum
581 amount to move back, and set up each alternative appropriately. */
582
583 if (*this_start_code == OP_ASSERTBACK || *this_start_code == OP_ASSERTBACK_NOT)
584 {
585 size_t max_back = 0;
586 size_t gone_back;
587
588 end_code = this_start_code;
589 do
590 {
591 size_t back = (size_t)GET(end_code, 2+LINK_SIZE);
592 if (back > max_back) max_back = back;
593 end_code += GET(end_code, 1);
594 }
595 while (*end_code == OP_ALT);
596
597 /* If we can't go back the amount required for the longest lookbehind
598 pattern, go back as far as we can; some alternatives may still be viable. */
599
600 #ifdef SUPPORT_UNICODE
601 /* In character mode we have to step back character by character */
602
603 if (utf)
604 {
605 for (gone_back = 0; gone_back < max_back; gone_back++)
606 {
607 if (current_subject <= start_subject) break;
608 current_subject--;
609 ACROSSCHAR(current_subject > start_subject, current_subject,
610 current_subject--);
611 }
612 }
613 else
614 #endif
615
616 /* In byte-mode we can do this quickly. */
617
618 {
619 size_t current_offset = (size_t)(current_subject - start_subject);
620 gone_back = (current_offset < max_back)? current_offset : max_back;
621 current_subject -= gone_back;
622 }
623
624 /* Save the earliest consulted character */
625
626 if (current_subject < mb->start_used_ptr)
627 mb->start_used_ptr = current_subject;
628
629 /* Now we can process the individual branches. There will be an OP_REVERSE at
630 the start of each branch, except when the length of the branch is zero. */
631
632 end_code = this_start_code;
633 do
634 {
635 uint32_t revlen = (end_code[1+LINK_SIZE] == OP_REVERSE)? 1 + LINK_SIZE : 0;
636 size_t back = (revlen == 0)? 0 : (size_t)GET(end_code, 2+LINK_SIZE);
637 if (back <= gone_back)
638 {
639 int bstate = (int)(end_code - start_code + 1 + LINK_SIZE + revlen);
640 ADD_NEW_DATA(-bstate, 0, (int)(gone_back - back));
641 }
642 end_code += GET(end_code, 1);
643 }
644 while (*end_code == OP_ALT);
645 }
646
647 /* This is the code for a "normal" subpattern (not a backward assertion). The
648 start of a whole pattern is always one of these. If we are at the top level,
649 we may be asked to restart matching from the same point that we reached for a
650 previous partial match. We still have to scan through the top-level branches to
651 find the end state. */
652
653 else
654 {
655 end_code = this_start_code;
656
657 /* Restarting */
658
659 if (rlevel == 1 && (mb->moptions & PCRE2_DFA_RESTART) != 0)
660 {
661 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
662 new_count = workspace[1];
663 if (!workspace[0])
664 memcpy(new_states, active_states, (size_t)new_count * sizeof(stateblock));
665 }
666
667 /* Not restarting */
668
669 else
670 {
671 int length = 1 + LINK_SIZE +
672 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
673 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
674 ? IMM2_SIZE:0);
675 do
676 {
677 ADD_NEW((int)(end_code - start_code + length), 0);
678 end_code += GET(end_code, 1);
679 length = 1 + LINK_SIZE;
680 }
681 while (*end_code == OP_ALT);
682 }
683 }
684
685 workspace[0] = 0; /* Bit indicating which vector is current */
686
687 /* Loop for scanning the subject */
688
689 ptr = current_subject;
690 for (;;)
691 {
692 int i, j;
693 int clen, dlen;
694 uint32_t c, d;
695 int forced_fail = 0;
696 BOOL partial_newline = FALSE;
697 BOOL could_continue = reset_could_continue;
698 reset_could_continue = FALSE;
699
700 if (ptr > mb->last_used_ptr) mb->last_used_ptr = ptr;
701
702 /* Make the new state list into the active state list and empty the
703 new state list. */
704
705 temp_states = active_states;
706 active_states = new_states;
707 new_states = temp_states;
708 active_count = new_count;
709 new_count = 0;
710
711 workspace[0] ^= 1; /* Remember for the restarting feature */
712 workspace[1] = active_count;
713
714 /* Set the pointers for adding new states */
715
716 next_active_state = active_states + active_count;
717 next_new_state = new_states;
718
719 /* Load the current character from the subject outside the loop, as many
720 different states may want to look at it, and we assume that at least one
721 will. */
722
723 if (ptr < end_subject)
724 {
725 clen = 1; /* Number of data items in the character */
726 #ifdef SUPPORT_UNICODE
727 GETCHARLENTEST(c, ptr, clen);
728 #else
729 c = *ptr;
730 #endif /* SUPPORT_UNICODE */
731 }
732 else
733 {
734 clen = 0; /* This indicates the end of the subject */
735 c = NOTACHAR; /* This value should never actually be used */
736 }
737
738 /* Scan up the active states and act on each one. The result of an action
739 may be to add more states to the currently active list (e.g. on hitting a
740 parenthesis) or it may be to put states on the new list, for considering
741 when we move the character pointer on. */
742
743 for (i = 0; i < active_count; i++)
744 {
745 stateblock *current_state = active_states + i;
746 BOOL caseless = FALSE;
747 PCRE2_SPTR code;
748 uint32_t codevalue;
749 int state_offset = current_state->offset;
750 int rrc;
751 int count;
752
753 /* A negative offset is a special case meaning "hold off going to this
754 (negated) state until the number of characters in the data field have
755 been skipped". If the could_continue flag was passed over from a previous
756 state, arrange for it to passed on. */
757
758 if (state_offset < 0)
759 {
760 if (current_state->data > 0)
761 {
762 ADD_NEW_DATA(state_offset, current_state->count,
763 current_state->data - 1);
764 if (could_continue) reset_could_continue = TRUE;
765 continue;
766 }
767 else
768 {
769 current_state->offset = state_offset = -state_offset;
770 }
771 }
772
773 /* Check for a duplicate state with the same count, and skip if found.
774 See the note at the head of this module about the possibility of improving
775 performance here. */
776
777 for (j = 0; j < i; j++)
778 {
779 if (active_states[j].offset == state_offset &&
780 active_states[j].count == current_state->count)
781 goto NEXT_ACTIVE_STATE;
782 }
783
784 /* The state offset is the offset to the opcode */
785
786 code = start_code + state_offset;
787 codevalue = *code;
788
789 /* If this opcode inspects a character, but we are at the end of the
790 subject, remember the fact for use when testing for a partial match. */
791
792 if (clen == 0 && poptable[codevalue] != 0)
793 could_continue = TRUE;
794
795 /* If this opcode is followed by an inline character, load it. It is
796 tempting to test for the presence of a subject character here, but that
797 is wrong, because sometimes zero repetitions of the subject are
798 permitted.
799
800 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
801 argument that is not a data character - but is always one byte long because
802 the values are small. We have to take special action to deal with \P, \p,
803 \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
804 these ones to new opcodes. */
805
806 if (coptable[codevalue] > 0)
807 {
808 dlen = 1;
809 #ifdef SUPPORT_UNICODE
810 if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
811 #endif /* SUPPORT_UNICODE */
812 d = code[coptable[codevalue]];
813 if (codevalue >= OP_TYPESTAR)
814 {
815 switch(d)
816 {
817 case OP_ANYBYTE: return PCRE2_ERROR_DFA_UITEM;
818 case OP_NOTPROP:
819 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
820 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
821 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
822 case OP_NOT_HSPACE:
823 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
824 case OP_NOT_VSPACE:
825 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
826 default: break;
827 }
828 }
829 }
830 else
831 {
832 dlen = 0; /* Not strictly necessary, but compilers moan */
833 d = NOTACHAR; /* if these variables are not set. */
834 }
835
836
837 /* Now process the individual opcodes */
838
839 switch (codevalue)
840 {
841 /* ========================================================================== */
842 /* These cases are never obeyed. This is a fudge that causes a compile-
843 time error if the vectors coptable or poptable, which are indexed by
844 opcode, are not the correct length. It seems to be the only way to do
845 such a check at compile time, as the sizeof() operator does not work
846 in the C preprocessor. */
847
848 case OP_TABLE_LENGTH:
849 case OP_TABLE_LENGTH +
850 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
851 (sizeof(poptable) == OP_TABLE_LENGTH)):
852 return 0;
853
854 /* ========================================================================== */
855 /* Reached a closing bracket. If not at the end of the pattern, carry
856 on with the next opcode. For repeating opcodes, also add the repeat
857 state. Note that KETRPOS will always be encountered at the end of the
858 subpattern, because the possessive subpattern repeats are always handled
859 using recursive calls. Thus, it never adds any new states.
860
861 At the end of the (sub)pattern, unless we have an empty string and
862 PCRE2_NOTEMPTY is set, or PCRE2_NOTEMPTY_ATSTART is set and we are at the
863 start of the subject, save the match data, shifting up all previous
864 matches so we always have the longest first. */
865
866 case OP_KET:
867 case OP_KETRMIN:
868 case OP_KETRMAX:
869 case OP_KETRPOS:
870 if (code != end_code)
871 {
872 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
873 if (codevalue != OP_KET)
874 {
875 ADD_ACTIVE(state_offset - (int)GET(code, 1), 0);
876 }
877 }
878 else
879 {
880 if (ptr > current_subject ||
881 ((mb->moptions & PCRE2_NOTEMPTY) == 0 &&
882 ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) == 0 ||
883 current_subject > start_subject + mb->start_offset)))
884 {
885 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
886 else if (match_count > 0 && ++match_count * 2 > (int)offsetcount)
887 match_count = 0;
888 count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2;
889 if (count > 0) (void)memmove(offsets + 2, offsets,
890 (size_t)count * sizeof(PCRE2_SIZE));
891 if (offsetcount >= 2)
892 {
893 offsets[0] = (PCRE2_SIZE)(current_subject - start_subject);
894 offsets[1] = (PCRE2_SIZE)(ptr - start_subject);
895 }
896 if ((mb->moptions & PCRE2_DFA_SHORTEST) != 0) return match_count;
897 }
898 }
899 break;
900
901 /* ========================================================================== */
902 /* These opcodes add to the current list of states without looking
903 at the current character. */
904
905 /*-----------------------------------------------------------------*/
906 case OP_ALT:
907 do { code += GET(code, 1); } while (*code == OP_ALT);
908 ADD_ACTIVE((int)(code - start_code), 0);
909 break;
910
911 /*-----------------------------------------------------------------*/
912 case OP_BRA:
913 case OP_SBRA:
914 do
915 {
916 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
917 code += GET(code, 1);
918 }
919 while (*code == OP_ALT);
920 break;
921
922 /*-----------------------------------------------------------------*/
923 case OP_CBRA:
924 case OP_SCBRA:
925 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
926 code += GET(code, 1);
927 while (*code == OP_ALT)
928 {
929 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
930 code += GET(code, 1);
931 }
932 break;
933
934 /*-----------------------------------------------------------------*/
935 case OP_BRAZERO:
936 case OP_BRAMINZERO:
937 ADD_ACTIVE(state_offset + 1, 0);
938 code += 1 + GET(code, 2);
939 while (*code == OP_ALT) code += GET(code, 1);
940 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
941 break;
942
943 /*-----------------------------------------------------------------*/
944 case OP_SKIPZERO:
945 code += 1 + GET(code, 2);
946 while (*code == OP_ALT) code += GET(code, 1);
947 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
948 break;
949
950 /*-----------------------------------------------------------------*/
951 case OP_CIRC:
952 if (ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0)
953 { ADD_ACTIVE(state_offset + 1, 0); }
954 break;
955
956 /*-----------------------------------------------------------------*/
957 case OP_CIRCM:
958 if ((ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) ||
959 ((ptr != end_subject || (mb->poptions & PCRE2_ALT_CIRCUMFLEX) != 0 )
960 && WAS_NEWLINE(ptr)))
961 { ADD_ACTIVE(state_offset + 1, 0); }
962 break;
963
964 /*-----------------------------------------------------------------*/
965 case OP_EOD:
966 if (ptr >= end_subject)
967 {
968 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
969 return PCRE2_ERROR_PARTIAL;
970 else { ADD_ACTIVE(state_offset + 1, 0); }
971 }
972 break;
973
974 /*-----------------------------------------------------------------*/
975 case OP_SOD:
976 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
977 break;
978
979 /*-----------------------------------------------------------------*/
980 case OP_SOM:
981 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
982 break;
983
984
985 /* ========================================================================== */
986 /* These opcodes inspect the next subject character, and sometimes
987 the previous one as well, but do not have an argument. The variable
988 clen contains the length of the current character and is zero if we are
989 at the end of the subject. */
990
991 /*-----------------------------------------------------------------*/
992 case OP_ANY:
993 if (clen > 0 && !IS_NEWLINE(ptr))
994 {
995 if (ptr + 1 >= mb->end_subject &&
996 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
997 NLBLOCK->nltype == NLTYPE_FIXED &&
998 NLBLOCK->nllen == 2 &&
999 c == NLBLOCK->nl[0])
1000 {
1001 could_continue = partial_newline = TRUE;
1002 }
1003 else
1004 {
1005 ADD_NEW(state_offset + 1, 0);
1006 }
1007 }
1008 break;
1009
1010 /*-----------------------------------------------------------------*/
1011 case OP_ALLANY:
1012 if (clen > 0)
1013 { ADD_NEW(state_offset + 1, 0); }
1014 break;
1015
1016 /*-----------------------------------------------------------------*/
1017 case OP_EODN:
1018 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - mb->nllen))
1019 {
1020 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1021 return PCRE2_ERROR_PARTIAL;
1022 ADD_ACTIVE(state_offset + 1, 0);
1023 }
1024 break;
1025
1026 /*-----------------------------------------------------------------*/
1027 case OP_DOLL:
1028 if ((mb->moptions & PCRE2_NOTEOL) == 0)
1029 {
1030 if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1031 could_continue = TRUE;
1032 else if (clen == 0 ||
1033 ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
1034 (ptr == end_subject - mb->nllen)
1035 ))
1036 { ADD_ACTIVE(state_offset + 1, 0); }
1037 else if (ptr + 1 >= mb->end_subject &&
1038 (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1039 NLBLOCK->nltype == NLTYPE_FIXED &&
1040 NLBLOCK->nllen == 2 &&
1041 c == NLBLOCK->nl[0])
1042 {
1043 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1044 {
1045 reset_could_continue = TRUE;
1046 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1047 }
1048 else could_continue = partial_newline = TRUE;
1049 }
1050 }
1051 break;
1052
1053 /*-----------------------------------------------------------------*/
1054 case OP_DOLLM:
1055 if ((mb->moptions & PCRE2_NOTEOL) == 0)
1056 {
1057 if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1058 could_continue = TRUE;
1059 else if (clen == 0 ||
1060 ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
1061 { ADD_ACTIVE(state_offset + 1, 0); }
1062 else if (ptr + 1 >= mb->end_subject &&
1063 (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1064 NLBLOCK->nltype == NLTYPE_FIXED &&
1065 NLBLOCK->nllen == 2 &&
1066 c == NLBLOCK->nl[0])
1067 {
1068 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1069 {
1070 reset_could_continue = TRUE;
1071 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1072 }
1073 else could_continue = partial_newline = TRUE;
1074 }
1075 }
1076 else if (IS_NEWLINE(ptr))
1077 { ADD_ACTIVE(state_offset + 1, 0); }
1078 break;
1079
1080 /*-----------------------------------------------------------------*/
1081
1082 case OP_DIGIT:
1083 case OP_WHITESPACE:
1084 case OP_WORDCHAR:
1085 if (clen > 0 && c < 256 &&
1086 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
1087 { ADD_NEW(state_offset + 1, 0); }
1088 break;
1089
1090 /*-----------------------------------------------------------------*/
1091 case OP_NOT_DIGIT:
1092 case OP_NOT_WHITESPACE:
1093 case OP_NOT_WORDCHAR:
1094 if (clen > 0 && (c >= 256 ||
1095 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
1096 { ADD_NEW(state_offset + 1, 0); }
1097 break;
1098
1099 /*-----------------------------------------------------------------*/
1100 case OP_WORD_BOUNDARY:
1101 case OP_NOT_WORD_BOUNDARY:
1102 {
1103 int left_word, right_word;
1104
1105 if (ptr > start_subject)
1106 {
1107 PCRE2_SPTR temp = ptr - 1;
1108 if (temp < mb->start_used_ptr) mb->start_used_ptr = temp;
1109 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1110 if (utf) { BACKCHAR(temp); }
1111 #endif
1112 GETCHARTEST(d, temp);
1113 #ifdef SUPPORT_UNICODE
1114 if ((mb->poptions & PCRE2_UCP) != 0)
1115 {
1116 if (d == '_') left_word = TRUE; else
1117 {
1118 uint32_t cat = UCD_CATEGORY(d);
1119 left_word = (cat == ucp_L || cat == ucp_N);
1120 }
1121 }
1122 else
1123 #endif
1124 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1125 }
1126 else left_word = FALSE;
1127
1128 if (clen > 0)
1129 {
1130 if (ptr >= mb->last_used_ptr)
1131 {
1132 PCRE2_SPTR temp = ptr + 1;
1133 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1134 if (utf) { FORWARDCHARTEST(temp, mb->end_subject); }
1135 #endif
1136 mb->last_used_ptr = temp;
1137 }
1138 #ifdef SUPPORT_UNICODE
1139 if ((mb->poptions & PCRE2_UCP) != 0)
1140 {
1141 if (c == '_') right_word = TRUE; else
1142 {
1143 uint32_t cat = UCD_CATEGORY(c);
1144 right_word = (cat == ucp_L || cat == ucp_N);
1145 }
1146 }
1147 else
1148 #endif
1149 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1150 }
1151 else right_word = FALSE;
1152
1153 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1154 { ADD_ACTIVE(state_offset + 1, 0); }
1155 }
1156 break;
1157
1158
1159 /*-----------------------------------------------------------------*/
1160 /* Check the next character by Unicode property. We will get here only
1161 if the support is in the binary; otherwise a compile-time error occurs.
1162 */
1163
1164 #ifdef SUPPORT_UNICODE
1165 case OP_PROP:
1166 case OP_NOTPROP:
1167 if (clen > 0)
1168 {
1169 BOOL OK;
1170 const uint32_t *cp;
1171 const ucd_record * prop = GET_UCD(c);
1172 switch(code[1])
1173 {
1174 case PT_ANY:
1175 OK = TRUE;
1176 break;
1177
1178 case PT_LAMP:
1179 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1180 prop->chartype == ucp_Lt;
1181 break;
1182
1183 case PT_GC:
1184 OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1185 break;
1186
1187 case PT_PC:
1188 OK = prop->chartype == code[2];
1189 break;
1190
1191 case PT_SC:
1192 OK = prop->script == code[2];
1193 break;
1194
1195 /* These are specials for combination cases. */
1196
1197 case PT_ALNUM:
1198 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1199 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1200 break;
1201
1202 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1203 which means that Perl space and POSIX space are now identical. PCRE
1204 was changed at release 8.34. */
1205
1206 case PT_SPACE: /* Perl space */
1207 case PT_PXSPACE: /* POSIX space */
1208 switch(c)
1209 {
1210 HSPACE_CASES:
1211 VSPACE_CASES:
1212 OK = TRUE;
1213 break;
1214
1215 default:
1216 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1217 break;
1218 }
1219 break;
1220
1221 case PT_WORD:
1222 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1223 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1224 c == CHAR_UNDERSCORE;
1225 break;
1226
1227 case PT_CLIST:
1228 cp = PRIV(ucd_caseless_sets) + code[2];
1229 for (;;)
1230 {
1231 if (c < *cp) { OK = FALSE; break; }
1232 if (c == *cp++) { OK = TRUE; break; }
1233 }
1234 break;
1235
1236 case PT_UCNC:
1237 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1238 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1239 c >= 0xe000;
1240 break;
1241
1242 /* Should never occur, but keep compilers from grumbling. */
1243
1244 default:
1245 OK = codevalue != OP_PROP;
1246 break;
1247 }
1248
1249 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1250 }
1251 break;
1252 #endif
1253
1254
1255
1256 /* ========================================================================== */
1257 /* These opcodes likewise inspect the subject character, but have an
1258 argument that is not a data character. It is one of these opcodes:
1259 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1260 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1261
1262 case OP_TYPEPLUS:
1263 case OP_TYPEMINPLUS:
1264 case OP_TYPEPOSPLUS:
1265 count = current_state->count; /* Already matched */
1266 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1267 if (clen > 0)
1268 {
1269 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1270 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1271 NLBLOCK->nltype == NLTYPE_FIXED &&
1272 NLBLOCK->nllen == 2 &&
1273 c == NLBLOCK->nl[0])
1274 {
1275 could_continue = partial_newline = TRUE;
1276 }
1277 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1278 (c < 256 &&
1279 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1280 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1281 {
1282 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1283 {
1284 active_count--; /* Remove non-match possibility */
1285 next_active_state--;
1286 }
1287 count++;
1288 ADD_NEW(state_offset, count);
1289 }
1290 }
1291 break;
1292
1293 /*-----------------------------------------------------------------*/
1294 case OP_TYPEQUERY:
1295 case OP_TYPEMINQUERY:
1296 case OP_TYPEPOSQUERY:
1297 ADD_ACTIVE(state_offset + 2, 0);
1298 if (clen > 0)
1299 {
1300 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1301 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1302 NLBLOCK->nltype == NLTYPE_FIXED &&
1303 NLBLOCK->nllen == 2 &&
1304 c == NLBLOCK->nl[0])
1305 {
1306 could_continue = partial_newline = TRUE;
1307 }
1308 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1309 (c < 256 &&
1310 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1311 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1312 {
1313 if (codevalue == OP_TYPEPOSQUERY)
1314 {
1315 active_count--; /* Remove non-match possibility */
1316 next_active_state--;
1317 }
1318 ADD_NEW(state_offset + 2, 0);
1319 }
1320 }
1321 break;
1322
1323 /*-----------------------------------------------------------------*/
1324 case OP_TYPESTAR:
1325 case OP_TYPEMINSTAR:
1326 case OP_TYPEPOSSTAR:
1327 ADD_ACTIVE(state_offset + 2, 0);
1328 if (clen > 0)
1329 {
1330 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1331 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1332 NLBLOCK->nltype == NLTYPE_FIXED &&
1333 NLBLOCK->nllen == 2 &&
1334 c == NLBLOCK->nl[0])
1335 {
1336 could_continue = partial_newline = TRUE;
1337 }
1338 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1339 (c < 256 &&
1340 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1341 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1342 {
1343 if (codevalue == OP_TYPEPOSSTAR)
1344 {
1345 active_count--; /* Remove non-match possibility */
1346 next_active_state--;
1347 }
1348 ADD_NEW(state_offset, 0);
1349 }
1350 }
1351 break;
1352
1353 /*-----------------------------------------------------------------*/
1354 case OP_TYPEEXACT:
1355 count = current_state->count; /* Number already matched */
1356 if (clen > 0)
1357 {
1358 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1359 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1360 NLBLOCK->nltype == NLTYPE_FIXED &&
1361 NLBLOCK->nllen == 2 &&
1362 c == NLBLOCK->nl[0])
1363 {
1364 could_continue = partial_newline = TRUE;
1365 }
1366 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1367 (c < 256 &&
1368 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1369 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1370 {
1371 if (++count >= (int)GET2(code, 1))
1372 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1373 else
1374 { ADD_NEW(state_offset, count); }
1375 }
1376 }
1377 break;
1378
1379 /*-----------------------------------------------------------------*/
1380 case OP_TYPEUPTO:
1381 case OP_TYPEMINUPTO:
1382 case OP_TYPEPOSUPTO:
1383 ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1384 count = current_state->count; /* Number already matched */
1385 if (clen > 0)
1386 {
1387 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1388 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1389 NLBLOCK->nltype == NLTYPE_FIXED &&
1390 NLBLOCK->nllen == 2 &&
1391 c == NLBLOCK->nl[0])
1392 {
1393 could_continue = partial_newline = TRUE;
1394 }
1395 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1396 (c < 256 &&
1397 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1398 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1399 {
1400 if (codevalue == OP_TYPEPOSUPTO)
1401 {
1402 active_count--; /* Remove non-match possibility */
1403 next_active_state--;
1404 }
1405 if (++count >= (int)GET2(code, 1))
1406 { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1407 else
1408 { ADD_NEW(state_offset, count); }
1409 }
1410 }
1411 break;
1412
1413 /* ========================================================================== */
1414 /* These are virtual opcodes that are used when something like
1415 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1416 argument. It keeps the code above fast for the other cases. The argument
1417 is in the d variable. */
1418
1419 #ifdef SUPPORT_UNICODE
1420 case OP_PROP_EXTRA + OP_TYPEPLUS:
1421 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1422 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1423 count = current_state->count; /* Already matched */
1424 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1425 if (clen > 0)
1426 {
1427 BOOL OK;
1428 const uint32_t *cp;
1429 const ucd_record * prop = GET_UCD(c);
1430 switch(code[2])
1431 {
1432 case PT_ANY:
1433 OK = TRUE;
1434 break;
1435
1436 case PT_LAMP:
1437 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1438 prop->chartype == ucp_Lt;
1439 break;
1440
1441 case PT_GC:
1442 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1443 break;
1444
1445 case PT_PC:
1446 OK = prop->chartype == code[3];
1447 break;
1448
1449 case PT_SC:
1450 OK = prop->script == code[3];
1451 break;
1452
1453 /* These are specials for combination cases. */
1454
1455 case PT_ALNUM:
1456 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1457 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1458 break;
1459
1460 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1461 which means that Perl space and POSIX space are now identical. PCRE
1462 was changed at release 8.34. */
1463
1464 case PT_SPACE: /* Perl space */
1465 case PT_PXSPACE: /* POSIX space */
1466 switch(c)
1467 {
1468 HSPACE_CASES:
1469 VSPACE_CASES:
1470 OK = TRUE;
1471 break;
1472
1473 default:
1474 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1475 break;
1476 }
1477 break;
1478
1479 case PT_WORD:
1480 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1481 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1482 c == CHAR_UNDERSCORE;
1483 break;
1484
1485 case PT_CLIST:
1486 cp = PRIV(ucd_caseless_sets) + code[3];
1487 for (;;)
1488 {
1489 if (c < *cp) { OK = FALSE; break; }
1490 if (c == *cp++) { OK = TRUE; break; }
1491 }
1492 break;
1493
1494 case PT_UCNC:
1495 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1496 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1497 c >= 0xe000;
1498 break;
1499
1500 /* Should never occur, but keep compilers from grumbling. */
1501
1502 default:
1503 OK = codevalue != OP_PROP;
1504 break;
1505 }
1506
1507 if (OK == (d == OP_PROP))
1508 {
1509 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1510 {
1511 active_count--; /* Remove non-match possibility */
1512 next_active_state--;
1513 }
1514 count++;
1515 ADD_NEW(state_offset, count);
1516 }
1517 }
1518 break;
1519
1520 /*-----------------------------------------------------------------*/
1521 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1522 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1523 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1524 count = current_state->count; /* Already matched */
1525 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1526 if (clen > 0)
1527 {
1528 int ncount = 0;
1529 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1530 {
1531 active_count--; /* Remove non-match possibility */
1532 next_active_state--;
1533 }
1534 (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1535 &ncount);
1536 count++;
1537 ADD_NEW_DATA(-state_offset, count, ncount);
1538 }
1539 break;
1540 #endif
1541
1542 /*-----------------------------------------------------------------*/
1543 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1544 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1545 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1546 count = current_state->count; /* Already matched */
1547 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1548 if (clen > 0)
1549 {
1550 int ncount = 0;
1551 switch (c)
1552 {
1553 case CHAR_VT:
1554 case CHAR_FF:
1555 case CHAR_NEL:
1556 #ifndef EBCDIC
1557 case 0x2028:
1558 case 0x2029:
1559 #endif /* Not EBCDIC */
1560 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1561 goto ANYNL01;
1562
1563 case CHAR_CR:
1564 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1565 /* Fall through */
1566
1567 ANYNL01:
1568 case CHAR_LF:
1569 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1570 {
1571 active_count--; /* Remove non-match possibility */
1572 next_active_state--;
1573 }
1574 count++;
1575 ADD_NEW_DATA(-state_offset, count, ncount);
1576 break;
1577
1578 default:
1579 break;
1580 }
1581 }
1582 break;
1583
1584 /*-----------------------------------------------------------------*/
1585 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1586 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1587 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1588 count = current_state->count; /* Already matched */
1589 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1590 if (clen > 0)
1591 {
1592 BOOL OK;
1593 switch (c)
1594 {
1595 VSPACE_CASES:
1596 OK = TRUE;
1597 break;
1598
1599 default:
1600 OK = FALSE;
1601 break;
1602 }
1603
1604 if (OK == (d == OP_VSPACE))
1605 {
1606 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1607 {
1608 active_count--; /* Remove non-match possibility */
1609 next_active_state--;
1610 }
1611 count++;
1612 ADD_NEW_DATA(-state_offset, count, 0);
1613 }
1614 }
1615 break;
1616
1617 /*-----------------------------------------------------------------*/
1618 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1619 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1620 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1621 count = current_state->count; /* Already matched */
1622 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1623 if (clen > 0)
1624 {
1625 BOOL OK;
1626 switch (c)
1627 {
1628 HSPACE_CASES:
1629 OK = TRUE;
1630 break;
1631
1632 default:
1633 OK = FALSE;
1634 break;
1635 }
1636
1637 if (OK == (d == OP_HSPACE))
1638 {
1639 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1640 {
1641 active_count--; /* Remove non-match possibility */
1642 next_active_state--;
1643 }
1644 count++;
1645 ADD_NEW_DATA(-state_offset, count, 0);
1646 }
1647 }
1648 break;
1649
1650 /*-----------------------------------------------------------------*/
1651 #ifdef SUPPORT_UNICODE
1652 case OP_PROP_EXTRA + OP_TYPEQUERY:
1653 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1654 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1655 count = 4;
1656 goto QS1;
1657
1658 case OP_PROP_EXTRA + OP_TYPESTAR:
1659 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1660 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1661 count = 0;
1662
1663 QS1:
1664
1665 ADD_ACTIVE(state_offset + 4, 0);
1666 if (clen > 0)
1667 {
1668 BOOL OK;
1669 const uint32_t *cp;
1670 const ucd_record * prop = GET_UCD(c);
1671 switch(code[2])
1672 {
1673 case PT_ANY:
1674 OK = TRUE;
1675 break;
1676
1677 case PT_LAMP:
1678 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1679 prop->chartype == ucp_Lt;
1680 break;
1681
1682 case PT_GC:
1683 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1684 break;
1685
1686 case PT_PC:
1687 OK = prop->chartype == code[3];
1688 break;
1689
1690 case PT_SC:
1691 OK = prop->script == code[3];
1692 break;
1693
1694 /* These are specials for combination cases. */
1695
1696 case PT_ALNUM:
1697 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1698 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1699 break;
1700
1701 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1702 which means that Perl space and POSIX space are now identical. PCRE
1703 was changed at release 8.34. */
1704
1705 case PT_SPACE: /* Perl space */
1706 case PT_PXSPACE: /* POSIX space */
1707 switch(c)
1708 {
1709 HSPACE_CASES:
1710 VSPACE_CASES:
1711 OK = TRUE;
1712 break;
1713
1714 default:
1715 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1716 break;
1717 }
1718 break;
1719
1720 case PT_WORD:
1721 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1722 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1723 c == CHAR_UNDERSCORE;
1724 break;
1725
1726 case PT_CLIST:
1727 cp = PRIV(ucd_caseless_sets) + code[3];
1728 for (;;)
1729 {
1730 if (c < *cp) { OK = FALSE; break; }
1731 if (c == *cp++) { OK = TRUE; break; }
1732 }
1733 break;
1734
1735 case PT_UCNC:
1736 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1737 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1738 c >= 0xe000;
1739 break;
1740
1741 /* Should never occur, but keep compilers from grumbling. */
1742
1743 default:
1744 OK = codevalue != OP_PROP;
1745 break;
1746 }
1747
1748 if (OK == (d == OP_PROP))
1749 {
1750 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1751 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1752 {
1753 active_count--; /* Remove non-match possibility */
1754 next_active_state--;
1755 }
1756 ADD_NEW(state_offset + count, 0);
1757 }
1758 }
1759 break;
1760
1761 /*-----------------------------------------------------------------*/
1762 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1763 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1764 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1765 count = 2;
1766 goto QS2;
1767
1768 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1769 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1770 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1771 count = 0;
1772
1773 QS2:
1774
1775 ADD_ACTIVE(state_offset + 2, 0);
1776 if (clen > 0)
1777 {
1778 int ncount = 0;
1779 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1780 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1781 {
1782 active_count--; /* Remove non-match possibility */
1783 next_active_state--;
1784 }
1785 (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1786 &ncount);
1787 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1788 }
1789 break;
1790 #endif
1791
1792 /*-----------------------------------------------------------------*/
1793 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1794 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1795 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1796 count = 2;
1797 goto QS3;
1798
1799 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1800 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1801 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1802 count = 0;
1803
1804 QS3:
1805 ADD_ACTIVE(state_offset + 2, 0);
1806 if (clen > 0)
1807 {
1808 int ncount = 0;
1809 switch (c)
1810 {
1811 case CHAR_VT:
1812 case CHAR_FF:
1813 case CHAR_NEL:
1814 #ifndef EBCDIC
1815 case 0x2028:
1816 case 0x2029:
1817 #endif /* Not EBCDIC */
1818 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1819 goto ANYNL02;
1820
1821 case CHAR_CR:
1822 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1823 /* Fall through */
1824
1825 ANYNL02:
1826 case CHAR_LF:
1827 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1828 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1829 {
1830 active_count--; /* Remove non-match possibility */
1831 next_active_state--;
1832 }
1833 ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1834 break;
1835
1836 default:
1837 break;
1838 }
1839 }
1840 break;
1841
1842 /*-----------------------------------------------------------------*/
1843 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1844 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1845 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1846 count = 2;
1847 goto QS4;
1848
1849 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1850 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1851 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1852 count = 0;
1853
1854 QS4:
1855 ADD_ACTIVE(state_offset + 2, 0);
1856 if (clen > 0)
1857 {
1858 BOOL OK;
1859 switch (c)
1860 {
1861 VSPACE_CASES:
1862 OK = TRUE;
1863 break;
1864
1865 default:
1866 OK = FALSE;
1867 break;
1868 }
1869 if (OK == (d == OP_VSPACE))
1870 {
1871 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1872 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1873 {
1874 active_count--; /* Remove non-match possibility */
1875 next_active_state--;
1876 }
1877 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1878 }
1879 }
1880 break;
1881
1882 /*-----------------------------------------------------------------*/
1883 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1884 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1885 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1886 count = 2;
1887 goto QS5;
1888
1889 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1890 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1891 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1892 count = 0;
1893
1894 QS5:
1895 ADD_ACTIVE(state_offset + 2, 0);
1896 if (clen > 0)
1897 {
1898 BOOL OK;
1899 switch (c)
1900 {
1901 HSPACE_CASES:
1902 OK = TRUE;
1903 break;
1904
1905 default:
1906 OK = FALSE;
1907 break;
1908 }
1909
1910 if (OK == (d == OP_HSPACE))
1911 {
1912 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1913 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1914 {
1915 active_count--; /* Remove non-match possibility */
1916 next_active_state--;
1917 }
1918 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1919 }
1920 }
1921 break;
1922
1923 /*-----------------------------------------------------------------*/
1924 #ifdef SUPPORT_UNICODE
1925 case OP_PROP_EXTRA + OP_TYPEEXACT:
1926 case OP_PROP_EXTRA + OP_TYPEUPTO:
1927 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1928 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1929 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1930 { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1931 count = current_state->count; /* Number already matched */
1932 if (clen > 0)
1933 {
1934 BOOL OK;
1935 const uint32_t *cp;
1936 const ucd_record * prop = GET_UCD(c);
1937 switch(code[1 + IMM2_SIZE + 1])
1938 {
1939 case PT_ANY:
1940 OK = TRUE;
1941 break;
1942
1943 case PT_LAMP:
1944 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1945 prop->chartype == ucp_Lt;
1946 break;
1947
1948 case PT_GC:
1949 OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1950 break;
1951
1952 case PT_PC:
1953 OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1954 break;
1955
1956 case PT_SC:
1957 OK = prop->script == code[1 + IMM2_SIZE + 2];
1958 break;
1959
1960 /* These are specials for combination cases. */
1961
1962 case PT_ALNUM:
1963 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1964 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1965 break;
1966
1967 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1968 which means that Perl space and POSIX space are now identical. PCRE
1969 was changed at release 8.34. */
1970
1971 case PT_SPACE: /* Perl space */
1972 case PT_PXSPACE: /* POSIX space */
1973 switch(c)
1974 {
1975 HSPACE_CASES:
1976 VSPACE_CASES:
1977 OK = TRUE;
1978 break;
1979
1980 default:
1981 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1982 break;
1983 }
1984 break;
1985
1986 case PT_WORD:
1987 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1988 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1989 c == CHAR_UNDERSCORE;
1990 break;
1991
1992 case PT_CLIST:
1993 cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
1994 for (;;)
1995 {
1996 if (c < *cp) { OK = FALSE; break; }
1997 if (c == *cp++) { OK = TRUE; break; }
1998 }
1999 break;
2000
2001 case PT_UCNC:
2002 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2003 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2004 c >= 0xe000;
2005 break;
2006
2007 /* Should never occur, but keep compilers from grumbling. */
2008
2009 default:
2010 OK = codevalue != OP_PROP;
2011 break;
2012 }
2013
2014 if (OK == (d == OP_PROP))
2015 {
2016 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
2017 {
2018 active_count--; /* Remove non-match possibility */
2019 next_active_state--;
2020 }
2021 if (++count >= (int)GET2(code, 1))
2022 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
2023 else
2024 { ADD_NEW(state_offset, count); }
2025 }
2026 }
2027 break;
2028
2029 /*-----------------------------------------------------------------*/
2030 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
2031 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
2032 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
2033 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
2034 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
2035 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2036 count = current_state->count; /* Number already matched */
2037 if (clen > 0)
2038 {
2039 PCRE2_SPTR nptr;
2040 int ncount = 0;
2041 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
2042 {
2043 active_count--; /* Remove non-match possibility */
2044 next_active_state--;
2045 }
2046 nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
2047 &ncount);
2048 if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2049 reset_could_continue = TRUE;
2050 if (++count >= (int)GET2(code, 1))
2051 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2052 else
2053 { ADD_NEW_DATA(-state_offset, count, ncount); }
2054 }
2055 break;
2056 #endif
2057
2058 /*-----------------------------------------------------------------*/
2059 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
2060 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
2061 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
2062 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
2063 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
2064 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2065 count = current_state->count; /* Number already matched */
2066 if (clen > 0)
2067 {
2068 int ncount = 0;
2069 switch (c)
2070 {
2071 case CHAR_VT:
2072 case CHAR_FF:
2073 case CHAR_NEL:
2074 #ifndef EBCDIC
2075 case 0x2028:
2076 case 0x2029:
2077 #endif /* Not EBCDIC */
2078 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2079 goto ANYNL03;
2080
2081 case CHAR_CR:
2082 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
2083 /* Fall through */
2084
2085 ANYNL03:
2086 case CHAR_LF:
2087 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
2088 {
2089 active_count--; /* Remove non-match possibility */
2090 next_active_state--;
2091 }
2092 if (++count >= (int)GET2(code, 1))
2093 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2094 else
2095 { ADD_NEW_DATA(-state_offset, count, ncount); }
2096 break;
2097
2098 default:
2099 break;
2100 }
2101 }
2102 break;
2103
2104 /*-----------------------------------------------------------------*/
2105 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
2106 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
2107 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2108 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2109 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2110 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2111 count = current_state->count; /* Number already matched */
2112 if (clen > 0)
2113 {
2114 BOOL OK;
2115 switch (c)
2116 {
2117 VSPACE_CASES:
2118 OK = TRUE;
2119 break;
2120
2121 default:
2122 OK = FALSE;
2123 }
2124
2125 if (OK == (d == OP_VSPACE))
2126 {
2127 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2128 {
2129 active_count--; /* Remove non-match possibility */
2130 next_active_state--;
2131 }
2132 if (++count >= (int)GET2(code, 1))
2133 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2134 else
2135 { ADD_NEW_DATA(-state_offset, count, 0); }
2136 }
2137 }
2138 break;
2139
2140 /*-----------------------------------------------------------------*/
2141 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2142 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2143 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2144 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2145 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2146 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2147 count = current_state->count; /* Number already matched */
2148 if (clen > 0)
2149 {
2150 BOOL OK;
2151 switch (c)
2152 {
2153 HSPACE_CASES:
2154 OK = TRUE;
2155 break;
2156
2157 default:
2158 OK = FALSE;
2159 break;
2160 }
2161
2162 if (OK == (d == OP_HSPACE))
2163 {
2164 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2165 {
2166 active_count--; /* Remove non-match possibility */
2167 next_active_state--;
2168 }
2169 if (++count >= (int)GET2(code, 1))
2170 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2171 else
2172 { ADD_NEW_DATA(-state_offset, count, 0); }
2173 }
2174 }
2175 break;
2176
2177 /* ========================================================================== */
2178 /* These opcodes are followed by a character that is usually compared
2179 to the current subject character; it is loaded into d. We still get
2180 here even if there is no subject character, because in some cases zero
2181 repetitions are permitted. */
2182
2183 /*-----------------------------------------------------------------*/
2184 case OP_CHAR:
2185 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2186 break;
2187
2188 /*-----------------------------------------------------------------*/
2189 case OP_CHARI:
2190 if (clen == 0) break;
2191
2192 #ifdef SUPPORT_UNICODE
2193 if (utf)
2194 {
2195 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2196 {
2197 unsigned int othercase;
2198 if (c < 128)
2199 othercase = fcc[c];
2200 else
2201 othercase = UCD_OTHERCASE(c);
2202 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2203 }
2204 }
2205 else
2206 #endif /* SUPPORT_UNICODE */
2207 /* Not UTF mode */
2208 {
2209 if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2210 { ADD_NEW(state_offset + 2, 0); }
2211 }
2212 break;
2213
2214
2215 #ifdef SUPPORT_UNICODE
2216 /*-----------------------------------------------------------------*/
2217 /* This is a tricky one because it can match more than one character.
2218 Find out how many characters to skip, and then set up a negative state
2219 to wait for them to pass before continuing. */
2220
2221 case OP_EXTUNI:
2222 if (clen > 0)
2223 {
2224 int ncount = 0;
2225 PCRE2_SPTR nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject,
2226 end_subject, utf, &ncount);
2227 if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2228 reset_could_continue = TRUE;
2229 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2230 }
2231 break;
2232 #endif
2233
2234 /*-----------------------------------------------------------------*/
2235 /* This is a tricky like EXTUNI because it too can match more than one
2236 character (when CR is followed by LF). In this case, set up a negative
2237 state to wait for one character to pass before continuing. */
2238
2239 case OP_ANYNL:
2240 if (clen > 0) switch(c)
2241 {
2242 case CHAR_VT:
2243 case CHAR_FF:
2244 case CHAR_NEL:
2245 #ifndef EBCDIC
2246 case 0x2028:
2247 case 0x2029:
2248 #endif /* Not EBCDIC */
2249 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2250 /* Fall through */
2251
2252 case CHAR_LF:
2253 ADD_NEW(state_offset + 1, 0);
2254 break;
2255
2256 case CHAR_CR:
2257 if (ptr + 1 >= end_subject)
2258 {
2259 ADD_NEW(state_offset + 1, 0);
2260 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2261 reset_could_continue = TRUE;
2262 }
2263 else if (UCHAR21TEST(ptr + 1) == CHAR_LF)
2264 {
2265 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2266 }
2267 else
2268 {
2269 ADD_NEW(state_offset + 1, 0);
2270 }
2271 break;
2272 }
2273 break;
2274
2275 /*-----------------------------------------------------------------*/
2276 case OP_NOT_VSPACE:
2277 if (clen > 0) switch(c)
2278 {
2279 VSPACE_CASES:
2280 break;
2281
2282 default:
2283 ADD_NEW(state_offset + 1, 0);
2284 break;
2285 }
2286 break;
2287
2288 /*-----------------------------------------------------------------*/
2289 case OP_VSPACE:
2290 if (clen > 0) switch(c)
2291 {
2292 VSPACE_CASES:
2293 ADD_NEW(state_offset + 1, 0);
2294 break;
2295
2296 default:
2297 break;
2298 }
2299 break;
2300
2301 /*-----------------------------------------------------------------*/
2302 case OP_NOT_HSPACE:
2303 if (clen > 0) switch(c)
2304 {
2305 HSPACE_CASES:
2306 break;
2307
2308 default:
2309 ADD_NEW(state_offset + 1, 0);
2310 break;
2311 }
2312 break;
2313
2314 /*-----------------------------------------------------------------*/
2315 case OP_HSPACE:
2316 if (clen > 0) switch(c)
2317 {
2318 HSPACE_CASES:
2319 ADD_NEW(state_offset + 1, 0);
2320 break;
2321
2322 default:
2323 break;
2324 }
2325 break;
2326
2327 /*-----------------------------------------------------------------*/
2328 /* Match a negated single character casefully. */
2329
2330 case OP_NOT:
2331 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2332 break;
2333
2334 /*-----------------------------------------------------------------*/
2335 /* Match a negated single character caselessly. */
2336
2337 case OP_NOTI:
2338 if (clen > 0)
2339 {
2340 uint32_t otherd;
2341 #ifdef SUPPORT_UNICODE
2342 if (utf && d >= 128)
2343 otherd = UCD_OTHERCASE(d);
2344 else
2345 #endif /* SUPPORT_UNICODE */
2346 otherd = TABLE_GET(d, fcc, d);
2347 if (c != d && c != otherd)
2348 { ADD_NEW(state_offset + dlen + 1, 0); }
2349 }
2350 break;
2351
2352 /*-----------------------------------------------------------------*/
2353 case OP_PLUSI:
2354 case OP_MINPLUSI:
2355 case OP_POSPLUSI:
2356 case OP_NOTPLUSI:
2357 case OP_NOTMINPLUSI:
2358 case OP_NOTPOSPLUSI:
2359 caseless = TRUE;
2360 codevalue -= OP_STARI - OP_STAR;
2361
2362 /* Fall through */
2363 case OP_PLUS:
2364 case OP_MINPLUS:
2365 case OP_POSPLUS:
2366 case OP_NOTPLUS:
2367 case OP_NOTMINPLUS:
2368 case OP_NOTPOSPLUS:
2369 count = current_state->count; /* Already matched */
2370 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2371 if (clen > 0)
2372 {
2373 uint32_t otherd = NOTACHAR;
2374 if (caseless)
2375 {
2376 #ifdef SUPPORT_UNICODE
2377 if (utf && d >= 128)
2378 otherd = UCD_OTHERCASE(d);
2379 else
2380 #endif /* SUPPORT_UNICODE */
2381 otherd = TABLE_GET(d, fcc, d);
2382 }
2383 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2384 {
2385 if (count > 0 &&
2386 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2387 {
2388 active_count--; /* Remove non-match possibility */
2389 next_active_state--;
2390 }
2391 count++;
2392 ADD_NEW(state_offset, count);
2393 }
2394 }
2395 break;
2396
2397 /*-----------------------------------------------------------------*/
2398 case OP_QUERYI:
2399 case OP_MINQUERYI:
2400 case OP_POSQUERYI:
2401 case OP_NOTQUERYI:
2402 case OP_NOTMINQUERYI:
2403 case OP_NOTPOSQUERYI:
2404 caseless = TRUE;
2405 codevalue -= OP_STARI - OP_STAR;
2406 /* Fall through */
2407 case OP_QUERY:
2408 case OP_MINQUERY:
2409 case OP_POSQUERY:
2410 case OP_NOTQUERY:
2411 case OP_NOTMINQUERY:
2412 case OP_NOTPOSQUERY:
2413 ADD_ACTIVE(state_offset + dlen + 1, 0);
2414 if (clen > 0)
2415 {
2416 uint32_t otherd = NOTACHAR;
2417 if (caseless)
2418 {
2419 #ifdef SUPPORT_UNICODE
2420 if (utf && d >= 128)
2421 otherd = UCD_OTHERCASE(d);
2422 else
2423 #endif /* SUPPORT_UNICODE */
2424 otherd = TABLE_GET(d, fcc, d);
2425 }
2426 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2427 {
2428 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2429 {
2430 active_count--; /* Remove non-match possibility */
2431 next_active_state--;
2432 }
2433 ADD_NEW(state_offset + dlen + 1, 0);
2434 }
2435 }
2436 break;
2437
2438 /*-----------------------------------------------------------------*/
2439 case OP_STARI:
2440 case OP_MINSTARI:
2441 case OP_POSSTARI:
2442 case OP_NOTSTARI:
2443 case OP_NOTMINSTARI:
2444 case OP_NOTPOSSTARI:
2445 caseless = TRUE;
2446 codevalue -= OP_STARI - OP_STAR;
2447 /* Fall through */
2448 case OP_STAR:
2449 case OP_MINSTAR:
2450 case OP_POSSTAR:
2451 case OP_NOTSTAR:
2452 case OP_NOTMINSTAR:
2453 case OP_NOTPOSSTAR:
2454 ADD_ACTIVE(state_offset + dlen + 1, 0);
2455 if (clen > 0)
2456 {
2457 uint32_t otherd = NOTACHAR;
2458 if (caseless)
2459 {
2460 #ifdef SUPPORT_UNICODE
2461 if (utf && d >= 128)
2462 otherd = UCD_OTHERCASE(d);
2463 else
2464 #endif /* SUPPORT_UNICODE */
2465 otherd = TABLE_GET(d, fcc, d);
2466 }
2467 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2468 {
2469 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2470 {
2471 active_count--; /* Remove non-match possibility */
2472 next_active_state--;
2473 }
2474 ADD_NEW(state_offset, 0);
2475 }
2476 }
2477 break;
2478
2479 /*-----------------------------------------------------------------*/
2480 case OP_EXACTI:
2481 case OP_NOTEXACTI:
2482 caseless = TRUE;
2483 codevalue -= OP_STARI - OP_STAR;
2484 /* Fall through */
2485 case OP_EXACT:
2486 case OP_NOTEXACT:
2487 count = current_state->count; /* Number already matched */
2488 if (clen > 0)
2489 {
2490 uint32_t otherd = NOTACHAR;
2491 if (caseless)
2492 {
2493 #ifdef SUPPORT_UNICODE
2494 if (utf && d >= 128)
2495 otherd = UCD_OTHERCASE(d);
2496 else
2497 #endif /* SUPPORT_UNICODE */
2498 otherd = TABLE_GET(d, fcc, d);
2499 }
2500 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2501 {
2502 if (++count >= (int)GET2(code, 1))
2503 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2504 else
2505 { ADD_NEW(state_offset, count); }
2506 }
2507 }
2508 break;
2509
2510 /*-----------------------------------------------------------------*/
2511 case OP_UPTOI:
2512 case OP_MINUPTOI:
2513 case OP_POSUPTOI:
2514 case OP_NOTUPTOI:
2515 case OP_NOTMINUPTOI:
2516 case OP_NOTPOSUPTOI:
2517 caseless = TRUE;
2518 codevalue -= OP_STARI - OP_STAR;
2519 /* Fall through */
2520 case OP_UPTO:
2521 case OP_MINUPTO:
2522 case OP_POSUPTO:
2523 case OP_NOTUPTO:
2524 case OP_NOTMINUPTO:
2525 case OP_NOTPOSUPTO:
2526 ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2527 count = current_state->count; /* Number already matched */
2528 if (clen > 0)
2529 {
2530 uint32_t otherd = NOTACHAR;
2531 if (caseless)
2532 {
2533 #ifdef SUPPORT_UNICODE
2534 if (utf && d >= 128)
2535 otherd = UCD_OTHERCASE(d);
2536 else
2537 #endif /* SUPPORT_UNICODE */
2538 otherd = TABLE_GET(d, fcc, d);
2539 }
2540 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2541 {
2542 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2543 {
2544 active_count--; /* Remove non-match possibility */
2545 next_active_state--;
2546 }
2547 if (++count >= (int)GET2(code, 1))
2548 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2549 else
2550 { ADD_NEW(state_offset, count); }
2551 }
2552 }
2553 break;
2554
2555
2556 /* ========================================================================== */
2557 /* These are the class-handling opcodes */
2558
2559 case OP_CLASS:
2560 case OP_NCLASS:
2561 case OP_XCLASS:
2562 {
2563 BOOL isinclass = FALSE;
2564 int next_state_offset;
2565 PCRE2_SPTR ecode;
2566
2567 /* For a simple class, there is always just a 32-byte table, and we
2568 can set isinclass from it. */
2569
2570 if (codevalue != OP_XCLASS)
2571 {
2572 ecode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
2573 if (clen > 0)
2574 {
2575 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2576 ((((uint8_t *)(code + 1))[c/8] & (1u << (c&7))) != 0);
2577 }
2578 }
2579
2580 /* An extended class may have a table or a list of single characters,
2581 ranges, or both, and it may be positive or negative. There's a
2582 function that sorts all this out. */
2583
2584 else
2585 {
2586 ecode = code + GET(code, 1);
2587 if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2588 }
2589
2590 /* At this point, isinclass is set for all kinds of class, and ecode
2591 points to the byte after the end of the class. If there is a
2592 quantifier, this is where it will be. */
2593
2594 next_state_offset = (int)(ecode - start_code);
2595
2596 switch (*ecode)
2597 {
2598 case OP_CRSTAR:
2599 case OP_CRMINSTAR:
2600 case OP_CRPOSSTAR:
2601 ADD_ACTIVE(next_state_offset + 1, 0);
2602 if (isinclass)
2603 {
2604 if (*ecode == OP_CRPOSSTAR)
2605 {
2606 active_count--; /* Remove non-match possibility */
2607 next_active_state--;
2608 }
2609 ADD_NEW(state_offset, 0);
2610 }
2611 break;
2612
2613 case OP_CRPLUS:
2614 case OP_CRMINPLUS:
2615 case OP_CRPOSPLUS:
2616 count = current_state->count; /* Already matched */
2617 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2618 if (isinclass)
2619 {
2620 if (count > 0 && *ecode == OP_CRPOSPLUS)
2621 {
2622 active_count--; /* Remove non-match possibility */
2623 next_active_state--;
2624 }
2625 count++;
2626 ADD_NEW(state_offset, count);
2627 }
2628 break;
2629
2630 case OP_CRQUERY:
2631 case OP_CRMINQUERY:
2632 case OP_CRPOSQUERY:
2633 ADD_ACTIVE(next_state_offset + 1, 0);
2634 if (isinclass)
2635 {
2636 if (*ecode == OP_CRPOSQUERY)
2637 {
2638 active_count--; /* Remove non-match possibility */
2639 next_active_state--;
2640 }
2641 ADD_NEW(next_state_offset + 1, 0);
2642 }
2643 break;
2644
2645 case OP_CRRANGE:
2646 case OP_CRMINRANGE:
2647 case OP_CRPOSRANGE:
2648 count = current_state->count; /* Already matched */
2649 if (count >= (int)GET2(ecode, 1))
2650 { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2651 if (isinclass)
2652 {
2653 int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2654
2655 if (*ecode == OP_CRPOSRANGE && count >= (int)GET2(ecode, 1))
2656 {
2657 active_count--; /* Remove non-match possibility */
2658 next_active_state--;
2659 }
2660
2661 if (++count >= max && max != 0) /* Max 0 => no limit */
2662 { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2663 else
2664 { ADD_NEW(state_offset, count); }
2665 }
2666 break;
2667
2668 default:
2669 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2670 break;
2671 }
2672 }
2673 break;
2674
2675 /* ========================================================================== */
2676 /* These are the opcodes for fancy brackets of various kinds. We have
2677 to use recursion in order to handle them. The "always failing" assertion
2678 (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2679 though the other "backtracking verbs" are not supported. */
2680
2681 case OP_FAIL:
2682 forced_fail++; /* Count FAILs for multiple states */
2683 break;
2684
2685 case OP_ASSERT:
2686 case OP_ASSERT_NOT:
2687 case OP_ASSERTBACK:
2688 case OP_ASSERTBACK_NOT:
2689 {
2690 int rc;
2691 int *local_workspace;
2692 PCRE2_SIZE *local_offsets;
2693 PCRE2_SPTR endasscode = code + GET(code, 1);
2694 RWS_anchor *rws = (RWS_anchor *)RWS;
2695
2696 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2697 {
2698 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2699 if (rc != 0) return rc;
2700 RWS = (int *)rws;
2701 }
2702
2703 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2704 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2705 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2706
2707 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2708
2709 rc = internal_dfa_match(
2710 mb, /* static match data */
2711 code, /* this subexpression's code */
2712 ptr, /* where we currently are */
2713 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2714 local_offsets, /* offset vector */
2715 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
2716 local_workspace, /* workspace vector */
2717 RWS_RSIZE, /* size of same */
2718 rlevel, /* function recursion level */
2719 RWS); /* recursion workspace */
2720
2721 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2722
2723 if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2724 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2725 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2726 }
2727 break;
2728
2729 /*-----------------------------------------------------------------*/
2730 case OP_COND:
2731 case OP_SCOND:
2732 {
2733 int codelink = (int)GET(code, 1);
2734 PCRE2_UCHAR condcode;
2735
2736 /* Because of the way auto-callout works during compile, a callout item
2737 is inserted between OP_COND and an assertion condition. This does not
2738 happen for the other conditions. */
2739
2740 if (code[LINK_SIZE + 1] == OP_CALLOUT
2741 || code[LINK_SIZE + 1] == OP_CALLOUT_STR)
2742 {
2743 PCRE2_SIZE callout_length;
2744 rrc = do_callout(code, offsets, current_subject, ptr, mb,
2745 1 + LINK_SIZE, &callout_length);
2746 if (rrc < 0) return rrc; /* Abandon */
2747 if (rrc > 0) break; /* Fail this thread */
2748 code += callout_length; /* Skip callout data */
2749 }
2750
2751 condcode = code[LINK_SIZE+1];
2752
2753 /* Back reference conditions and duplicate named recursion conditions
2754 are not supported */
2755
2756 if (condcode == OP_CREF || condcode == OP_DNCREF ||
2757 condcode == OP_DNRREF)
2758 return PCRE2_ERROR_DFA_UCOND;
2759
2760 /* The DEFINE condition is always false, and the assertion (?!) is
2761 converted to OP_FAIL. */
2762
2763 if (condcode == OP_FALSE || condcode == OP_FAIL)
2764 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2765
2766 /* There is also an always-true condition */
2767
2768 else if (condcode == OP_TRUE)
2769 { ADD_ACTIVE(state_offset + LINK_SIZE + 2, 0); }
2770
2771 /* The only supported version of OP_RREF is for the value RREF_ANY,
2772 which means "test if in any recursion". We can't test for specifically
2773 recursed groups. */
2774
2775 else if (condcode == OP_RREF)
2776 {
2777 unsigned int value = GET2(code, LINK_SIZE + 2);
2778 if (value != RREF_ANY) return PCRE2_ERROR_DFA_UCOND;
2779 if (mb->recursive != NULL)
2780 { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2781 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2782 }
2783
2784 /* Otherwise, the condition is an assertion */
2785
2786 else
2787 {
2788 int rc;
2789 int *local_workspace;
2790 PCRE2_SIZE *local_offsets;
2791 PCRE2_SPTR asscode = code + LINK_SIZE + 1;
2792 PCRE2_SPTR endasscode = asscode + GET(asscode, 1);
2793 RWS_anchor *rws = (RWS_anchor *)RWS;
2794
2795 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2796 {
2797 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2798 if (rc != 0) return rc;
2799 RWS = (int *)rws;
2800 }
2801
2802 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2803 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2804 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2805
2806 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2807
2808 rc = internal_dfa_match(
2809 mb, /* fixed match data */
2810 asscode, /* this subexpression's code */
2811 ptr, /* where we currently are */
2812 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2813 local_offsets, /* offset vector */
2814 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
2815 local_workspace, /* workspace vector */
2816 RWS_RSIZE, /* size of same */
2817 rlevel, /* function recursion level */
2818 RWS); /* recursion workspace */
2819
2820 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2821
2822 if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2823 if ((rc >= 0) ==
2824 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2825 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2826 else
2827 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2828 }
2829 }
2830 break;
2831
2832 /*-----------------------------------------------------------------*/
2833 case OP_RECURSE:
2834 {
2835 int rc;
2836 int *local_workspace;
2837 PCRE2_SIZE *local_offsets;
2838 RWS_anchor *rws = (RWS_anchor *)RWS;
2839 dfa_recursion_info *ri;
2840 PCRE2_SPTR callpat = start_code + GET(code, 1);
2841 uint32_t recno = (callpat == mb->start_code)? 0 :
2842 GET2(callpat, 1 + LINK_SIZE);
2843
2844 if (rws->free < RWS_RSIZE + RWS_OVEC_RSIZE)
2845 {
2846 rc = more_workspace(&rws, RWS_OVEC_RSIZE, mb);
2847 if (rc != 0) return rc;
2848 RWS = (int *)rws;
2849 }
2850
2851 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2852 local_workspace = ((int *)local_offsets) + RWS_OVEC_RSIZE;
2853 rws->free -= RWS_RSIZE + RWS_OVEC_RSIZE;
2854
2855 /* Check for repeating a recursion without advancing the subject
2856 pointer. This should catch convoluted mutual recursions. (Some simple
2857 cases are caught at compile time.) */
2858
2859 for (ri = mb->recursive; ri != NULL; ri = ri->prevrec)
2860 if (recno == ri->group_num && ptr == ri->subject_position)
2861 return PCRE2_ERROR_RECURSELOOP;
2862
2863 /* Remember this recursion and where we started it so as to
2864 catch infinite loops. */
2865
2866 new_recursive.group_num = recno;
2867 new_recursive.subject_position = ptr;
2868 new_recursive.prevrec = mb->recursive;
2869 mb->recursive = &new_recursive;
2870
2871 rc = internal_dfa_match(
2872 mb, /* fixed match data */
2873 callpat, /* this subexpression's code */
2874 ptr, /* where we currently are */
2875 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2876 local_offsets, /* offset vector */
2877 RWS_OVEC_RSIZE/OVEC_UNIT, /* size of same */
2878 local_workspace, /* workspace vector */
2879 RWS_RSIZE, /* size of same */
2880 rlevel, /* function recursion level */
2881 RWS); /* recursion workspace */
2882
2883 rws->free += RWS_RSIZE + RWS_OVEC_RSIZE;
2884 mb->recursive = new_recursive.prevrec; /* Done this recursion */
2885
2886 /* Ran out of internal offsets */
2887
2888 if (rc == 0) return PCRE2_ERROR_DFA_RECURSE;
2889
2890 /* For each successful matched substring, set up the next state with a
2891 count of characters to skip before trying it. Note that the count is in
2892 characters, not bytes. */
2893
2894 if (rc > 0)
2895 {
2896 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2897 {
2898 PCRE2_SIZE charcount = local_offsets[rc+1] - local_offsets[rc];
2899 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2900 if (utf)
2901 {
2902 PCRE2_SPTR p = start_subject + local_offsets[rc];
2903 PCRE2_SPTR pp = start_subject + local_offsets[rc+1];
2904 while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
2905 }
2906 #endif
2907 if (charcount > 0)
2908 {
2909 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0,
2910 (int)(charcount - 1));
2911 }
2912 else
2913 {
2914 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2915 }
2916 }
2917 }
2918 else if (rc != PCRE2_ERROR_NOMATCH) return rc;
2919 }
2920 break;
2921
2922 /*-----------------------------------------------------------------*/
2923 case OP_BRAPOS:
2924 case OP_SBRAPOS:
2925 case OP_CBRAPOS:
2926 case OP_SCBRAPOS:
2927 case OP_BRAPOSZERO:
2928 {
2929 int rc;
2930 int *local_workspace;
2931 PCRE2_SIZE *local_offsets;
2932 PCRE2_SIZE charcount, matched_count;
2933 PCRE2_SPTR local_ptr = ptr;
2934 RWS_anchor *rws = (RWS_anchor *)RWS;
2935 BOOL allow_zero;
2936
2937 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2938 {
2939 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2940 if (rc != 0) return rc;
2941 RWS = (int *)rws;
2942 }
2943
2944 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2945 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2946 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2947
2948 if (codevalue == OP_BRAPOSZERO)
2949 {
2950 allow_zero = TRUE;
2951 codevalue = *(++code); /* Codevalue will be one of above BRAs */
2952 }
2953 else allow_zero = FALSE;
2954
2955 /* Loop to match the subpattern as many times as possible as if it were
2956 a complete pattern. */
2957
2958 for (matched_count = 0;; matched_count++)
2959 {
2960 rc = internal_dfa_match(
2961 mb, /* fixed match data */
2962 code, /* this subexpression's code */
2963 local_ptr, /* where we currently are */
2964 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2965 local_offsets, /* offset vector */
2966 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
2967 local_workspace, /* workspace vector */
2968 RWS_RSIZE, /* size of same */
2969 rlevel, /* function recursion level */
2970 RWS); /* recursion workspace */
2971
2972 /* Failed to match */
2973
2974 if (rc < 0)
2975 {
2976 if (rc != PCRE2_ERROR_NOMATCH) return rc;
2977 break;
2978 }
2979
2980 /* Matched: break the loop if zero characters matched. */
2981
2982 charcount = local_offsets[1] - local_offsets[0];
2983 if (charcount == 0) break;
2984 local_ptr += charcount; /* Advance temporary position ptr */
2985 }
2986
2987 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2988
2989 /* At this point we have matched the subpattern matched_count
2990 times, and local_ptr is pointing to the character after the end of the
2991 last match. */
2992
2993 if (matched_count > 0 || allow_zero)
2994 {
2995 PCRE2_SPTR end_subpattern = code;
2996 int next_state_offset;
2997
2998 do { end_subpattern += GET(end_subpattern, 1); }
2999 while (*end_subpattern == OP_ALT);
3000 next_state_offset =
3001 (int)(end_subpattern - start_code + LINK_SIZE + 1);
3002
3003 /* Optimization: if there are no more active states, and there
3004 are no new states yet set up, then skip over the subject string
3005 right here, to save looping. Otherwise, set up the new state to swing
3006 into action when the end of the matched substring is reached. */
3007
3008 if (i + 1 >= active_count && new_count == 0)
3009 {
3010 ptr = local_ptr;
3011 clen = 0;
3012 ADD_NEW(next_state_offset, 0);
3013 }
3014 else
3015 {
3016 PCRE2_SPTR p = ptr;
3017 PCRE2_SPTR pp = local_ptr;
3018 charcount = (PCRE2_SIZE)(pp - p);
3019 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3020 if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3021 #endif
3022 ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3023 }
3024 }
3025 }
3026 break;
3027
3028 /*-----------------------------------------------------------------*/
3029 case OP_ONCE:
3030 {
3031 int rc;
3032 int *local_workspace;
3033 PCRE2_SIZE *local_offsets;
3034 RWS_anchor *rws = (RWS_anchor *)RWS;
3035
3036 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
3037 {
3038 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
3039 if (rc != 0) return rc;
3040 RWS = (int *)rws;
3041 }
3042
3043 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
3044 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
3045 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
3046
3047 rc = internal_dfa_match(
3048 mb, /* fixed match data */
3049 code, /* this subexpression's code */
3050 ptr, /* where we currently are */
3051 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
3052 local_offsets, /* offset vector */
3053 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
3054 local_workspace, /* workspace vector */
3055 RWS_RSIZE, /* size of same */
3056 rlevel, /* function recursion level */
3057 RWS); /* recursion workspace */
3058
3059 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
3060
3061 if (rc >= 0)
3062 {
3063 PCRE2_SPTR end_subpattern = code;
3064 PCRE2_SIZE charcount = local_offsets[1] - local_offsets[0];
3065 int next_state_offset, repeat_state_offset;
3066
3067 do { end_subpattern += GET(end_subpattern, 1); }
3068 while (*end_subpattern == OP_ALT);
3069 next_state_offset =
3070 (int)(end_subpattern - start_code + LINK_SIZE + 1);
3071
3072 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
3073 arrange for the repeat state also to be added to the relevant list.
3074 Calculate the offset, or set -1 for no repeat. */
3075
3076 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
3077 *end_subpattern == OP_KETRMIN)?
3078 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
3079
3080 /* If we have matched an empty string, add the next state at the
3081 current character pointer. This is important so that the duplicate
3082 checking kicks in, which is what breaks infinite loops that match an
3083 empty string. */
3084
3085 if (charcount == 0)
3086 {
3087 ADD_ACTIVE(next_state_offset, 0);
3088 }
3089
3090 /* Optimization: if there are no more active states, and there
3091 are no new states yet set up, then skip over the subject string
3092 right here, to save looping. Otherwise, set up the new state to swing
3093 into action when the end of the matched substring is reached. */
3094
3095 else if (i + 1 >= active_count && new_count == 0)
3096 {
3097 ptr += charcount;
3098 clen = 0;
3099 ADD_NEW(next_state_offset, 0);
3100
3101 /* If we are adding a repeat state at the new character position,
3102 we must fudge things so that it is the only current state.
3103 Otherwise, it might be a duplicate of one we processed before, and
3104 that would cause it to be skipped. */
3105
3106 if (repeat_state_offset >= 0)
3107 {
3108 next_active_state = active_states;
3109 active_count = 0;
3110 i = -1;
3111 ADD_ACTIVE(repeat_state_offset, 0);
3112 }
3113 }
3114 else
3115 {
3116 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3117 if (utf)
3118 {
3119 PCRE2_SPTR p = start_subject + local_offsets[0];
3120 PCRE2_SPTR pp = start_subject + local_offsets[1];
3121 while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3122 }
3123 #endif
3124 ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3125 if (repeat_state_offset >= 0)
3126 { ADD_NEW_DATA(-repeat_state_offset, 0, (int)(charcount - 1)); }
3127 }
3128 }
3129 else if (rc != PCRE2_ERROR_NOMATCH) return rc;
3130 }
3131 break;
3132
3133
3134 /* ========================================================================== */
3135 /* Handle callouts */
3136
3137 case OP_CALLOUT:
3138 case OP_CALLOUT_STR:
3139 {
3140 PCRE2_SIZE callout_length;
3141 rrc = do_callout(code, offsets, current_subject, ptr, mb, 0,
3142 &callout_length);
3143 if (rrc < 0) return rrc; /* Abandon */
3144 if (rrc == 0)
3145 { ADD_ACTIVE(state_offset + (int)callout_length, 0); }
3146 }
3147 break;
3148
3149
3150 /* ========================================================================== */
3151 default: /* Unsupported opcode */
3152 return PCRE2_ERROR_DFA_UITEM;
3153 }
3154
3155 NEXT_ACTIVE_STATE: continue;
3156
3157 } /* End of loop scanning active states */
3158
3159 /* We have finished the processing at the current subject character. If no
3160 new states have been set for the next character, we have found all the
3161 matches that we are going to find. If partial matching has been requested,
3162 check for appropriate conditions.
3163
3164 The "forced_ fail" variable counts the number of (*F) encountered for the
3165 character. If it is equal to the original active_count (saved in
3166 workspace[1]) it means that (*F) was found on every active state. In this
3167 case we don't want to give a partial match.
3168
3169 The "could_continue" variable is true if a state could have continued but
3170 for the fact that the end of the subject was reached. */
3171
3172 if (new_count <= 0)
3173 {
3174 if (could_continue && /* Some could go on, and */
3175 forced_fail != workspace[1] && /* Not all forced fail & */
3176 ( /* either... */
3177 (mb->moptions & PCRE2_PARTIAL_HARD) != 0 /* Hard partial */
3178 || /* or... */
3179 ((mb->moptions & PCRE2_PARTIAL_SOFT) != 0 && /* Soft partial and */
3180 match_count < 0) /* no matches */
3181 ) && /* And... */
3182 (
3183 partial_newline || /* Either partial NL */
3184 ( /* or ... */
3185 ptr >= end_subject && /* End of subject and */
3186 ( /* either */
3187 ptr > mb->start_used_ptr || /* Inspected non-empty string */
3188 mb->allowemptypartial /* or pattern has lookbehind */
3189 ) /* or could match empty */
3190 )
3191 ))
3192 match_count = PCRE2_ERROR_PARTIAL;
3193 break; /* Exit from loop along the subject string */
3194 }
3195
3196 /* One or more states are active for the next character. */
3197
3198 ptr += clen; /* Advance to next subject character */
3199 } /* Loop to move along the subject string */
3200
3201 /* Control gets here from "break" a few lines above. If we have a match and
3202 PCRE2_ENDANCHORED is set, the match fails. */
3203
3204 if (match_count >= 0 &&
3205 ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0 &&
3206 ptr < end_subject)
3207 match_count = PCRE2_ERROR_NOMATCH;
3208
3209 return match_count;
3210 }
3211
3212
3213
3214 /*************************************************
3215 * Match a pattern using the DFA algorithm *
3216 *************************************************/
3217
3218 /* This function matches a compiled pattern to a subject string, using the
3219 alternate matching algorithm that finds all matches at once.
3220
3221 Arguments:
3222 code points to the compiled pattern
3223 subject subject string
3224 length length of subject string
3225 startoffset where to start matching in the subject
3226 options option bits
3227 match_data points to a match data structure
3228 gcontext points to a match context
3229 workspace pointer to workspace
3230 wscount size of workspace
3231
3232 Returns: > 0 => number of match offset pairs placed in offsets
3233 = 0 => offsets overflowed; longest matches are present
3234 -1 => failed to match
3235 < -1 => some kind of unexpected problem
3236 */
3237
3238 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_dfa_match(const pcre2_code * code,PCRE2_SPTR subject,PCRE2_SIZE length,PCRE2_SIZE start_offset,uint32_t options,pcre2_match_data * match_data,pcre2_match_context * mcontext,int * workspace,PCRE2_SIZE wscount)3239 pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
3240 PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
3241 pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount)
3242 {
3243 int rc;
3244 int was_zero_terminated = 0;
3245
3246 const pcre2_real_code *re = (const pcre2_real_code *)code;
3247
3248 PCRE2_SPTR start_match;
3249 PCRE2_SPTR end_subject;
3250 PCRE2_SPTR bumpalong_limit;
3251 PCRE2_SPTR req_cu_ptr;
3252
3253 BOOL utf, anchored, startline, firstline;
3254 BOOL has_first_cu = FALSE;
3255 BOOL has_req_cu = FALSE;
3256
3257 #if PCRE2_CODE_UNIT_WIDTH == 8
3258 BOOL memchr_not_found_first_cu = FALSE;
3259 BOOL memchr_not_found_first_cu2 = FALSE;
3260 #endif
3261
3262 PCRE2_UCHAR first_cu = 0;
3263 PCRE2_UCHAR first_cu2 = 0;
3264 PCRE2_UCHAR req_cu = 0;
3265 PCRE2_UCHAR req_cu2 = 0;
3266
3267 const uint8_t *start_bits = NULL;
3268
3269 /* We need to have mb pointing to a match block, because the IS_NEWLINE macro
3270 is used below, and it expects NLBLOCK to be defined as a pointer. */
3271
3272 pcre2_callout_block cb;
3273 dfa_match_block actual_match_block;
3274 dfa_match_block *mb = &actual_match_block;
3275
3276 /* Set up a starting block of memory for use during recursive calls to
3277 internal_dfa_match(). By putting this on the stack, it minimizes resource use
3278 in the case when it is not needed. If this is too small, more memory is
3279 obtained from the heap. At the start of each block is an anchor structure.*/
3280
3281 int base_recursion_workspace[RWS_BASE_SIZE];
3282 RWS_anchor *rws = (RWS_anchor *)base_recursion_workspace;
3283 rws->next = NULL;
3284 rws->size = RWS_BASE_SIZE;
3285 rws->free = RWS_BASE_SIZE - RWS_ANCHOR_SIZE;
3286
3287 /* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated
3288 subject string. */
3289
3290 if (length == PCRE2_ZERO_TERMINATED)
3291 {
3292 length = PRIV(strlen)(subject);
3293 was_zero_terminated = 1;
3294 }
3295
3296 /* Plausibility checks */
3297
3298 if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
3299 if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL)
3300 return PCRE2_ERROR_NULL;
3301 if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE;
3302 if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
3303
3304 /* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same
3305 time. */
3306
3307 if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
3308 ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
3309 return PCRE2_ERROR_BADOPTION;
3310
3311 /* Invalid UTF support is not available for DFA matching. */
3312
3313 if ((re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0)
3314 return PCRE2_ERROR_DFA_UINVALID_UTF;
3315
3316 /* Check that the first field in the block is the magic number. If it is not,
3317 return with PCRE2_ERROR_BADMAGIC. */
3318
3319 if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
3320
3321 /* Check the code unit width. */
3322
3323 if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
3324 return PCRE2_ERROR_BADMODE;
3325
3326 /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
3327 options variable for this function. Users of PCRE2 who are not calling the
3328 function directly would like to have a way of setting these flags, in the same
3329 way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with
3330 constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
3331 (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be
3332 transferred to the options for this function. The bits are guaranteed to be
3333 adjacent, but do not have the same values. This bit of Boolean trickery assumes
3334 that the match-time bits are not more significant than the flag bits. If by
3335 accident this is not the case, a compile-time division by zero error will
3336 occur. */
3337
3338 #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
3339 #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
3340 options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
3341 #undef FF
3342 #undef OO
3343
3344 /* If restarting after a partial match, do some sanity checks on the contents
3345 of the workspace. */
3346
3347 if ((options & PCRE2_DFA_RESTART) != 0)
3348 {
3349 if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3350 workspace[1] > (int)((wscount - 2)/INTS_PER_STATEBLOCK))
3351 return PCRE2_ERROR_DFA_BADRESTART;
3352 }
3353
3354 /* Set some local values */
3355
3356 utf = (re->overall_options & PCRE2_UTF) != 0;
3357 start_match = subject + start_offset;
3358 end_subject = subject + length;
3359 req_cu_ptr = start_match - 1;
3360 anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 ||
3361 (re->overall_options & PCRE2_ANCHORED) != 0;
3362
3363 /* The "must be at the start of a line" flags are used in a loop when finding
3364 where to start. */
3365
3366 startline = (re->flags & PCRE2_STARTLINE) != 0;
3367 firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;
3368 bumpalong_limit = end_subject;
3369
3370 /* Initialize and set up the fixed fields in the callout block, with a pointer
3371 in the match block. */
3372
3373 mb->cb = &cb;
3374 cb.version = 2;
3375 cb.subject = subject;
3376 cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
3377 cb.callout_flags = 0;
3378 cb.capture_top = 1; /* No capture support */
3379 cb.capture_last = 0;
3380 cb.mark = NULL; /* No (*MARK) support */
3381
3382 /* Get data from the match context, if present, and fill in the remaining
3383 fields in the match block. It is an error to set an offset limit without
3384 setting the flag at compile time. */
3385
3386 if (mcontext == NULL)
3387 {
3388 mb->callout = NULL;
3389 mb->memctl = re->memctl;
3390 mb->match_limit = PRIV(default_match_context).match_limit;
3391 mb->match_limit_depth = PRIV(default_match_context).depth_limit;
3392 mb->heap_limit = PRIV(default_match_context).heap_limit;
3393 }
3394 else
3395 {
3396 if (mcontext->offset_limit != PCRE2_UNSET)
3397 {
3398 if ((re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
3399 return PCRE2_ERROR_BADOFFSETLIMIT;
3400 bumpalong_limit = subject + mcontext->offset_limit;
3401 }
3402 mb->callout = mcontext->callout;
3403 mb->callout_data = mcontext->callout_data;
3404 mb->memctl = mcontext->memctl;
3405 mb->match_limit = mcontext->match_limit;
3406 mb->match_limit_depth = mcontext->depth_limit;
3407 mb->heap_limit = mcontext->heap_limit;
3408 }
3409
3410 if (mb->match_limit > re->limit_match)
3411 mb->match_limit = re->limit_match;
3412
3413 if (mb->match_limit_depth > re->limit_depth)
3414 mb->match_limit_depth = re->limit_depth;
3415
3416 if (mb->heap_limit > re->limit_heap)
3417 mb->heap_limit = re->limit_heap;
3418
3419 mb->start_code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
3420 re->name_count * re->name_entry_size;
3421 mb->tables = re->tables;
3422 mb->start_subject = subject;
3423 mb->end_subject = end_subject;
3424 mb->start_offset = start_offset;
3425 mb->allowemptypartial = (re->max_lookbehind > 0) ||
3426 (re->flags & PCRE2_MATCH_EMPTY) != 0;
3427 mb->moptions = options;
3428 mb->poptions = re->overall_options;
3429 mb->match_call_count = 0;
3430 mb->heap_used = 0;
3431
3432 /* Process the \R and newline settings. */
3433
3434 mb->bsr_convention = re->bsr_convention;
3435 mb->nltype = NLTYPE_FIXED;
3436 switch(re->newline_convention)
3437 {
3438 case PCRE2_NEWLINE_CR:
3439 mb->nllen = 1;
3440 mb->nl[0] = CHAR_CR;
3441 break;
3442
3443 case PCRE2_NEWLINE_LF:
3444 mb->nllen = 1;
3445 mb->nl[0] = CHAR_NL;
3446 break;
3447
3448 case PCRE2_NEWLINE_NUL:
3449 mb->nllen = 1;
3450 mb->nl[0] = CHAR_NUL;
3451 break;
3452
3453 case PCRE2_NEWLINE_CRLF:
3454 mb->nllen = 2;
3455 mb->nl[0] = CHAR_CR;
3456 mb->nl[1] = CHAR_NL;
3457 break;
3458
3459 case PCRE2_NEWLINE_ANY:
3460 mb->nltype = NLTYPE_ANY;
3461 break;
3462
3463 case PCRE2_NEWLINE_ANYCRLF:
3464 mb->nltype = NLTYPE_ANYCRLF;
3465 break;
3466
3467 default: return PCRE2_ERROR_INTERNAL;
3468 }
3469
3470 /* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
3471 we must also check that a starting offset does not point into the middle of a
3472 multiunit character. We check only the portion of the subject that is going to
3473 be inspected during matching - from the offset minus the maximum back reference
3474 to the given length. This saves time when a small part of a large subject is
3475 being matched by the use of a starting offset. Note that the maximum lookbehind
3476 is a number of characters, not code units. */
3477
3478 #ifdef SUPPORT_UNICODE
3479 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
3480 {
3481 PCRE2_SPTR check_subject = start_match; /* start_match includes offset */
3482
3483 if (start_offset > 0)
3484 {
3485 #if PCRE2_CODE_UNIT_WIDTH != 32
3486 unsigned int i;
3487 if (start_match < end_subject && NOT_FIRSTCU(*start_match))
3488 return PCRE2_ERROR_BADUTFOFFSET;
3489 for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
3490 {
3491 check_subject--;
3492 while (check_subject > subject &&
3493 #if PCRE2_CODE_UNIT_WIDTH == 8
3494 (*check_subject & 0xc0) == 0x80)
3495 #else /* 16-bit */
3496 (*check_subject & 0xfc00) == 0xdc00)
3497 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
3498 check_subject--;
3499 }
3500 #else /* In the 32-bit library, one code unit equals one character. */
3501 check_subject -= re->max_lookbehind;
3502 if (check_subject < subject) check_subject = subject;
3503 #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
3504 }
3505
3506 /* Validate the relevant portion of the subject. After an error, adjust the
3507 offset to be an absolute offset in the whole string. */
3508
3509 match_data->rc = PRIV(valid_utf)(check_subject,
3510 length - (PCRE2_SIZE)(check_subject - subject), &(match_data->startchar));
3511 if (match_data->rc != 0)
3512 {
3513 match_data->startchar += (PCRE2_SIZE)(check_subject - subject);
3514 return match_data->rc;
3515 }
3516 }
3517 #endif /* SUPPORT_UNICODE */
3518
3519 /* Set up the first code unit to match, if available. If there's no first code
3520 unit there may be a bitmap of possible first characters. */
3521
3522 if ((re->flags & PCRE2_FIRSTSET) != 0)
3523 {
3524 has_first_cu = TRUE;
3525 first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
3526 if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
3527 {
3528 first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
3529 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
3530 if (utf && first_cu > 127)
3531 first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3532 #endif
3533 }
3534 }
3535 else
3536 if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
3537 start_bits = re->start_bitmap;
3538
3539 /* There may be a "last known required code unit" set. */
3540
3541 if ((re->flags & PCRE2_LASTSET) != 0)
3542 {
3543 has_req_cu = TRUE;
3544 req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
3545 if ((re->flags & PCRE2_LASTCASELESS) != 0)
3546 {
3547 req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
3548 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
3549 if (utf && req_cu > 127) req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3550 #endif
3551 }
3552 }
3553
3554 /* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT,
3555 free the memory that was obtained. */
3556
3557 if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0)
3558 {
3559 match_data->memctl.free((void *)match_data->subject,
3560 match_data->memctl.memory_data);
3561 match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT;
3562 }
3563
3564 /* Fill in fields that are always returned in the match data. */
3565
3566 match_data->code = re;
3567 match_data->subject = NULL; /* Default for no match */
3568 match_data->mark = NULL;
3569 match_data->matchedby = PCRE2_MATCHEDBY_DFA_INTERPRETER;
3570
3571 /* Call the main matching function, looping for a non-anchored regex after a
3572 failed match. If not restarting, perform certain optimizations at the start of
3573 a match. */
3574
3575 for (;;)
3576 {
3577 /* ----------------- Start of match optimizations ---------------- */
3578
3579 /* There are some optimizations that avoid running the match if a known
3580 starting point is not found, or if a known later code unit is not present.
3581 However, there is an option (settable at compile time) that disables
3582 these, for testing and for ensuring that all callouts do actually occur.
3583 The optimizations must also be avoided when restarting a DFA match. */
3584
3585 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
3586 (options & PCRE2_DFA_RESTART) == 0)
3587 {
3588 /* If firstline is TRUE, the start of the match is constrained to the first
3589 line of a multiline string. That is, the match must be before or at the
3590 first newline following the start of matching. Temporarily adjust
3591 end_subject so that we stop the optimization scans for a first code unit
3592 immediately after the first character of a newline (the first code unit can
3593 legitimately be a newline). If the match fails at the newline, later code
3594 breaks this loop. */
3595
3596 if (firstline)
3597 {
3598 PCRE2_SPTR t = start_match;
3599 #ifdef SUPPORT_UNICODE
3600 if (utf)
3601 {
3602 while (t < end_subject && !IS_NEWLINE(t))
3603 {
3604 t++;
3605 ACROSSCHAR(t < end_subject, t, t++);
3606 }
3607 }
3608 else
3609 #endif
3610 while (t < end_subject && !IS_NEWLINE(t)) t++;
3611 end_subject = t;
3612 }
3613
3614 /* Anchored: check the first code unit if one is recorded. This may seem
3615 pointless but it can help in detecting a no match case without scanning for
3616 the required code unit. */
3617
3618 if (anchored)
3619 {
3620 if (has_first_cu || start_bits != NULL)
3621 {
3622 BOOL ok = start_match < end_subject;
3623 if (ok)
3624 {
3625 PCRE2_UCHAR c = UCHAR21TEST(start_match);
3626 ok = has_first_cu && (c == first_cu || c == first_cu2);
3627 if (!ok && start_bits != NULL)
3628 {
3629 #if PCRE2_CODE_UNIT_WIDTH != 8
3630 if (c > 255) c = 255;
3631 #endif
3632 ok = (start_bits[c/8] & (1u << (c&7))) != 0;
3633 }
3634 }
3635 if (!ok) break;
3636 }
3637 }
3638
3639 /* Not anchored. Advance to a unique first code unit if there is one. In
3640 8-bit mode, the use of memchr() gives a big speed up, even though we have
3641 to call it twice in caseless mode, in order to find the earliest occurrence
3642 of the character in either of its cases. If a call to memchr() that
3643 searches the rest of the subject fails to find one case, remember that in
3644 order not to keep on repeating the search. This can make a huge difference
3645 when the strings are very long and only one case is present. */
3646
3647 else
3648 {
3649 if (has_first_cu)
3650 {
3651 if (first_cu != first_cu2) /* Caseless */
3652 {
3653 #if PCRE2_CODE_UNIT_WIDTH != 8
3654 PCRE2_UCHAR smc;
3655 while (start_match < end_subject &&
3656 (smc = UCHAR21TEST(start_match)) != first_cu &&
3657 smc != first_cu2)
3658 start_match++;
3659
3660 #else /* 8-bit code units */
3661 PCRE2_SPTR pp1 = NULL;
3662 PCRE2_SPTR pp2 = NULL;
3663 PCRE2_SIZE cu2size = end_subject - start_match;
3664
3665 if (!memchr_not_found_first_cu)
3666 {
3667 pp1 = memchr(start_match, first_cu, end_subject - start_match);
3668 if (pp1 == NULL) memchr_not_found_first_cu = TRUE;
3669 else cu2size = pp1 - start_match;
3670 }
3671
3672 /* If pp1 is not NULL, we have arranged to search only as far as pp1,
3673 to see if the other case is earlier, so we can set "not found" only
3674 when both searches have returned NULL. */
3675
3676 if (!memchr_not_found_first_cu2)
3677 {
3678 pp2 = memchr(start_match, first_cu2, cu2size);
3679 memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL);
3680 }
3681
3682 if (pp1 == NULL)
3683 start_match = (pp2 == NULL)? end_subject : pp2;
3684 else
3685 start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
3686 #endif
3687 }
3688
3689 /* The caseful case */
3690
3691 else
3692 {
3693 #if PCRE2_CODE_UNIT_WIDTH != 8
3694 while (start_match < end_subject && UCHAR21TEST(start_match) !=
3695 first_cu)
3696 start_match++;
3697 #else /* 8-bit code units */
3698 start_match = memchr(start_match, first_cu, end_subject - start_match);
3699 if (start_match == NULL) start_match = end_subject;
3700 #endif
3701 }
3702
3703 /* If we can't find the required code unit, having reached the true end
3704 of the subject, break the bumpalong loop, to force a match failure,
3705 except when doing partial matching, when we let the next cycle run at
3706 the end of the subject. To see why, consider the pattern /(?<=abc)def/,
3707 which partially matches "abc", even though the string does not contain
3708 the starting character "d". If we have not reached the true end of the
3709 subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
3710 we also let the cycle run, because the matching string is legitimately
3711 allowed to start with the first code unit of a newline. */
3712
3713 if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3714 start_match >= mb->end_subject)
3715 break;
3716 }
3717
3718 /* If there's no first code unit, advance to just after a linebreak for a
3719 multiline match if required. */
3720
3721 else if (startline)
3722 {
3723 if (start_match > mb->start_subject + start_offset)
3724 {
3725 #ifdef SUPPORT_UNICODE
3726 if (utf)
3727 {
3728 while (start_match < end_subject && !WAS_NEWLINE(start_match))
3729 {
3730 start_match++;
3731 ACROSSCHAR(start_match < end_subject, start_match, start_match++);
3732 }
3733 }
3734 else
3735 #endif
3736 while (start_match < end_subject && !WAS_NEWLINE(start_match))
3737 start_match++;
3738
3739 /* If we have just passed a CR and the newline option is ANY or
3740 ANYCRLF, and we are now at a LF, advance the match position by one
3741 more code unit. */
3742
3743 if (start_match[-1] == CHAR_CR &&
3744 (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
3745 start_match < end_subject &&
3746 UCHAR21TEST(start_match) == CHAR_NL)
3747 start_match++;
3748 }
3749 }
3750
3751 /* If there's no first code unit or a requirement for a multiline line
3752 start, advance to a non-unique first code unit if any have been
3753 identified. The bitmap contains only 256 bits. When code units are 16 or
3754 32 bits wide, all code units greater than 254 set the 255 bit. */
3755
3756 else if (start_bits != NULL)
3757 {
3758 while (start_match < end_subject)
3759 {
3760 uint32_t c = UCHAR21TEST(start_match);
3761 #if PCRE2_CODE_UNIT_WIDTH != 8
3762 if (c > 255) c = 255;
3763 #endif
3764 if ((start_bits[c/8] & (1u << (c&7))) != 0) break;
3765 start_match++;
3766 }
3767
3768 /* See comment above in first_cu checking about the next line. */
3769
3770 if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3771 start_match >= mb->end_subject)
3772 break;
3773 }
3774 } /* End of first code unit handling */
3775
3776 /* Restore fudged end_subject */
3777
3778 end_subject = mb->end_subject;
3779
3780 /* The following two optimizations are disabled for partial matching. */
3781
3782 if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0)
3783 {
3784 PCRE2_SPTR p;
3785
3786 /* The minimum matching length is a lower bound; no actual string of that
3787 length may actually match the pattern. Although the value is, strictly,
3788 in characters, we treat it as code units to avoid spending too much time
3789 in this optimization. */
3790
3791 if (end_subject - start_match < re->minlength) goto NOMATCH_EXIT;
3792
3793 /* If req_cu is set, we know that that code unit must appear in the
3794 subject for the match to succeed. If the first code unit is set, req_cu
3795 must be later in the subject; otherwise the test starts at the match
3796 point. This optimization can save a huge amount of backtracking in
3797 patterns with nested unlimited repeats that aren't going to match.
3798 Writing separate code for cased/caseless versions makes it go faster, as
3799 does using an autoincrement and backing off on a match. As in the case of
3800 the first code unit, using memchr() in the 8-bit library gives a big
3801 speed up. Unlike the first_cu check above, we do not need to call
3802 memchr() twice in the caseless case because we only need to check for the
3803 presence of the character in either case, not find the first occurrence.
3804
3805 The search can be skipped if the code unit was found later than the
3806 current starting point in a previous iteration of the bumpalong loop.
3807
3808 HOWEVER: when the subject string is very, very long, searching to its end
3809 can take a long time, and give bad performance on quite ordinary
3810 patterns. This showed up when somebody was matching something like
3811 /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
3812 sufficiently long, but it's worth searching a lot more for unanchored
3813 patterns. */
3814
3815 p = start_match + (has_first_cu? 1:0);
3816 if (has_req_cu && p > req_cu_ptr)
3817 {
3818 PCRE2_SIZE check_length = end_subject - start_match;
3819
3820 if (check_length < REQ_CU_MAX ||
3821 (!anchored && check_length < REQ_CU_MAX * 1000))
3822 {
3823 if (req_cu != req_cu2) /* Caseless */
3824 {
3825 #if PCRE2_CODE_UNIT_WIDTH != 8
3826 while (p < end_subject)
3827 {
3828 uint32_t pp = UCHAR21INCTEST(p);
3829 if (pp == req_cu || pp == req_cu2) { p--; break; }
3830 }
3831 #else /* 8-bit code units */
3832 PCRE2_SPTR pp = p;
3833 p = memchr(pp, req_cu, end_subject - pp);
3834 if (p == NULL)
3835 {
3836 p = memchr(pp, req_cu2, end_subject - pp);
3837 if (p == NULL) p = end_subject;
3838 }
3839 #endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
3840 }
3841
3842 /* The caseful case */
3843
3844 else
3845 {
3846 #if PCRE2_CODE_UNIT_WIDTH != 8
3847 while (p < end_subject)
3848 {
3849 if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
3850 }
3851
3852 #else /* 8-bit code units */
3853 p = memchr(p, req_cu, end_subject - p);
3854 if (p == NULL) p = end_subject;
3855 #endif
3856 }
3857
3858 /* If we can't find the required code unit, break the matching loop,
3859 forcing a match failure. */
3860
3861 if (p >= end_subject) break;
3862
3863 /* If we have found the required code unit, save the point where we
3864 found it, so that we don't search again next time round the loop if
3865 the start hasn't passed this code unit yet. */
3866
3867 req_cu_ptr = p;
3868 }
3869 }
3870 }
3871 }
3872
3873 /* ------------ End of start of match optimizations ------------ */
3874
3875 /* Give no match if we have passed the bumpalong limit. */
3876
3877 if (start_match > bumpalong_limit) break;
3878
3879 /* OK, now we can do the business */
3880
3881 mb->start_used_ptr = start_match;
3882 mb->last_used_ptr = start_match;
3883 mb->recursive = NULL;
3884
3885 rc = internal_dfa_match(
3886 mb, /* fixed match data */
3887 mb->start_code, /* this subexpression's code */
3888 start_match, /* where we currently are */
3889 start_offset, /* start offset in subject */
3890 match_data->ovector, /* offset vector */
3891 (uint32_t)match_data->oveccount * 2, /* actual size of same */
3892 workspace, /* workspace vector */
3893 (int)wscount, /* size of same */
3894 0, /* function recurse level */
3895 base_recursion_workspace); /* initial workspace for recursion */
3896
3897 /* Anything other than "no match" means we are done, always; otherwise, carry
3898 on only if not anchored. */
3899
3900 if (rc != PCRE2_ERROR_NOMATCH || anchored)
3901 {
3902 if (rc == PCRE2_ERROR_PARTIAL && match_data->oveccount > 0)
3903 {
3904 match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject);
3905 match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject);
3906 }
3907 match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject);
3908 match_data->rightchar = (PCRE2_SIZE)( mb->last_used_ptr - subject);
3909 match_data->startchar = (PCRE2_SIZE)(start_match - subject);
3910 match_data->rc = rc;
3911
3912 if (rc >= 0 &&(options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
3913 {
3914 length = CU2BYTES(length + was_zero_terminated);
3915 match_data->subject = match_data->memctl.malloc(length,
3916 match_data->memctl.memory_data);
3917 if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;
3918 memcpy((void *)match_data->subject, subject, length);
3919 match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
3920 }
3921 else
3922 {
3923 if (rc >= 0 || rc == PCRE2_ERROR_PARTIAL) match_data->subject = subject;
3924 }
3925 goto EXIT;
3926 }
3927
3928 /* Advance to the next subject character unless we are at the end of a line
3929 and firstline is set. */
3930
3931 if (firstline && IS_NEWLINE(start_match)) break;
3932 start_match++;
3933 #ifdef SUPPORT_UNICODE
3934 if (utf)
3935 {
3936 ACROSSCHAR(start_match < end_subject, start_match, start_match++);
3937 }
3938 #endif
3939 if (start_match > end_subject) break;
3940
3941 /* If we have just passed a CR and we are now at a LF, and the pattern does
3942 not contain any explicit matches for \r or \n, and the newline option is CRLF
3943 or ANY or ANYCRLF, advance the match position by one more character. */
3944
3945 if (UCHAR21TEST(start_match - 1) == CHAR_CR &&
3946 start_match < end_subject &&
3947 UCHAR21TEST(start_match) == CHAR_NL &&
3948 (re->flags & PCRE2_HASCRORLF) == 0 &&
3949 (mb->nltype == NLTYPE_ANY ||
3950 mb->nltype == NLTYPE_ANYCRLF ||
3951 mb->nllen == 2))
3952 start_match++;
3953
3954 } /* "Bumpalong" loop */
3955
3956 NOMATCH_EXIT:
3957 rc = PCRE2_ERROR_NOMATCH;
3958
3959 EXIT:
3960 while (rws->next != NULL)
3961 {
3962 RWS_anchor *next = rws->next;
3963 rws->next = next->next;
3964 mb->memctl.free(next, mb->memctl.memory_data);
3965 }
3966
3967 return rc;
3968 }
3969
3970 /* End of pcre2_dfa_match.c */
3971