1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10          New API code Copyright (c) 2014 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 
42 /* This module contains the external function pcre2_dfa_match(), which is an
43 alternative matching function that uses a sort of DFA algorithm (not a true
44 FSM). This is NOT Perl-compatible, but it has advantages in certain
45 applications. */
46 
47 
48 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49 the performance of his patterns greatly. I could not use it as it stood, as it
50 was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 test 7 to loop, and test 9 to crash with a segfault.
52 
53 The issue is the check for duplicate states, which is done by a simple linear
54 search up the state list. (Grep for "duplicate" below to find the code.) For
55 many patterns, there will never be many states active at one time, so a simple
56 linear search is fine. In patterns that have many active states, it might be a
57 bottleneck. The suggested code used an indexing scheme to remember which states
58 had previously been used for each character, and avoided the linear search when
59 it knew there was no chance of a duplicate. This was implemented when adding
60 states to the state lists.
61 
62 I wrote some thread-safe, not-limited code to try something similar at the time
63 of checking for duplicates (instead of when adding states), using index vectors
64 on the stack. It did give a 13% improvement with one specially constructed
65 pattern for certain subject strings, but on other strings and on many of the
66 simpler patterns in the test suite it did worse. The major problem, I think,
67 was the extra time to initialize the index. This had to be done for each call
68 of internal_dfa_match(). (The supplied patch used a static vector, initialized
69 only once - I suspect this was the cause of the problems with the tests.)
70 
71 Overall, I concluded that the gains in some cases did not outweigh the losses
72 in others, so I abandoned this code. */
73 
74 
75 #ifdef HAVE_CONFIG_H
76 #include "config.h"
77 #endif
78 
79 #define NLBLOCK mb             /* Block containing newline information */
80 #define PSSTART start_subject  /* Field containing processed string start */
81 #define PSEND   end_subject    /* Field containing processed string end */
82 
83 #include "pcre2_internal.h"
84 
85 #define PUBLIC_DFA_MATCH_OPTIONS \
86   (PCRE2_ANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
87    PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
88    PCRE2_PARTIAL_SOFT|PCRE2_DFA_SHORTEST|PCRE2_DFA_RESTART)
89 
90 
91 /*************************************************
92 *      Code parameters and static tables         *
93 *************************************************/
94 
95 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
96 into others, under special conditions. A gap of 20 between the blocks should be
97 enough. The resulting opcodes don't have to be less than 256 because they are
98 never stored, so we push them well clear of the normal opcodes. */
99 
100 #define OP_PROP_EXTRA       300
101 #define OP_EXTUNI_EXTRA     320
102 #define OP_ANYNL_EXTRA      340
103 #define OP_HSPACE_EXTRA     360
104 #define OP_VSPACE_EXTRA     380
105 
106 
107 /* This table identifies those opcodes that are followed immediately by a
108 character that is to be tested in some way. This makes it possible to
109 centralize the loading of these characters. In the case of Type * etc, the
110 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
111 small value. Non-zero values in the table are the offsets from the opcode where
112 the character is to be found. ***NOTE*** If the start of this table is
113 modified, the three tables that follow must also be modified. */
114 
115 static const uint8_t coptable[] = {
116   0,                             /* End                                    */
117   0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
118   0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
119   0, 0, 0,                       /* Any, AllAny, Anybyte                   */
120   0, 0,                          /* \P, \p                                 */
121   0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
122   0,                             /* \X                                     */
123   0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
124   1,                             /* Char                                   */
125   1,                             /* Chari                                  */
126   1,                             /* not                                    */
127   1,                             /* noti                                   */
128   /* Positive single-char repeats                                          */
129   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
130   1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto, minupto                          */
131   1+IMM2_SIZE,                   /* exact                                  */
132   1, 1, 1, 1+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */
133   1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
134   1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto I, minupto I                      */
135   1+IMM2_SIZE,                   /* exact I                                */
136   1, 1, 1, 1+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */
137   /* Negative single-char repeats - only for chars < 256                   */
138   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
139   1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto, minupto                      */
140   1+IMM2_SIZE,                   /* NOT exact                              */
141   1, 1, 1, 1+IMM2_SIZE,          /* NOT *+, ++, ?+, upto+                  */
142   1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
143   1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto I, minupto I                  */
144   1+IMM2_SIZE,                   /* NOT exact I                            */
145   1, 1, 1, 1+IMM2_SIZE,          /* NOT *+I, ++I, ?+I, upto+I              */
146   /* Positive type repeats                                                 */
147   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
148   1+IMM2_SIZE, 1+IMM2_SIZE,      /* Type upto, minupto                     */
149   1+IMM2_SIZE,                   /* Type exact                             */
150   1, 1, 1, 1+IMM2_SIZE,          /* Type *+, ++, ?+, upto+                 */
151   /* Character class & ref repeats                                         */
152   0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
153   0, 0,                          /* CRRANGE, CRMINRANGE                    */
154   0, 0, 0, 0,                    /* Possessive *+, ++, ?+, CRPOSRANGE      */
155   0,                             /* CLASS                                  */
156   0,                             /* NCLASS                                 */
157   0,                             /* XCLASS - variable length               */
158   0,                             /* REF                                    */
159   0,                             /* REFI                                   */
160   0,                             /* DNREF                                  */
161   0,                             /* DNREFI                                 */
162   0,                             /* RECURSE                                */
163   0,                             /* CALLOUT                                */
164   0,                             /* Alt                                    */
165   0,                             /* Ket                                    */
166   0,                             /* KetRmax                                */
167   0,                             /* KetRmin                                */
168   0,                             /* KetRpos                                */
169   0,                             /* Reverse                                */
170   0,                             /* Assert                                 */
171   0,                             /* Assert not                             */
172   0,                             /* Assert behind                          */
173   0,                             /* Assert behind not                      */
174   0, 0,                          /* ONCE, ONCE_NC                          */
175   0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
176   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
177   0, 0,                          /* CREF, DNCREF                           */
178   0, 0,                          /* RREF, DNRREF                           */
179   0, 0,                          /* FALSE, TRUE                            */
180   0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
181   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
182   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
183   0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
184   0, 0, 0                        /* CLOSE, SKIPZERO, DEFINE                */
185 };
186 
187 /* This table identifies those opcodes that inspect a character. It is used to
188 remember the fact that a character could have been inspected when the end of
189 the subject is reached. ***NOTE*** If the start of this table is modified, the
190 two tables that follow must also be modified. */
191 
192 static const uint8_t poptable[] = {
193   0,                             /* End                                    */
194   0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
195   1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
196   1, 1, 1,                       /* Any, AllAny, Anybyte                   */
197   1, 1,                          /* \P, \p                                 */
198   1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
199   1,                             /* \X                                     */
200   0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
201   1,                             /* Char                                   */
202   1,                             /* Chari                                  */
203   1,                             /* not                                    */
204   1,                             /* noti                                   */
205   /* Positive single-char repeats                                          */
206   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
207   1, 1, 1,                       /* upto, minupto, exact                   */
208   1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
209   1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
210   1, 1, 1,                       /* upto I, minupto I, exact I             */
211   1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
212   /* Negative single-char repeats - only for chars < 256                   */
213   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
214   1, 1, 1,                       /* NOT upto, minupto, exact               */
215   1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
216   1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
217   1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
218   1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
219   /* Positive type repeats                                                 */
220   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
221   1, 1, 1,                       /* Type upto, minupto, exact              */
222   1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
223   /* Character class & ref repeats                                         */
224   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
225   1, 1,                          /* CRRANGE, CRMINRANGE                    */
226   1, 1, 1, 1,                    /* Possessive *+, ++, ?+, CRPOSRANGE      */
227   1,                             /* CLASS                                  */
228   1,                             /* NCLASS                                 */
229   1,                             /* XCLASS - variable length               */
230   0,                             /* REF                                    */
231   0,                             /* REFI                                   */
232   0,                             /* DNREF                                  */
233   0,                             /* DNREFI                                 */
234   0,                             /* RECURSE                                */
235   0,                             /* CALLOUT                                */
236   0,                             /* Alt                                    */
237   0,                             /* Ket                                    */
238   0,                             /* KetRmax                                */
239   0,                             /* KetRmin                                */
240   0,                             /* KetRpos                                */
241   0,                             /* Reverse                                */
242   0,                             /* Assert                                 */
243   0,                             /* Assert not                             */
244   0,                             /* Assert behind                          */
245   0,                             /* Assert behind not                      */
246   0, 0,                          /* ONCE, ONCE_NC                          */
247   0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
248   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
249   0, 0,                          /* CREF, DNCREF                           */
250   0, 0,                          /* RREF, DNRREF                           */
251   0, 0,                          /* FALSE, TRUE                            */
252   0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
253   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
254   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
255   0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
256   0, 0, 0                        /* CLOSE, SKIPZERO, DEFINE                */
257 };
258 
259 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
260 and \w */
261 
262 static const uint8_t toptable1[] = {
263   0, 0, 0, 0, 0, 0,
264   ctype_digit, ctype_digit,
265   ctype_space, ctype_space,
266   ctype_word,  ctype_word,
267   0, 0                            /* OP_ANY, OP_ALLANY */
268 };
269 
270 static const uint8_t toptable2[] = {
271   0, 0, 0, 0, 0, 0,
272   ctype_digit, 0,
273   ctype_space, 0,
274   ctype_word,  0,
275   1, 1                            /* OP_ANY, OP_ALLANY */
276 };
277 
278 
279 /* Structure for holding data about a particular state, which is in effect the
280 current data for an active path through the match tree. It must consist
281 entirely of ints because the working vector we are passed, and which we put
282 these structures in, is a vector of ints. */
283 
284 typedef struct stateblock {
285   int offset;                     /* Offset to opcode (-ve has meaning) */
286   int count;                      /* Count for repeats */
287   int data;                       /* Some use extra data */
288 } stateblock;
289 
290 #define INTS_PER_STATEBLOCK  (int)(sizeof(stateblock)/sizeof(int))
291 
292 
293 
294 /*************************************************
295 *     Match a Regular Expression - DFA engine    *
296 *************************************************/
297 
298 /* This internal function applies a compiled pattern to a subject string,
299 starting at a given point, using a DFA engine. This function is called from the
300 external one, possibly multiple times if the pattern is not anchored. The
301 function calls itself recursively for some kinds of subpattern.
302 
303 Arguments:
304   mb                the match_data block with fixed information
305   this_start_code   the opening bracket of this subexpression's code
306   current_subject   where we currently are in the subject string
307   start_offset      start offset in the subject string
308   offsets           vector to contain the matching string offsets
309   offsetcount       size of same
310   workspace         vector of workspace
311   wscount           size of same
312   rlevel            function call recursion level
313 
314 Returns:            > 0 => number of match offset pairs placed in offsets
315                     = 0 => offsets overflowed; longest matches are present
316                      -1 => failed to match
317                    < -1 => some kind of unexpected problem
318 
319 The following macros are used for adding states to the two state vectors (one
320 for the current character, one for the following character). */
321 
322 #define ADD_ACTIVE(x,y) \
323   if (active_count++ < wscount) \
324     { \
325     next_active_state->offset = (x); \
326     next_active_state->count  = (y); \
327     next_active_state++; \
328     } \
329   else return PCRE2_ERROR_DFA_WSSIZE
330 
331 #define ADD_ACTIVE_DATA(x,y,z) \
332   if (active_count++ < wscount) \
333     { \
334     next_active_state->offset = (x); \
335     next_active_state->count  = (y); \
336     next_active_state->data   = (z); \
337     next_active_state++; \
338     } \
339   else return PCRE2_ERROR_DFA_WSSIZE
340 
341 #define ADD_NEW(x,y) \
342   if (new_count++ < wscount) \
343     { \
344     next_new_state->offset = (x); \
345     next_new_state->count  = (y); \
346     next_new_state++; \
347     } \
348   else return PCRE2_ERROR_DFA_WSSIZE
349 
350 #define ADD_NEW_DATA(x,y,z) \
351   if (new_count++ < wscount) \
352     { \
353     next_new_state->offset = (x); \
354     next_new_state->count  = (y); \
355     next_new_state->data   = (z); \
356     next_new_state++; \
357     } \
358   else return PCRE2_ERROR_DFA_WSSIZE
359 
360 /* And now, here is the code */
361 
362 static int
internal_dfa_match(dfa_match_block * mb,PCRE2_SPTR this_start_code,PCRE2_SPTR current_subject,PCRE2_SIZE start_offset,PCRE2_SIZE * offsets,uint32_t offsetcount,int * workspace,int wscount,int rlevel)363 internal_dfa_match(
364   dfa_match_block *mb,
365   PCRE2_SPTR this_start_code,
366   PCRE2_SPTR current_subject,
367   PCRE2_SIZE start_offset,
368   PCRE2_SIZE *offsets,
369   uint32_t offsetcount,
370   int *workspace,
371   int wscount,
372   int  rlevel)
373 {
374 stateblock *active_states, *new_states, *temp_states;
375 stateblock *next_active_state, *next_new_state;
376 
377 const uint8_t *ctypes, *lcc, *fcc;
378 PCRE2_SPTR ptr;
379 PCRE2_SPTR end_code;
380 PCRE2_SPTR first_op;
381 
382 dfa_recursion_info new_recursive;
383 
384 int active_count, new_count, match_count;
385 
386 /* Some fields in the mb block are frequently referenced, so we load them into
387 independent variables in the hope that this will perform better. */
388 
389 PCRE2_SPTR start_subject = mb->start_subject;
390 PCRE2_SPTR end_subject = mb->end_subject;
391 PCRE2_SPTR start_code = mb->start_code;
392 
393 #ifdef SUPPORT_UNICODE
394 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
395 #else
396 BOOL utf = FALSE;
397 #endif
398 
399 BOOL reset_could_continue = FALSE;
400 
401 rlevel++;
402 offsetcount &= (-2);
403 
404 wscount -= 2;
405 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
406           (2 * INTS_PER_STATEBLOCK);
407 
408 ctypes = mb->tables + ctypes_offset;
409 lcc = mb->tables + lcc_offset;
410 fcc = mb->tables + fcc_offset;
411 
412 match_count = PCRE2_ERROR_NOMATCH;   /* A negative number */
413 
414 active_states = (stateblock *)(workspace + 2);
415 next_new_state = new_states = active_states + wscount;
416 new_count = 0;
417 
418 first_op = this_start_code + 1 + LINK_SIZE +
419   ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
420     *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
421     ? IMM2_SIZE:0);
422 
423 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
424 the alternative states onto the list, and find out where the end is. This
425 makes is possible to use this function recursively, when we want to stop at a
426 matching internal ket rather than at the end.
427 
428 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
429 a backward assertion. In that case, we have to find out the maximum amount to
430 move back, and set up each alternative appropriately. */
431 
432 if (*first_op == OP_REVERSE)
433   {
434   int max_back = 0;
435   int gone_back;
436 
437   end_code = this_start_code;
438   do
439     {
440     int back = GET(end_code, 2+LINK_SIZE);
441     if (back > max_back) max_back = back;
442     end_code += GET(end_code, 1);
443     }
444   while (*end_code == OP_ALT);
445 
446   /* If we can't go back the amount required for the longest lookbehind
447   pattern, go back as far as we can; some alternatives may still be viable. */
448 
449 #ifdef SUPPORT_UNICODE
450   /* In character mode we have to step back character by character */
451 
452   if (utf)
453     {
454     for (gone_back = 0; gone_back < max_back; gone_back++)
455       {
456       if (current_subject <= start_subject) break;
457       current_subject--;
458       ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
459       }
460     }
461   else
462 #endif
463 
464   /* In byte-mode we can do this quickly. */
465 
466     {
467     gone_back = (current_subject - max_back < start_subject)?
468       (int)(current_subject - start_subject) : max_back;
469     current_subject -= gone_back;
470     }
471 
472   /* Save the earliest consulted character */
473 
474   if (current_subject < mb->start_used_ptr)
475     mb->start_used_ptr = current_subject;
476 
477   /* Now we can process the individual branches. */
478 
479   end_code = this_start_code;
480   do
481     {
482     int back = GET(end_code, 2+LINK_SIZE);
483     if (back <= gone_back)
484       {
485       int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
486       ADD_NEW_DATA(-bstate, 0, gone_back - back);
487       }
488     end_code += GET(end_code, 1);
489     }
490   while (*end_code == OP_ALT);
491  }
492 
493 /* This is the code for a "normal" subpattern (not a backward assertion). The
494 start of a whole pattern is always one of these. If we are at the top level,
495 we may be asked to restart matching from the same point that we reached for a
496 previous partial match. We still have to scan through the top-level branches to
497 find the end state. */
498 
499 else
500   {
501   end_code = this_start_code;
502 
503   /* Restarting */
504 
505   if (rlevel == 1 && (mb->moptions & PCRE2_DFA_RESTART) != 0)
506     {
507     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
508     new_count = workspace[1];
509     if (!workspace[0])
510       memcpy(new_states, active_states, new_count * sizeof(stateblock));
511     }
512 
513   /* Not restarting */
514 
515   else
516     {
517     int length = 1 + LINK_SIZE +
518       ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
519         *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
520         ? IMM2_SIZE:0);
521     do
522       {
523       ADD_NEW((int)(end_code - start_code + length), 0);
524       end_code += GET(end_code, 1);
525       length = 1 + LINK_SIZE;
526       }
527     while (*end_code == OP_ALT);
528     }
529   }
530 
531 workspace[0] = 0;    /* Bit indicating which vector is current */
532 
533 /* Loop for scanning the subject */
534 
535 ptr = current_subject;
536 for (;;)
537   {
538   int i, j;
539   int clen, dlen;
540   uint32_t c, d;
541   int forced_fail = 0;
542   BOOL partial_newline = FALSE;
543   BOOL could_continue = reset_could_continue;
544   reset_could_continue = FALSE;
545 
546   if (ptr > mb->last_used_ptr) mb->last_used_ptr = ptr;
547 
548   /* Make the new state list into the active state list and empty the
549   new state list. */
550 
551   temp_states = active_states;
552   active_states = new_states;
553   new_states = temp_states;
554   active_count = new_count;
555   new_count = 0;
556 
557   workspace[0] ^= 1;              /* Remember for the restarting feature */
558   workspace[1] = active_count;
559 
560   /* Set the pointers for adding new states */
561 
562   next_active_state = active_states + active_count;
563   next_new_state = new_states;
564 
565   /* Load the current character from the subject outside the loop, as many
566   different states may want to look at it, and we assume that at least one
567   will. */
568 
569   if (ptr < end_subject)
570     {
571     clen = 1;        /* Number of data items in the character */
572 #ifdef SUPPORT_UNICODE
573     GETCHARLENTEST(c, ptr, clen);
574 #else
575     c = *ptr;
576 #endif  /* SUPPORT_UNICODE */
577     }
578   else
579     {
580     clen = 0;        /* This indicates the end of the subject */
581     c = NOTACHAR;    /* This value should never actually be used */
582     }
583 
584   /* Scan up the active states and act on each one. The result of an action
585   may be to add more states to the currently active list (e.g. on hitting a
586   parenthesis) or it may be to put states on the new list, for considering
587   when we move the character pointer on. */
588 
589   for (i = 0; i < active_count; i++)
590     {
591     stateblock *current_state = active_states + i;
592     BOOL caseless = FALSE;
593     PCRE2_SPTR code;
594     int state_offset = current_state->offset;
595     int codevalue, rrc;
596     int count;
597 
598     /* A negative offset is a special case meaning "hold off going to this
599     (negated) state until the number of characters in the data field have
600     been skipped". If the could_continue flag was passed over from a previous
601     state, arrange for it to passed on. */
602 
603     if (state_offset < 0)
604       {
605       if (current_state->data > 0)
606         {
607         ADD_NEW_DATA(state_offset, current_state->count,
608           current_state->data - 1);
609         if (could_continue) reset_could_continue = TRUE;
610         continue;
611         }
612       else
613         {
614         current_state->offset = state_offset = -state_offset;
615         }
616       }
617 
618     /* Check for a duplicate state with the same count, and skip if found.
619     See the note at the head of this module about the possibility of improving
620     performance here. */
621 
622     for (j = 0; j < i; j++)
623       {
624       if (active_states[j].offset == state_offset &&
625           active_states[j].count == current_state->count)
626         goto NEXT_ACTIVE_STATE;
627       }
628 
629     /* The state offset is the offset to the opcode */
630 
631     code = start_code + state_offset;
632     codevalue = *code;
633 
634     /* If this opcode inspects a character, but we are at the end of the
635     subject, remember the fact for use when testing for a partial match. */
636 
637     if (clen == 0 && poptable[codevalue] != 0)
638       could_continue = TRUE;
639 
640     /* If this opcode is followed by an inline character, load it. It is
641     tempting to test for the presence of a subject character here, but that
642     is wrong, because sometimes zero repetitions of the subject are
643     permitted.
644 
645     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
646     argument that is not a data character - but is always one byte long because
647     the values are small. We have to take special action to deal with  \P, \p,
648     \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
649     these ones to new opcodes. */
650 
651     if (coptable[codevalue] > 0)
652       {
653       dlen = 1;
654 #ifdef SUPPORT_UNICODE
655       if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
656 #endif  /* SUPPORT_UNICODE */
657       d = code[coptable[codevalue]];
658       if (codevalue >= OP_TYPESTAR)
659         {
660         switch(d)
661           {
662           case OP_ANYBYTE: return PCRE2_ERROR_DFA_UITEM;
663           case OP_NOTPROP:
664           case OP_PROP: codevalue += OP_PROP_EXTRA; break;
665           case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
666           case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
667           case OP_NOT_HSPACE:
668           case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
669           case OP_NOT_VSPACE:
670           case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
671           default: break;
672           }
673         }
674       }
675     else
676       {
677       dlen = 0;         /* Not strictly necessary, but compilers moan */
678       d = NOTACHAR;     /* if these variables are not set. */
679       }
680 
681 
682     /* Now process the individual opcodes */
683 
684     switch (codevalue)
685       {
686 /* ========================================================================== */
687       /* These cases are never obeyed. This is a fudge that causes a compile-
688       time error if the vectors coptable or poptable, which are indexed by
689       opcode, are not the correct length. It seems to be the only way to do
690       such a check at compile time, as the sizeof() operator does not work
691       in the C preprocessor. */
692 
693       case OP_TABLE_LENGTH:
694       case OP_TABLE_LENGTH +
695         ((sizeof(coptable) == OP_TABLE_LENGTH) &&
696          (sizeof(poptable) == OP_TABLE_LENGTH)):
697       break;
698 
699 /* ========================================================================== */
700       /* Reached a closing bracket. If not at the end of the pattern, carry
701       on with the next opcode. For repeating opcodes, also add the repeat
702       state. Note that KETRPOS will always be encountered at the end of the
703       subpattern, because the possessive subpattern repeats are always handled
704       using recursive calls. Thus, it never adds any new states.
705 
706       At the end of the (sub)pattern, unless we have an empty string and
707       PCRE2_NOTEMPTY is set, or PCRE2_NOTEMPTY_ATSTART is set and we are at the
708       start of the subject, save the match data, shifting up all previous
709       matches so we always have the longest first. */
710 
711       case OP_KET:
712       case OP_KETRMIN:
713       case OP_KETRMAX:
714       case OP_KETRPOS:
715       if (code != end_code)
716         {
717         ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
718         if (codevalue != OP_KET)
719           {
720           ADD_ACTIVE(state_offset - GET(code, 1), 0);
721           }
722         }
723       else
724         {
725         if (ptr > current_subject ||
726             ((mb->moptions & PCRE2_NOTEMPTY) == 0 &&
727               ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) == 0 ||
728                 current_subject > start_subject + mb->start_offset)))
729           {
730           if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
731             else if (match_count > 0 && ++match_count * 2 > (int)offsetcount)
732               match_count = 0;
733           count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2;
734           if (count > 0) memmove(offsets + 2, offsets, count * sizeof(PCRE2_SIZE));
735           if (offsetcount >= 2)
736             {
737             offsets[0] = (int)(current_subject - start_subject);
738             offsets[1] = (int)(ptr - start_subject);
739             }
740           if ((mb->moptions & PCRE2_DFA_SHORTEST) != 0) return match_count;
741           }
742         }
743       break;
744 
745 /* ========================================================================== */
746       /* These opcodes add to the current list of states without looking
747       at the current character. */
748 
749       /*-----------------------------------------------------------------*/
750       case OP_ALT:
751       do { code += GET(code, 1); } while (*code == OP_ALT);
752       ADD_ACTIVE((int)(code - start_code), 0);
753       break;
754 
755       /*-----------------------------------------------------------------*/
756       case OP_BRA:
757       case OP_SBRA:
758       do
759         {
760         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
761         code += GET(code, 1);
762         }
763       while (*code == OP_ALT);
764       break;
765 
766       /*-----------------------------------------------------------------*/
767       case OP_CBRA:
768       case OP_SCBRA:
769       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE),  0);
770       code += GET(code, 1);
771       while (*code == OP_ALT)
772         {
773         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
774         code += GET(code, 1);
775         }
776       break;
777 
778       /*-----------------------------------------------------------------*/
779       case OP_BRAZERO:
780       case OP_BRAMINZERO:
781       ADD_ACTIVE(state_offset + 1, 0);
782       code += 1 + GET(code, 2);
783       while (*code == OP_ALT) code += GET(code, 1);
784       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
785       break;
786 
787       /*-----------------------------------------------------------------*/
788       case OP_SKIPZERO:
789       code += 1 + GET(code, 2);
790       while (*code == OP_ALT) code += GET(code, 1);
791       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
792       break;
793 
794       /*-----------------------------------------------------------------*/
795       case OP_CIRC:
796       if (ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0)
797         { ADD_ACTIVE(state_offset + 1, 0); }
798       break;
799 
800       /*-----------------------------------------------------------------*/
801       case OP_CIRCM:
802       if ((ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) ||
803           (ptr != end_subject && WAS_NEWLINE(ptr)))
804         { ADD_ACTIVE(state_offset + 1, 0); }
805       break;
806 
807       /*-----------------------------------------------------------------*/
808       case OP_EOD:
809       if (ptr >= end_subject)
810         {
811         if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
812           could_continue = TRUE;
813         else { ADD_ACTIVE(state_offset + 1, 0); }
814         }
815       break;
816 
817       /*-----------------------------------------------------------------*/
818       case OP_SOD:
819       if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
820       break;
821 
822       /*-----------------------------------------------------------------*/
823       case OP_SOM:
824       if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
825       break;
826 
827 
828 /* ========================================================================== */
829       /* These opcodes inspect the next subject character, and sometimes
830       the previous one as well, but do not have an argument. The variable
831       clen contains the length of the current character and is zero if we are
832       at the end of the subject. */
833 
834       /*-----------------------------------------------------------------*/
835       case OP_ANY:
836       if (clen > 0 && !IS_NEWLINE(ptr))
837         {
838         if (ptr + 1 >= mb->end_subject &&
839             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
840             NLBLOCK->nltype == NLTYPE_FIXED &&
841             NLBLOCK->nllen == 2 &&
842             c == NLBLOCK->nl[0])
843           {
844           could_continue = partial_newline = TRUE;
845           }
846         else
847           {
848           ADD_NEW(state_offset + 1, 0);
849           }
850         }
851       break;
852 
853       /*-----------------------------------------------------------------*/
854       case OP_ALLANY:
855       if (clen > 0)
856         { ADD_NEW(state_offset + 1, 0); }
857       break;
858 
859       /*-----------------------------------------------------------------*/
860       case OP_EODN:
861       if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
862         could_continue = TRUE;
863       else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - mb->nllen))
864         { ADD_ACTIVE(state_offset + 1, 0); }
865       break;
866 
867       /*-----------------------------------------------------------------*/
868       case OP_DOLL:
869       if ((mb->moptions & PCRE2_NOTEOL) == 0)
870         {
871         if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
872           could_continue = TRUE;
873         else if (clen == 0 ||
874             ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
875                (ptr == end_subject - mb->nllen)
876             ))
877           { ADD_ACTIVE(state_offset + 1, 0); }
878         else if (ptr + 1 >= mb->end_subject &&
879                  (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
880                  NLBLOCK->nltype == NLTYPE_FIXED &&
881                  NLBLOCK->nllen == 2 &&
882                  c == NLBLOCK->nl[0])
883           {
884           if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
885             {
886             reset_could_continue = TRUE;
887             ADD_NEW_DATA(-(state_offset + 1), 0, 1);
888             }
889           else could_continue = partial_newline = TRUE;
890           }
891         }
892       break;
893 
894       /*-----------------------------------------------------------------*/
895       case OP_DOLLM:
896       if ((mb->moptions & PCRE2_NOTEOL) == 0)
897         {
898         if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
899           could_continue = TRUE;
900         else if (clen == 0 ||
901             ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
902           { ADD_ACTIVE(state_offset + 1, 0); }
903         else if (ptr + 1 >= mb->end_subject &&
904                  (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
905                  NLBLOCK->nltype == NLTYPE_FIXED &&
906                  NLBLOCK->nllen == 2 &&
907                  c == NLBLOCK->nl[0])
908           {
909           if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
910             {
911             reset_could_continue = TRUE;
912             ADD_NEW_DATA(-(state_offset + 1), 0, 1);
913             }
914           else could_continue = partial_newline = TRUE;
915           }
916         }
917       else if (IS_NEWLINE(ptr))
918         { ADD_ACTIVE(state_offset + 1, 0); }
919       break;
920 
921       /*-----------------------------------------------------------------*/
922 
923       case OP_DIGIT:
924       case OP_WHITESPACE:
925       case OP_WORDCHAR:
926       if (clen > 0 && c < 256 &&
927             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
928         { ADD_NEW(state_offset + 1, 0); }
929       break;
930 
931       /*-----------------------------------------------------------------*/
932       case OP_NOT_DIGIT:
933       case OP_NOT_WHITESPACE:
934       case OP_NOT_WORDCHAR:
935       if (clen > 0 && (c >= 256 ||
936             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
937         { ADD_NEW(state_offset + 1, 0); }
938       break;
939 
940       /*-----------------------------------------------------------------*/
941       case OP_WORD_BOUNDARY:
942       case OP_NOT_WORD_BOUNDARY:
943         {
944         int left_word, right_word;
945 
946         if (ptr > start_subject)
947           {
948           PCRE2_SPTR temp = ptr - 1;
949           if (temp < mb->start_used_ptr) mb->start_used_ptr = temp;
950 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
951           if (utf) { BACKCHAR(temp); }
952 #endif
953           GETCHARTEST(d, temp);
954 #ifdef SUPPORT_UNICODE
955           if ((mb->poptions & PCRE2_UCP) != 0)
956             {
957             if (d == '_') left_word = TRUE; else
958               {
959               int cat = UCD_CATEGORY(d);
960               left_word = (cat == ucp_L || cat == ucp_N);
961               }
962             }
963           else
964 #endif
965           left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
966           }
967         else left_word = FALSE;
968 
969         if (clen > 0)
970           {
971           if (ptr >= mb->last_used_ptr)
972             {
973             PCRE2_SPTR temp = ptr + 1;
974 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
975             if (utf) { FORWARDCHARTEST(temp, mb->end_subject); }
976 #endif
977             mb->last_used_ptr = temp;
978             }
979 #ifdef SUPPORT_UNICODE
980           if ((mb->poptions & PCRE2_UCP) != 0)
981             {
982             if (c == '_') right_word = TRUE; else
983               {
984               int cat = UCD_CATEGORY(c);
985               right_word = (cat == ucp_L || cat == ucp_N);
986               }
987             }
988           else
989 #endif
990           right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
991           }
992         else right_word = FALSE;
993 
994         if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
995           { ADD_ACTIVE(state_offset + 1, 0); }
996         }
997       break;
998 
999 
1000       /*-----------------------------------------------------------------*/
1001       /* Check the next character by Unicode property. We will get here only
1002       if the support is in the binary; otherwise a compile-time error occurs.
1003       */
1004 
1005 #ifdef SUPPORT_UNICODE
1006       case OP_PROP:
1007       case OP_NOTPROP:
1008       if (clen > 0)
1009         {
1010         BOOL OK;
1011         const uint32_t *cp;
1012         const ucd_record * prop = GET_UCD(c);
1013         switch(code[1])
1014           {
1015           case PT_ANY:
1016           OK = TRUE;
1017           break;
1018 
1019           case PT_LAMP:
1020           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1021                prop->chartype == ucp_Lt;
1022           break;
1023 
1024           case PT_GC:
1025           OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1026           break;
1027 
1028           case PT_PC:
1029           OK = prop->chartype == code[2];
1030           break;
1031 
1032           case PT_SC:
1033           OK = prop->script == code[2];
1034           break;
1035 
1036           /* These are specials for combination cases. */
1037 
1038           case PT_ALNUM:
1039           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1040                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1041           break;
1042 
1043           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1044           which means that Perl space and POSIX space are now identical. PCRE
1045           was changed at release 8.34. */
1046 
1047           case PT_SPACE:    /* Perl space */
1048           case PT_PXSPACE:  /* POSIX space */
1049           switch(c)
1050             {
1051             HSPACE_CASES:
1052             VSPACE_CASES:
1053             OK = TRUE;
1054             break;
1055 
1056             default:
1057             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1058             break;
1059             }
1060           break;
1061 
1062           case PT_WORD:
1063           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1064                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1065                c == CHAR_UNDERSCORE;
1066           break;
1067 
1068           case PT_CLIST:
1069           cp = PRIV(ucd_caseless_sets) + code[2];
1070           for (;;)
1071             {
1072             if (c < *cp) { OK = FALSE; break; }
1073             if (c == *cp++) { OK = TRUE; break; }
1074             }
1075           break;
1076 
1077           case PT_UCNC:
1078           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1079                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1080                c >= 0xe000;
1081           break;
1082 
1083           /* Should never occur, but keep compilers from grumbling. */
1084 
1085           default:
1086           OK = codevalue != OP_PROP;
1087           break;
1088           }
1089 
1090         if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1091         }
1092       break;
1093 #endif
1094 
1095 
1096 
1097 /* ========================================================================== */
1098       /* These opcodes likewise inspect the subject character, but have an
1099       argument that is not a data character. It is one of these opcodes:
1100       OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1101       OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1102 
1103       case OP_TYPEPLUS:
1104       case OP_TYPEMINPLUS:
1105       case OP_TYPEPOSPLUS:
1106       count = current_state->count;  /* Already matched */
1107       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1108       if (clen > 0)
1109         {
1110         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1111             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1112             NLBLOCK->nltype == NLTYPE_FIXED &&
1113             NLBLOCK->nllen == 2 &&
1114             c == NLBLOCK->nl[0])
1115           {
1116           could_continue = partial_newline = TRUE;
1117           }
1118         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1119             (c < 256 &&
1120               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1121               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1122           {
1123           if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1124             {
1125             active_count--;            /* Remove non-match possibility */
1126             next_active_state--;
1127             }
1128           count++;
1129           ADD_NEW(state_offset, count);
1130           }
1131         }
1132       break;
1133 
1134       /*-----------------------------------------------------------------*/
1135       case OP_TYPEQUERY:
1136       case OP_TYPEMINQUERY:
1137       case OP_TYPEPOSQUERY:
1138       ADD_ACTIVE(state_offset + 2, 0);
1139       if (clen > 0)
1140         {
1141         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1142             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1143             NLBLOCK->nltype == NLTYPE_FIXED &&
1144             NLBLOCK->nllen == 2 &&
1145             c == NLBLOCK->nl[0])
1146           {
1147           could_continue = partial_newline = TRUE;
1148           }
1149         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1150             (c < 256 &&
1151               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1152               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1153           {
1154           if (codevalue == OP_TYPEPOSQUERY)
1155             {
1156             active_count--;            /* Remove non-match possibility */
1157             next_active_state--;
1158             }
1159           ADD_NEW(state_offset + 2, 0);
1160           }
1161         }
1162       break;
1163 
1164       /*-----------------------------------------------------------------*/
1165       case OP_TYPESTAR:
1166       case OP_TYPEMINSTAR:
1167       case OP_TYPEPOSSTAR:
1168       ADD_ACTIVE(state_offset + 2, 0);
1169       if (clen > 0)
1170         {
1171         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1172             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1173             NLBLOCK->nltype == NLTYPE_FIXED &&
1174             NLBLOCK->nllen == 2 &&
1175             c == NLBLOCK->nl[0])
1176           {
1177           could_continue = partial_newline = TRUE;
1178           }
1179         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1180             (c < 256 &&
1181               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1182               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1183           {
1184           if (codevalue == OP_TYPEPOSSTAR)
1185             {
1186             active_count--;            /* Remove non-match possibility */
1187             next_active_state--;
1188             }
1189           ADD_NEW(state_offset, 0);
1190           }
1191         }
1192       break;
1193 
1194       /*-----------------------------------------------------------------*/
1195       case OP_TYPEEXACT:
1196       count = current_state->count;  /* Number already matched */
1197       if (clen > 0)
1198         {
1199         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1200             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1201             NLBLOCK->nltype == NLTYPE_FIXED &&
1202             NLBLOCK->nllen == 2 &&
1203             c == NLBLOCK->nl[0])
1204           {
1205           could_continue = partial_newline = TRUE;
1206           }
1207         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1208             (c < 256 &&
1209               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1210               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1211           {
1212           if (++count >= (int)GET2(code, 1))
1213             { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1214           else
1215             { ADD_NEW(state_offset, count); }
1216           }
1217         }
1218       break;
1219 
1220       /*-----------------------------------------------------------------*/
1221       case OP_TYPEUPTO:
1222       case OP_TYPEMINUPTO:
1223       case OP_TYPEPOSUPTO:
1224       ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1225       count = current_state->count;  /* Number already matched */
1226       if (clen > 0)
1227         {
1228         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1229             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1230             NLBLOCK->nltype == NLTYPE_FIXED &&
1231             NLBLOCK->nllen == 2 &&
1232             c == NLBLOCK->nl[0])
1233           {
1234           could_continue = partial_newline = TRUE;
1235           }
1236         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1237             (c < 256 &&
1238               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1239               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1240           {
1241           if (codevalue == OP_TYPEPOSUPTO)
1242             {
1243             active_count--;           /* Remove non-match possibility */
1244             next_active_state--;
1245             }
1246           if (++count >= (int)GET2(code, 1))
1247             { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1248           else
1249             { ADD_NEW(state_offset, count); }
1250           }
1251         }
1252       break;
1253 
1254 /* ========================================================================== */
1255       /* These are virtual opcodes that are used when something like
1256       OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1257       argument. It keeps the code above fast for the other cases. The argument
1258       is in the d variable. */
1259 
1260 #ifdef SUPPORT_UNICODE
1261       case OP_PROP_EXTRA + OP_TYPEPLUS:
1262       case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1263       case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1264       count = current_state->count;           /* Already matched */
1265       if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1266       if (clen > 0)
1267         {
1268         BOOL OK;
1269         const uint32_t *cp;
1270         const ucd_record * prop = GET_UCD(c);
1271         switch(code[2])
1272           {
1273           case PT_ANY:
1274           OK = TRUE;
1275           break;
1276 
1277           case PT_LAMP:
1278           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1279             prop->chartype == ucp_Lt;
1280           break;
1281 
1282           case PT_GC:
1283           OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1284           break;
1285 
1286           case PT_PC:
1287           OK = prop->chartype == code[3];
1288           break;
1289 
1290           case PT_SC:
1291           OK = prop->script == code[3];
1292           break;
1293 
1294           /* These are specials for combination cases. */
1295 
1296           case PT_ALNUM:
1297           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1298                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1299           break;
1300 
1301           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1302           which means that Perl space and POSIX space are now identical. PCRE
1303           was changed at release 8.34. */
1304 
1305           case PT_SPACE:    /* Perl space */
1306           case PT_PXSPACE:  /* POSIX space */
1307           switch(c)
1308             {
1309             HSPACE_CASES:
1310             VSPACE_CASES:
1311             OK = TRUE;
1312             break;
1313 
1314             default:
1315             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1316             break;
1317             }
1318           break;
1319 
1320           case PT_WORD:
1321           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1322                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1323                c == CHAR_UNDERSCORE;
1324           break;
1325 
1326           case PT_CLIST:
1327           cp = PRIV(ucd_caseless_sets) + code[3];
1328           for (;;)
1329             {
1330             if (c < *cp) { OK = FALSE; break; }
1331             if (c == *cp++) { OK = TRUE; break; }
1332             }
1333           break;
1334 
1335           case PT_UCNC:
1336           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1337                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1338                c >= 0xe000;
1339           break;
1340 
1341           /* Should never occur, but keep compilers from grumbling. */
1342 
1343           default:
1344           OK = codevalue != OP_PROP;
1345           break;
1346           }
1347 
1348         if (OK == (d == OP_PROP))
1349           {
1350           if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1351             {
1352             active_count--;           /* Remove non-match possibility */
1353             next_active_state--;
1354             }
1355           count++;
1356           ADD_NEW(state_offset, count);
1357           }
1358         }
1359       break;
1360 
1361       /*-----------------------------------------------------------------*/
1362       case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1363       case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1364       case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1365       count = current_state->count;  /* Already matched */
1366       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1367       if (clen > 0)
1368         {
1369         int lgb, rgb;
1370         PCRE2_SPTR nptr = ptr + clen;
1371         int ncount = 0;
1372         if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1373           {
1374           active_count--;           /* Remove non-match possibility */
1375           next_active_state--;
1376           }
1377         lgb = UCD_GRAPHBREAK(c);
1378         while (nptr < end_subject)
1379           {
1380           dlen = 1;
1381           if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1382           rgb = UCD_GRAPHBREAK(d);
1383           if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1384           ncount++;
1385           lgb = rgb;
1386           nptr += dlen;
1387           }
1388         count++;
1389         ADD_NEW_DATA(-state_offset, count, ncount);
1390         }
1391       break;
1392 #endif
1393 
1394       /*-----------------------------------------------------------------*/
1395       case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1396       case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1397       case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1398       count = current_state->count;  /* Already matched */
1399       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1400       if (clen > 0)
1401         {
1402         int ncount = 0;
1403         switch (c)
1404           {
1405           case CHAR_VT:
1406           case CHAR_FF:
1407           case CHAR_NEL:
1408 #ifndef EBCDIC
1409           case 0x2028:
1410           case 0x2029:
1411 #endif  /* Not EBCDIC */
1412           if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1413           goto ANYNL01;
1414 
1415           case CHAR_CR:
1416           if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1417           /* Fall through */
1418 
1419           ANYNL01:
1420           case CHAR_LF:
1421           if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1422             {
1423             active_count--;           /* Remove non-match possibility */
1424             next_active_state--;
1425             }
1426           count++;
1427           ADD_NEW_DATA(-state_offset, count, ncount);
1428           break;
1429 
1430           default:
1431           break;
1432           }
1433         }
1434       break;
1435 
1436       /*-----------------------------------------------------------------*/
1437       case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1438       case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1439       case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1440       count = current_state->count;  /* Already matched */
1441       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1442       if (clen > 0)
1443         {
1444         BOOL OK;
1445         switch (c)
1446           {
1447           VSPACE_CASES:
1448           OK = TRUE;
1449           break;
1450 
1451           default:
1452           OK = FALSE;
1453           break;
1454           }
1455 
1456         if (OK == (d == OP_VSPACE))
1457           {
1458           if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1459             {
1460             active_count--;           /* Remove non-match possibility */
1461             next_active_state--;
1462             }
1463           count++;
1464           ADD_NEW_DATA(-state_offset, count, 0);
1465           }
1466         }
1467       break;
1468 
1469       /*-----------------------------------------------------------------*/
1470       case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1471       case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1472       case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1473       count = current_state->count;  /* Already matched */
1474       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1475       if (clen > 0)
1476         {
1477         BOOL OK;
1478         switch (c)
1479           {
1480           HSPACE_CASES:
1481           OK = TRUE;
1482           break;
1483 
1484           default:
1485           OK = FALSE;
1486           break;
1487           }
1488 
1489         if (OK == (d == OP_HSPACE))
1490           {
1491           if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1492             {
1493             active_count--;           /* Remove non-match possibility */
1494             next_active_state--;
1495             }
1496           count++;
1497           ADD_NEW_DATA(-state_offset, count, 0);
1498           }
1499         }
1500       break;
1501 
1502       /*-----------------------------------------------------------------*/
1503 #ifdef SUPPORT_UNICODE
1504       case OP_PROP_EXTRA + OP_TYPEQUERY:
1505       case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1506       case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1507       count = 4;
1508       goto QS1;
1509 
1510       case OP_PROP_EXTRA + OP_TYPESTAR:
1511       case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1512       case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1513       count = 0;
1514 
1515       QS1:
1516 
1517       ADD_ACTIVE(state_offset + 4, 0);
1518       if (clen > 0)
1519         {
1520         BOOL OK;
1521         const uint32_t *cp;
1522         const ucd_record * prop = GET_UCD(c);
1523         switch(code[2])
1524           {
1525           case PT_ANY:
1526           OK = TRUE;
1527           break;
1528 
1529           case PT_LAMP:
1530           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1531             prop->chartype == ucp_Lt;
1532           break;
1533 
1534           case PT_GC:
1535           OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1536           break;
1537 
1538           case PT_PC:
1539           OK = prop->chartype == code[3];
1540           break;
1541 
1542           case PT_SC:
1543           OK = prop->script == code[3];
1544           break;
1545 
1546           /* These are specials for combination cases. */
1547 
1548           case PT_ALNUM:
1549           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1550                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1551           break;
1552 
1553           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1554           which means that Perl space and POSIX space are now identical. PCRE
1555           was changed at release 8.34. */
1556 
1557           case PT_SPACE:    /* Perl space */
1558           case PT_PXSPACE:  /* POSIX space */
1559           switch(c)
1560             {
1561             HSPACE_CASES:
1562             VSPACE_CASES:
1563             OK = TRUE;
1564             break;
1565 
1566             default:
1567             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1568             break;
1569             }
1570           break;
1571 
1572           case PT_WORD:
1573           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1574                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1575                c == CHAR_UNDERSCORE;
1576           break;
1577 
1578           case PT_CLIST:
1579           cp = PRIV(ucd_caseless_sets) + code[3];
1580           for (;;)
1581             {
1582             if (c < *cp) { OK = FALSE; break; }
1583             if (c == *cp++) { OK = TRUE; break; }
1584             }
1585           break;
1586 
1587           case PT_UCNC:
1588           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1589                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1590                c >= 0xe000;
1591           break;
1592 
1593           /* Should never occur, but keep compilers from grumbling. */
1594 
1595           default:
1596           OK = codevalue != OP_PROP;
1597           break;
1598           }
1599 
1600         if (OK == (d == OP_PROP))
1601           {
1602           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1603               codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1604             {
1605             active_count--;           /* Remove non-match possibility */
1606             next_active_state--;
1607             }
1608           ADD_NEW(state_offset + count, 0);
1609           }
1610         }
1611       break;
1612 
1613       /*-----------------------------------------------------------------*/
1614       case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1615       case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1616       case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1617       count = 2;
1618       goto QS2;
1619 
1620       case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1621       case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1622       case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1623       count = 0;
1624 
1625       QS2:
1626 
1627       ADD_ACTIVE(state_offset + 2, 0);
1628       if (clen > 0)
1629         {
1630         int lgb, rgb;
1631         PCRE2_SPTR nptr = ptr + clen;
1632         int ncount = 0;
1633         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1634             codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1635           {
1636           active_count--;           /* Remove non-match possibility */
1637           next_active_state--;
1638           }
1639         lgb = UCD_GRAPHBREAK(c);
1640         while (nptr < end_subject)
1641           {
1642           dlen = 1;
1643           if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1644           rgb = UCD_GRAPHBREAK(d);
1645           if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1646           ncount++;
1647           lgb = rgb;
1648           nptr += dlen;
1649           }
1650         ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1651         }
1652       break;
1653 #endif
1654 
1655       /*-----------------------------------------------------------------*/
1656       case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1657       case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1658       case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1659       count = 2;
1660       goto QS3;
1661 
1662       case OP_ANYNL_EXTRA + OP_TYPESTAR:
1663       case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1664       case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1665       count = 0;
1666 
1667       QS3:
1668       ADD_ACTIVE(state_offset + 2, 0);
1669       if (clen > 0)
1670         {
1671         int ncount = 0;
1672         switch (c)
1673           {
1674           case CHAR_VT:
1675           case CHAR_FF:
1676           case CHAR_NEL:
1677 #ifndef EBCDIC
1678           case 0x2028:
1679           case 0x2029:
1680 #endif  /* Not EBCDIC */
1681           if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1682           goto ANYNL02;
1683 
1684           case CHAR_CR:
1685           if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1686           /* Fall through */
1687 
1688           ANYNL02:
1689           case CHAR_LF:
1690           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1691               codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1692             {
1693             active_count--;           /* Remove non-match possibility */
1694             next_active_state--;
1695             }
1696           ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1697           break;
1698 
1699           default:
1700           break;
1701           }
1702         }
1703       break;
1704 
1705       /*-----------------------------------------------------------------*/
1706       case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1707       case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1708       case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1709       count = 2;
1710       goto QS4;
1711 
1712       case OP_VSPACE_EXTRA + OP_TYPESTAR:
1713       case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1714       case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1715       count = 0;
1716 
1717       QS4:
1718       ADD_ACTIVE(state_offset + 2, 0);
1719       if (clen > 0)
1720         {
1721         BOOL OK;
1722         switch (c)
1723           {
1724           VSPACE_CASES:
1725           OK = TRUE;
1726           break;
1727 
1728           default:
1729           OK = FALSE;
1730           break;
1731           }
1732         if (OK == (d == OP_VSPACE))
1733           {
1734           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1735               codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1736             {
1737             active_count--;           /* Remove non-match possibility */
1738             next_active_state--;
1739             }
1740           ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1741           }
1742         }
1743       break;
1744 
1745       /*-----------------------------------------------------------------*/
1746       case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1747       case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1748       case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1749       count = 2;
1750       goto QS5;
1751 
1752       case OP_HSPACE_EXTRA + OP_TYPESTAR:
1753       case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1754       case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1755       count = 0;
1756 
1757       QS5:
1758       ADD_ACTIVE(state_offset + 2, 0);
1759       if (clen > 0)
1760         {
1761         BOOL OK;
1762         switch (c)
1763           {
1764           HSPACE_CASES:
1765           OK = TRUE;
1766           break;
1767 
1768           default:
1769           OK = FALSE;
1770           break;
1771           }
1772 
1773         if (OK == (d == OP_HSPACE))
1774           {
1775           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1776               codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1777             {
1778             active_count--;           /* Remove non-match possibility */
1779             next_active_state--;
1780             }
1781           ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1782           }
1783         }
1784       break;
1785 
1786       /*-----------------------------------------------------------------*/
1787 #ifdef SUPPORT_UNICODE
1788       case OP_PROP_EXTRA + OP_TYPEEXACT:
1789       case OP_PROP_EXTRA + OP_TYPEUPTO:
1790       case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1791       case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1792       if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1793         { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1794       count = current_state->count;  /* Number already matched */
1795       if (clen > 0)
1796         {
1797         BOOL OK;
1798         const uint32_t *cp;
1799         const ucd_record * prop = GET_UCD(c);
1800         switch(code[1 + IMM2_SIZE + 1])
1801           {
1802           case PT_ANY:
1803           OK = TRUE;
1804           break;
1805 
1806           case PT_LAMP:
1807           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1808             prop->chartype == ucp_Lt;
1809           break;
1810 
1811           case PT_GC:
1812           OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1813           break;
1814 
1815           case PT_PC:
1816           OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1817           break;
1818 
1819           case PT_SC:
1820           OK = prop->script == code[1 + IMM2_SIZE + 2];
1821           break;
1822 
1823           /* These are specials for combination cases. */
1824 
1825           case PT_ALNUM:
1826           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1827                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1828           break;
1829 
1830           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1831           which means that Perl space and POSIX space are now identical. PCRE
1832           was changed at release 8.34. */
1833 
1834           case PT_SPACE:    /* Perl space */
1835           case PT_PXSPACE:  /* POSIX space */
1836           switch(c)
1837             {
1838             HSPACE_CASES:
1839             VSPACE_CASES:
1840             OK = TRUE;
1841             break;
1842 
1843             default:
1844             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1845             break;
1846             }
1847           break;
1848 
1849           case PT_WORD:
1850           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1851                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1852                c == CHAR_UNDERSCORE;
1853           break;
1854 
1855           case PT_CLIST:
1856           cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
1857           for (;;)
1858             {
1859             if (c < *cp) { OK = FALSE; break; }
1860             if (c == *cp++) { OK = TRUE; break; }
1861             }
1862           break;
1863 
1864           case PT_UCNC:
1865           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1866                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1867                c >= 0xe000;
1868           break;
1869 
1870           /* Should never occur, but keep compilers from grumbling. */
1871 
1872           default:
1873           OK = codevalue != OP_PROP;
1874           break;
1875           }
1876 
1877         if (OK == (d == OP_PROP))
1878           {
1879           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1880             {
1881             active_count--;           /* Remove non-match possibility */
1882             next_active_state--;
1883             }
1884           if (++count >= (int)GET2(code, 1))
1885             { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1886           else
1887             { ADD_NEW(state_offset, count); }
1888           }
1889         }
1890       break;
1891 
1892       /*-----------------------------------------------------------------*/
1893       case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1894       case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1895       case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1896       case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1897       if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1898         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1899       count = current_state->count;  /* Number already matched */
1900       if (clen > 0)
1901         {
1902         int lgb, rgb;
1903         PCRE2_SPTR nptr = ptr + clen;
1904         int ncount = 0;
1905         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1906           {
1907           active_count--;           /* Remove non-match possibility */
1908           next_active_state--;
1909           }
1910         lgb = UCD_GRAPHBREAK(c);
1911         while (nptr < end_subject)
1912           {
1913           dlen = 1;
1914           if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1915           rgb = UCD_GRAPHBREAK(d);
1916           if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1917           ncount++;
1918           lgb = rgb;
1919           nptr += dlen;
1920           }
1921         if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1922             reset_could_continue = TRUE;
1923         if (++count >= (int)GET2(code, 1))
1924           { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1925         else
1926           { ADD_NEW_DATA(-state_offset, count, ncount); }
1927         }
1928       break;
1929 #endif
1930 
1931       /*-----------------------------------------------------------------*/
1932       case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1933       case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1934       case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1935       case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1936       if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1937         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1938       count = current_state->count;  /* Number already matched */
1939       if (clen > 0)
1940         {
1941         int ncount = 0;
1942         switch (c)
1943           {
1944           case CHAR_VT:
1945           case CHAR_FF:
1946           case CHAR_NEL:
1947 #ifndef EBCDIC
1948           case 0x2028:
1949           case 0x2029:
1950 #endif  /* Not EBCDIC */
1951           if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1952           goto ANYNL03;
1953 
1954           case CHAR_CR:
1955           if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1956           /* Fall through */
1957 
1958           ANYNL03:
1959           case CHAR_LF:
1960           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1961             {
1962             active_count--;           /* Remove non-match possibility */
1963             next_active_state--;
1964             }
1965           if (++count >= (int)GET2(code, 1))
1966             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1967           else
1968             { ADD_NEW_DATA(-state_offset, count, ncount); }
1969           break;
1970 
1971           default:
1972           break;
1973           }
1974         }
1975       break;
1976 
1977       /*-----------------------------------------------------------------*/
1978       case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1979       case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1980       case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1981       case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1982       if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1983         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1984       count = current_state->count;  /* Number already matched */
1985       if (clen > 0)
1986         {
1987         BOOL OK;
1988         switch (c)
1989           {
1990           VSPACE_CASES:
1991           OK = TRUE;
1992           break;
1993 
1994           default:
1995           OK = FALSE;
1996           }
1997 
1998         if (OK == (d == OP_VSPACE))
1999           {
2000           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2001             {
2002             active_count--;           /* Remove non-match possibility */
2003             next_active_state--;
2004             }
2005           if (++count >= (int)GET2(code, 1))
2006             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2007           else
2008             { ADD_NEW_DATA(-state_offset, count, 0); }
2009           }
2010         }
2011       break;
2012 
2013       /*-----------------------------------------------------------------*/
2014       case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2015       case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2016       case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2017       case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2018       if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2019         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2020       count = current_state->count;  /* Number already matched */
2021       if (clen > 0)
2022         {
2023         BOOL OK;
2024         switch (c)
2025           {
2026           HSPACE_CASES:
2027           OK = TRUE;
2028           break;
2029 
2030           default:
2031           OK = FALSE;
2032           break;
2033           }
2034 
2035         if (OK == (d == OP_HSPACE))
2036           {
2037           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2038             {
2039             active_count--;           /* Remove non-match possibility */
2040             next_active_state--;
2041             }
2042           if (++count >= (int)GET2(code, 1))
2043             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2044           else
2045             { ADD_NEW_DATA(-state_offset, count, 0); }
2046           }
2047         }
2048       break;
2049 
2050 /* ========================================================================== */
2051       /* These opcodes are followed by a character that is usually compared
2052       to the current subject character; it is loaded into d. We still get
2053       here even if there is no subject character, because in some cases zero
2054       repetitions are permitted. */
2055 
2056       /*-----------------------------------------------------------------*/
2057       case OP_CHAR:
2058       if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2059       break;
2060 
2061       /*-----------------------------------------------------------------*/
2062       case OP_CHARI:
2063       if (clen == 0) break;
2064 
2065 #ifdef SUPPORT_UNICODE
2066       if (utf)
2067         {
2068         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2069           {
2070           unsigned int othercase;
2071           if (c < 128)
2072             othercase = fcc[c];
2073           else
2074             othercase = UCD_OTHERCASE(c);
2075           if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2076           }
2077         }
2078       else
2079 #endif  /* SUPPORT_UNICODE */
2080       /* Not UTF mode */
2081         {
2082         if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2083           { ADD_NEW(state_offset + 2, 0); }
2084         }
2085       break;
2086 
2087 
2088 #ifdef SUPPORT_UNICODE
2089       /*-----------------------------------------------------------------*/
2090       /* This is a tricky one because it can match more than one character.
2091       Find out how many characters to skip, and then set up a negative state
2092       to wait for them to pass before continuing. */
2093 
2094       case OP_EXTUNI:
2095       if (clen > 0)
2096         {
2097         int lgb, rgb;
2098         PCRE2_SPTR nptr = ptr + clen;
2099         int ncount = 0;
2100         lgb = UCD_GRAPHBREAK(c);
2101         while (nptr < end_subject)
2102           {
2103           dlen = 1;
2104           if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
2105           rgb = UCD_GRAPHBREAK(d);
2106           if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2107           ncount++;
2108           lgb = rgb;
2109           nptr += dlen;
2110           }
2111         if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2112             reset_could_continue = TRUE;
2113         ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2114         }
2115       break;
2116 #endif
2117 
2118       /*-----------------------------------------------------------------*/
2119       /* This is a tricky like EXTUNI because it too can match more than one
2120       character (when CR is followed by LF). In this case, set up a negative
2121       state to wait for one character to pass before continuing. */
2122 
2123       case OP_ANYNL:
2124       if (clen > 0) switch(c)
2125         {
2126         case CHAR_VT:
2127         case CHAR_FF:
2128         case CHAR_NEL:
2129 #ifndef EBCDIC
2130         case 0x2028:
2131         case 0x2029:
2132 #endif  /* Not EBCDIC */
2133         if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2134 
2135         case CHAR_LF:
2136         ADD_NEW(state_offset + 1, 0);
2137         break;
2138 
2139         case CHAR_CR:
2140         if (ptr + 1 >= end_subject)
2141           {
2142           ADD_NEW(state_offset + 1, 0);
2143           if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2144             reset_could_continue = TRUE;
2145           }
2146         else if (UCHAR21TEST(ptr + 1) == CHAR_LF)
2147           {
2148           ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2149           }
2150         else
2151           {
2152           ADD_NEW(state_offset + 1, 0);
2153           }
2154         break;
2155         }
2156       break;
2157 
2158       /*-----------------------------------------------------------------*/
2159       case OP_NOT_VSPACE:
2160       if (clen > 0) switch(c)
2161         {
2162         VSPACE_CASES:
2163         break;
2164 
2165         default:
2166         ADD_NEW(state_offset + 1, 0);
2167         break;
2168         }
2169       break;
2170 
2171       /*-----------------------------------------------------------------*/
2172       case OP_VSPACE:
2173       if (clen > 0) switch(c)
2174         {
2175         VSPACE_CASES:
2176         ADD_NEW(state_offset + 1, 0);
2177         break;
2178 
2179         default:
2180         break;
2181         }
2182       break;
2183 
2184       /*-----------------------------------------------------------------*/
2185       case OP_NOT_HSPACE:
2186       if (clen > 0) switch(c)
2187         {
2188         HSPACE_CASES:
2189         break;
2190 
2191         default:
2192         ADD_NEW(state_offset + 1, 0);
2193         break;
2194         }
2195       break;
2196 
2197       /*-----------------------------------------------------------------*/
2198       case OP_HSPACE:
2199       if (clen > 0) switch(c)
2200         {
2201         HSPACE_CASES:
2202         ADD_NEW(state_offset + 1, 0);
2203         break;
2204 
2205         default:
2206         break;
2207         }
2208       break;
2209 
2210       /*-----------------------------------------------------------------*/
2211       /* Match a negated single character casefully. */
2212 
2213       case OP_NOT:
2214       if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2215       break;
2216 
2217       /*-----------------------------------------------------------------*/
2218       /* Match a negated single character caselessly. */
2219 
2220       case OP_NOTI:
2221       if (clen > 0)
2222         {
2223         unsigned int otherd;
2224 #ifdef SUPPORT_UNICODE
2225         if (utf && d >= 128)
2226           otherd = UCD_OTHERCASE(d);
2227         else
2228 #endif  /* SUPPORT_UNICODE */
2229         otherd = TABLE_GET(d, fcc, d);
2230         if (c != d && c != otherd)
2231           { ADD_NEW(state_offset + dlen + 1, 0); }
2232         }
2233       break;
2234 
2235       /*-----------------------------------------------------------------*/
2236       case OP_PLUSI:
2237       case OP_MINPLUSI:
2238       case OP_POSPLUSI:
2239       case OP_NOTPLUSI:
2240       case OP_NOTMINPLUSI:
2241       case OP_NOTPOSPLUSI:
2242       caseless = TRUE;
2243       codevalue -= OP_STARI - OP_STAR;
2244 
2245       /* Fall through */
2246       case OP_PLUS:
2247       case OP_MINPLUS:
2248       case OP_POSPLUS:
2249       case OP_NOTPLUS:
2250       case OP_NOTMINPLUS:
2251       case OP_NOTPOSPLUS:
2252       count = current_state->count;  /* Already matched */
2253       if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2254       if (clen > 0)
2255         {
2256         uint32_t otherd = NOTACHAR;
2257         if (caseless)
2258           {
2259 #ifdef SUPPORT_UNICODE
2260           if (utf && d >= 128)
2261             otherd = UCD_OTHERCASE(d);
2262           else
2263 #endif  /* SUPPORT_UNICODE */
2264           otherd = TABLE_GET(d, fcc, d);
2265           }
2266         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2267           {
2268           if (count > 0 &&
2269               (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2270             {
2271             active_count--;             /* Remove non-match possibility */
2272             next_active_state--;
2273             }
2274           count++;
2275           ADD_NEW(state_offset, count);
2276           }
2277         }
2278       break;
2279 
2280       /*-----------------------------------------------------------------*/
2281       case OP_QUERYI:
2282       case OP_MINQUERYI:
2283       case OP_POSQUERYI:
2284       case OP_NOTQUERYI:
2285       case OP_NOTMINQUERYI:
2286       case OP_NOTPOSQUERYI:
2287       caseless = TRUE;
2288       codevalue -= OP_STARI - OP_STAR;
2289       /* Fall through */
2290       case OP_QUERY:
2291       case OP_MINQUERY:
2292       case OP_POSQUERY:
2293       case OP_NOTQUERY:
2294       case OP_NOTMINQUERY:
2295       case OP_NOTPOSQUERY:
2296       ADD_ACTIVE(state_offset + dlen + 1, 0);
2297       if (clen > 0)
2298         {
2299         uint32_t otherd = NOTACHAR;
2300         if (caseless)
2301           {
2302 #ifdef SUPPORT_UNICODE
2303           if (utf && d >= 128)
2304             otherd = UCD_OTHERCASE(d);
2305           else
2306 #endif  /* SUPPORT_UNICODE */
2307           otherd = TABLE_GET(d, fcc, d);
2308           }
2309         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2310           {
2311           if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2312             {
2313             active_count--;            /* Remove non-match possibility */
2314             next_active_state--;
2315             }
2316           ADD_NEW(state_offset + dlen + 1, 0);
2317           }
2318         }
2319       break;
2320 
2321       /*-----------------------------------------------------------------*/
2322       case OP_STARI:
2323       case OP_MINSTARI:
2324       case OP_POSSTARI:
2325       case OP_NOTSTARI:
2326       case OP_NOTMINSTARI:
2327       case OP_NOTPOSSTARI:
2328       caseless = TRUE;
2329       codevalue -= OP_STARI - OP_STAR;
2330       /* Fall through */
2331       case OP_STAR:
2332       case OP_MINSTAR:
2333       case OP_POSSTAR:
2334       case OP_NOTSTAR:
2335       case OP_NOTMINSTAR:
2336       case OP_NOTPOSSTAR:
2337       ADD_ACTIVE(state_offset + dlen + 1, 0);
2338       if (clen > 0)
2339         {
2340         uint32_t otherd = NOTACHAR;
2341         if (caseless)
2342           {
2343 #ifdef SUPPORT_UNICODE
2344           if (utf && d >= 128)
2345             otherd = UCD_OTHERCASE(d);
2346           else
2347 #endif  /* SUPPORT_UNICODE */
2348           otherd = TABLE_GET(d, fcc, d);
2349           }
2350         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2351           {
2352           if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2353             {
2354             active_count--;            /* Remove non-match possibility */
2355             next_active_state--;
2356             }
2357           ADD_NEW(state_offset, 0);
2358           }
2359         }
2360       break;
2361 
2362       /*-----------------------------------------------------------------*/
2363       case OP_EXACTI:
2364       case OP_NOTEXACTI:
2365       caseless = TRUE;
2366       codevalue -= OP_STARI - OP_STAR;
2367       /* Fall through */
2368       case OP_EXACT:
2369       case OP_NOTEXACT:
2370       count = current_state->count;  /* Number already matched */
2371       if (clen > 0)
2372         {
2373         uint32_t otherd = NOTACHAR;
2374         if (caseless)
2375           {
2376 #ifdef SUPPORT_UNICODE
2377           if (utf && d >= 128)
2378             otherd = UCD_OTHERCASE(d);
2379           else
2380 #endif  /* SUPPORT_UNICODE */
2381           otherd = TABLE_GET(d, fcc, d);
2382           }
2383         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2384           {
2385           if (++count >= (int)GET2(code, 1))
2386             { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2387           else
2388             { ADD_NEW(state_offset, count); }
2389           }
2390         }
2391       break;
2392 
2393       /*-----------------------------------------------------------------*/
2394       case OP_UPTOI:
2395       case OP_MINUPTOI:
2396       case OP_POSUPTOI:
2397       case OP_NOTUPTOI:
2398       case OP_NOTMINUPTOI:
2399       case OP_NOTPOSUPTOI:
2400       caseless = TRUE;
2401       codevalue -= OP_STARI - OP_STAR;
2402       /* Fall through */
2403       case OP_UPTO:
2404       case OP_MINUPTO:
2405       case OP_POSUPTO:
2406       case OP_NOTUPTO:
2407       case OP_NOTMINUPTO:
2408       case OP_NOTPOSUPTO:
2409       ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2410       count = current_state->count;  /* Number already matched */
2411       if (clen > 0)
2412         {
2413         uint32_t otherd = NOTACHAR;
2414         if (caseless)
2415           {
2416 #ifdef SUPPORT_UNICODE
2417           if (utf && d >= 128)
2418             otherd = UCD_OTHERCASE(d);
2419           else
2420 #endif  /* SUPPORT_UNICODE */
2421           otherd = TABLE_GET(d, fcc, d);
2422           }
2423         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2424           {
2425           if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2426             {
2427             active_count--;             /* Remove non-match possibility */
2428             next_active_state--;
2429             }
2430           if (++count >= (int)GET2(code, 1))
2431             { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2432           else
2433             { ADD_NEW(state_offset, count); }
2434           }
2435         }
2436       break;
2437 
2438 
2439 /* ========================================================================== */
2440       /* These are the class-handling opcodes */
2441 
2442       case OP_CLASS:
2443       case OP_NCLASS:
2444       case OP_XCLASS:
2445         {
2446         BOOL isinclass = FALSE;
2447         int next_state_offset;
2448         PCRE2_SPTR ecode;
2449 
2450         /* For a simple class, there is always just a 32-byte table, and we
2451         can set isinclass from it. */
2452 
2453         if (codevalue != OP_XCLASS)
2454           {
2455           ecode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
2456           if (clen > 0)
2457             {
2458             isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2459               ((((uint8_t *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2460             }
2461           }
2462 
2463         /* An extended class may have a table or a list of single characters,
2464         ranges, or both, and it may be positive or negative. There's a
2465         function that sorts all this out. */
2466 
2467         else
2468          {
2469          ecode = code + GET(code, 1);
2470          if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2471          }
2472 
2473         /* At this point, isinclass is set for all kinds of class, and ecode
2474         points to the byte after the end of the class. If there is a
2475         quantifier, this is where it will be. */
2476 
2477         next_state_offset = (int)(ecode - start_code);
2478 
2479         switch (*ecode)
2480           {
2481           case OP_CRSTAR:
2482           case OP_CRMINSTAR:
2483           case OP_CRPOSSTAR:
2484           ADD_ACTIVE(next_state_offset + 1, 0);
2485           if (isinclass)
2486             {
2487             if (*ecode == OP_CRPOSSTAR)
2488               {
2489               active_count--;           /* Remove non-match possibility */
2490               next_active_state--;
2491               }
2492             ADD_NEW(state_offset, 0);
2493             }
2494           break;
2495 
2496           case OP_CRPLUS:
2497           case OP_CRMINPLUS:
2498           case OP_CRPOSPLUS:
2499           count = current_state->count;  /* Already matched */
2500           if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2501           if (isinclass)
2502             {
2503             if (count > 0 && *ecode == OP_CRPOSPLUS)
2504               {
2505               active_count--;           /* Remove non-match possibility */
2506               next_active_state--;
2507               }
2508             count++;
2509             ADD_NEW(state_offset, count);
2510             }
2511           break;
2512 
2513           case OP_CRQUERY:
2514           case OP_CRMINQUERY:
2515           case OP_CRPOSQUERY:
2516           ADD_ACTIVE(next_state_offset + 1, 0);
2517           if (isinclass)
2518             {
2519             if (*ecode == OP_CRPOSQUERY)
2520               {
2521               active_count--;           /* Remove non-match possibility */
2522               next_active_state--;
2523               }
2524             ADD_NEW(next_state_offset + 1, 0);
2525             }
2526           break;
2527 
2528           case OP_CRRANGE:
2529           case OP_CRMINRANGE:
2530           case OP_CRPOSRANGE:
2531           count = current_state->count;  /* Already matched */
2532           if (count >= (int)GET2(ecode, 1))
2533             { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2534           if (isinclass)
2535             {
2536             int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2537             if (*ecode == OP_CRPOSRANGE)
2538               {
2539               active_count--;           /* Remove non-match possibility */
2540               next_active_state--;
2541               }
2542             if (++count >= max && max != 0)   /* Max 0 => no limit */
2543               { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2544             else
2545               { ADD_NEW(state_offset, count); }
2546             }
2547           break;
2548 
2549           default:
2550           if (isinclass) { ADD_NEW(next_state_offset, 0); }
2551           break;
2552           }
2553         }
2554       break;
2555 
2556 /* ========================================================================== */
2557       /* These are the opcodes for fancy brackets of various kinds. We have
2558       to use recursion in order to handle them. The "always failing" assertion
2559       (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2560       though the other "backtracking verbs" are not supported. */
2561 
2562       case OP_FAIL:
2563       forced_fail++;    /* Count FAILs for multiple states */
2564       break;
2565 
2566       case OP_ASSERT:
2567       case OP_ASSERT_NOT:
2568       case OP_ASSERTBACK:
2569       case OP_ASSERTBACK_NOT:
2570         {
2571         PCRE2_SPTR endasscode = code + GET(code, 1);
2572         PCRE2_SIZE local_offsets[2];
2573         int rc;
2574         int local_workspace[1000];
2575 
2576         while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2577 
2578         rc = internal_dfa_match(
2579           mb,                                   /* static match data */
2580           code,                                 /* this subexpression's code */
2581           ptr,                                  /* where we currently are */
2582           (int)(ptr - start_subject),           /* start offset */
2583           local_offsets,                        /* offset vector */
2584           sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
2585           local_workspace,                      /* workspace vector */
2586           sizeof(local_workspace)/sizeof(int),  /* size of same */
2587           rlevel);                              /* function recursion level */
2588 
2589         if (rc == PCRE2_ERROR_DFA_UITEM) return rc;
2590         if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2591             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2592         }
2593       break;
2594 
2595       /*-----------------------------------------------------------------*/
2596       case OP_COND:
2597       case OP_SCOND:
2598         {
2599         PCRE2_SIZE local_offsets[1000];
2600         int local_workspace[1000];
2601         int codelink = GET(code, 1);
2602         int condcode;
2603 
2604         /* Because of the way auto-callout works during compile, a callout item
2605         is inserted between OP_COND and an assertion condition. This does not
2606         happen for the other conditions. */
2607 
2608         if (code[LINK_SIZE+1] == OP_CALLOUT)
2609           {
2610           rrc = 0;
2611           if (mb->callout != NULL)
2612             {
2613             pcre2_callout_block cb;
2614             cb.version          = 0;
2615             cb.callout_number   = code[LINK_SIZE+2];
2616             cb.capture_top      = 1;
2617             cb.capture_last     = 0;
2618             cb.offset_vector    = offsets;
2619             cb.mark             = NULL;   /* No (*MARK) support */
2620             cb.subject          = start_subject;
2621             cb.subject_length   = (PCRE2_SIZE)(end_subject - start_subject);
2622             cb.start_match      = (PCRE2_SIZE)(current_subject - start_subject);
2623             cb.current_position = (PCRE2_SIZE)(ptr - start_subject);
2624             cb.pattern_position = GET(code, LINK_SIZE + 3);
2625             cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2626             if ((rrc = (mb->callout)(&cb, mb->callout_data)) < 0)
2627               return rrc;   /* Abandon */
2628             }
2629           if (rrc > 0) break;                      /* Fail this thread */
2630           code += PRIV(OP_lengths)[OP_CALLOUT];    /* Skip callout data */
2631           }
2632 
2633         condcode = code[LINK_SIZE+1];
2634 
2635         /* Back reference conditions and duplicate named recursion conditions
2636         are not supported */
2637 
2638         if (condcode == OP_CREF || condcode == OP_DNCREF ||
2639             condcode == OP_DNRREF)
2640           return PCRE2_ERROR_DFA_UCOND;
2641 
2642         /* The DEFINE condition is always false */
2643 
2644         if (condcode == OP_FALSE)
2645           { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2646 
2647         /* There is also an always-true condition */
2648 
2649         if (condcode == OP_TRUE)
2650           { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2651 
2652         /* The only supported version of OP_RREF is for the value RREF_ANY,
2653         which means "test if in any recursion". We can't test for specifically
2654         recursed groups. */
2655 
2656         else if (condcode == OP_RREF)
2657           {
2658           int value = GET2(code, LINK_SIZE + 2);
2659           if (value != RREF_ANY) return PCRE2_ERROR_DFA_UCOND;
2660           if (mb->recursive != NULL)
2661             { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2662           else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2663           }
2664 
2665         /* Otherwise, the condition is an assertion */
2666 
2667         else
2668           {
2669           int rc;
2670           PCRE2_SPTR asscode = code + LINK_SIZE + 1;
2671           PCRE2_SPTR endasscode = asscode + GET(asscode, 1);
2672 
2673           while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2674 
2675           rc = internal_dfa_match(
2676             mb,                                   /* fixed match data */
2677             asscode,                              /* this subexpression's code */
2678             ptr,                                  /* where we currently are */
2679             (int)(ptr - start_subject),           /* start offset */
2680             local_offsets,                        /* offset vector */
2681             sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
2682             local_workspace,                      /* workspace vector */
2683             sizeof(local_workspace)/sizeof(int),  /* size of same */
2684             rlevel);                              /* function recursion level */
2685 
2686           if (rc == PCRE2_ERROR_DFA_UITEM) return rc;
2687           if ((rc >= 0) ==
2688                 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2689             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2690           else
2691             { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2692           }
2693         }
2694       break;
2695 
2696       /*-----------------------------------------------------------------*/
2697       case OP_RECURSE:
2698         {
2699         dfa_recursion_info *ri;
2700         PCRE2_SIZE local_offsets[1000];
2701         int local_workspace[1000];
2702         PCRE2_SPTR callpat = start_code + GET(code, 1);
2703         uint32_t recno = (callpat == mb->start_code)? 0 :
2704           GET2(callpat, 1 + LINK_SIZE);
2705         int rc;
2706 
2707         /* Check for repeating a recursion without advancing the subject
2708         pointer. This should catch convoluted mutual recursions. (Some simple
2709         cases are caught at compile time.) */
2710 
2711         for (ri = mb->recursive; ri != NULL; ri = ri->prevrec)
2712           if (recno == ri->group_num && ptr == ri->subject_position)
2713             return PCRE2_ERROR_RECURSELOOP;
2714 
2715         /* Remember this recursion and where we started it so as to
2716         catch infinite loops. */
2717 
2718         new_recursive.group_num = recno;
2719         new_recursive.subject_position = ptr;
2720         new_recursive.prevrec = mb->recursive;
2721         mb->recursive = &new_recursive;
2722 
2723         rc = internal_dfa_match(
2724           mb,                                   /* fixed match data */
2725           callpat,                              /* this subexpression's code */
2726           ptr,                                  /* where we currently are */
2727           (int)(ptr - start_subject),           /* start offset */
2728           local_offsets,                        /* offset vector */
2729           sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
2730           local_workspace,                      /* workspace vector */
2731           sizeof(local_workspace)/sizeof(int),  /* size of same */
2732           rlevel);                              /* function recursion level */
2733 
2734         mb->recursive = new_recursive.prevrec;  /* Done this recursion */
2735 
2736         /* Ran out of internal offsets */
2737 
2738         if (rc == 0) return PCRE2_ERROR_DFA_RECURSE;
2739 
2740         /* For each successful matched substring, set up the next state with a
2741         count of characters to skip before trying it. Note that the count is in
2742         characters, not bytes. */
2743 
2744         if (rc > 0)
2745           {
2746           for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2747             {
2748             int charcount = local_offsets[rc+1] - local_offsets[rc];
2749 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2750             if (utf)
2751               {
2752               PCRE2_SPTR p = start_subject + local_offsets[rc];
2753               PCRE2_SPTR pp = start_subject + local_offsets[rc+1];
2754               while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2755               }
2756 #endif
2757             if (charcount > 0)
2758               {
2759               ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2760               }
2761             else
2762               {
2763               ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2764               }
2765             }
2766           }
2767         else if (rc != PCRE2_ERROR_NOMATCH) return rc;
2768         }
2769       break;
2770 
2771       /*-----------------------------------------------------------------*/
2772       case OP_BRAPOS:
2773       case OP_SBRAPOS:
2774       case OP_CBRAPOS:
2775       case OP_SCBRAPOS:
2776       case OP_BRAPOSZERO:
2777         {
2778         int charcount, matched_count;
2779         PCRE2_SPTR local_ptr = ptr;
2780         BOOL allow_zero;
2781 
2782         if (codevalue == OP_BRAPOSZERO)
2783           {
2784           allow_zero = TRUE;
2785           codevalue = *(++code);  /* Codevalue will be one of above BRAs */
2786           }
2787         else allow_zero = FALSE;
2788 
2789         /* Loop to match the subpattern as many times as possible as if it were
2790         a complete pattern. */
2791 
2792         for (matched_count = 0;; matched_count++)
2793           {
2794           PCRE2_SIZE local_offsets[2];
2795           int local_workspace[1000];
2796 
2797           int rc = internal_dfa_match(
2798             mb,                                   /* fixed match data */
2799             code,                                 /* this subexpression's code */
2800             local_ptr,                            /* where we currently are */
2801             (int)(ptr - start_subject),           /* start offset */
2802             local_offsets,                        /* offset vector */
2803             sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
2804             local_workspace,                      /* workspace vector */
2805             sizeof(local_workspace)/sizeof(int),  /* size of same */
2806             rlevel);                              /* function recursion level */
2807 
2808           /* Failed to match */
2809 
2810           if (rc < 0)
2811             {
2812             if (rc != PCRE2_ERROR_NOMATCH) return rc;
2813             break;
2814             }
2815 
2816           /* Matched: break the loop if zero characters matched. */
2817 
2818           charcount = local_offsets[1] - local_offsets[0];
2819           if (charcount == 0) break;
2820           local_ptr += charcount;    /* Advance temporary position ptr */
2821           }
2822 
2823         /* At this point we have matched the subpattern matched_count
2824         times, and local_ptr is pointing to the character after the end of the
2825         last match. */
2826 
2827         if (matched_count > 0 || allow_zero)
2828           {
2829           PCRE2_SPTR end_subpattern = code;
2830           int next_state_offset;
2831 
2832           do { end_subpattern += GET(end_subpattern, 1); }
2833             while (*end_subpattern == OP_ALT);
2834           next_state_offset =
2835             (int)(end_subpattern - start_code + LINK_SIZE + 1);
2836 
2837           /* Optimization: if there are no more active states, and there
2838           are no new states yet set up, then skip over the subject string
2839           right here, to save looping. Otherwise, set up the new state to swing
2840           into action when the end of the matched substring is reached. */
2841 
2842           if (i + 1 >= active_count && new_count == 0)
2843             {
2844             ptr = local_ptr;
2845             clen = 0;
2846             ADD_NEW(next_state_offset, 0);
2847             }
2848           else
2849             {
2850             PCRE2_SPTR p = ptr;
2851             PCRE2_SPTR pp = local_ptr;
2852             charcount = (int)(pp - p);
2853 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2854             if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2855 #endif
2856             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2857             }
2858           }
2859         }
2860       break;
2861 
2862       /*-----------------------------------------------------------------*/
2863       case OP_ONCE:
2864       case OP_ONCE_NC:
2865         {
2866         PCRE2_SIZE local_offsets[2];
2867         int local_workspace[1000];
2868 
2869         int rc = internal_dfa_match(
2870           mb,                                   /* fixed match data */
2871           code,                                 /* this subexpression's code */
2872           ptr,                                  /* where we currently are */
2873           (int)(ptr - start_subject),           /* start offset */
2874           local_offsets,                        /* offset vector */
2875           sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
2876           local_workspace,                      /* workspace vector */
2877           sizeof(local_workspace)/sizeof(int),  /* size of same */
2878           rlevel);                              /* function recursion level */
2879 
2880         if (rc >= 0)
2881           {
2882           PCRE2_SPTR end_subpattern = code;
2883           int charcount = local_offsets[1] - local_offsets[0];
2884           int next_state_offset, repeat_state_offset;
2885 
2886           do { end_subpattern += GET(end_subpattern, 1); }
2887             while (*end_subpattern == OP_ALT);
2888           next_state_offset =
2889             (int)(end_subpattern - start_code + LINK_SIZE + 1);
2890 
2891           /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2892           arrange for the repeat state also to be added to the relevant list.
2893           Calculate the offset, or set -1 for no repeat. */
2894 
2895           repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2896                                  *end_subpattern == OP_KETRMIN)?
2897             (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2898 
2899           /* If we have matched an empty string, add the next state at the
2900           current character pointer. This is important so that the duplicate
2901           checking kicks in, which is what breaks infinite loops that match an
2902           empty string. */
2903 
2904           if (charcount == 0)
2905             {
2906             ADD_ACTIVE(next_state_offset, 0);
2907             }
2908 
2909           /* Optimization: if there are no more active states, and there
2910           are no new states yet set up, then skip over the subject string
2911           right here, to save looping. Otherwise, set up the new state to swing
2912           into action when the end of the matched substring is reached. */
2913 
2914           else if (i + 1 >= active_count && new_count == 0)
2915             {
2916             ptr += charcount;
2917             clen = 0;
2918             ADD_NEW(next_state_offset, 0);
2919 
2920             /* If we are adding a repeat state at the new character position,
2921             we must fudge things so that it is the only current state.
2922             Otherwise, it might be a duplicate of one we processed before, and
2923             that would cause it to be skipped. */
2924 
2925             if (repeat_state_offset >= 0)
2926               {
2927               next_active_state = active_states;
2928               active_count = 0;
2929               i = -1;
2930               ADD_ACTIVE(repeat_state_offset, 0);
2931               }
2932             }
2933           else
2934             {
2935 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2936             if (utf)
2937               {
2938               PCRE2_SPTR p = start_subject + local_offsets[0];
2939               PCRE2_SPTR pp = start_subject + local_offsets[1];
2940               while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2941               }
2942 #endif
2943             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2944             if (repeat_state_offset >= 0)
2945               { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2946             }
2947           }
2948         else if (rc != PCRE2_ERROR_NOMATCH) return rc;
2949         }
2950       break;
2951 
2952 
2953 /* ========================================================================== */
2954       /* Handle callouts */
2955 
2956       case OP_CALLOUT:
2957       rrc = 0;
2958       if (mb->callout != NULL)
2959         {
2960         pcre2_callout_block cb;
2961         cb.version          = 0;
2962         cb.callout_number   = code[1];
2963         cb.capture_top      = 1;
2964         cb.capture_last     = 0;
2965         cb.offset_vector    = offsets;
2966         cb.mark             = NULL;   /* No (*MARK) support */
2967         cb.subject          = start_subject;
2968         cb.subject_length   = (PCRE2_SIZE)(end_subject - start_subject);
2969         cb.start_match      = (PCRE2_SIZE)(current_subject - start_subject);
2970         cb.current_position = (PCRE2_SIZE)(ptr - start_subject);
2971         cb.pattern_position = GET(code, 2);
2972         cb.next_item_length = GET(code, 2 + LINK_SIZE);
2973         if ((rrc = (mb->callout)(&cb, mb->callout_data)) < 0)
2974           return rrc;   /* Abandon */
2975         }
2976       if (rrc == 0)
2977         { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
2978       break;
2979 
2980 
2981 /* ========================================================================== */
2982       default:        /* Unsupported opcode */
2983       return PCRE2_ERROR_DFA_UITEM;
2984       }
2985 
2986     NEXT_ACTIVE_STATE: continue;
2987 
2988     }      /* End of loop scanning active states */
2989 
2990   /* We have finished the processing at the current subject character. If no
2991   new states have been set for the next character, we have found all the
2992   matches that we are going to find. If we are at the top level and partial
2993   matching has been requested, check for appropriate conditions.
2994 
2995   The "forced_ fail" variable counts the number of (*F) encountered for the
2996   character. If it is equal to the original active_count (saved in
2997   workspace[1]) it means that (*F) was found on every active state. In this
2998   case we don't want to give a partial match.
2999 
3000   The "could_continue" variable is true if a state could have continued but
3001   for the fact that the end of the subject was reached. */
3002 
3003   if (new_count <= 0)
3004     {
3005     if (rlevel == 1 &&                               /* Top level, and */
3006         could_continue &&                            /* Some could go on, and */
3007         forced_fail != workspace[1] &&               /* Not all forced fail & */
3008         (                                            /* either... */
3009         (mb->moptions & PCRE2_PARTIAL_HARD) != 0      /* Hard partial */
3010         ||                                           /* or... */
3011         ((mb->moptions & PCRE2_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
3012          match_count < 0)                            /* no matches */
3013         ) &&                                         /* And... */
3014         (
3015         partial_newline ||                           /* Either partial NL */
3016           (                                          /* or ... */
3017           ptr >= end_subject &&                /* End of subject and */
3018           ptr > mb->start_used_ptr)            /* Inspected non-empty string */
3019           )
3020         )
3021       match_count = PCRE2_ERROR_PARTIAL;
3022     break;        /* In effect, "return", but see the comment below */
3023     }
3024 
3025   /* One or more states are active for the next character. */
3026 
3027   ptr += clen;    /* Advance to next subject character */
3028   }               /* Loop to move along the subject string */
3029 
3030 /* Control gets here from "break" a few lines above. We do it this way because
3031 if we use "return" above, we have compiler trouble. Some compilers warn if
3032 there's nothing here because they think the function doesn't return a value. On
3033 the other hand, if we put a dummy statement here, some more clever compilers
3034 complain that it can't be reached. Sigh. */
3035 
3036 return match_count;
3037 }
3038 
3039 
3040 
3041 /*************************************************
3042 *     Match a pattern using the DFA algorithm    *
3043 *************************************************/
3044 
3045 /* This function matches a compiled pattern to a subject string, using the
3046 alternate matching algorithm that finds all matches at once.
3047 
3048 Arguments:
3049   code          points to the compiled pattern
3050   subject       subject string
3051   length        length of subject string
3052   startoffset   where to start matching in the subject
3053   options       option bits
3054   match_data    points to a match data structure
3055   gcontext      points to a match context
3056   workspace     pointer to workspace
3057   wscount       size of workspace
3058 
3059 Returns:        > 0 => number of match offset pairs placed in offsets
3060                 = 0 => offsets overflowed; longest matches are present
3061                  -1 => failed to match
3062                < -1 => some kind of unexpected problem
3063 */
3064 
3065 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_dfa_match(const pcre2_code * code,PCRE2_SPTR subject,PCRE2_SIZE length,PCRE2_SIZE start_offset,uint32_t options,pcre2_match_data * match_data,pcre2_match_context * mcontext,int * workspace,size_t wscount)3066 pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
3067   PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
3068   pcre2_match_context *mcontext, int *workspace, size_t wscount)
3069 {
3070 const pcre2_real_code *re = (const pcre2_real_code *)code;
3071 
3072 PCRE2_SPTR start_match;
3073 PCRE2_SPTR end_subject;
3074 PCRE2_SPTR req_cu_ptr;
3075 
3076 BOOL utf, anchored, startline, firstline;
3077 
3078 BOOL has_first_cu = FALSE;
3079 BOOL has_req_cu = FALSE;
3080 PCRE2_UCHAR first_cu = 0;
3081 PCRE2_UCHAR first_cu2 = 0;
3082 PCRE2_UCHAR req_cu = 0;
3083 PCRE2_UCHAR req_cu2 = 0;
3084 
3085 const uint8_t *start_bits = NULL;
3086 
3087 /* We need to have mb pointing to a match block, because the IS_NEWLINE macro
3088 is used below, and it expects NLBLOCK to be defined as a pointer. */
3089 
3090 dfa_match_block actual_match_block;
3091 dfa_match_block *mb = &actual_match_block;
3092 
3093 /* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated
3094 subject string. */
3095 
3096 if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject);
3097 
3098 /* Plausibility checks */
3099 
3100 if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
3101 if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL)
3102   return PCRE2_ERROR_NULL;
3103 if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE;
3104 if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
3105 
3106 /* Check that the first field in the block is the magic number. If it is not,
3107 return with PCRE2_ERROR_BADMAGIC. */
3108 
3109 if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
3110 
3111 /* Check the code unit width. */
3112 
3113 if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
3114   return PCRE2_ERROR_BADMODE;
3115 
3116 /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
3117 options variable for this function. Users of PCRE2 who are not calling the
3118 function directly would like to have a way of setting these flags, in the same
3119 way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with
3120 constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
3121 (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be
3122 transferred to the options for this function. The bits are guaranteed to be
3123 adjacent, but do not have the same values. This bit of Boolean trickery assumes
3124 that the match-time bits are not more significant than the flag bits. If by
3125 accident this is not the case, a compile-time division by zero error will
3126 occur. */
3127 
3128 #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
3129 #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
3130 options |= (re->flags & FF) / ((FF & -FF) / (OO & -OO));
3131 #undef FF
3132 #undef OO
3133 
3134 /* A NULL match context means "use a default context" */
3135 
3136 if (mcontext == NULL)
3137   mcontext = (pcre2_match_context *)(&PRIV(default_match_context));
3138 
3139 /* If restarting after a partial match, do some sanity checks on the contents
3140 of the workspace. */
3141 
3142 if ((options & PCRE2_DFA_RESTART) != 0)
3143   {
3144   if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3145     workspace[1] > (int)((wscount - 2)/INTS_PER_STATEBLOCK))
3146       return PCRE2_ERROR_DFA_BADRESTART;
3147   }
3148 
3149 /* Set some local values */
3150 
3151 utf = (re->overall_options & PCRE2_UTF) != 0;
3152 start_match = subject + start_offset;
3153 end_subject = subject + length;
3154 req_cu_ptr = start_match - 1;
3155 anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 ||
3156   (re->overall_options & PCRE2_ANCHORED) != 0;
3157 
3158 /* The "must be at the start of a line" flags are used in a loop when finding
3159 where to start. */
3160 
3161 startline = (re->flags & PCRE2_STARTLINE) != 0;
3162 firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;
3163 
3164 /* Fill in the fields in the match block. */
3165 
3166 if (mcontext == NULL)
3167   {
3168   mb->callout = NULL;
3169   mb->memctl = re->memctl;
3170   }
3171 else
3172   {
3173   mb->callout = mcontext->callout;
3174   mb->callout_data = mcontext->callout_data;
3175   mb->memctl = mcontext->memctl;
3176   }
3177 
3178 mb->start_code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
3179   re->name_count * re->name_entry_size;
3180 mb->tables = re->tables;
3181 mb->start_subject = subject;
3182 mb->end_subject = end_subject;
3183 mb->start_offset = start_offset;
3184 mb->moptions = options;
3185 mb->poptions = re->overall_options;
3186 
3187 /* Process the \R and newline settings. */
3188 
3189 mb->bsr_convention = re->bsr_convention;
3190 mb->nltype = NLTYPE_FIXED;
3191 switch(re->newline_convention)
3192   {
3193   case PCRE2_NEWLINE_CR:
3194   mb->nllen = 1;
3195   mb->nl[0] = CHAR_CR;
3196   break;
3197 
3198   case PCRE2_NEWLINE_LF:
3199   mb->nllen = 1;
3200   mb->nl[0] = CHAR_NL;
3201   break;
3202 
3203   case PCRE2_NEWLINE_CRLF:
3204   mb->nllen = 2;
3205   mb->nl[0] = CHAR_CR;
3206   mb->nl[1] = CHAR_NL;
3207   break;
3208 
3209   case PCRE2_NEWLINE_ANY:
3210   mb->nltype = NLTYPE_ANY;
3211   break;
3212 
3213   case PCRE2_NEWLINE_ANYCRLF:
3214   mb->nltype = NLTYPE_ANYCRLF;
3215   break;
3216 
3217   default: return PCRE2_ERROR_INTERNAL;
3218   }
3219 
3220 /* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
3221 we must also check that a starting offset does not point into the middle of a
3222 multiunit character. */
3223 
3224 #ifdef SUPPORT_UNICODE
3225 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
3226   {
3227   match_data->rc = PRIV(valid_utf)(subject, length, &(match_data->startchar));
3228   if (match_data->rc != 0) return match_data->rc;
3229 #if PCRE2_CODE_UNIT_WIDTH != 32
3230   if (start_offset > 0 && start_offset < length &&
3231       NOT_FIRSTCHAR(subject[start_offset]))
3232     return PCRE2_ERROR_BADUTFOFFSET;
3233 #endif  /* PCRE2_CODE_UNIT_WIDTH != 32 */
3234   }
3235 #endif  /* SUPPORT_UNICODE */
3236 
3237 /* Set up the first code unit to match, if available. The first_codeunit value
3238 is never set for an anchored regular expression, but the anchoring may be
3239 forced at run time, so we have to test for anchoring. The first code unit may
3240 be unset for an unanchored pattern, of course. If there's no first code unit
3241 there may be a bitmap of possible first characters. */
3242 
3243 if (!anchored)
3244   {
3245   if ((re->flags & PCRE2_FIRSTSET) != 0)
3246     {
3247     has_first_cu = TRUE;
3248     first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
3249     if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
3250       {
3251       first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
3252 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
3253       if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu);
3254 #endif
3255       }
3256     }
3257   else
3258     if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
3259       start_bits = re->start_bitmap;
3260   }
3261 
3262 /* For anchored or unanchored matches, there may be a "last known required
3263 character" set. */
3264 
3265 if ((re->flags & PCRE2_LASTSET) != 0)
3266   {
3267   has_req_cu = TRUE;
3268   req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
3269   if ((re->flags & PCRE2_LASTCASELESS) != 0)
3270     {
3271     req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
3272 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
3273     if (utf && req_cu > 127) req_cu2 = UCD_OTHERCASE(req_cu);
3274 #endif
3275     }
3276   }
3277 
3278 /* Fill in fields that are always returned in the match data. */
3279 
3280 match_data->code = re;
3281 match_data->subject = subject;
3282 match_data->mark = NULL;
3283 match_data->matchedby = PCRE2_MATCHEDBY_DFA_INTERPRETER;
3284 
3285 /* Call the main matching function, looping for a non-anchored regex after a
3286 failed match. If not restarting, perform certain optimizations at the start of
3287 a match. */
3288 
3289 for (;;)
3290   {
3291   int rc;
3292 
3293   /* ----------------- Start of match optimizations ---------------- */
3294 
3295   /* There are some optimizations that avoid running the match if a known
3296   starting point is not found, or if a known later code unit is not present.
3297   However, there is an option (settable at compile time) that disables
3298   these, for testing and for ensuring that all callouts do actually occur.
3299   The optimizations must also be avoided when restarting a DFA match. */
3300 
3301   if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
3302       (options & PCRE2_DFA_RESTART) == 0)
3303     {
3304     PCRE2_SPTR save_end_subject = end_subject;
3305 
3306     /* If firstline is TRUE, the start of the match is constrained to the first
3307     line of a multiline string. That is, the match must be before or at the
3308     first newline. Implement this by temporarily adjusting end_subject so that
3309     we stop the optimization scans at a newline. If the match fails at the
3310     newline, later code breaks this loop. */
3311 
3312     if (firstline)
3313       {
3314       PCRE2_SPTR t = start_match;
3315 #ifdef SUPPORT_UNICODE
3316       if (utf)
3317         {
3318         while (t < mb->end_subject && !IS_NEWLINE(t))
3319           {
3320           t++;
3321           ACROSSCHAR(t < end_subject, *t, t++);
3322           }
3323         }
3324       else
3325 #endif
3326       while (t < mb->end_subject && !IS_NEWLINE(t)) t++;
3327       end_subject = t;
3328       }
3329 
3330     /* Advance to a unique first code unit if there is one. */
3331 
3332     if (has_first_cu)
3333       {
3334       PCRE2_UCHAR smc;
3335       if (first_cu != first_cu2)
3336         while (start_match < end_subject &&
3337           (smc = UCHAR21TEST(start_match)) != first_cu && smc != first_cu2)
3338           start_match++;
3339       else
3340         while (start_match < end_subject && UCHAR21TEST(start_match) != first_cu)
3341           start_match++;
3342       }
3343 
3344     /* Or to just after a linebreak for a multiline match */
3345 
3346     else if (startline)
3347       {
3348       if (start_match > mb->start_subject + start_offset)
3349         {
3350 #ifdef SUPPORT_UNICODE
3351         if (utf)
3352           {
3353           while (start_match < end_subject && !WAS_NEWLINE(start_match))
3354             {
3355             start_match++;
3356             ACROSSCHAR(start_match < end_subject, *start_match,
3357               start_match++);
3358             }
3359           }
3360         else
3361 #endif
3362         while (start_match < end_subject && !WAS_NEWLINE(start_match))
3363           start_match++;
3364 
3365         /* If we have just passed a CR and the newline option is ANY or
3366         ANYCRLF, and we are now at a LF, advance the match position by one more
3367         code unit. */
3368 
3369         if (start_match[-1] == CHAR_CR &&
3370              (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
3371              start_match < end_subject &&
3372              UCHAR21TEST(start_match) == CHAR_NL)
3373           start_match++;
3374         }
3375       }
3376 
3377     /* Or to a non-unique first code unit if any have been identified. The
3378     bitmap contains only 256 bits. When code units are 16 or 32 bits wide, all
3379     code units greater than 254 set the 255 bit. */
3380 
3381     else if (start_bits != NULL)
3382       {
3383       while (start_match < end_subject)
3384         {
3385         register uint32_t c = UCHAR21TEST(start_match);
3386 #if PCRE2_CODE_UNIT_WIDTH != 8
3387         if (c > 255) c = 255;
3388 #endif
3389         if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
3390         start_match++;
3391         }
3392       }
3393 
3394     /* Restore fudged end_subject */
3395 
3396     end_subject = save_end_subject;
3397 
3398     /* The following two optimizations are disabled for partial matching. */
3399 
3400     if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0)
3401       {
3402       /* The minimum matching length is a lower bound; no actual string of that
3403       length may actually match the pattern. Although the value is, strictly,
3404       in characters, we treat it as code units to avoid spending too much time
3405       in this optimization. */
3406 
3407       if (end_subject - start_match < re->minlength) return PCRE2_ERROR_NOMATCH;
3408 
3409       /* If req_cu is set, we know that that code unit must appear in the
3410       subject for the match to succeed. If the first code unit is set, req_cu
3411       must be later in the subject; otherwise the test starts at the match
3412       point. This optimization can save a huge amount of backtracking in
3413       patterns with nested unlimited repeats that aren't going to match.
3414       Writing separate code for cased/caseless versions makes it go faster, as
3415       does using an autoincrement and backing off on a match.
3416 
3417       HOWEVER: when the subject string is very, very long, searching to its end
3418       can take a long time, and give bad performance on quite ordinary
3419       patterns. This showed up when somebody was matching something like
3420       /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
3421       sufficiently long. */
3422 
3423       if (has_req_cu && end_subject - start_match < REQ_CU_MAX)
3424         {
3425         register PCRE2_SPTR p = start_match + (has_first_cu? 1:0);
3426 
3427         /* We don't need to repeat the search if we haven't yet reached the
3428         place we found it at last time. */
3429 
3430         if (p > req_cu_ptr)
3431           {
3432           if (req_cu != req_cu2)
3433             {
3434             while (p < end_subject)
3435               {
3436               register uint32_t pp = UCHAR21INCTEST(p);
3437               if (pp == req_cu || pp == req_cu2) { p--; break; }
3438               }
3439             }
3440           else
3441             {
3442             while (p < end_subject)
3443               {
3444               if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
3445               }
3446             }
3447 
3448           /* If we can't find the required code unit, break the matching loop,
3449           forcing a match failure. */
3450 
3451           if (p >= end_subject) break;
3452 
3453           /* If we have found the required code unit, save the point where we
3454           found it, so that we don't search again next time round the loop if
3455           the start hasn't passed this code unit yet. */
3456 
3457           req_cu_ptr = p;
3458           }
3459         }
3460       }
3461     }
3462 
3463   /* ------------ End of start of match optimizations ------------ */
3464 
3465   /* OK, now we can do the business */
3466 
3467   mb->start_used_ptr = start_match;
3468   mb->last_used_ptr = start_match;
3469   mb->recursive = NULL;
3470 
3471   rc = internal_dfa_match(
3472     mb,                           /* fixed match data */
3473     mb->start_code,               /* this subexpression's code */
3474     start_match,                  /* where we currently are */
3475     start_offset,                 /* start offset in subject */
3476     match_data->ovector,          /* offset vector */
3477     match_data->oveccount * 2,    /* actual size of same */
3478     workspace,                    /* workspace vector */
3479     wscount,                      /* size of same */
3480     0);                           /* function recurse level */
3481 
3482   /* Anything other than "no match" means we are done, always; otherwise, carry
3483   on only if not anchored. */
3484 
3485   if (rc != PCRE2_ERROR_NOMATCH || anchored)
3486     {
3487     if (rc == PCRE2_ERROR_PARTIAL && match_data->oveccount > 0)
3488       {
3489       match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject);
3490       match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject);
3491       }
3492     match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject);
3493     match_data->rightchar = mb->last_used_ptr - subject;
3494     match_data->startchar = (PCRE2_SIZE)(start_match - subject);
3495     match_data->rc = rc;
3496     return rc;
3497     }
3498 
3499   /* Advance to the next subject character unless we are at the end of a line
3500   and firstline is set. */
3501 
3502   if (firstline && IS_NEWLINE(start_match)) break;
3503   start_match++;
3504 #ifdef SUPPORT_UNICODE
3505   if (utf)
3506     {
3507     ACROSSCHAR(start_match < end_subject, *start_match,
3508       start_match++);
3509     }
3510 #endif
3511   if (start_match > end_subject) break;
3512 
3513   /* If we have just passed a CR and we are now at a LF, and the pattern does
3514   not contain any explicit matches for \r or \n, and the newline option is CRLF
3515   or ANY or ANYCRLF, advance the match position by one more character. */
3516 
3517   if (UCHAR21TEST(start_match - 1) == CHAR_CR &&
3518       start_match < end_subject &&
3519       UCHAR21TEST(start_match) == CHAR_NL &&
3520       (re->flags & PCRE2_HASCRORLF) == 0 &&
3521         (mb->nltype == NLTYPE_ANY ||
3522          mb->nltype == NLTYPE_ANYCRLF ||
3523          mb->nllen == 2))
3524     start_match++;
3525 
3526   }   /* "Bumpalong" loop */
3527 
3528 
3529 return PCRE2_ERROR_NOMATCH;
3530 }
3531 
3532 /* End of pcre2_dfa_match.c */
3533