1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10           New API code Copyright (c) 2016-2019 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 
42 /* This module contains the external function pcre2_dfa_match(), which is an
43 alternative matching function that uses a sort of DFA algorithm (not a true
44 FSM). This is NOT Perl-compatible, but it has advantages in certain
45 applications. */
46 
47 
48 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49 the performance of his patterns greatly. I could not use it as it stood, as it
50 was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 test 7 to loop, and test 9 to crash with a segfault.
52 
53 The issue is the check for duplicate states, which is done by a simple linear
54 search up the state list. (Grep for "duplicate" below to find the code.) For
55 many patterns, there will never be many states active at one time, so a simple
56 linear search is fine. In patterns that have many active states, it might be a
57 bottleneck. The suggested code used an indexing scheme to remember which states
58 had previously been used for each character, and avoided the linear search when
59 it knew there was no chance of a duplicate. This was implemented when adding
60 states to the state lists.
61 
62 I wrote some thread-safe, not-limited code to try something similar at the time
63 of checking for duplicates (instead of when adding states), using index vectors
64 on the stack. It did give a 13% improvement with one specially constructed
65 pattern for certain subject strings, but on other strings and on many of the
66 simpler patterns in the test suite it did worse. The major problem, I think,
67 was the extra time to initialize the index. This had to be done for each call
68 of internal_dfa_match(). (The supplied patch used a static vector, initialized
69 only once - I suspect this was the cause of the problems with the tests.)
70 
71 Overall, I concluded that the gains in some cases did not outweigh the losses
72 in others, so I abandoned this code. */
73 
74 
75 #ifdef HAVE_CONFIG_H
76 #include "config.h"
77 #endif
78 
79 #define NLBLOCK mb             /* Block containing newline information */
80 #define PSSTART start_subject  /* Field containing processed string start */
81 #define PSEND   end_subject    /* Field containing processed string end */
82 
83 #include "pcre2_internal.h"
84 
85 #define PUBLIC_DFA_MATCH_OPTIONS \
86   (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
87    PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
88    PCRE2_PARTIAL_SOFT|PCRE2_DFA_SHORTEST|PCRE2_DFA_RESTART| \
89    PCRE2_COPY_MATCHED_SUBJECT)
90 
91 
92 /*************************************************
93 *      Code parameters and static tables         *
94 *************************************************/
95 
96 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 into others, under special conditions. A gap of 20 between the blocks should be
98 enough. The resulting opcodes don't have to be less than 256 because they are
99 never stored, so we push them well clear of the normal opcodes. */
100 
101 #define OP_PROP_EXTRA       300
102 #define OP_EXTUNI_EXTRA     320
103 #define OP_ANYNL_EXTRA      340
104 #define OP_HSPACE_EXTRA     360
105 #define OP_VSPACE_EXTRA     380
106 
107 
108 /* This table identifies those opcodes that are followed immediately by a
109 character that is to be tested in some way. This makes it possible to
110 centralize the loading of these characters. In the case of Type * etc, the
111 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 small value. Non-zero values in the table are the offsets from the opcode where
113 the character is to be found. ***NOTE*** If the start of this table is
114 modified, the three tables that follow must also be modified. */
115 
116 static const uint8_t coptable[] = {
117   0,                             /* End                                    */
118   0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
119   0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
120   0, 0, 0,                       /* Any, AllAny, Anybyte                   */
121   0, 0,                          /* \P, \p                                 */
122   0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
123   0,                             /* \X                                     */
124   0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
125   1,                             /* Char                                   */
126   1,                             /* Chari                                  */
127   1,                             /* not                                    */
128   1,                             /* noti                                   */
129   /* Positive single-char repeats                                          */
130   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
131   1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto, minupto                          */
132   1+IMM2_SIZE,                   /* exact                                  */
133   1, 1, 1, 1+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */
134   1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
135   1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto I, minupto I                      */
136   1+IMM2_SIZE,                   /* exact I                                */
137   1, 1, 1, 1+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */
138   /* Negative single-char repeats - only for chars < 256                   */
139   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
140   1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto, minupto                      */
141   1+IMM2_SIZE,                   /* NOT exact                              */
142   1, 1, 1, 1+IMM2_SIZE,          /* NOT *+, ++, ?+, upto+                  */
143   1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
144   1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto I, minupto I                  */
145   1+IMM2_SIZE,                   /* NOT exact I                            */
146   1, 1, 1, 1+IMM2_SIZE,          /* NOT *+I, ++I, ?+I, upto+I              */
147   /* Positive type repeats                                                 */
148   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
149   1+IMM2_SIZE, 1+IMM2_SIZE,      /* Type upto, minupto                     */
150   1+IMM2_SIZE,                   /* Type exact                             */
151   1, 1, 1, 1+IMM2_SIZE,          /* Type *+, ++, ?+, upto+                 */
152   /* Character class & ref repeats                                         */
153   0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
154   0, 0,                          /* CRRANGE, CRMINRANGE                    */
155   0, 0, 0, 0,                    /* Possessive *+, ++, ?+, CRPOSRANGE      */
156   0,                             /* CLASS                                  */
157   0,                             /* NCLASS                                 */
158   0,                             /* XCLASS - variable length               */
159   0,                             /* REF                                    */
160   0,                             /* REFI                                   */
161   0,                             /* DNREF                                  */
162   0,                             /* DNREFI                                 */
163   0,                             /* RECURSE                                */
164   0,                             /* CALLOUT                                */
165   0,                             /* CALLOUT_STR                            */
166   0,                             /* Alt                                    */
167   0,                             /* Ket                                    */
168   0,                             /* KetRmax                                */
169   0,                             /* KetRmin                                */
170   0,                             /* KetRpos                                */
171   0,                             /* Reverse                                */
172   0,                             /* Assert                                 */
173   0,                             /* Assert not                             */
174   0,                             /* Assert behind                          */
175   0,                             /* Assert behind not                      */
176   0,                             /* NA assert                              */
177   0,                             /* NA assert behind                       */
178   0,                             /* ONCE                                   */
179   0,                             /* SCRIPT_RUN                             */
180   0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
181   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
182   0, 0,                          /* CREF, DNCREF                           */
183   0, 0,                          /* RREF, DNRREF                           */
184   0, 0,                          /* FALSE, TRUE                            */
185   0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
186   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
187   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
188   0, 0,                          /* COMMIT, COMMIT_ARG                     */
189   0, 0, 0,                       /* FAIL, ACCEPT, ASSERT_ACCEPT            */
190   0, 0, 0                        /* CLOSE, SKIPZERO, DEFINE                */
191 };
192 
193 /* This table identifies those opcodes that inspect a character. It is used to
194 remember the fact that a character could have been inspected when the end of
195 the subject is reached. ***NOTE*** If the start of this table is modified, the
196 two tables that follow must also be modified. */
197 
198 static const uint8_t poptable[] = {
199   0,                             /* End                                    */
200   0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
201   1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
202   1, 1, 1,                       /* Any, AllAny, Anybyte                   */
203   1, 1,                          /* \P, \p                                 */
204   1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
205   1,                             /* \X                                     */
206   0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
207   1,                             /* Char                                   */
208   1,                             /* Chari                                  */
209   1,                             /* not                                    */
210   1,                             /* noti                                   */
211   /* Positive single-char repeats                                          */
212   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
213   1, 1, 1,                       /* upto, minupto, exact                   */
214   1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
215   1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
216   1, 1, 1,                       /* upto I, minupto I, exact I             */
217   1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
218   /* Negative single-char repeats - only for chars < 256                   */
219   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
220   1, 1, 1,                       /* NOT upto, minupto, exact               */
221   1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
222   1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
223   1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
224   1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
225   /* Positive type repeats                                                 */
226   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
227   1, 1, 1,                       /* Type upto, minupto, exact              */
228   1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
229   /* Character class & ref repeats                                         */
230   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
231   1, 1,                          /* CRRANGE, CRMINRANGE                    */
232   1, 1, 1, 1,                    /* Possessive *+, ++, ?+, CRPOSRANGE      */
233   1,                             /* CLASS                                  */
234   1,                             /* NCLASS                                 */
235   1,                             /* XCLASS - variable length               */
236   0,                             /* REF                                    */
237   0,                             /* REFI                                   */
238   0,                             /* DNREF                                  */
239   0,                             /* DNREFI                                 */
240   0,                             /* RECURSE                                */
241   0,                             /* CALLOUT                                */
242   0,                             /* CALLOUT_STR                            */
243   0,                             /* Alt                                    */
244   0,                             /* Ket                                    */
245   0,                             /* KetRmax                                */
246   0,                             /* KetRmin                                */
247   0,                             /* KetRpos                                */
248   0,                             /* Reverse                                */
249   0,                             /* Assert                                 */
250   0,                             /* Assert not                             */
251   0,                             /* Assert behind                          */
252   0,                             /* Assert behind not                      */
253   0,                             /* NA assert                              */
254   0,                             /* NA assert behind                       */
255   0,                             /* ONCE                                   */
256   0,                             /* SCRIPT_RUN                             */
257   0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
258   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
259   0, 0,                          /* CREF, DNCREF                           */
260   0, 0,                          /* RREF, DNRREF                           */
261   0, 0,                          /* FALSE, TRUE                            */
262   0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
263   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
264   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
265   0, 0,                          /* COMMIT, COMMIT_ARG                     */
266   0, 0, 0,                       /* FAIL, ACCEPT, ASSERT_ACCEPT            */
267   0, 0, 0                        /* CLOSE, SKIPZERO, DEFINE                */
268 };
269 
270 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
271 and \w */
272 
273 static const uint8_t toptable1[] = {
274   0, 0, 0, 0, 0, 0,
275   ctype_digit, ctype_digit,
276   ctype_space, ctype_space,
277   ctype_word,  ctype_word,
278   0, 0                            /* OP_ANY, OP_ALLANY */
279 };
280 
281 static const uint8_t toptable2[] = {
282   0, 0, 0, 0, 0, 0,
283   ctype_digit, 0,
284   ctype_space, 0,
285   ctype_word,  0,
286   1, 1                            /* OP_ANY, OP_ALLANY */
287 };
288 
289 
290 /* Structure for holding data about a particular state, which is in effect the
291 current data for an active path through the match tree. It must consist
292 entirely of ints because the working vector we are passed, and which we put
293 these structures in, is a vector of ints. */
294 
295 typedef struct stateblock {
296   int offset;                     /* Offset to opcode (-ve has meaning) */
297   int count;                      /* Count for repeats */
298   int data;                       /* Some use extra data */
299 } stateblock;
300 
301 #define INTS_PER_STATEBLOCK  (int)(sizeof(stateblock)/sizeof(int))
302 
303 
304 /* Before version 10.32 the recursive calls of internal_dfa_match() were passed
305 local working space and output vectors that were created on the stack. This has
306 caused issues for some patterns, especially in small-stack environments such as
307 Windows. A new scheme is now in use which sets up a vector on the stack, but if
308 this is too small, heap memory is used, up to the heap_limit. The main
309 parameters are all numbers of ints because the workspace is a vector of ints.
310 
311 The size of the starting stack vector, DFA_START_RWS_SIZE, is in bytes, and is
312 defined in pcre2_internal.h so as to be available to pcre2test when it is
313 finding the minimum heap requirement for a match. */
314 
315 #define OVEC_UNIT  (sizeof(PCRE2_SIZE)/sizeof(int))
316 
317 #define RWS_BASE_SIZE   (DFA_START_RWS_SIZE/sizeof(int))  /* Stack vector */
318 #define RWS_RSIZE       1000                    /* Work size for recursion */
319 #define RWS_OVEC_RSIZE  (1000*OVEC_UNIT)        /* Ovector for recursion */
320 #define RWS_OVEC_OSIZE  (2*OVEC_UNIT)           /* Ovector in other cases */
321 
322 /* This structure is at the start of each workspace block. */
323 
324 typedef struct RWS_anchor {
325   struct RWS_anchor *next;
326   uint32_t size;  /* Number of ints */
327   uint32_t free;  /* Number of ints */
328 } RWS_anchor;
329 
330 #define RWS_ANCHOR_SIZE (sizeof(RWS_anchor)/sizeof(int))
331 
332 
333 
334 /*************************************************
335 *               Process a callout                *
336 *************************************************/
337 
338 /* This function is called to perform a callout.
339 
340 Arguments:
341   code              current code pointer
342   offsets           points to current capture offsets
343   current_subject   start of current subject match
344   ptr               current position in subject
345   mb                the match block
346   extracode         extra code offset when called from condition
347   lengthptr         where to return the callout length
348 
349 Returns:            the return from the callout
350 */
351 
352 static int
do_callout(PCRE2_SPTR code,PCRE2_SIZE * offsets,PCRE2_SPTR current_subject,PCRE2_SPTR ptr,dfa_match_block * mb,PCRE2_SIZE extracode,PCRE2_SIZE * lengthptr)353 do_callout(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject,
354   PCRE2_SPTR ptr, dfa_match_block *mb, PCRE2_SIZE extracode,
355   PCRE2_SIZE *lengthptr)
356 {
357 pcre2_callout_block *cb = mb->cb;
358 
359 *lengthptr = (code[extracode] == OP_CALLOUT)?
360   (PCRE2_SIZE)PRIV(OP_lengths)[OP_CALLOUT] :
361   (PCRE2_SIZE)GET(code, 1 + 2*LINK_SIZE + extracode);
362 
363 if (mb->callout == NULL) return 0;    /* No callout provided */
364 
365 /* Fixed fields in the callout block are set once and for all at the start of
366 matching. */
367 
368 cb->offset_vector    = offsets;
369 cb->start_match      = (PCRE2_SIZE)(current_subject - mb->start_subject);
370 cb->current_position = (PCRE2_SIZE)(ptr - mb->start_subject);
371 cb->pattern_position = GET(code, 1 + extracode);
372 cb->next_item_length = GET(code, 1 + LINK_SIZE + extracode);
373 
374 if (code[extracode] == OP_CALLOUT)
375   {
376   cb->callout_number = code[1 + 2*LINK_SIZE + extracode];
377   cb->callout_string_offset = 0;
378   cb->callout_string = NULL;
379   cb->callout_string_length = 0;
380   }
381 else
382   {
383   cb->callout_number = 0;
384   cb->callout_string_offset = GET(code, 1 + 3*LINK_SIZE + extracode);
385   cb->callout_string = code + (1 + 4*LINK_SIZE + extracode) + 1;
386   cb->callout_string_length = *lengthptr - (1 + 4*LINK_SIZE) - 2;
387   }
388 
389 return (mb->callout)(cb, mb->callout_data);
390 }
391 
392 
393 
394 /*************************************************
395 *         Expand local workspace memory          *
396 *************************************************/
397 
398 /* This function is called when internal_dfa_match() is about to be called
399 recursively and there is insufficient working space left in the current
400 workspace block. If there's an existing next block, use it; otherwise get a new
401 block unless the heap limit is reached.
402 
403 Arguments:
404   rwsptr     pointer to block pointer (updated)
405   ovecsize   space needed for an ovector
406   mb         the match block
407 
408 Returns:     0 rwsptr has been updated
409             !0 an error code
410 */
411 
412 static int
more_workspace(RWS_anchor ** rwsptr,unsigned int ovecsize,dfa_match_block * mb)413 more_workspace(RWS_anchor **rwsptr, unsigned int ovecsize, dfa_match_block *mb)
414 {
415 RWS_anchor *rws = *rwsptr;
416 RWS_anchor *new;
417 
418 if (rws->next != NULL)
419   {
420   new = rws->next;
421   }
422 
423 /* Sizes in the RWS_anchor blocks are in units of sizeof(int), but
424 mb->heap_limit and mb->heap_used are in kibibytes. Play carefully, to avoid
425 overflow. */
426 
427 else
428   {
429   uint32_t newsize = (rws->size >= UINT32_MAX/2)? UINT32_MAX/2 : rws->size * 2;
430   uint32_t newsizeK = newsize/(1024/sizeof(int));
431 
432   if (newsizeK + mb->heap_used > mb->heap_limit)
433     newsizeK = (uint32_t)(mb->heap_limit - mb->heap_used);
434   newsize = newsizeK*(1024/sizeof(int));
435 
436   if (newsize < RWS_RSIZE + ovecsize + RWS_ANCHOR_SIZE)
437     return PCRE2_ERROR_HEAPLIMIT;
438   new = mb->memctl.malloc(newsize*sizeof(int), mb->memctl.memory_data);
439   if (new == NULL) return PCRE2_ERROR_NOMEMORY;
440   mb->heap_used += newsizeK;
441   new->next = NULL;
442   new->size = newsize;
443   rws->next = new;
444   }
445 
446 new->free = new->size - RWS_ANCHOR_SIZE;
447 *rwsptr = new;
448 return 0;
449 }
450 
451 
452 
453 /*************************************************
454 *     Match a Regular Expression - DFA engine    *
455 *************************************************/
456 
457 /* This internal function applies a compiled pattern to a subject string,
458 starting at a given point, using a DFA engine. This function is called from the
459 external one, possibly multiple times if the pattern is not anchored. The
460 function calls itself recursively for some kinds of subpattern.
461 
462 Arguments:
463   mb                the match_data block with fixed information
464   this_start_code   the opening bracket of this subexpression's code
465   current_subject   where we currently are in the subject string
466   start_offset      start offset in the subject string
467   offsets           vector to contain the matching string offsets
468   offsetcount       size of same
469   workspace         vector of workspace
470   wscount           size of same
471   rlevel            function call recursion level
472 
473 Returns:            > 0 => number of match offset pairs placed in offsets
474                     = 0 => offsets overflowed; longest matches are present
475                      -1 => failed to match
476                    < -1 => some kind of unexpected problem
477 
478 The following macros are used for adding states to the two state vectors (one
479 for the current character, one for the following character). */
480 
481 #define ADD_ACTIVE(x,y) \
482   if (active_count++ < wscount) \
483     { \
484     next_active_state->offset = (x); \
485     next_active_state->count  = (y); \
486     next_active_state++; \
487     } \
488   else return PCRE2_ERROR_DFA_WSSIZE
489 
490 #define ADD_ACTIVE_DATA(x,y,z) \
491   if (active_count++ < wscount) \
492     { \
493     next_active_state->offset = (x); \
494     next_active_state->count  = (y); \
495     next_active_state->data   = (z); \
496     next_active_state++; \
497     } \
498   else return PCRE2_ERROR_DFA_WSSIZE
499 
500 #define ADD_NEW(x,y) \
501   if (new_count++ < wscount) \
502     { \
503     next_new_state->offset = (x); \
504     next_new_state->count  = (y); \
505     next_new_state++; \
506     } \
507   else return PCRE2_ERROR_DFA_WSSIZE
508 
509 #define ADD_NEW_DATA(x,y,z) \
510   if (new_count++ < wscount) \
511     { \
512     next_new_state->offset = (x); \
513     next_new_state->count  = (y); \
514     next_new_state->data   = (z); \
515     next_new_state++; \
516     } \
517   else return PCRE2_ERROR_DFA_WSSIZE
518 
519 /* And now, here is the code */
520 
521 static int
internal_dfa_match(dfa_match_block * mb,PCRE2_SPTR this_start_code,PCRE2_SPTR current_subject,PCRE2_SIZE start_offset,PCRE2_SIZE * offsets,uint32_t offsetcount,int * workspace,int wscount,uint32_t rlevel,int * RWS)522 internal_dfa_match(
523   dfa_match_block *mb,
524   PCRE2_SPTR this_start_code,
525   PCRE2_SPTR current_subject,
526   PCRE2_SIZE start_offset,
527   PCRE2_SIZE *offsets,
528   uint32_t offsetcount,
529   int *workspace,
530   int wscount,
531   uint32_t rlevel,
532   int *RWS)
533 {
534 stateblock *active_states, *new_states, *temp_states;
535 stateblock *next_active_state, *next_new_state;
536 const uint8_t *ctypes, *lcc, *fcc;
537 PCRE2_SPTR ptr;
538 PCRE2_SPTR end_code;
539 dfa_recursion_info new_recursive;
540 int active_count, new_count, match_count;
541 
542 /* Some fields in the mb block are frequently referenced, so we load them into
543 independent variables in the hope that this will perform better. */
544 
545 PCRE2_SPTR start_subject = mb->start_subject;
546 PCRE2_SPTR end_subject = mb->end_subject;
547 PCRE2_SPTR start_code = mb->start_code;
548 
549 #ifdef SUPPORT_UNICODE
550 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
551 #else
552 BOOL utf = FALSE;
553 #endif
554 
555 BOOL reset_could_continue = FALSE;
556 
557 if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;
558 if (rlevel++ > mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
559 offsetcount &= (uint32_t)(-2);  /* Round down */
560 
561 wscount -= 2;
562 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
563           (2 * INTS_PER_STATEBLOCK);
564 
565 ctypes = mb->tables + ctypes_offset;
566 lcc = mb->tables + lcc_offset;
567 fcc = mb->tables + fcc_offset;
568 
569 match_count = PCRE2_ERROR_NOMATCH;   /* A negative number */
570 
571 active_states = (stateblock *)(workspace + 2);
572 next_new_state = new_states = active_states + wscount;
573 new_count = 0;
574 
575 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
576 the alternative states onto the list, and find out where the end is. This
577 makes is possible to use this function recursively, when we want to stop at a
578 matching internal ket rather than at the end.
579 
580 If we are dealing with a backward assertion we have to find out the maximum
581 amount to move back, and set up each alternative appropriately. */
582 
583 if (*this_start_code == OP_ASSERTBACK || *this_start_code == OP_ASSERTBACK_NOT)
584   {
585   size_t max_back = 0;
586   size_t gone_back;
587 
588   end_code = this_start_code;
589   do
590     {
591     size_t back = (size_t)GET(end_code, 2+LINK_SIZE);
592     if (back > max_back) max_back = back;
593     end_code += GET(end_code, 1);
594     }
595   while (*end_code == OP_ALT);
596 
597   /* If we can't go back the amount required for the longest lookbehind
598   pattern, go back as far as we can; some alternatives may still be viable. */
599 
600 #ifdef SUPPORT_UNICODE
601   /* In character mode we have to step back character by character */
602 
603   if (utf)
604     {
605     for (gone_back = 0; gone_back < max_back; gone_back++)
606       {
607       if (current_subject <= start_subject) break;
608       current_subject--;
609       ACROSSCHAR(current_subject > start_subject, current_subject,
610         current_subject--);
611       }
612     }
613   else
614 #endif
615 
616   /* In byte-mode we can do this quickly. */
617 
618     {
619     size_t current_offset = (size_t)(current_subject - start_subject);
620     gone_back = (current_offset < max_back)? current_offset : max_back;
621     current_subject -= gone_back;
622     }
623 
624   /* Save the earliest consulted character */
625 
626   if (current_subject < mb->start_used_ptr)
627     mb->start_used_ptr = current_subject;
628 
629   /* Now we can process the individual branches. There will be an OP_REVERSE at
630   the start of each branch, except when the length of the branch is zero. */
631 
632   end_code = this_start_code;
633   do
634     {
635     uint32_t revlen = (end_code[1+LINK_SIZE] == OP_REVERSE)? 1 + LINK_SIZE : 0;
636     size_t back = (revlen == 0)? 0 : (size_t)GET(end_code, 2+LINK_SIZE);
637     if (back <= gone_back)
638       {
639       int bstate = (int)(end_code - start_code + 1 + LINK_SIZE + revlen);
640       ADD_NEW_DATA(-bstate, 0, (int)(gone_back - back));
641       }
642     end_code += GET(end_code, 1);
643     }
644   while (*end_code == OP_ALT);
645  }
646 
647 /* This is the code for a "normal" subpattern (not a backward assertion). The
648 start of a whole pattern is always one of these. If we are at the top level,
649 we may be asked to restart matching from the same point that we reached for a
650 previous partial match. We still have to scan through the top-level branches to
651 find the end state. */
652 
653 else
654   {
655   end_code = this_start_code;
656 
657   /* Restarting */
658 
659   if (rlevel == 1 && (mb->moptions & PCRE2_DFA_RESTART) != 0)
660     {
661     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
662     new_count = workspace[1];
663     if (!workspace[0])
664       memcpy(new_states, active_states, (size_t)new_count * sizeof(stateblock));
665     }
666 
667   /* Not restarting */
668 
669   else
670     {
671     int length = 1 + LINK_SIZE +
672       ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
673         *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
674         ? IMM2_SIZE:0);
675     do
676       {
677       ADD_NEW((int)(end_code - start_code + length), 0);
678       end_code += GET(end_code, 1);
679       length = 1 + LINK_SIZE;
680       }
681     while (*end_code == OP_ALT);
682     }
683   }
684 
685 workspace[0] = 0;    /* Bit indicating which vector is current */
686 
687 /* Loop for scanning the subject */
688 
689 ptr = current_subject;
690 for (;;)
691   {
692   int i, j;
693   int clen, dlen;
694   uint32_t c, d;
695   int forced_fail = 0;
696   BOOL partial_newline = FALSE;
697   BOOL could_continue = reset_could_continue;
698   reset_could_continue = FALSE;
699 
700   if (ptr > mb->last_used_ptr) mb->last_used_ptr = ptr;
701 
702   /* Make the new state list into the active state list and empty the
703   new state list. */
704 
705   temp_states = active_states;
706   active_states = new_states;
707   new_states = temp_states;
708   active_count = new_count;
709   new_count = 0;
710 
711   workspace[0] ^= 1;              /* Remember for the restarting feature */
712   workspace[1] = active_count;
713 
714   /* Set the pointers for adding new states */
715 
716   next_active_state = active_states + active_count;
717   next_new_state = new_states;
718 
719   /* Load the current character from the subject outside the loop, as many
720   different states may want to look at it, and we assume that at least one
721   will. */
722 
723   if (ptr < end_subject)
724     {
725     clen = 1;        /* Number of data items in the character */
726 #ifdef SUPPORT_UNICODE
727     GETCHARLENTEST(c, ptr, clen);
728 #else
729     c = *ptr;
730 #endif  /* SUPPORT_UNICODE */
731     }
732   else
733     {
734     clen = 0;        /* This indicates the end of the subject */
735     c = NOTACHAR;    /* This value should never actually be used */
736     }
737 
738   /* Scan up the active states and act on each one. The result of an action
739   may be to add more states to the currently active list (e.g. on hitting a
740   parenthesis) or it may be to put states on the new list, for considering
741   when we move the character pointer on. */
742 
743   for (i = 0; i < active_count; i++)
744     {
745     stateblock *current_state = active_states + i;
746     BOOL caseless = FALSE;
747     PCRE2_SPTR code;
748     uint32_t codevalue;
749     int state_offset = current_state->offset;
750     int rrc;
751     int count;
752 
753     /* A negative offset is a special case meaning "hold off going to this
754     (negated) state until the number of characters in the data field have
755     been skipped". If the could_continue flag was passed over from a previous
756     state, arrange for it to passed on. */
757 
758     if (state_offset < 0)
759       {
760       if (current_state->data > 0)
761         {
762         ADD_NEW_DATA(state_offset, current_state->count,
763           current_state->data - 1);
764         if (could_continue) reset_could_continue = TRUE;
765         continue;
766         }
767       else
768         {
769         current_state->offset = state_offset = -state_offset;
770         }
771       }
772 
773     /* Check for a duplicate state with the same count, and skip if found.
774     See the note at the head of this module about the possibility of improving
775     performance here. */
776 
777     for (j = 0; j < i; j++)
778       {
779       if (active_states[j].offset == state_offset &&
780           active_states[j].count == current_state->count)
781         goto NEXT_ACTIVE_STATE;
782       }
783 
784     /* The state offset is the offset to the opcode */
785 
786     code = start_code + state_offset;
787     codevalue = *code;
788 
789     /* If this opcode inspects a character, but we are at the end of the
790     subject, remember the fact for use when testing for a partial match. */
791 
792     if (clen == 0 && poptable[codevalue] != 0)
793       could_continue = TRUE;
794 
795     /* If this opcode is followed by an inline character, load it. It is
796     tempting to test for the presence of a subject character here, but that
797     is wrong, because sometimes zero repetitions of the subject are
798     permitted.
799 
800     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
801     argument that is not a data character - but is always one byte long because
802     the values are small. We have to take special action to deal with  \P, \p,
803     \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
804     these ones to new opcodes. */
805 
806     if (coptable[codevalue] > 0)
807       {
808       dlen = 1;
809 #ifdef SUPPORT_UNICODE
810       if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
811 #endif  /* SUPPORT_UNICODE */
812       d = code[coptable[codevalue]];
813       if (codevalue >= OP_TYPESTAR)
814         {
815         switch(d)
816           {
817           case OP_ANYBYTE: return PCRE2_ERROR_DFA_UITEM;
818           case OP_NOTPROP:
819           case OP_PROP: codevalue += OP_PROP_EXTRA; break;
820           case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
821           case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
822           case OP_NOT_HSPACE:
823           case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
824           case OP_NOT_VSPACE:
825           case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
826           default: break;
827           }
828         }
829       }
830     else
831       {
832       dlen = 0;         /* Not strictly necessary, but compilers moan */
833       d = NOTACHAR;     /* if these variables are not set. */
834       }
835 
836 
837     /* Now process the individual opcodes */
838 
839     switch (codevalue)
840       {
841 /* ========================================================================== */
842       /* These cases are never obeyed. This is a fudge that causes a compile-
843       time error if the vectors coptable or poptable, which are indexed by
844       opcode, are not the correct length. It seems to be the only way to do
845       such a check at compile time, as the sizeof() operator does not work
846       in the C preprocessor. */
847 
848       case OP_TABLE_LENGTH:
849       case OP_TABLE_LENGTH +
850         ((sizeof(coptable) == OP_TABLE_LENGTH) &&
851          (sizeof(poptable) == OP_TABLE_LENGTH)):
852       return 0;
853 
854 /* ========================================================================== */
855       /* Reached a closing bracket. If not at the end of the pattern, carry
856       on with the next opcode. For repeating opcodes, also add the repeat
857       state. Note that KETRPOS will always be encountered at the end of the
858       subpattern, because the possessive subpattern repeats are always handled
859       using recursive calls. Thus, it never adds any new states.
860 
861       At the end of the (sub)pattern, unless we have an empty string and
862       PCRE2_NOTEMPTY is set, or PCRE2_NOTEMPTY_ATSTART is set and we are at the
863       start of the subject, save the match data, shifting up all previous
864       matches so we always have the longest first. */
865 
866       case OP_KET:
867       case OP_KETRMIN:
868       case OP_KETRMAX:
869       case OP_KETRPOS:
870       if (code != end_code)
871         {
872         ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
873         if (codevalue != OP_KET)
874           {
875           ADD_ACTIVE(state_offset - (int)GET(code, 1), 0);
876           }
877         }
878       else
879         {
880         if (ptr > current_subject ||
881             ((mb->moptions & PCRE2_NOTEMPTY) == 0 &&
882               ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) == 0 ||
883                 current_subject > start_subject + mb->start_offset)))
884           {
885           if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
886             else if (match_count > 0 && ++match_count * 2 > (int)offsetcount)
887               match_count = 0;
888           count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2;
889           if (count > 0) (void)memmove(offsets + 2, offsets,
890             (size_t)count * sizeof(PCRE2_SIZE));
891           if (offsetcount >= 2)
892             {
893             offsets[0] = (PCRE2_SIZE)(current_subject - start_subject);
894             offsets[1] = (PCRE2_SIZE)(ptr - start_subject);
895             }
896           if ((mb->moptions & PCRE2_DFA_SHORTEST) != 0) return match_count;
897           }
898         }
899       break;
900 
901 /* ========================================================================== */
902       /* These opcodes add to the current list of states without looking
903       at the current character. */
904 
905       /*-----------------------------------------------------------------*/
906       case OP_ALT:
907       do { code += GET(code, 1); } while (*code == OP_ALT);
908       ADD_ACTIVE((int)(code - start_code), 0);
909       break;
910 
911       /*-----------------------------------------------------------------*/
912       case OP_BRA:
913       case OP_SBRA:
914       do
915         {
916         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
917         code += GET(code, 1);
918         }
919       while (*code == OP_ALT);
920       break;
921 
922       /*-----------------------------------------------------------------*/
923       case OP_CBRA:
924       case OP_SCBRA:
925       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE),  0);
926       code += GET(code, 1);
927       while (*code == OP_ALT)
928         {
929         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
930         code += GET(code, 1);
931         }
932       break;
933 
934       /*-----------------------------------------------------------------*/
935       case OP_BRAZERO:
936       case OP_BRAMINZERO:
937       ADD_ACTIVE(state_offset + 1, 0);
938       code += 1 + GET(code, 2);
939       while (*code == OP_ALT) code += GET(code, 1);
940       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
941       break;
942 
943       /*-----------------------------------------------------------------*/
944       case OP_SKIPZERO:
945       code += 1 + GET(code, 2);
946       while (*code == OP_ALT) code += GET(code, 1);
947       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
948       break;
949 
950       /*-----------------------------------------------------------------*/
951       case OP_CIRC:
952       if (ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0)
953         { ADD_ACTIVE(state_offset + 1, 0); }
954       break;
955 
956       /*-----------------------------------------------------------------*/
957       case OP_CIRCM:
958       if ((ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) ||
959           ((ptr != end_subject || (mb->poptions & PCRE2_ALT_CIRCUMFLEX) != 0 )
960             && WAS_NEWLINE(ptr)))
961         { ADD_ACTIVE(state_offset + 1, 0); }
962       break;
963 
964       /*-----------------------------------------------------------------*/
965       case OP_EOD:
966       if (ptr >= end_subject)
967         {
968         if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
969           return PCRE2_ERROR_PARTIAL;
970         else { ADD_ACTIVE(state_offset + 1, 0); }
971         }
972       break;
973 
974       /*-----------------------------------------------------------------*/
975       case OP_SOD:
976       if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
977       break;
978 
979       /*-----------------------------------------------------------------*/
980       case OP_SOM:
981       if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
982       break;
983 
984 
985 /* ========================================================================== */
986       /* These opcodes inspect the next subject character, and sometimes
987       the previous one as well, but do not have an argument. The variable
988       clen contains the length of the current character and is zero if we are
989       at the end of the subject. */
990 
991       /*-----------------------------------------------------------------*/
992       case OP_ANY:
993       if (clen > 0 && !IS_NEWLINE(ptr))
994         {
995         if (ptr + 1 >= mb->end_subject &&
996             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
997             NLBLOCK->nltype == NLTYPE_FIXED &&
998             NLBLOCK->nllen == 2 &&
999             c == NLBLOCK->nl[0])
1000           {
1001           could_continue = partial_newline = TRUE;
1002           }
1003         else
1004           {
1005           ADD_NEW(state_offset + 1, 0);
1006           }
1007         }
1008       break;
1009 
1010       /*-----------------------------------------------------------------*/
1011       case OP_ALLANY:
1012       if (clen > 0)
1013         { ADD_NEW(state_offset + 1, 0); }
1014       break;
1015 
1016       /*-----------------------------------------------------------------*/
1017       case OP_EODN:
1018       if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - mb->nllen))
1019         {
1020         if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1021           return PCRE2_ERROR_PARTIAL;
1022         ADD_ACTIVE(state_offset + 1, 0);
1023         }
1024       break;
1025 
1026       /*-----------------------------------------------------------------*/
1027       case OP_DOLL:
1028       if ((mb->moptions & PCRE2_NOTEOL) == 0)
1029         {
1030         if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1031           could_continue = TRUE;
1032         else if (clen == 0 ||
1033             ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
1034                (ptr == end_subject - mb->nllen)
1035             ))
1036           { ADD_ACTIVE(state_offset + 1, 0); }
1037         else if (ptr + 1 >= mb->end_subject &&
1038                  (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1039                  NLBLOCK->nltype == NLTYPE_FIXED &&
1040                  NLBLOCK->nllen == 2 &&
1041                  c == NLBLOCK->nl[0])
1042           {
1043           if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1044             {
1045             reset_could_continue = TRUE;
1046             ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1047             }
1048           else could_continue = partial_newline = TRUE;
1049           }
1050         }
1051       break;
1052 
1053       /*-----------------------------------------------------------------*/
1054       case OP_DOLLM:
1055       if ((mb->moptions & PCRE2_NOTEOL) == 0)
1056         {
1057         if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1058           could_continue = TRUE;
1059         else if (clen == 0 ||
1060             ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
1061           { ADD_ACTIVE(state_offset + 1, 0); }
1062         else if (ptr + 1 >= mb->end_subject &&
1063                  (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1064                  NLBLOCK->nltype == NLTYPE_FIXED &&
1065                  NLBLOCK->nllen == 2 &&
1066                  c == NLBLOCK->nl[0])
1067           {
1068           if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1069             {
1070             reset_could_continue = TRUE;
1071             ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1072             }
1073           else could_continue = partial_newline = TRUE;
1074           }
1075         }
1076       else if (IS_NEWLINE(ptr))
1077         { ADD_ACTIVE(state_offset + 1, 0); }
1078       break;
1079 
1080       /*-----------------------------------------------------------------*/
1081 
1082       case OP_DIGIT:
1083       case OP_WHITESPACE:
1084       case OP_WORDCHAR:
1085       if (clen > 0 && c < 256 &&
1086             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
1087         { ADD_NEW(state_offset + 1, 0); }
1088       break;
1089 
1090       /*-----------------------------------------------------------------*/
1091       case OP_NOT_DIGIT:
1092       case OP_NOT_WHITESPACE:
1093       case OP_NOT_WORDCHAR:
1094       if (clen > 0 && (c >= 256 ||
1095             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
1096         { ADD_NEW(state_offset + 1, 0); }
1097       break;
1098 
1099       /*-----------------------------------------------------------------*/
1100       case OP_WORD_BOUNDARY:
1101       case OP_NOT_WORD_BOUNDARY:
1102         {
1103         int left_word, right_word;
1104 
1105         if (ptr > start_subject)
1106           {
1107           PCRE2_SPTR temp = ptr - 1;
1108           if (temp < mb->start_used_ptr) mb->start_used_ptr = temp;
1109 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1110           if (utf) { BACKCHAR(temp); }
1111 #endif
1112           GETCHARTEST(d, temp);
1113 #ifdef SUPPORT_UNICODE
1114           if ((mb->poptions & PCRE2_UCP) != 0)
1115             {
1116             if (d == '_') left_word = TRUE; else
1117               {
1118               uint32_t cat = UCD_CATEGORY(d);
1119               left_word = (cat == ucp_L || cat == ucp_N);
1120               }
1121             }
1122           else
1123 #endif
1124           left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1125           }
1126         else left_word = FALSE;
1127 
1128         if (clen > 0)
1129           {
1130           if (ptr >= mb->last_used_ptr)
1131             {
1132             PCRE2_SPTR temp = ptr + 1;
1133 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1134             if (utf) { FORWARDCHARTEST(temp, mb->end_subject); }
1135 #endif
1136             mb->last_used_ptr = temp;
1137             }
1138 #ifdef SUPPORT_UNICODE
1139           if ((mb->poptions & PCRE2_UCP) != 0)
1140             {
1141             if (c == '_') right_word = TRUE; else
1142               {
1143               uint32_t cat = UCD_CATEGORY(c);
1144               right_word = (cat == ucp_L || cat == ucp_N);
1145               }
1146             }
1147           else
1148 #endif
1149           right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1150           }
1151         else right_word = FALSE;
1152 
1153         if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1154           { ADD_ACTIVE(state_offset + 1, 0); }
1155         }
1156       break;
1157 
1158 
1159       /*-----------------------------------------------------------------*/
1160       /* Check the next character by Unicode property. We will get here only
1161       if the support is in the binary; otherwise a compile-time error occurs.
1162       */
1163 
1164 #ifdef SUPPORT_UNICODE
1165       case OP_PROP:
1166       case OP_NOTPROP:
1167       if (clen > 0)
1168         {
1169         BOOL OK;
1170         const uint32_t *cp;
1171         const ucd_record * prop = GET_UCD(c);
1172         switch(code[1])
1173           {
1174           case PT_ANY:
1175           OK = TRUE;
1176           break;
1177 
1178           case PT_LAMP:
1179           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1180                prop->chartype == ucp_Lt;
1181           break;
1182 
1183           case PT_GC:
1184           OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1185           break;
1186 
1187           case PT_PC:
1188           OK = prop->chartype == code[2];
1189           break;
1190 
1191           case PT_SC:
1192           OK = prop->script == code[2];
1193           break;
1194 
1195           /* These are specials for combination cases. */
1196 
1197           case PT_ALNUM:
1198           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1199                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1200           break;
1201 
1202           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1203           which means that Perl space and POSIX space are now identical. PCRE
1204           was changed at release 8.34. */
1205 
1206           case PT_SPACE:    /* Perl space */
1207           case PT_PXSPACE:  /* POSIX space */
1208           switch(c)
1209             {
1210             HSPACE_CASES:
1211             VSPACE_CASES:
1212             OK = TRUE;
1213             break;
1214 
1215             default:
1216             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1217             break;
1218             }
1219           break;
1220 
1221           case PT_WORD:
1222           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1223                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1224                c == CHAR_UNDERSCORE;
1225           break;
1226 
1227           case PT_CLIST:
1228           cp = PRIV(ucd_caseless_sets) + code[2];
1229           for (;;)
1230             {
1231             if (c < *cp) { OK = FALSE; break; }
1232             if (c == *cp++) { OK = TRUE; break; }
1233             }
1234           break;
1235 
1236           case PT_UCNC:
1237           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1238                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1239                c >= 0xe000;
1240           break;
1241 
1242           /* Should never occur, but keep compilers from grumbling. */
1243 
1244           default:
1245           OK = codevalue != OP_PROP;
1246           break;
1247           }
1248 
1249         if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1250         }
1251       break;
1252 #endif
1253 
1254 
1255 
1256 /* ========================================================================== */
1257       /* These opcodes likewise inspect the subject character, but have an
1258       argument that is not a data character. It is one of these opcodes:
1259       OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1260       OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1261 
1262       case OP_TYPEPLUS:
1263       case OP_TYPEMINPLUS:
1264       case OP_TYPEPOSPLUS:
1265       count = current_state->count;  /* Already matched */
1266       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1267       if (clen > 0)
1268         {
1269         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1270             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1271             NLBLOCK->nltype == NLTYPE_FIXED &&
1272             NLBLOCK->nllen == 2 &&
1273             c == NLBLOCK->nl[0])
1274           {
1275           could_continue = partial_newline = TRUE;
1276           }
1277         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1278             (c < 256 &&
1279               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1280               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1281           {
1282           if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1283             {
1284             active_count--;            /* Remove non-match possibility */
1285             next_active_state--;
1286             }
1287           count++;
1288           ADD_NEW(state_offset, count);
1289           }
1290         }
1291       break;
1292 
1293       /*-----------------------------------------------------------------*/
1294       case OP_TYPEQUERY:
1295       case OP_TYPEMINQUERY:
1296       case OP_TYPEPOSQUERY:
1297       ADD_ACTIVE(state_offset + 2, 0);
1298       if (clen > 0)
1299         {
1300         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1301             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1302             NLBLOCK->nltype == NLTYPE_FIXED &&
1303             NLBLOCK->nllen == 2 &&
1304             c == NLBLOCK->nl[0])
1305           {
1306           could_continue = partial_newline = TRUE;
1307           }
1308         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1309             (c < 256 &&
1310               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1311               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1312           {
1313           if (codevalue == OP_TYPEPOSQUERY)
1314             {
1315             active_count--;            /* Remove non-match possibility */
1316             next_active_state--;
1317             }
1318           ADD_NEW(state_offset + 2, 0);
1319           }
1320         }
1321       break;
1322 
1323       /*-----------------------------------------------------------------*/
1324       case OP_TYPESTAR:
1325       case OP_TYPEMINSTAR:
1326       case OP_TYPEPOSSTAR:
1327       ADD_ACTIVE(state_offset + 2, 0);
1328       if (clen > 0)
1329         {
1330         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1331             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1332             NLBLOCK->nltype == NLTYPE_FIXED &&
1333             NLBLOCK->nllen == 2 &&
1334             c == NLBLOCK->nl[0])
1335           {
1336           could_continue = partial_newline = TRUE;
1337           }
1338         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1339             (c < 256 &&
1340               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1341               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1342           {
1343           if (codevalue == OP_TYPEPOSSTAR)
1344             {
1345             active_count--;            /* Remove non-match possibility */
1346             next_active_state--;
1347             }
1348           ADD_NEW(state_offset, 0);
1349           }
1350         }
1351       break;
1352 
1353       /*-----------------------------------------------------------------*/
1354       case OP_TYPEEXACT:
1355       count = current_state->count;  /* Number already matched */
1356       if (clen > 0)
1357         {
1358         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1359             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1360             NLBLOCK->nltype == NLTYPE_FIXED &&
1361             NLBLOCK->nllen == 2 &&
1362             c == NLBLOCK->nl[0])
1363           {
1364           could_continue = partial_newline = TRUE;
1365           }
1366         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1367             (c < 256 &&
1368               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1369               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1370           {
1371           if (++count >= (int)GET2(code, 1))
1372             { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1373           else
1374             { ADD_NEW(state_offset, count); }
1375           }
1376         }
1377       break;
1378 
1379       /*-----------------------------------------------------------------*/
1380       case OP_TYPEUPTO:
1381       case OP_TYPEMINUPTO:
1382       case OP_TYPEPOSUPTO:
1383       ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1384       count = current_state->count;  /* Number already matched */
1385       if (clen > 0)
1386         {
1387         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1388             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1389             NLBLOCK->nltype == NLTYPE_FIXED &&
1390             NLBLOCK->nllen == 2 &&
1391             c == NLBLOCK->nl[0])
1392           {
1393           could_continue = partial_newline = TRUE;
1394           }
1395         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1396             (c < 256 &&
1397               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1398               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1399           {
1400           if (codevalue == OP_TYPEPOSUPTO)
1401             {
1402             active_count--;           /* Remove non-match possibility */
1403             next_active_state--;
1404             }
1405           if (++count >= (int)GET2(code, 1))
1406             { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1407           else
1408             { ADD_NEW(state_offset, count); }
1409           }
1410         }
1411       break;
1412 
1413 /* ========================================================================== */
1414       /* These are virtual opcodes that are used when something like
1415       OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1416       argument. It keeps the code above fast for the other cases. The argument
1417       is in the d variable. */
1418 
1419 #ifdef SUPPORT_UNICODE
1420       case OP_PROP_EXTRA + OP_TYPEPLUS:
1421       case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1422       case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1423       count = current_state->count;           /* Already matched */
1424       if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1425       if (clen > 0)
1426         {
1427         BOOL OK;
1428         const uint32_t *cp;
1429         const ucd_record * prop = GET_UCD(c);
1430         switch(code[2])
1431           {
1432           case PT_ANY:
1433           OK = TRUE;
1434           break;
1435 
1436           case PT_LAMP:
1437           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1438             prop->chartype == ucp_Lt;
1439           break;
1440 
1441           case PT_GC:
1442           OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1443           break;
1444 
1445           case PT_PC:
1446           OK = prop->chartype == code[3];
1447           break;
1448 
1449           case PT_SC:
1450           OK = prop->script == code[3];
1451           break;
1452 
1453           /* These are specials for combination cases. */
1454 
1455           case PT_ALNUM:
1456           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1457                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1458           break;
1459 
1460           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1461           which means that Perl space and POSIX space are now identical. PCRE
1462           was changed at release 8.34. */
1463 
1464           case PT_SPACE:    /* Perl space */
1465           case PT_PXSPACE:  /* POSIX space */
1466           switch(c)
1467             {
1468             HSPACE_CASES:
1469             VSPACE_CASES:
1470             OK = TRUE;
1471             break;
1472 
1473             default:
1474             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1475             break;
1476             }
1477           break;
1478 
1479           case PT_WORD:
1480           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1481                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1482                c == CHAR_UNDERSCORE;
1483           break;
1484 
1485           case PT_CLIST:
1486           cp = PRIV(ucd_caseless_sets) + code[3];
1487           for (;;)
1488             {
1489             if (c < *cp) { OK = FALSE; break; }
1490             if (c == *cp++) { OK = TRUE; break; }
1491             }
1492           break;
1493 
1494           case PT_UCNC:
1495           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1496                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1497                c >= 0xe000;
1498           break;
1499 
1500           /* Should never occur, but keep compilers from grumbling. */
1501 
1502           default:
1503           OK = codevalue != OP_PROP;
1504           break;
1505           }
1506 
1507         if (OK == (d == OP_PROP))
1508           {
1509           if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1510             {
1511             active_count--;           /* Remove non-match possibility */
1512             next_active_state--;
1513             }
1514           count++;
1515           ADD_NEW(state_offset, count);
1516           }
1517         }
1518       break;
1519 
1520       /*-----------------------------------------------------------------*/
1521       case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1522       case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1523       case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1524       count = current_state->count;  /* Already matched */
1525       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1526       if (clen > 0)
1527         {
1528         int ncount = 0;
1529         if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1530           {
1531           active_count--;           /* Remove non-match possibility */
1532           next_active_state--;
1533           }
1534         (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1535           &ncount);
1536         count++;
1537         ADD_NEW_DATA(-state_offset, count, ncount);
1538         }
1539       break;
1540 #endif
1541 
1542       /*-----------------------------------------------------------------*/
1543       case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1544       case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1545       case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1546       count = current_state->count;  /* Already matched */
1547       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1548       if (clen > 0)
1549         {
1550         int ncount = 0;
1551         switch (c)
1552           {
1553           case CHAR_VT:
1554           case CHAR_FF:
1555           case CHAR_NEL:
1556 #ifndef EBCDIC
1557           case 0x2028:
1558           case 0x2029:
1559 #endif  /* Not EBCDIC */
1560           if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1561           goto ANYNL01;
1562 
1563           case CHAR_CR:
1564           if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1565           /* Fall through */
1566 
1567           ANYNL01:
1568           case CHAR_LF:
1569           if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1570             {
1571             active_count--;           /* Remove non-match possibility */
1572             next_active_state--;
1573             }
1574           count++;
1575           ADD_NEW_DATA(-state_offset, count, ncount);
1576           break;
1577 
1578           default:
1579           break;
1580           }
1581         }
1582       break;
1583 
1584       /*-----------------------------------------------------------------*/
1585       case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1586       case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1587       case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1588       count = current_state->count;  /* Already matched */
1589       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1590       if (clen > 0)
1591         {
1592         BOOL OK;
1593         switch (c)
1594           {
1595           VSPACE_CASES:
1596           OK = TRUE;
1597           break;
1598 
1599           default:
1600           OK = FALSE;
1601           break;
1602           }
1603 
1604         if (OK == (d == OP_VSPACE))
1605           {
1606           if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1607             {
1608             active_count--;           /* Remove non-match possibility */
1609             next_active_state--;
1610             }
1611           count++;
1612           ADD_NEW_DATA(-state_offset, count, 0);
1613           }
1614         }
1615       break;
1616 
1617       /*-----------------------------------------------------------------*/
1618       case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1619       case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1620       case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1621       count = current_state->count;  /* Already matched */
1622       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1623       if (clen > 0)
1624         {
1625         BOOL OK;
1626         switch (c)
1627           {
1628           HSPACE_CASES:
1629           OK = TRUE;
1630           break;
1631 
1632           default:
1633           OK = FALSE;
1634           break;
1635           }
1636 
1637         if (OK == (d == OP_HSPACE))
1638           {
1639           if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1640             {
1641             active_count--;           /* Remove non-match possibility */
1642             next_active_state--;
1643             }
1644           count++;
1645           ADD_NEW_DATA(-state_offset, count, 0);
1646           }
1647         }
1648       break;
1649 
1650       /*-----------------------------------------------------------------*/
1651 #ifdef SUPPORT_UNICODE
1652       case OP_PROP_EXTRA + OP_TYPEQUERY:
1653       case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1654       case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1655       count = 4;
1656       goto QS1;
1657 
1658       case OP_PROP_EXTRA + OP_TYPESTAR:
1659       case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1660       case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1661       count = 0;
1662 
1663       QS1:
1664 
1665       ADD_ACTIVE(state_offset + 4, 0);
1666       if (clen > 0)
1667         {
1668         BOOL OK;
1669         const uint32_t *cp;
1670         const ucd_record * prop = GET_UCD(c);
1671         switch(code[2])
1672           {
1673           case PT_ANY:
1674           OK = TRUE;
1675           break;
1676 
1677           case PT_LAMP:
1678           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1679             prop->chartype == ucp_Lt;
1680           break;
1681 
1682           case PT_GC:
1683           OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1684           break;
1685 
1686           case PT_PC:
1687           OK = prop->chartype == code[3];
1688           break;
1689 
1690           case PT_SC:
1691           OK = prop->script == code[3];
1692           break;
1693 
1694           /* These are specials for combination cases. */
1695 
1696           case PT_ALNUM:
1697           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1698                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1699           break;
1700 
1701           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1702           which means that Perl space and POSIX space are now identical. PCRE
1703           was changed at release 8.34. */
1704 
1705           case PT_SPACE:    /* Perl space */
1706           case PT_PXSPACE:  /* POSIX space */
1707           switch(c)
1708             {
1709             HSPACE_CASES:
1710             VSPACE_CASES:
1711             OK = TRUE;
1712             break;
1713 
1714             default:
1715             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1716             break;
1717             }
1718           break;
1719 
1720           case PT_WORD:
1721           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1722                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1723                c == CHAR_UNDERSCORE;
1724           break;
1725 
1726           case PT_CLIST:
1727           cp = PRIV(ucd_caseless_sets) + code[3];
1728           for (;;)
1729             {
1730             if (c < *cp) { OK = FALSE; break; }
1731             if (c == *cp++) { OK = TRUE; break; }
1732             }
1733           break;
1734 
1735           case PT_UCNC:
1736           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1737                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1738                c >= 0xe000;
1739           break;
1740 
1741           /* Should never occur, but keep compilers from grumbling. */
1742 
1743           default:
1744           OK = codevalue != OP_PROP;
1745           break;
1746           }
1747 
1748         if (OK == (d == OP_PROP))
1749           {
1750           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1751               codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1752             {
1753             active_count--;           /* Remove non-match possibility */
1754             next_active_state--;
1755             }
1756           ADD_NEW(state_offset + count, 0);
1757           }
1758         }
1759       break;
1760 
1761       /*-----------------------------------------------------------------*/
1762       case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1763       case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1764       case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1765       count = 2;
1766       goto QS2;
1767 
1768       case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1769       case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1770       case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1771       count = 0;
1772 
1773       QS2:
1774 
1775       ADD_ACTIVE(state_offset + 2, 0);
1776       if (clen > 0)
1777         {
1778         int ncount = 0;
1779         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1780             codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1781           {
1782           active_count--;           /* Remove non-match possibility */
1783           next_active_state--;
1784           }
1785         (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1786           &ncount);
1787         ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1788         }
1789       break;
1790 #endif
1791 
1792       /*-----------------------------------------------------------------*/
1793       case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1794       case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1795       case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1796       count = 2;
1797       goto QS3;
1798 
1799       case OP_ANYNL_EXTRA + OP_TYPESTAR:
1800       case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1801       case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1802       count = 0;
1803 
1804       QS3:
1805       ADD_ACTIVE(state_offset + 2, 0);
1806       if (clen > 0)
1807         {
1808         int ncount = 0;
1809         switch (c)
1810           {
1811           case CHAR_VT:
1812           case CHAR_FF:
1813           case CHAR_NEL:
1814 #ifndef EBCDIC
1815           case 0x2028:
1816           case 0x2029:
1817 #endif  /* Not EBCDIC */
1818           if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1819           goto ANYNL02;
1820 
1821           case CHAR_CR:
1822           if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1823           /* Fall through */
1824 
1825           ANYNL02:
1826           case CHAR_LF:
1827           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1828               codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1829             {
1830             active_count--;           /* Remove non-match possibility */
1831             next_active_state--;
1832             }
1833           ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1834           break;
1835 
1836           default:
1837           break;
1838           }
1839         }
1840       break;
1841 
1842       /*-----------------------------------------------------------------*/
1843       case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1844       case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1845       case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1846       count = 2;
1847       goto QS4;
1848 
1849       case OP_VSPACE_EXTRA + OP_TYPESTAR:
1850       case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1851       case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1852       count = 0;
1853 
1854       QS4:
1855       ADD_ACTIVE(state_offset + 2, 0);
1856       if (clen > 0)
1857         {
1858         BOOL OK;
1859         switch (c)
1860           {
1861           VSPACE_CASES:
1862           OK = TRUE;
1863           break;
1864 
1865           default:
1866           OK = FALSE;
1867           break;
1868           }
1869         if (OK == (d == OP_VSPACE))
1870           {
1871           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1872               codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1873             {
1874             active_count--;           /* Remove non-match possibility */
1875             next_active_state--;
1876             }
1877           ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1878           }
1879         }
1880       break;
1881 
1882       /*-----------------------------------------------------------------*/
1883       case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1884       case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1885       case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1886       count = 2;
1887       goto QS5;
1888 
1889       case OP_HSPACE_EXTRA + OP_TYPESTAR:
1890       case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1891       case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1892       count = 0;
1893 
1894       QS5:
1895       ADD_ACTIVE(state_offset + 2, 0);
1896       if (clen > 0)
1897         {
1898         BOOL OK;
1899         switch (c)
1900           {
1901           HSPACE_CASES:
1902           OK = TRUE;
1903           break;
1904 
1905           default:
1906           OK = FALSE;
1907           break;
1908           }
1909 
1910         if (OK == (d == OP_HSPACE))
1911           {
1912           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1913               codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1914             {
1915             active_count--;           /* Remove non-match possibility */
1916             next_active_state--;
1917             }
1918           ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1919           }
1920         }
1921       break;
1922 
1923       /*-----------------------------------------------------------------*/
1924 #ifdef SUPPORT_UNICODE
1925       case OP_PROP_EXTRA + OP_TYPEEXACT:
1926       case OP_PROP_EXTRA + OP_TYPEUPTO:
1927       case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1928       case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1929       if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1930         { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1931       count = current_state->count;  /* Number already matched */
1932       if (clen > 0)
1933         {
1934         BOOL OK;
1935         const uint32_t *cp;
1936         const ucd_record * prop = GET_UCD(c);
1937         switch(code[1 + IMM2_SIZE + 1])
1938           {
1939           case PT_ANY:
1940           OK = TRUE;
1941           break;
1942 
1943           case PT_LAMP:
1944           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1945             prop->chartype == ucp_Lt;
1946           break;
1947 
1948           case PT_GC:
1949           OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1950           break;
1951 
1952           case PT_PC:
1953           OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1954           break;
1955 
1956           case PT_SC:
1957           OK = prop->script == code[1 + IMM2_SIZE + 2];
1958           break;
1959 
1960           /* These are specials for combination cases. */
1961 
1962           case PT_ALNUM:
1963           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1964                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1965           break;
1966 
1967           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1968           which means that Perl space and POSIX space are now identical. PCRE
1969           was changed at release 8.34. */
1970 
1971           case PT_SPACE:    /* Perl space */
1972           case PT_PXSPACE:  /* POSIX space */
1973           switch(c)
1974             {
1975             HSPACE_CASES:
1976             VSPACE_CASES:
1977             OK = TRUE;
1978             break;
1979 
1980             default:
1981             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1982             break;
1983             }
1984           break;
1985 
1986           case PT_WORD:
1987           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1988                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1989                c == CHAR_UNDERSCORE;
1990           break;
1991 
1992           case PT_CLIST:
1993           cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
1994           for (;;)
1995             {
1996             if (c < *cp) { OK = FALSE; break; }
1997             if (c == *cp++) { OK = TRUE; break; }
1998             }
1999           break;
2000 
2001           case PT_UCNC:
2002           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2003                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2004                c >= 0xe000;
2005           break;
2006 
2007           /* Should never occur, but keep compilers from grumbling. */
2008 
2009           default:
2010           OK = codevalue != OP_PROP;
2011           break;
2012           }
2013 
2014         if (OK == (d == OP_PROP))
2015           {
2016           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
2017             {
2018             active_count--;           /* Remove non-match possibility */
2019             next_active_state--;
2020             }
2021           if (++count >= (int)GET2(code, 1))
2022             { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
2023           else
2024             { ADD_NEW(state_offset, count); }
2025           }
2026         }
2027       break;
2028 
2029       /*-----------------------------------------------------------------*/
2030       case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
2031       case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
2032       case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
2033       case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
2034       if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
2035         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2036       count = current_state->count;  /* Number already matched */
2037       if (clen > 0)
2038         {
2039         PCRE2_SPTR nptr;
2040         int ncount = 0;
2041         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
2042           {
2043           active_count--;           /* Remove non-match possibility */
2044           next_active_state--;
2045           }
2046         nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
2047           &ncount);
2048         if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2049             reset_could_continue = TRUE;
2050         if (++count >= (int)GET2(code, 1))
2051           { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2052         else
2053           { ADD_NEW_DATA(-state_offset, count, ncount); }
2054         }
2055       break;
2056 #endif
2057 
2058       /*-----------------------------------------------------------------*/
2059       case OP_ANYNL_EXTRA + OP_TYPEEXACT:
2060       case OP_ANYNL_EXTRA + OP_TYPEUPTO:
2061       case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
2062       case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
2063       if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
2064         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2065       count = current_state->count;  /* Number already matched */
2066       if (clen > 0)
2067         {
2068         int ncount = 0;
2069         switch (c)
2070           {
2071           case CHAR_VT:
2072           case CHAR_FF:
2073           case CHAR_NEL:
2074 #ifndef EBCDIC
2075           case 0x2028:
2076           case 0x2029:
2077 #endif  /* Not EBCDIC */
2078           if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2079           goto ANYNL03;
2080 
2081           case CHAR_CR:
2082           if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
2083           /* Fall through */
2084 
2085           ANYNL03:
2086           case CHAR_LF:
2087           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
2088             {
2089             active_count--;           /* Remove non-match possibility */
2090             next_active_state--;
2091             }
2092           if (++count >= (int)GET2(code, 1))
2093             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2094           else
2095             { ADD_NEW_DATA(-state_offset, count, ncount); }
2096           break;
2097 
2098           default:
2099           break;
2100           }
2101         }
2102       break;
2103 
2104       /*-----------------------------------------------------------------*/
2105       case OP_VSPACE_EXTRA + OP_TYPEEXACT:
2106       case OP_VSPACE_EXTRA + OP_TYPEUPTO:
2107       case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2108       case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2109       if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2110         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2111       count = current_state->count;  /* Number already matched */
2112       if (clen > 0)
2113         {
2114         BOOL OK;
2115         switch (c)
2116           {
2117           VSPACE_CASES:
2118           OK = TRUE;
2119           break;
2120 
2121           default:
2122           OK = FALSE;
2123           }
2124 
2125         if (OK == (d == OP_VSPACE))
2126           {
2127           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2128             {
2129             active_count--;           /* Remove non-match possibility */
2130             next_active_state--;
2131             }
2132           if (++count >= (int)GET2(code, 1))
2133             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2134           else
2135             { ADD_NEW_DATA(-state_offset, count, 0); }
2136           }
2137         }
2138       break;
2139 
2140       /*-----------------------------------------------------------------*/
2141       case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2142       case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2143       case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2144       case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2145       if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2146         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2147       count = current_state->count;  /* Number already matched */
2148       if (clen > 0)
2149         {
2150         BOOL OK;
2151         switch (c)
2152           {
2153           HSPACE_CASES:
2154           OK = TRUE;
2155           break;
2156 
2157           default:
2158           OK = FALSE;
2159           break;
2160           }
2161 
2162         if (OK == (d == OP_HSPACE))
2163           {
2164           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2165             {
2166             active_count--;           /* Remove non-match possibility */
2167             next_active_state--;
2168             }
2169           if (++count >= (int)GET2(code, 1))
2170             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2171           else
2172             { ADD_NEW_DATA(-state_offset, count, 0); }
2173           }
2174         }
2175       break;
2176 
2177 /* ========================================================================== */
2178       /* These opcodes are followed by a character that is usually compared
2179       to the current subject character; it is loaded into d. We still get
2180       here even if there is no subject character, because in some cases zero
2181       repetitions are permitted. */
2182 
2183       /*-----------------------------------------------------------------*/
2184       case OP_CHAR:
2185       if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2186       break;
2187 
2188       /*-----------------------------------------------------------------*/
2189       case OP_CHARI:
2190       if (clen == 0) break;
2191 
2192 #ifdef SUPPORT_UNICODE
2193       if (utf)
2194         {
2195         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2196           {
2197           unsigned int othercase;
2198           if (c < 128)
2199             othercase = fcc[c];
2200           else
2201             othercase = UCD_OTHERCASE(c);
2202           if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2203           }
2204         }
2205       else
2206 #endif  /* SUPPORT_UNICODE */
2207       /* Not UTF mode */
2208         {
2209         if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2210           { ADD_NEW(state_offset + 2, 0); }
2211         }
2212       break;
2213 
2214 
2215 #ifdef SUPPORT_UNICODE
2216       /*-----------------------------------------------------------------*/
2217       /* This is a tricky one because it can match more than one character.
2218       Find out how many characters to skip, and then set up a negative state
2219       to wait for them to pass before continuing. */
2220 
2221       case OP_EXTUNI:
2222       if (clen > 0)
2223         {
2224         int ncount = 0;
2225         PCRE2_SPTR nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject,
2226           end_subject, utf, &ncount);
2227         if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2228             reset_could_continue = TRUE;
2229         ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2230         }
2231       break;
2232 #endif
2233 
2234       /*-----------------------------------------------------------------*/
2235       /* This is a tricky like EXTUNI because it too can match more than one
2236       character (when CR is followed by LF). In this case, set up a negative
2237       state to wait for one character to pass before continuing. */
2238 
2239       case OP_ANYNL:
2240       if (clen > 0) switch(c)
2241         {
2242         case CHAR_VT:
2243         case CHAR_FF:
2244         case CHAR_NEL:
2245 #ifndef EBCDIC
2246         case 0x2028:
2247         case 0x2029:
2248 #endif  /* Not EBCDIC */
2249         if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2250         /* Fall through */
2251 
2252         case CHAR_LF:
2253         ADD_NEW(state_offset + 1, 0);
2254         break;
2255 
2256         case CHAR_CR:
2257         if (ptr + 1 >= end_subject)
2258           {
2259           ADD_NEW(state_offset + 1, 0);
2260           if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2261             reset_could_continue = TRUE;
2262           }
2263         else if (UCHAR21TEST(ptr + 1) == CHAR_LF)
2264           {
2265           ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2266           }
2267         else
2268           {
2269           ADD_NEW(state_offset + 1, 0);
2270           }
2271         break;
2272         }
2273       break;
2274 
2275       /*-----------------------------------------------------------------*/
2276       case OP_NOT_VSPACE:
2277       if (clen > 0) switch(c)
2278         {
2279         VSPACE_CASES:
2280         break;
2281 
2282         default:
2283         ADD_NEW(state_offset + 1, 0);
2284         break;
2285         }
2286       break;
2287 
2288       /*-----------------------------------------------------------------*/
2289       case OP_VSPACE:
2290       if (clen > 0) switch(c)
2291         {
2292         VSPACE_CASES:
2293         ADD_NEW(state_offset + 1, 0);
2294         break;
2295 
2296         default:
2297         break;
2298         }
2299       break;
2300 
2301       /*-----------------------------------------------------------------*/
2302       case OP_NOT_HSPACE:
2303       if (clen > 0) switch(c)
2304         {
2305         HSPACE_CASES:
2306         break;
2307 
2308         default:
2309         ADD_NEW(state_offset + 1, 0);
2310         break;
2311         }
2312       break;
2313 
2314       /*-----------------------------------------------------------------*/
2315       case OP_HSPACE:
2316       if (clen > 0) switch(c)
2317         {
2318         HSPACE_CASES:
2319         ADD_NEW(state_offset + 1, 0);
2320         break;
2321 
2322         default:
2323         break;
2324         }
2325       break;
2326 
2327       /*-----------------------------------------------------------------*/
2328       /* Match a negated single character casefully. */
2329 
2330       case OP_NOT:
2331       if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2332       break;
2333 
2334       /*-----------------------------------------------------------------*/
2335       /* Match a negated single character caselessly. */
2336 
2337       case OP_NOTI:
2338       if (clen > 0)
2339         {
2340         uint32_t otherd;
2341 #ifdef SUPPORT_UNICODE
2342         if (utf && d >= 128)
2343           otherd = UCD_OTHERCASE(d);
2344         else
2345 #endif  /* SUPPORT_UNICODE */
2346         otherd = TABLE_GET(d, fcc, d);
2347         if (c != d && c != otherd)
2348           { ADD_NEW(state_offset + dlen + 1, 0); }
2349         }
2350       break;
2351 
2352       /*-----------------------------------------------------------------*/
2353       case OP_PLUSI:
2354       case OP_MINPLUSI:
2355       case OP_POSPLUSI:
2356       case OP_NOTPLUSI:
2357       case OP_NOTMINPLUSI:
2358       case OP_NOTPOSPLUSI:
2359       caseless = TRUE;
2360       codevalue -= OP_STARI - OP_STAR;
2361 
2362       /* Fall through */
2363       case OP_PLUS:
2364       case OP_MINPLUS:
2365       case OP_POSPLUS:
2366       case OP_NOTPLUS:
2367       case OP_NOTMINPLUS:
2368       case OP_NOTPOSPLUS:
2369       count = current_state->count;  /* Already matched */
2370       if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2371       if (clen > 0)
2372         {
2373         uint32_t otherd = NOTACHAR;
2374         if (caseless)
2375           {
2376 #ifdef SUPPORT_UNICODE
2377           if (utf && d >= 128)
2378             otherd = UCD_OTHERCASE(d);
2379           else
2380 #endif  /* SUPPORT_UNICODE */
2381           otherd = TABLE_GET(d, fcc, d);
2382           }
2383         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2384           {
2385           if (count > 0 &&
2386               (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2387             {
2388             active_count--;             /* Remove non-match possibility */
2389             next_active_state--;
2390             }
2391           count++;
2392           ADD_NEW(state_offset, count);
2393           }
2394         }
2395       break;
2396 
2397       /*-----------------------------------------------------------------*/
2398       case OP_QUERYI:
2399       case OP_MINQUERYI:
2400       case OP_POSQUERYI:
2401       case OP_NOTQUERYI:
2402       case OP_NOTMINQUERYI:
2403       case OP_NOTPOSQUERYI:
2404       caseless = TRUE;
2405       codevalue -= OP_STARI - OP_STAR;
2406       /* Fall through */
2407       case OP_QUERY:
2408       case OP_MINQUERY:
2409       case OP_POSQUERY:
2410       case OP_NOTQUERY:
2411       case OP_NOTMINQUERY:
2412       case OP_NOTPOSQUERY:
2413       ADD_ACTIVE(state_offset + dlen + 1, 0);
2414       if (clen > 0)
2415         {
2416         uint32_t otherd = NOTACHAR;
2417         if (caseless)
2418           {
2419 #ifdef SUPPORT_UNICODE
2420           if (utf && d >= 128)
2421             otherd = UCD_OTHERCASE(d);
2422           else
2423 #endif  /* SUPPORT_UNICODE */
2424           otherd = TABLE_GET(d, fcc, d);
2425           }
2426         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2427           {
2428           if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2429             {
2430             active_count--;            /* Remove non-match possibility */
2431             next_active_state--;
2432             }
2433           ADD_NEW(state_offset + dlen + 1, 0);
2434           }
2435         }
2436       break;
2437 
2438       /*-----------------------------------------------------------------*/
2439       case OP_STARI:
2440       case OP_MINSTARI:
2441       case OP_POSSTARI:
2442       case OP_NOTSTARI:
2443       case OP_NOTMINSTARI:
2444       case OP_NOTPOSSTARI:
2445       caseless = TRUE;
2446       codevalue -= OP_STARI - OP_STAR;
2447       /* Fall through */
2448       case OP_STAR:
2449       case OP_MINSTAR:
2450       case OP_POSSTAR:
2451       case OP_NOTSTAR:
2452       case OP_NOTMINSTAR:
2453       case OP_NOTPOSSTAR:
2454       ADD_ACTIVE(state_offset + dlen + 1, 0);
2455       if (clen > 0)
2456         {
2457         uint32_t otherd = NOTACHAR;
2458         if (caseless)
2459           {
2460 #ifdef SUPPORT_UNICODE
2461           if (utf && d >= 128)
2462             otherd = UCD_OTHERCASE(d);
2463           else
2464 #endif  /* SUPPORT_UNICODE */
2465           otherd = TABLE_GET(d, fcc, d);
2466           }
2467         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2468           {
2469           if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2470             {
2471             active_count--;            /* Remove non-match possibility */
2472             next_active_state--;
2473             }
2474           ADD_NEW(state_offset, 0);
2475           }
2476         }
2477       break;
2478 
2479       /*-----------------------------------------------------------------*/
2480       case OP_EXACTI:
2481       case OP_NOTEXACTI:
2482       caseless = TRUE;
2483       codevalue -= OP_STARI - OP_STAR;
2484       /* Fall through */
2485       case OP_EXACT:
2486       case OP_NOTEXACT:
2487       count = current_state->count;  /* Number already matched */
2488       if (clen > 0)
2489         {
2490         uint32_t otherd = NOTACHAR;
2491         if (caseless)
2492           {
2493 #ifdef SUPPORT_UNICODE
2494           if (utf && d >= 128)
2495             otherd = UCD_OTHERCASE(d);
2496           else
2497 #endif  /* SUPPORT_UNICODE */
2498           otherd = TABLE_GET(d, fcc, d);
2499           }
2500         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2501           {
2502           if (++count >= (int)GET2(code, 1))
2503             { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2504           else
2505             { ADD_NEW(state_offset, count); }
2506           }
2507         }
2508       break;
2509 
2510       /*-----------------------------------------------------------------*/
2511       case OP_UPTOI:
2512       case OP_MINUPTOI:
2513       case OP_POSUPTOI:
2514       case OP_NOTUPTOI:
2515       case OP_NOTMINUPTOI:
2516       case OP_NOTPOSUPTOI:
2517       caseless = TRUE;
2518       codevalue -= OP_STARI - OP_STAR;
2519       /* Fall through */
2520       case OP_UPTO:
2521       case OP_MINUPTO:
2522       case OP_POSUPTO:
2523       case OP_NOTUPTO:
2524       case OP_NOTMINUPTO:
2525       case OP_NOTPOSUPTO:
2526       ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2527       count = current_state->count;  /* Number already matched */
2528       if (clen > 0)
2529         {
2530         uint32_t otherd = NOTACHAR;
2531         if (caseless)
2532           {
2533 #ifdef SUPPORT_UNICODE
2534           if (utf && d >= 128)
2535             otherd = UCD_OTHERCASE(d);
2536           else
2537 #endif  /* SUPPORT_UNICODE */
2538           otherd = TABLE_GET(d, fcc, d);
2539           }
2540         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2541           {
2542           if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2543             {
2544             active_count--;             /* Remove non-match possibility */
2545             next_active_state--;
2546             }
2547           if (++count >= (int)GET2(code, 1))
2548             { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2549           else
2550             { ADD_NEW(state_offset, count); }
2551           }
2552         }
2553       break;
2554 
2555 
2556 /* ========================================================================== */
2557       /* These are the class-handling opcodes */
2558 
2559       case OP_CLASS:
2560       case OP_NCLASS:
2561       case OP_XCLASS:
2562         {
2563         BOOL isinclass = FALSE;
2564         int next_state_offset;
2565         PCRE2_SPTR ecode;
2566 
2567         /* For a simple class, there is always just a 32-byte table, and we
2568         can set isinclass from it. */
2569 
2570         if (codevalue != OP_XCLASS)
2571           {
2572           ecode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
2573           if (clen > 0)
2574             {
2575             isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2576               ((((uint8_t *)(code + 1))[c/8] & (1u << (c&7))) != 0);
2577             }
2578           }
2579 
2580         /* An extended class may have a table or a list of single characters,
2581         ranges, or both, and it may be positive or negative. There's a
2582         function that sorts all this out. */
2583 
2584         else
2585          {
2586          ecode = code + GET(code, 1);
2587          if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2588          }
2589 
2590         /* At this point, isinclass is set for all kinds of class, and ecode
2591         points to the byte after the end of the class. If there is a
2592         quantifier, this is where it will be. */
2593 
2594         next_state_offset = (int)(ecode - start_code);
2595 
2596         switch (*ecode)
2597           {
2598           case OP_CRSTAR:
2599           case OP_CRMINSTAR:
2600           case OP_CRPOSSTAR:
2601           ADD_ACTIVE(next_state_offset + 1, 0);
2602           if (isinclass)
2603             {
2604             if (*ecode == OP_CRPOSSTAR)
2605               {
2606               active_count--;           /* Remove non-match possibility */
2607               next_active_state--;
2608               }
2609             ADD_NEW(state_offset, 0);
2610             }
2611           break;
2612 
2613           case OP_CRPLUS:
2614           case OP_CRMINPLUS:
2615           case OP_CRPOSPLUS:
2616           count = current_state->count;  /* Already matched */
2617           if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2618           if (isinclass)
2619             {
2620             if (count > 0 && *ecode == OP_CRPOSPLUS)
2621               {
2622               active_count--;           /* Remove non-match possibility */
2623               next_active_state--;
2624               }
2625             count++;
2626             ADD_NEW(state_offset, count);
2627             }
2628           break;
2629 
2630           case OP_CRQUERY:
2631           case OP_CRMINQUERY:
2632           case OP_CRPOSQUERY:
2633           ADD_ACTIVE(next_state_offset + 1, 0);
2634           if (isinclass)
2635             {
2636             if (*ecode == OP_CRPOSQUERY)
2637               {
2638               active_count--;           /* Remove non-match possibility */
2639               next_active_state--;
2640               }
2641             ADD_NEW(next_state_offset + 1, 0);
2642             }
2643           break;
2644 
2645           case OP_CRRANGE:
2646           case OP_CRMINRANGE:
2647           case OP_CRPOSRANGE:
2648           count = current_state->count;  /* Already matched */
2649           if (count >= (int)GET2(ecode, 1))
2650             { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2651           if (isinclass)
2652             {
2653             int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2654 
2655             if (*ecode == OP_CRPOSRANGE && count >= (int)GET2(ecode, 1))
2656               {
2657               active_count--;           /* Remove non-match possibility */
2658               next_active_state--;
2659               }
2660 
2661             if (++count >= max && max != 0)   /* Max 0 => no limit */
2662               { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2663             else
2664               { ADD_NEW(state_offset, count); }
2665             }
2666           break;
2667 
2668           default:
2669           if (isinclass) { ADD_NEW(next_state_offset, 0); }
2670           break;
2671           }
2672         }
2673       break;
2674 
2675 /* ========================================================================== */
2676       /* These are the opcodes for fancy brackets of various kinds. We have
2677       to use recursion in order to handle them. The "always failing" assertion
2678       (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2679       though the other "backtracking verbs" are not supported. */
2680 
2681       case OP_FAIL:
2682       forced_fail++;    /* Count FAILs for multiple states */
2683       break;
2684 
2685       case OP_ASSERT:
2686       case OP_ASSERT_NOT:
2687       case OP_ASSERTBACK:
2688       case OP_ASSERTBACK_NOT:
2689         {
2690         int rc;
2691         int *local_workspace;
2692         PCRE2_SIZE *local_offsets;
2693         PCRE2_SPTR endasscode = code + GET(code, 1);
2694         RWS_anchor *rws = (RWS_anchor *)RWS;
2695 
2696         if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2697           {
2698           rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2699           if (rc != 0) return rc;
2700           RWS = (int *)rws;
2701           }
2702 
2703         local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2704         local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2705         rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2706 
2707         while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2708 
2709         rc = internal_dfa_match(
2710           mb,                                   /* static match data */
2711           code,                                 /* this subexpression's code */
2712           ptr,                                  /* where we currently are */
2713           (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2714           local_offsets,                        /* offset vector */
2715           RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
2716           local_workspace,                      /* workspace vector */
2717           RWS_RSIZE,                            /* size of same */
2718           rlevel,                               /* function recursion level */
2719           RWS);                                 /* recursion workspace */
2720 
2721         rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2722 
2723         if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2724         if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2725             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2726         }
2727       break;
2728 
2729       /*-----------------------------------------------------------------*/
2730       case OP_COND:
2731       case OP_SCOND:
2732         {
2733         int codelink = (int)GET(code, 1);
2734         PCRE2_UCHAR condcode;
2735 
2736         /* Because of the way auto-callout works during compile, a callout item
2737         is inserted between OP_COND and an assertion condition. This does not
2738         happen for the other conditions. */
2739 
2740         if (code[LINK_SIZE + 1] == OP_CALLOUT
2741             || code[LINK_SIZE + 1] == OP_CALLOUT_STR)
2742           {
2743           PCRE2_SIZE callout_length;
2744           rrc = do_callout(code, offsets, current_subject, ptr, mb,
2745             1 + LINK_SIZE, &callout_length);
2746           if (rrc < 0) return rrc;                 /* Abandon */
2747           if (rrc > 0) break;                      /* Fail this thread */
2748           code += callout_length;                  /* Skip callout data */
2749           }
2750 
2751         condcode = code[LINK_SIZE+1];
2752 
2753         /* Back reference conditions and duplicate named recursion conditions
2754         are not supported */
2755 
2756         if (condcode == OP_CREF || condcode == OP_DNCREF ||
2757             condcode == OP_DNRREF)
2758           return PCRE2_ERROR_DFA_UCOND;
2759 
2760         /* The DEFINE condition is always false, and the assertion (?!) is
2761         converted to OP_FAIL. */
2762 
2763         if (condcode == OP_FALSE || condcode == OP_FAIL)
2764           { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2765 
2766         /* There is also an always-true condition */
2767 
2768         else if (condcode == OP_TRUE)
2769           { ADD_ACTIVE(state_offset + LINK_SIZE + 2, 0); }
2770 
2771         /* The only supported version of OP_RREF is for the value RREF_ANY,
2772         which means "test if in any recursion". We can't test for specifically
2773         recursed groups. */
2774 
2775         else if (condcode == OP_RREF)
2776           {
2777           unsigned int value = GET2(code, LINK_SIZE + 2);
2778           if (value != RREF_ANY) return PCRE2_ERROR_DFA_UCOND;
2779           if (mb->recursive != NULL)
2780             { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2781           else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2782           }
2783 
2784         /* Otherwise, the condition is an assertion */
2785 
2786         else
2787           {
2788           int rc;
2789           int *local_workspace;
2790           PCRE2_SIZE *local_offsets;
2791           PCRE2_SPTR asscode = code + LINK_SIZE + 1;
2792           PCRE2_SPTR endasscode = asscode + GET(asscode, 1);
2793           RWS_anchor *rws = (RWS_anchor *)RWS;
2794 
2795           if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2796             {
2797             rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2798             if (rc != 0) return rc;
2799             RWS = (int *)rws;
2800             }
2801 
2802           local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2803           local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2804           rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2805 
2806           while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2807 
2808           rc = internal_dfa_match(
2809             mb,                                   /* fixed match data */
2810             asscode,                              /* this subexpression's code */
2811             ptr,                                  /* where we currently are */
2812             (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2813             local_offsets,                        /* offset vector */
2814             RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
2815             local_workspace,                      /* workspace vector */
2816             RWS_RSIZE,                            /* size of same */
2817             rlevel,                               /* function recursion level */
2818             RWS);                                 /* recursion workspace */
2819 
2820           rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2821 
2822           if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2823           if ((rc >= 0) ==
2824                 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2825             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2826           else
2827             { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2828           }
2829         }
2830       break;
2831 
2832       /*-----------------------------------------------------------------*/
2833       case OP_RECURSE:
2834         {
2835         int rc;
2836         int *local_workspace;
2837         PCRE2_SIZE *local_offsets;
2838         RWS_anchor *rws = (RWS_anchor *)RWS;
2839         dfa_recursion_info *ri;
2840         PCRE2_SPTR callpat = start_code + GET(code, 1);
2841         uint32_t recno = (callpat == mb->start_code)? 0 :
2842           GET2(callpat, 1 + LINK_SIZE);
2843 
2844         if (rws->free < RWS_RSIZE + RWS_OVEC_RSIZE)
2845           {
2846           rc = more_workspace(&rws, RWS_OVEC_RSIZE, mb);
2847           if (rc != 0) return rc;
2848           RWS = (int *)rws;
2849           }
2850 
2851         local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2852         local_workspace = ((int *)local_offsets) + RWS_OVEC_RSIZE;
2853         rws->free -= RWS_RSIZE + RWS_OVEC_RSIZE;
2854 
2855         /* Check for repeating a recursion without advancing the subject
2856         pointer. This should catch convoluted mutual recursions. (Some simple
2857         cases are caught at compile time.) */
2858 
2859         for (ri = mb->recursive; ri != NULL; ri = ri->prevrec)
2860           if (recno == ri->group_num && ptr == ri->subject_position)
2861             return PCRE2_ERROR_RECURSELOOP;
2862 
2863         /* Remember this recursion and where we started it so as to
2864         catch infinite loops. */
2865 
2866         new_recursive.group_num = recno;
2867         new_recursive.subject_position = ptr;
2868         new_recursive.prevrec = mb->recursive;
2869         mb->recursive = &new_recursive;
2870 
2871         rc = internal_dfa_match(
2872           mb,                                   /* fixed match data */
2873           callpat,                              /* this subexpression's code */
2874           ptr,                                  /* where we currently are */
2875           (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2876           local_offsets,                        /* offset vector */
2877           RWS_OVEC_RSIZE/OVEC_UNIT,             /* size of same */
2878           local_workspace,                      /* workspace vector */
2879           RWS_RSIZE,                            /* size of same */
2880           rlevel,                               /* function recursion level */
2881           RWS);                                 /* recursion workspace */
2882 
2883         rws->free += RWS_RSIZE + RWS_OVEC_RSIZE;
2884         mb->recursive = new_recursive.prevrec;  /* Done this recursion */
2885 
2886         /* Ran out of internal offsets */
2887 
2888         if (rc == 0) return PCRE2_ERROR_DFA_RECURSE;
2889 
2890         /* For each successful matched substring, set up the next state with a
2891         count of characters to skip before trying it. Note that the count is in
2892         characters, not bytes. */
2893 
2894         if (rc > 0)
2895           {
2896           for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2897             {
2898             PCRE2_SIZE charcount = local_offsets[rc+1] - local_offsets[rc];
2899 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2900             if (utf)
2901               {
2902               PCRE2_SPTR p = start_subject + local_offsets[rc];
2903               PCRE2_SPTR pp = start_subject + local_offsets[rc+1];
2904               while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
2905               }
2906 #endif
2907             if (charcount > 0)
2908               {
2909               ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0,
2910                 (int)(charcount - 1));
2911               }
2912             else
2913               {
2914               ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2915               }
2916             }
2917           }
2918         else if (rc != PCRE2_ERROR_NOMATCH) return rc;
2919         }
2920       break;
2921 
2922       /*-----------------------------------------------------------------*/
2923       case OP_BRAPOS:
2924       case OP_SBRAPOS:
2925       case OP_CBRAPOS:
2926       case OP_SCBRAPOS:
2927       case OP_BRAPOSZERO:
2928         {
2929         int rc;
2930         int *local_workspace;
2931         PCRE2_SIZE *local_offsets;
2932         PCRE2_SIZE charcount, matched_count;
2933         PCRE2_SPTR local_ptr = ptr;
2934         RWS_anchor *rws = (RWS_anchor *)RWS;
2935         BOOL allow_zero;
2936 
2937         if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2938           {
2939           rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2940           if (rc != 0) return rc;
2941           RWS = (int *)rws;
2942           }
2943 
2944         local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2945         local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2946         rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2947 
2948         if (codevalue == OP_BRAPOSZERO)
2949           {
2950           allow_zero = TRUE;
2951           codevalue = *(++code);  /* Codevalue will be one of above BRAs */
2952           }
2953         else allow_zero = FALSE;
2954 
2955         /* Loop to match the subpattern as many times as possible as if it were
2956         a complete pattern. */
2957 
2958         for (matched_count = 0;; matched_count++)
2959           {
2960           rc = internal_dfa_match(
2961             mb,                                   /* fixed match data */
2962             code,                                 /* this subexpression's code */
2963             local_ptr,                            /* where we currently are */
2964             (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2965             local_offsets,                        /* offset vector */
2966             RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
2967             local_workspace,                      /* workspace vector */
2968             RWS_RSIZE,                            /* size of same */
2969             rlevel,                               /* function recursion level */
2970             RWS);                                 /* recursion workspace */
2971 
2972           /* Failed to match */
2973 
2974           if (rc < 0)
2975             {
2976             if (rc != PCRE2_ERROR_NOMATCH) return rc;
2977             break;
2978             }
2979 
2980           /* Matched: break the loop if zero characters matched. */
2981 
2982           charcount = local_offsets[1] - local_offsets[0];
2983           if (charcount == 0) break;
2984           local_ptr += charcount;    /* Advance temporary position ptr */
2985           }
2986 
2987         rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2988 
2989         /* At this point we have matched the subpattern matched_count
2990         times, and local_ptr is pointing to the character after the end of the
2991         last match. */
2992 
2993         if (matched_count > 0 || allow_zero)
2994           {
2995           PCRE2_SPTR end_subpattern = code;
2996           int next_state_offset;
2997 
2998           do { end_subpattern += GET(end_subpattern, 1); }
2999             while (*end_subpattern == OP_ALT);
3000           next_state_offset =
3001             (int)(end_subpattern - start_code + LINK_SIZE + 1);
3002 
3003           /* Optimization: if there are no more active states, and there
3004           are no new states yet set up, then skip over the subject string
3005           right here, to save looping. Otherwise, set up the new state to swing
3006           into action when the end of the matched substring is reached. */
3007 
3008           if (i + 1 >= active_count && new_count == 0)
3009             {
3010             ptr = local_ptr;
3011             clen = 0;
3012             ADD_NEW(next_state_offset, 0);
3013             }
3014           else
3015             {
3016             PCRE2_SPTR p = ptr;
3017             PCRE2_SPTR pp = local_ptr;
3018             charcount = (PCRE2_SIZE)(pp - p);
3019 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3020             if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3021 #endif
3022             ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3023             }
3024           }
3025         }
3026       break;
3027 
3028       /*-----------------------------------------------------------------*/
3029       case OP_ONCE:
3030         {
3031         int rc;
3032         int *local_workspace;
3033         PCRE2_SIZE *local_offsets;
3034         RWS_anchor *rws = (RWS_anchor *)RWS;
3035 
3036         if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
3037           {
3038           rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
3039           if (rc != 0) return rc;
3040           RWS = (int *)rws;
3041           }
3042 
3043         local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
3044         local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
3045         rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
3046 
3047         rc = internal_dfa_match(
3048           mb,                                   /* fixed match data */
3049           code,                                 /* this subexpression's code */
3050           ptr,                                  /* where we currently are */
3051           (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
3052           local_offsets,                        /* offset vector */
3053           RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
3054           local_workspace,                      /* workspace vector */
3055           RWS_RSIZE,                            /* size of same */
3056           rlevel,                               /* function recursion level */
3057           RWS);                                 /* recursion workspace */
3058 
3059         rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
3060 
3061         if (rc >= 0)
3062           {
3063           PCRE2_SPTR end_subpattern = code;
3064           PCRE2_SIZE charcount = local_offsets[1] - local_offsets[0];
3065           int next_state_offset, repeat_state_offset;
3066 
3067           do { end_subpattern += GET(end_subpattern, 1); }
3068             while (*end_subpattern == OP_ALT);
3069           next_state_offset =
3070             (int)(end_subpattern - start_code + LINK_SIZE + 1);
3071 
3072           /* If the end of this subpattern is KETRMAX or KETRMIN, we must
3073           arrange for the repeat state also to be added to the relevant list.
3074           Calculate the offset, or set -1 for no repeat. */
3075 
3076           repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
3077                                  *end_subpattern == OP_KETRMIN)?
3078             (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
3079 
3080           /* If we have matched an empty string, add the next state at the
3081           current character pointer. This is important so that the duplicate
3082           checking kicks in, which is what breaks infinite loops that match an
3083           empty string. */
3084 
3085           if (charcount == 0)
3086             {
3087             ADD_ACTIVE(next_state_offset, 0);
3088             }
3089 
3090           /* Optimization: if there are no more active states, and there
3091           are no new states yet set up, then skip over the subject string
3092           right here, to save looping. Otherwise, set up the new state to swing
3093           into action when the end of the matched substring is reached. */
3094 
3095           else if (i + 1 >= active_count && new_count == 0)
3096             {
3097             ptr += charcount;
3098             clen = 0;
3099             ADD_NEW(next_state_offset, 0);
3100 
3101             /* If we are adding a repeat state at the new character position,
3102             we must fudge things so that it is the only current state.
3103             Otherwise, it might be a duplicate of one we processed before, and
3104             that would cause it to be skipped. */
3105 
3106             if (repeat_state_offset >= 0)
3107               {
3108               next_active_state = active_states;
3109               active_count = 0;
3110               i = -1;
3111               ADD_ACTIVE(repeat_state_offset, 0);
3112               }
3113             }
3114           else
3115             {
3116 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3117             if (utf)
3118               {
3119               PCRE2_SPTR p = start_subject + local_offsets[0];
3120               PCRE2_SPTR pp = start_subject + local_offsets[1];
3121               while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3122               }
3123 #endif
3124             ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3125             if (repeat_state_offset >= 0)
3126               { ADD_NEW_DATA(-repeat_state_offset, 0, (int)(charcount - 1)); }
3127             }
3128           }
3129         else if (rc != PCRE2_ERROR_NOMATCH) return rc;
3130         }
3131       break;
3132 
3133 
3134 /* ========================================================================== */
3135       /* Handle callouts */
3136 
3137       case OP_CALLOUT:
3138       case OP_CALLOUT_STR:
3139         {
3140         PCRE2_SIZE callout_length;
3141         rrc = do_callout(code, offsets, current_subject, ptr, mb, 0,
3142           &callout_length);
3143         if (rrc < 0) return rrc;   /* Abandon */
3144         if (rrc == 0)
3145           { ADD_ACTIVE(state_offset + (int)callout_length, 0); }
3146         }
3147       break;
3148 
3149 
3150 /* ========================================================================== */
3151       default:        /* Unsupported opcode */
3152       return PCRE2_ERROR_DFA_UITEM;
3153       }
3154 
3155     NEXT_ACTIVE_STATE: continue;
3156 
3157     }      /* End of loop scanning active states */
3158 
3159   /* We have finished the processing at the current subject character. If no
3160   new states have been set for the next character, we have found all the
3161   matches that we are going to find. If partial matching has been requested,
3162   check for appropriate conditions.
3163 
3164   The "forced_ fail" variable counts the number of (*F) encountered for the
3165   character. If it is equal to the original active_count (saved in
3166   workspace[1]) it means that (*F) was found on every active state. In this
3167   case we don't want to give a partial match.
3168 
3169   The "could_continue" variable is true if a state could have continued but
3170   for the fact that the end of the subject was reached. */
3171 
3172   if (new_count <= 0)
3173     {
3174     if (could_continue &&                            /* Some could go on, and */
3175         forced_fail != workspace[1] &&               /* Not all forced fail & */
3176         (                                            /* either... */
3177         (mb->moptions & PCRE2_PARTIAL_HARD) != 0      /* Hard partial */
3178         ||                                           /* or... */
3179         ((mb->moptions & PCRE2_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
3180          match_count < 0)                             /* no matches */
3181         ) &&                                         /* And... */
3182         (
3183         partial_newline ||                   /* Either partial NL */
3184           (                                  /* or ... */
3185           ptr >= end_subject &&              /* End of subject and */
3186             (                                  /* either */
3187             ptr > mb->start_used_ptr ||        /* Inspected non-empty string */
3188             mb->allowemptypartial              /* or pattern has lookbehind */
3189             )                                  /* or could match empty */
3190           )
3191         ))
3192       match_count = PCRE2_ERROR_PARTIAL;
3193     break;  /* Exit from loop along the subject string */
3194     }
3195 
3196   /* One or more states are active for the next character. */
3197 
3198   ptr += clen;    /* Advance to next subject character */
3199   }               /* Loop to move along the subject string */
3200 
3201 /* Control gets here from "break" a few lines above. If we have a match and
3202 PCRE2_ENDANCHORED is set, the match fails. */
3203 
3204 if (match_count >= 0 &&
3205     ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0 &&
3206     ptr < end_subject)
3207   match_count = PCRE2_ERROR_NOMATCH;
3208 
3209 return match_count;
3210 }
3211 
3212 
3213 
3214 /*************************************************
3215 *     Match a pattern using the DFA algorithm    *
3216 *************************************************/
3217 
3218 /* This function matches a compiled pattern to a subject string, using the
3219 alternate matching algorithm that finds all matches at once.
3220 
3221 Arguments:
3222   code          points to the compiled pattern
3223   subject       subject string
3224   length        length of subject string
3225   startoffset   where to start matching in the subject
3226   options       option bits
3227   match_data    points to a match data structure
3228   gcontext      points to a match context
3229   workspace     pointer to workspace
3230   wscount       size of workspace
3231 
3232 Returns:        > 0 => number of match offset pairs placed in offsets
3233                 = 0 => offsets overflowed; longest matches are present
3234                  -1 => failed to match
3235                < -1 => some kind of unexpected problem
3236 */
3237 
3238 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_dfa_match(const pcre2_code * code,PCRE2_SPTR subject,PCRE2_SIZE length,PCRE2_SIZE start_offset,uint32_t options,pcre2_match_data * match_data,pcre2_match_context * mcontext,int * workspace,PCRE2_SIZE wscount)3239 pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
3240   PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
3241   pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount)
3242 {
3243 int rc;
3244 int was_zero_terminated = 0;
3245 
3246 const pcre2_real_code *re = (const pcre2_real_code *)code;
3247 
3248 PCRE2_SPTR start_match;
3249 PCRE2_SPTR end_subject;
3250 PCRE2_SPTR bumpalong_limit;
3251 PCRE2_SPTR req_cu_ptr;
3252 
3253 BOOL utf, anchored, startline, firstline;
3254 BOOL has_first_cu = FALSE;
3255 BOOL has_req_cu = FALSE;
3256 
3257 #if PCRE2_CODE_UNIT_WIDTH == 8
3258 BOOL memchr_not_found_first_cu = FALSE;
3259 BOOL memchr_not_found_first_cu2 = FALSE;
3260 #endif
3261 
3262 PCRE2_UCHAR first_cu = 0;
3263 PCRE2_UCHAR first_cu2 = 0;
3264 PCRE2_UCHAR req_cu = 0;
3265 PCRE2_UCHAR req_cu2 = 0;
3266 
3267 const uint8_t *start_bits = NULL;
3268 
3269 /* We need to have mb pointing to a match block, because the IS_NEWLINE macro
3270 is used below, and it expects NLBLOCK to be defined as a pointer. */
3271 
3272 pcre2_callout_block cb;
3273 dfa_match_block actual_match_block;
3274 dfa_match_block *mb = &actual_match_block;
3275 
3276 /* Set up a starting block of memory for use during recursive calls to
3277 internal_dfa_match(). By putting this on the stack, it minimizes resource use
3278 in the case when it is not needed. If this is too small, more memory is
3279 obtained from the heap. At the start of each block is an anchor structure.*/
3280 
3281 int base_recursion_workspace[RWS_BASE_SIZE];
3282 RWS_anchor *rws = (RWS_anchor *)base_recursion_workspace;
3283 rws->next = NULL;
3284 rws->size = RWS_BASE_SIZE;
3285 rws->free = RWS_BASE_SIZE - RWS_ANCHOR_SIZE;
3286 
3287 /* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated
3288 subject string. */
3289 
3290 if (length == PCRE2_ZERO_TERMINATED)
3291   {
3292   length = PRIV(strlen)(subject);
3293   was_zero_terminated = 1;
3294   }
3295 
3296 /* Plausibility checks */
3297 
3298 if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
3299 if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL)
3300   return PCRE2_ERROR_NULL;
3301 if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE;
3302 if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
3303 
3304 /* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same
3305 time. */
3306 
3307 if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
3308    ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
3309   return PCRE2_ERROR_BADOPTION;
3310 
3311 /* Invalid UTF support is not available for DFA matching. */
3312 
3313 if ((re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0)
3314   return PCRE2_ERROR_DFA_UINVALID_UTF;
3315 
3316 /* Check that the first field in the block is the magic number. If it is not,
3317 return with PCRE2_ERROR_BADMAGIC. */
3318 
3319 if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
3320 
3321 /* Check the code unit width. */
3322 
3323 if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
3324   return PCRE2_ERROR_BADMODE;
3325 
3326 /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
3327 options variable for this function. Users of PCRE2 who are not calling the
3328 function directly would like to have a way of setting these flags, in the same
3329 way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with
3330 constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
3331 (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be
3332 transferred to the options for this function. The bits are guaranteed to be
3333 adjacent, but do not have the same values. This bit of Boolean trickery assumes
3334 that the match-time bits are not more significant than the flag bits. If by
3335 accident this is not the case, a compile-time division by zero error will
3336 occur. */
3337 
3338 #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
3339 #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
3340 options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
3341 #undef FF
3342 #undef OO
3343 
3344 /* If restarting after a partial match, do some sanity checks on the contents
3345 of the workspace. */
3346 
3347 if ((options & PCRE2_DFA_RESTART) != 0)
3348   {
3349   if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3350     workspace[1] > (int)((wscount - 2)/INTS_PER_STATEBLOCK))
3351       return PCRE2_ERROR_DFA_BADRESTART;
3352   }
3353 
3354 /* Set some local values */
3355 
3356 utf = (re->overall_options & PCRE2_UTF) != 0;
3357 start_match = subject + start_offset;
3358 end_subject = subject + length;
3359 req_cu_ptr = start_match - 1;
3360 anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 ||
3361   (re->overall_options & PCRE2_ANCHORED) != 0;
3362 
3363 /* The "must be at the start of a line" flags are used in a loop when finding
3364 where to start. */
3365 
3366 startline = (re->flags & PCRE2_STARTLINE) != 0;
3367 firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;
3368 bumpalong_limit = end_subject;
3369 
3370 /* Initialize and set up the fixed fields in the callout block, with a pointer
3371 in the match block. */
3372 
3373 mb->cb = &cb;
3374 cb.version = 2;
3375 cb.subject = subject;
3376 cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
3377 cb.callout_flags = 0;
3378 cb.capture_top      = 1;      /* No capture support */
3379 cb.capture_last     = 0;
3380 cb.mark             = NULL;   /* No (*MARK) support */
3381 
3382 /* Get data from the match context, if present, and fill in the remaining
3383 fields in the match block. It is an error to set an offset limit without
3384 setting the flag at compile time. */
3385 
3386 if (mcontext == NULL)
3387   {
3388   mb->callout = NULL;
3389   mb->memctl = re->memctl;
3390   mb->match_limit = PRIV(default_match_context).match_limit;
3391   mb->match_limit_depth = PRIV(default_match_context).depth_limit;
3392   mb->heap_limit = PRIV(default_match_context).heap_limit;
3393   }
3394 else
3395   {
3396   if (mcontext->offset_limit != PCRE2_UNSET)
3397     {
3398     if ((re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
3399       return PCRE2_ERROR_BADOFFSETLIMIT;
3400     bumpalong_limit = subject + mcontext->offset_limit;
3401     }
3402   mb->callout = mcontext->callout;
3403   mb->callout_data = mcontext->callout_data;
3404   mb->memctl = mcontext->memctl;
3405   mb->match_limit = mcontext->match_limit;
3406   mb->match_limit_depth = mcontext->depth_limit;
3407   mb->heap_limit = mcontext->heap_limit;
3408   }
3409 
3410 if (mb->match_limit > re->limit_match)
3411   mb->match_limit = re->limit_match;
3412 
3413 if (mb->match_limit_depth > re->limit_depth)
3414   mb->match_limit_depth = re->limit_depth;
3415 
3416 if (mb->heap_limit > re->limit_heap)
3417   mb->heap_limit = re->limit_heap;
3418 
3419 mb->start_code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
3420   re->name_count * re->name_entry_size;
3421 mb->tables = re->tables;
3422 mb->start_subject = subject;
3423 mb->end_subject = end_subject;
3424 mb->start_offset = start_offset;
3425 mb->allowemptypartial = (re->max_lookbehind > 0) ||
3426   (re->flags & PCRE2_MATCH_EMPTY) != 0;
3427 mb->moptions = options;
3428 mb->poptions = re->overall_options;
3429 mb->match_call_count = 0;
3430 mb->heap_used = 0;
3431 
3432 /* Process the \R and newline settings. */
3433 
3434 mb->bsr_convention = re->bsr_convention;
3435 mb->nltype = NLTYPE_FIXED;
3436 switch(re->newline_convention)
3437   {
3438   case PCRE2_NEWLINE_CR:
3439   mb->nllen = 1;
3440   mb->nl[0] = CHAR_CR;
3441   break;
3442 
3443   case PCRE2_NEWLINE_LF:
3444   mb->nllen = 1;
3445   mb->nl[0] = CHAR_NL;
3446   break;
3447 
3448   case PCRE2_NEWLINE_NUL:
3449   mb->nllen = 1;
3450   mb->nl[0] = CHAR_NUL;
3451   break;
3452 
3453   case PCRE2_NEWLINE_CRLF:
3454   mb->nllen = 2;
3455   mb->nl[0] = CHAR_CR;
3456   mb->nl[1] = CHAR_NL;
3457   break;
3458 
3459   case PCRE2_NEWLINE_ANY:
3460   mb->nltype = NLTYPE_ANY;
3461   break;
3462 
3463   case PCRE2_NEWLINE_ANYCRLF:
3464   mb->nltype = NLTYPE_ANYCRLF;
3465   break;
3466 
3467   default: return PCRE2_ERROR_INTERNAL;
3468   }
3469 
3470 /* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
3471 we must also check that a starting offset does not point into the middle of a
3472 multiunit character. We check only the portion of the subject that is going to
3473 be inspected during matching - from the offset minus the maximum back reference
3474 to the given length. This saves time when a small part of a large subject is
3475 being matched by the use of a starting offset. Note that the maximum lookbehind
3476 is a number of characters, not code units. */
3477 
3478 #ifdef SUPPORT_UNICODE
3479 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
3480   {
3481   PCRE2_SPTR check_subject = start_match;  /* start_match includes offset */
3482 
3483   if (start_offset > 0)
3484     {
3485 #if PCRE2_CODE_UNIT_WIDTH != 32
3486     unsigned int i;
3487     if (start_match < end_subject && NOT_FIRSTCU(*start_match))
3488       return PCRE2_ERROR_BADUTFOFFSET;
3489     for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
3490       {
3491       check_subject--;
3492       while (check_subject > subject &&
3493 #if PCRE2_CODE_UNIT_WIDTH == 8
3494       (*check_subject & 0xc0) == 0x80)
3495 #else  /* 16-bit */
3496       (*check_subject & 0xfc00) == 0xdc00)
3497 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
3498         check_subject--;
3499       }
3500 #else   /* In the 32-bit library, one code unit equals one character. */
3501     check_subject -= re->max_lookbehind;
3502     if (check_subject < subject) check_subject = subject;
3503 #endif  /* PCRE2_CODE_UNIT_WIDTH != 32 */
3504     }
3505 
3506   /* Validate the relevant portion of the subject. After an error, adjust the
3507   offset to be an absolute offset in the whole string. */
3508 
3509   match_data->rc = PRIV(valid_utf)(check_subject,
3510     length - (PCRE2_SIZE)(check_subject - subject), &(match_data->startchar));
3511   if (match_data->rc != 0)
3512     {
3513     match_data->startchar += (PCRE2_SIZE)(check_subject - subject);
3514     return match_data->rc;
3515     }
3516   }
3517 #endif  /* SUPPORT_UNICODE */
3518 
3519 /* Set up the first code unit to match, if available. If there's no first code
3520 unit there may be a bitmap of possible first characters. */
3521 
3522 if ((re->flags & PCRE2_FIRSTSET) != 0)
3523   {
3524   has_first_cu = TRUE;
3525   first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
3526   if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
3527     {
3528     first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
3529 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
3530     if (utf && first_cu > 127)
3531       first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3532 #endif
3533     }
3534   }
3535 else
3536   if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
3537     start_bits = re->start_bitmap;
3538 
3539 /* There may be a "last known required code unit" set. */
3540 
3541 if ((re->flags & PCRE2_LASTSET) != 0)
3542   {
3543   has_req_cu = TRUE;
3544   req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
3545   if ((re->flags & PCRE2_LASTCASELESS) != 0)
3546     {
3547     req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
3548 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
3549     if (utf && req_cu > 127) req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3550 #endif
3551     }
3552   }
3553 
3554 /* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT,
3555 free the memory that was obtained. */
3556 
3557 if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0)
3558   {
3559   match_data->memctl.free((void *)match_data->subject,
3560     match_data->memctl.memory_data);
3561   match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT;
3562   }
3563 
3564 /* Fill in fields that are always returned in the match data. */
3565 
3566 match_data->code = re;
3567 match_data->subject = NULL;  /* Default for no match */
3568 match_data->mark = NULL;
3569 match_data->matchedby = PCRE2_MATCHEDBY_DFA_INTERPRETER;
3570 
3571 /* Call the main matching function, looping for a non-anchored regex after a
3572 failed match. If not restarting, perform certain optimizations at the start of
3573 a match. */
3574 
3575 for (;;)
3576   {
3577   /* ----------------- Start of match optimizations ---------------- */
3578 
3579   /* There are some optimizations that avoid running the match if a known
3580   starting point is not found, or if a known later code unit is not present.
3581   However, there is an option (settable at compile time) that disables
3582   these, for testing and for ensuring that all callouts do actually occur.
3583   The optimizations must also be avoided when restarting a DFA match. */
3584 
3585   if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
3586       (options & PCRE2_DFA_RESTART) == 0)
3587     {
3588     /* If firstline is TRUE, the start of the match is constrained to the first
3589     line of a multiline string. That is, the match must be before or at the
3590     first newline following the start of matching. Temporarily adjust
3591     end_subject so that we stop the optimization scans for a first code unit
3592     immediately after the first character of a newline (the first code unit can
3593     legitimately be a newline). If the match fails at the newline, later code
3594     breaks this loop. */
3595 
3596     if (firstline)
3597       {
3598       PCRE2_SPTR t = start_match;
3599 #ifdef SUPPORT_UNICODE
3600       if (utf)
3601         {
3602         while (t < end_subject && !IS_NEWLINE(t))
3603           {
3604           t++;
3605           ACROSSCHAR(t < end_subject, t, t++);
3606           }
3607         }
3608       else
3609 #endif
3610       while (t < end_subject && !IS_NEWLINE(t)) t++;
3611       end_subject = t;
3612       }
3613 
3614     /* Anchored: check the first code unit if one is recorded. This may seem
3615     pointless but it can help in detecting a no match case without scanning for
3616     the required code unit. */
3617 
3618     if (anchored)
3619       {
3620       if (has_first_cu || start_bits != NULL)
3621         {
3622         BOOL ok = start_match < end_subject;
3623         if (ok)
3624           {
3625           PCRE2_UCHAR c = UCHAR21TEST(start_match);
3626           ok = has_first_cu && (c == first_cu || c == first_cu2);
3627           if (!ok && start_bits != NULL)
3628             {
3629 #if PCRE2_CODE_UNIT_WIDTH != 8
3630             if (c > 255) c = 255;
3631 #endif
3632             ok = (start_bits[c/8] & (1u << (c&7))) != 0;
3633             }
3634           }
3635         if (!ok) break;
3636         }
3637       }
3638 
3639     /* Not anchored. Advance to a unique first code unit if there is one. In
3640     8-bit mode, the use of memchr() gives a big speed up, even though we have
3641     to call it twice in caseless mode, in order to find the earliest occurrence
3642     of the character in either of its cases. If a call to memchr() that
3643     searches the rest of the subject fails to find one case, remember that in
3644     order not to keep on repeating the search. This can make a huge difference
3645     when the strings are very long and only one case is present. */
3646 
3647     else
3648       {
3649       if (has_first_cu)
3650         {
3651         if (first_cu != first_cu2)  /* Caseless */
3652           {
3653 #if PCRE2_CODE_UNIT_WIDTH != 8
3654           PCRE2_UCHAR smc;
3655           while (start_match < end_subject &&
3656                 (smc = UCHAR21TEST(start_match)) != first_cu &&
3657                   smc != first_cu2)
3658             start_match++;
3659 
3660 #else  /* 8-bit code units */
3661           PCRE2_SPTR pp1 = NULL;
3662           PCRE2_SPTR pp2 = NULL;
3663           PCRE2_SIZE cu2size = end_subject - start_match;
3664 
3665           if (!memchr_not_found_first_cu)
3666             {
3667             pp1 = memchr(start_match, first_cu, end_subject - start_match);
3668             if (pp1 == NULL) memchr_not_found_first_cu = TRUE;
3669               else cu2size = pp1 - start_match;
3670             }
3671 
3672           /* If pp1 is not NULL, we have arranged to search only as far as pp1,
3673           to see if the other case is earlier, so we can set "not found" only
3674           when both searches have returned NULL. */
3675 
3676           if (!memchr_not_found_first_cu2)
3677             {
3678             pp2 = memchr(start_match, first_cu2, cu2size);
3679             memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL);
3680             }
3681 
3682           if (pp1 == NULL)
3683             start_match = (pp2 == NULL)? end_subject : pp2;
3684           else
3685             start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
3686 #endif
3687           }
3688 
3689         /* The caseful case */
3690 
3691         else
3692           {
3693 #if PCRE2_CODE_UNIT_WIDTH != 8
3694           while (start_match < end_subject && UCHAR21TEST(start_match) !=
3695                  first_cu)
3696             start_match++;
3697 #else  /* 8-bit code units */
3698           start_match = memchr(start_match, first_cu, end_subject - start_match);
3699           if (start_match == NULL) start_match = end_subject;
3700 #endif
3701           }
3702 
3703         /* If we can't find the required code unit, having reached the true end
3704         of the subject, break the bumpalong loop, to force a match failure,
3705         except when doing partial matching, when we let the next cycle run at
3706         the end of the subject. To see why, consider the pattern /(?<=abc)def/,
3707         which partially matches "abc", even though the string does not contain
3708         the starting character "d". If we have not reached the true end of the
3709         subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
3710         we also let the cycle run, because the matching string is legitimately
3711         allowed to start with the first code unit of a newline. */
3712 
3713         if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3714             start_match >= mb->end_subject)
3715           break;
3716         }
3717 
3718       /* If there's no first code unit, advance to just after a linebreak for a
3719       multiline match if required. */
3720 
3721       else if (startline)
3722         {
3723         if (start_match > mb->start_subject + start_offset)
3724           {
3725 #ifdef SUPPORT_UNICODE
3726           if (utf)
3727             {
3728             while (start_match < end_subject && !WAS_NEWLINE(start_match))
3729               {
3730               start_match++;
3731               ACROSSCHAR(start_match < end_subject, start_match, start_match++);
3732               }
3733             }
3734           else
3735 #endif
3736           while (start_match < end_subject && !WAS_NEWLINE(start_match))
3737             start_match++;
3738 
3739           /* If we have just passed a CR and the newline option is ANY or
3740           ANYCRLF, and we are now at a LF, advance the match position by one
3741           more code unit. */
3742 
3743           if (start_match[-1] == CHAR_CR &&
3744                (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
3745                start_match < end_subject &&
3746                UCHAR21TEST(start_match) == CHAR_NL)
3747             start_match++;
3748           }
3749         }
3750 
3751       /* If there's no first code unit or a requirement for a multiline line
3752       start, advance to a non-unique first code unit if any have been
3753       identified. The bitmap contains only 256 bits. When code units are 16 or
3754       32 bits wide, all code units greater than 254 set the 255 bit. */
3755 
3756       else if (start_bits != NULL)
3757         {
3758         while (start_match < end_subject)
3759           {
3760           uint32_t c = UCHAR21TEST(start_match);
3761 #if PCRE2_CODE_UNIT_WIDTH != 8
3762           if (c > 255) c = 255;
3763 #endif
3764           if ((start_bits[c/8] & (1u << (c&7))) != 0) break;
3765           start_match++;
3766           }
3767 
3768         /* See comment above in first_cu checking about the next line. */
3769 
3770         if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3771             start_match >= mb->end_subject)
3772           break;
3773         }
3774       }  /* End of first code unit handling */
3775 
3776     /* Restore fudged end_subject */
3777 
3778     end_subject = mb->end_subject;
3779 
3780     /* The following two optimizations are disabled for partial matching. */
3781 
3782     if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0)
3783       {
3784       PCRE2_SPTR p;
3785 
3786       /* The minimum matching length is a lower bound; no actual string of that
3787       length may actually match the pattern. Although the value is, strictly,
3788       in characters, we treat it as code units to avoid spending too much time
3789       in this optimization. */
3790 
3791       if (end_subject - start_match < re->minlength) goto NOMATCH_EXIT;
3792 
3793       /* If req_cu is set, we know that that code unit must appear in the
3794       subject for the match to succeed. If the first code unit is set, req_cu
3795       must be later in the subject; otherwise the test starts at the match
3796       point. This optimization can save a huge amount of backtracking in
3797       patterns with nested unlimited repeats that aren't going to match.
3798       Writing separate code for cased/caseless versions makes it go faster, as
3799       does using an autoincrement and backing off on a match. As in the case of
3800       the first code unit, using memchr() in the 8-bit library gives a big
3801       speed up. Unlike the first_cu check above, we do not need to call
3802       memchr() twice in the caseless case because we only need to check for the
3803       presence of the character in either case, not find the first occurrence.
3804 
3805       The search can be skipped if the code unit was found later than the
3806       current starting point in a previous iteration of the bumpalong loop.
3807 
3808       HOWEVER: when the subject string is very, very long, searching to its end
3809       can take a long time, and give bad performance on quite ordinary
3810       patterns. This showed up when somebody was matching something like
3811       /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
3812       sufficiently long, but it's worth searching a lot more for unanchored
3813       patterns. */
3814 
3815       p = start_match + (has_first_cu? 1:0);
3816       if (has_req_cu && p > req_cu_ptr)
3817         {
3818         PCRE2_SIZE check_length = end_subject - start_match;
3819 
3820         if (check_length < REQ_CU_MAX ||
3821               (!anchored && check_length < REQ_CU_MAX * 1000))
3822           {
3823           if (req_cu != req_cu2)  /* Caseless */
3824             {
3825 #if PCRE2_CODE_UNIT_WIDTH != 8
3826             while (p < end_subject)
3827               {
3828               uint32_t pp = UCHAR21INCTEST(p);
3829               if (pp == req_cu || pp == req_cu2) { p--; break; }
3830               }
3831 #else  /* 8-bit code units */
3832             PCRE2_SPTR pp = p;
3833             p = memchr(pp, req_cu, end_subject - pp);
3834             if (p == NULL)
3835               {
3836               p = memchr(pp, req_cu2, end_subject - pp);
3837               if (p == NULL) p = end_subject;
3838               }
3839 #endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
3840             }
3841 
3842           /* The caseful case */
3843 
3844           else
3845             {
3846 #if PCRE2_CODE_UNIT_WIDTH != 8
3847             while (p < end_subject)
3848               {
3849               if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
3850               }
3851 
3852 #else  /* 8-bit code units */
3853             p = memchr(p, req_cu, end_subject - p);
3854             if (p == NULL) p = end_subject;
3855 #endif
3856             }
3857 
3858           /* If we can't find the required code unit, break the matching loop,
3859           forcing a match failure. */
3860 
3861           if (p >= end_subject) break;
3862 
3863           /* If we have found the required code unit, save the point where we
3864           found it, so that we don't search again next time round the loop if
3865           the start hasn't passed this code unit yet. */
3866 
3867           req_cu_ptr = p;
3868           }
3869         }
3870       }
3871     }
3872 
3873   /* ------------ End of start of match optimizations ------------ */
3874 
3875   /* Give no match if we have passed the bumpalong limit. */
3876 
3877   if (start_match > bumpalong_limit) break;
3878 
3879   /* OK, now we can do the business */
3880 
3881   mb->start_used_ptr = start_match;
3882   mb->last_used_ptr = start_match;
3883   mb->recursive = NULL;
3884 
3885   rc = internal_dfa_match(
3886     mb,                           /* fixed match data */
3887     mb->start_code,               /* this subexpression's code */
3888     start_match,                  /* where we currently are */
3889     start_offset,                 /* start offset in subject */
3890     match_data->ovector,          /* offset vector */
3891     (uint32_t)match_data->oveccount * 2,  /* actual size of same */
3892     workspace,                    /* workspace vector */
3893     (int)wscount,                 /* size of same */
3894     0,                            /* function recurse level */
3895     base_recursion_workspace);    /* initial workspace for recursion */
3896 
3897   /* Anything other than "no match" means we are done, always; otherwise, carry
3898   on only if not anchored. */
3899 
3900   if (rc != PCRE2_ERROR_NOMATCH || anchored)
3901     {
3902     if (rc == PCRE2_ERROR_PARTIAL && match_data->oveccount > 0)
3903       {
3904       match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject);
3905       match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject);
3906       }
3907     match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject);
3908     match_data->rightchar = (PCRE2_SIZE)( mb->last_used_ptr - subject);
3909     match_data->startchar = (PCRE2_SIZE)(start_match - subject);
3910     match_data->rc = rc;
3911 
3912     if (rc >= 0 &&(options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
3913       {
3914       length = CU2BYTES(length + was_zero_terminated);
3915       match_data->subject = match_data->memctl.malloc(length,
3916         match_data->memctl.memory_data);
3917       if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;
3918       memcpy((void *)match_data->subject, subject, length);
3919       match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
3920       }
3921     else
3922       {
3923       if (rc >= 0 || rc == PCRE2_ERROR_PARTIAL) match_data->subject = subject;
3924       }
3925     goto EXIT;
3926     }
3927 
3928   /* Advance to the next subject character unless we are at the end of a line
3929   and firstline is set. */
3930 
3931   if (firstline && IS_NEWLINE(start_match)) break;
3932   start_match++;
3933 #ifdef SUPPORT_UNICODE
3934   if (utf)
3935     {
3936     ACROSSCHAR(start_match < end_subject, start_match, start_match++);
3937     }
3938 #endif
3939   if (start_match > end_subject) break;
3940 
3941   /* If we have just passed a CR and we are now at a LF, and the pattern does
3942   not contain any explicit matches for \r or \n, and the newline option is CRLF
3943   or ANY or ANYCRLF, advance the match position by one more character. */
3944 
3945   if (UCHAR21TEST(start_match - 1) == CHAR_CR &&
3946       start_match < end_subject &&
3947       UCHAR21TEST(start_match) == CHAR_NL &&
3948       (re->flags & PCRE2_HASCRORLF) == 0 &&
3949         (mb->nltype == NLTYPE_ANY ||
3950          mb->nltype == NLTYPE_ANYCRLF ||
3951          mb->nllen == 2))
3952     start_match++;
3953 
3954   }   /* "Bumpalong" loop */
3955 
3956 NOMATCH_EXIT:
3957 rc = PCRE2_ERROR_NOMATCH;
3958 
3959 EXIT:
3960 while (rws->next != NULL)
3961   {
3962   RWS_anchor *next = rws->next;
3963   rws->next = next->next;
3964   mb->memctl.free(next, mb->memctl.memory_data);
3965   }
3966 
3967 return rc;
3968 }
3969 
3970 /* End of pcre2_dfa_match.c */
3971