1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10           New API code Copyright (c) 2016-2020 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 
42 /* This module contains the external function pcre2_dfa_match(), which is an
43 alternative matching function that uses a sort of DFA algorithm (not a true
44 FSM). This is NOT Perl-compatible, but it has advantages in certain
45 applications. */
46 
47 
48 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49 the performance of his patterns greatly. I could not use it as it stood, as it
50 was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 test 7 to loop, and test 9 to crash with a segfault.
52 
53 The issue is the check for duplicate states, which is done by a simple linear
54 search up the state list. (Grep for "duplicate" below to find the code.) For
55 many patterns, there will never be many states active at one time, so a simple
56 linear search is fine. In patterns that have many active states, it might be a
57 bottleneck. The suggested code used an indexing scheme to remember which states
58 had previously been used for each character, and avoided the linear search when
59 it knew there was no chance of a duplicate. This was implemented when adding
60 states to the state lists.
61 
62 I wrote some thread-safe, not-limited code to try something similar at the time
63 of checking for duplicates (instead of when adding states), using index vectors
64 on the stack. It did give a 13% improvement with one specially constructed
65 pattern for certain subject strings, but on other strings and on many of the
66 simpler patterns in the test suite it did worse. The major problem, I think,
67 was the extra time to initialize the index. This had to be done for each call
68 of internal_dfa_match(). (The supplied patch used a static vector, initialized
69 only once - I suspect this was the cause of the problems with the tests.)
70 
71 Overall, I concluded that the gains in some cases did not outweigh the losses
72 in others, so I abandoned this code. */
73 
74 
75 #ifdef HAVE_CONFIG_H
76 #include "config.h"
77 #endif
78 
79 #define NLBLOCK mb             /* Block containing newline information */
80 #define PSSTART start_subject  /* Field containing processed string start */
81 #define PSEND   end_subject    /* Field containing processed string end */
82 
83 #include "pcre2_internal.h"
84 
85 #define PUBLIC_DFA_MATCH_OPTIONS \
86   (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
87    PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
88    PCRE2_PARTIAL_SOFT|PCRE2_DFA_SHORTEST|PCRE2_DFA_RESTART| \
89    PCRE2_COPY_MATCHED_SUBJECT)
90 
91 
92 /*************************************************
93 *      Code parameters and static tables         *
94 *************************************************/
95 
96 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 into others, under special conditions. A gap of 20 between the blocks should be
98 enough. The resulting opcodes don't have to be less than 256 because they are
99 never stored, so we push them well clear of the normal opcodes. */
100 
101 #define OP_PROP_EXTRA       300
102 #define OP_EXTUNI_EXTRA     320
103 #define OP_ANYNL_EXTRA      340
104 #define OP_HSPACE_EXTRA     360
105 #define OP_VSPACE_EXTRA     380
106 
107 
108 /* This table identifies those opcodes that are followed immediately by a
109 character that is to be tested in some way. This makes it possible to
110 centralize the loading of these characters. In the case of Type * etc, the
111 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 small value. Non-zero values in the table are the offsets from the opcode where
113 the character is to be found. ***NOTE*** If the start of this table is
114 modified, the three tables that follow must also be modified. */
115 
116 static const uint8_t coptable[] = {
117   0,                             /* End                                    */
118   0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
119   0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
120   0, 0, 0,                       /* Any, AllAny, Anybyte                   */
121   0, 0,                          /* \P, \p                                 */
122   0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
123   0,                             /* \X                                     */
124   0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
125   1,                             /* Char                                   */
126   1,                             /* Chari                                  */
127   1,                             /* not                                    */
128   1,                             /* noti                                   */
129   /* Positive single-char repeats                                          */
130   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
131   1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto, minupto                          */
132   1+IMM2_SIZE,                   /* exact                                  */
133   1, 1, 1, 1+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */
134   1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
135   1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto I, minupto I                      */
136   1+IMM2_SIZE,                   /* exact I                                */
137   1, 1, 1, 1+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */
138   /* Negative single-char repeats - only for chars < 256                   */
139   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
140   1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto, minupto                      */
141   1+IMM2_SIZE,                   /* NOT exact                              */
142   1, 1, 1, 1+IMM2_SIZE,          /* NOT *+, ++, ?+, upto+                  */
143   1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
144   1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto I, minupto I                  */
145   1+IMM2_SIZE,                   /* NOT exact I                            */
146   1, 1, 1, 1+IMM2_SIZE,          /* NOT *+I, ++I, ?+I, upto+I              */
147   /* Positive type repeats                                                 */
148   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
149   1+IMM2_SIZE, 1+IMM2_SIZE,      /* Type upto, minupto                     */
150   1+IMM2_SIZE,                   /* Type exact                             */
151   1, 1, 1, 1+IMM2_SIZE,          /* Type *+, ++, ?+, upto+                 */
152   /* Character class & ref repeats                                         */
153   0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
154   0, 0,                          /* CRRANGE, CRMINRANGE                    */
155   0, 0, 0, 0,                    /* Possessive *+, ++, ?+, CRPOSRANGE      */
156   0,                             /* CLASS                                  */
157   0,                             /* NCLASS                                 */
158   0,                             /* XCLASS - variable length               */
159   0,                             /* REF                                    */
160   0,                             /* REFI                                   */
161   0,                             /* DNREF                                  */
162   0,                             /* DNREFI                                 */
163   0,                             /* RECURSE                                */
164   0,                             /* CALLOUT                                */
165   0,                             /* CALLOUT_STR                            */
166   0,                             /* Alt                                    */
167   0,                             /* Ket                                    */
168   0,                             /* KetRmax                                */
169   0,                             /* KetRmin                                */
170   0,                             /* KetRpos                                */
171   0,                             /* Reverse                                */
172   0,                             /* Assert                                 */
173   0,                             /* Assert not                             */
174   0,                             /* Assert behind                          */
175   0,                             /* Assert behind not                      */
176   0,                             /* NA assert                              */
177   0,                             /* NA assert behind                       */
178   0,                             /* ONCE                                   */
179   0,                             /* SCRIPT_RUN                             */
180   0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
181   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
182   0, 0,                          /* CREF, DNCREF                           */
183   0, 0,                          /* RREF, DNRREF                           */
184   0, 0,                          /* FALSE, TRUE                            */
185   0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
186   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
187   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
188   0, 0,                          /* COMMIT, COMMIT_ARG                     */
189   0, 0, 0,                       /* FAIL, ACCEPT, ASSERT_ACCEPT            */
190   0, 0, 0                        /* CLOSE, SKIPZERO, DEFINE                */
191 };
192 
193 /* This table identifies those opcodes that inspect a character. It is used to
194 remember the fact that a character could have been inspected when the end of
195 the subject is reached. ***NOTE*** If the start of this table is modified, the
196 two tables that follow must also be modified. */
197 
198 static const uint8_t poptable[] = {
199   0,                             /* End                                    */
200   0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
201   1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
202   1, 1, 1,                       /* Any, AllAny, Anybyte                   */
203   1, 1,                          /* \P, \p                                 */
204   1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
205   1,                             /* \X                                     */
206   0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
207   1,                             /* Char                                   */
208   1,                             /* Chari                                  */
209   1,                             /* not                                    */
210   1,                             /* noti                                   */
211   /* Positive single-char repeats                                          */
212   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
213   1, 1, 1,                       /* upto, minupto, exact                   */
214   1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
215   1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
216   1, 1, 1,                       /* upto I, minupto I, exact I             */
217   1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
218   /* Negative single-char repeats - only for chars < 256                   */
219   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
220   1, 1, 1,                       /* NOT upto, minupto, exact               */
221   1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
222   1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
223   1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
224   1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
225   /* Positive type repeats                                                 */
226   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
227   1, 1, 1,                       /* Type upto, minupto, exact              */
228   1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
229   /* Character class & ref repeats                                         */
230   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
231   1, 1,                          /* CRRANGE, CRMINRANGE                    */
232   1, 1, 1, 1,                    /* Possessive *+, ++, ?+, CRPOSRANGE      */
233   1,                             /* CLASS                                  */
234   1,                             /* NCLASS                                 */
235   1,                             /* XCLASS - variable length               */
236   0,                             /* REF                                    */
237   0,                             /* REFI                                   */
238   0,                             /* DNREF                                  */
239   0,                             /* DNREFI                                 */
240   0,                             /* RECURSE                                */
241   0,                             /* CALLOUT                                */
242   0,                             /* CALLOUT_STR                            */
243   0,                             /* Alt                                    */
244   0,                             /* Ket                                    */
245   0,                             /* KetRmax                                */
246   0,                             /* KetRmin                                */
247   0,                             /* KetRpos                                */
248   0,                             /* Reverse                                */
249   0,                             /* Assert                                 */
250   0,                             /* Assert not                             */
251   0,                             /* Assert behind                          */
252   0,                             /* Assert behind not                      */
253   0,                             /* NA assert                              */
254   0,                             /* NA assert behind                       */
255   0,                             /* ONCE                                   */
256   0,                             /* SCRIPT_RUN                             */
257   0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
258   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
259   0, 0,                          /* CREF, DNCREF                           */
260   0, 0,                          /* RREF, DNRREF                           */
261   0, 0,                          /* FALSE, TRUE                            */
262   0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
263   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
264   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
265   0, 0,                          /* COMMIT, COMMIT_ARG                     */
266   0, 0, 0,                       /* FAIL, ACCEPT, ASSERT_ACCEPT            */
267   0, 0, 0                        /* CLOSE, SKIPZERO, DEFINE                */
268 };
269 
270 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
271 and \w */
272 
273 static const uint8_t toptable1[] = {
274   0, 0, 0, 0, 0, 0,
275   ctype_digit, ctype_digit,
276   ctype_space, ctype_space,
277   ctype_word,  ctype_word,
278   0, 0                            /* OP_ANY, OP_ALLANY */
279 };
280 
281 static const uint8_t toptable2[] = {
282   0, 0, 0, 0, 0, 0,
283   ctype_digit, 0,
284   ctype_space, 0,
285   ctype_word,  0,
286   1, 1                            /* OP_ANY, OP_ALLANY */
287 };
288 
289 
290 /* Structure for holding data about a particular state, which is in effect the
291 current data for an active path through the match tree. It must consist
292 entirely of ints because the working vector we are passed, and which we put
293 these structures in, is a vector of ints. */
294 
295 typedef struct stateblock {
296   int offset;                     /* Offset to opcode (-ve has meaning) */
297   int count;                      /* Count for repeats */
298   int data;                       /* Some use extra data */
299 } stateblock;
300 
301 #define INTS_PER_STATEBLOCK  (int)(sizeof(stateblock)/sizeof(int))
302 
303 
304 /* Before version 10.32 the recursive calls of internal_dfa_match() were passed
305 local working space and output vectors that were created on the stack. This has
306 caused issues for some patterns, especially in small-stack environments such as
307 Windows. A new scheme is now in use which sets up a vector on the stack, but if
308 this is too small, heap memory is used, up to the heap_limit. The main
309 parameters are all numbers of ints because the workspace is a vector of ints.
310 
311 The size of the starting stack vector, DFA_START_RWS_SIZE, is in bytes, and is
312 defined in pcre2_internal.h so as to be available to pcre2test when it is
313 finding the minimum heap requirement for a match. */
314 
315 #define OVEC_UNIT  (sizeof(PCRE2_SIZE)/sizeof(int))
316 
317 #define RWS_BASE_SIZE   (DFA_START_RWS_SIZE/sizeof(int))  /* Stack vector */
318 #define RWS_RSIZE       1000                    /* Work size for recursion */
319 #define RWS_OVEC_RSIZE  (1000*OVEC_UNIT)        /* Ovector for recursion */
320 #define RWS_OVEC_OSIZE  (2*OVEC_UNIT)           /* Ovector in other cases */
321 
322 /* This structure is at the start of each workspace block. */
323 
324 typedef struct RWS_anchor {
325   struct RWS_anchor *next;
326   uint32_t size;  /* Number of ints */
327   uint32_t free;  /* Number of ints */
328 } RWS_anchor;
329 
330 #define RWS_ANCHOR_SIZE (sizeof(RWS_anchor)/sizeof(int))
331 
332 
333 
334 /*************************************************
335 *               Process a callout                *
336 *************************************************/
337 
338 /* This function is called to perform a callout.
339 
340 Arguments:
341   code              current code pointer
342   offsets           points to current capture offsets
343   current_subject   start of current subject match
344   ptr               current position in subject
345   mb                the match block
346   extracode         extra code offset when called from condition
347   lengthptr         where to return the callout length
348 
349 Returns:            the return from the callout
350 */
351 
352 static int
do_callout(PCRE2_SPTR code,PCRE2_SIZE * offsets,PCRE2_SPTR current_subject,PCRE2_SPTR ptr,dfa_match_block * mb,PCRE2_SIZE extracode,PCRE2_SIZE * lengthptr)353 do_callout(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject,
354   PCRE2_SPTR ptr, dfa_match_block *mb, PCRE2_SIZE extracode,
355   PCRE2_SIZE *lengthptr)
356 {
357 pcre2_callout_block *cb = mb->cb;
358 
359 *lengthptr = (code[extracode] == OP_CALLOUT)?
360   (PCRE2_SIZE)PRIV(OP_lengths)[OP_CALLOUT] :
361   (PCRE2_SIZE)GET(code, 1 + 2*LINK_SIZE + extracode);
362 
363 if (mb->callout == NULL) return 0;    /* No callout provided */
364 
365 /* Fixed fields in the callout block are set once and for all at the start of
366 matching. */
367 
368 cb->offset_vector    = offsets;
369 cb->start_match      = (PCRE2_SIZE)(current_subject - mb->start_subject);
370 cb->current_position = (PCRE2_SIZE)(ptr - mb->start_subject);
371 cb->pattern_position = GET(code, 1 + extracode);
372 cb->next_item_length = GET(code, 1 + LINK_SIZE + extracode);
373 
374 if (code[extracode] == OP_CALLOUT)
375   {
376   cb->callout_number = code[1 + 2*LINK_SIZE + extracode];
377   cb->callout_string_offset = 0;
378   cb->callout_string = NULL;
379   cb->callout_string_length = 0;
380   }
381 else
382   {
383   cb->callout_number = 0;
384   cb->callout_string_offset = GET(code, 1 + 3*LINK_SIZE + extracode);
385   cb->callout_string = code + (1 + 4*LINK_SIZE + extracode) + 1;
386   cb->callout_string_length = *lengthptr - (1 + 4*LINK_SIZE) - 2;
387   }
388 
389 return (mb->callout)(cb, mb->callout_data);
390 }
391 
392 
393 
394 /*************************************************
395 *         Expand local workspace memory          *
396 *************************************************/
397 
398 /* This function is called when internal_dfa_match() is about to be called
399 recursively and there is insufficient working space left in the current
400 workspace block. If there's an existing next block, use it; otherwise get a new
401 block unless the heap limit is reached.
402 
403 Arguments:
404   rwsptr     pointer to block pointer (updated)
405   ovecsize   space needed for an ovector
406   mb         the match block
407 
408 Returns:     0 rwsptr has been updated
409             !0 an error code
410 */
411 
412 static int
more_workspace(RWS_anchor ** rwsptr,unsigned int ovecsize,dfa_match_block * mb)413 more_workspace(RWS_anchor **rwsptr, unsigned int ovecsize, dfa_match_block *mb)
414 {
415 RWS_anchor *rws = *rwsptr;
416 RWS_anchor *new;
417 
418 if (rws->next != NULL)
419   {
420   new = rws->next;
421   }
422 
423 /* Sizes in the RWS_anchor blocks are in units of sizeof(int), but
424 mb->heap_limit and mb->heap_used are in kibibytes. Play carefully, to avoid
425 overflow. */
426 
427 else
428   {
429   uint32_t newsize = (rws->size >= UINT32_MAX/2)? UINT32_MAX/2 : rws->size * 2;
430   uint32_t newsizeK = newsize/(1024/sizeof(int));
431 
432   if (newsizeK + mb->heap_used > mb->heap_limit)
433     newsizeK = (uint32_t)(mb->heap_limit - mb->heap_used);
434   newsize = newsizeK*(1024/sizeof(int));
435 
436   if (newsize < RWS_RSIZE + ovecsize + RWS_ANCHOR_SIZE)
437     return PCRE2_ERROR_HEAPLIMIT;
438   new = mb->memctl.malloc(newsize*sizeof(int), mb->memctl.memory_data);
439   if (new == NULL) return PCRE2_ERROR_NOMEMORY;
440   mb->heap_used += newsizeK;
441   new->next = NULL;
442   new->size = newsize;
443   rws->next = new;
444   }
445 
446 new->free = new->size - RWS_ANCHOR_SIZE;
447 *rwsptr = new;
448 return 0;
449 }
450 
451 
452 
453 /*************************************************
454 *     Match a Regular Expression - DFA engine    *
455 *************************************************/
456 
457 /* This internal function applies a compiled pattern to a subject string,
458 starting at a given point, using a DFA engine. This function is called from the
459 external one, possibly multiple times if the pattern is not anchored. The
460 function calls itself recursively for some kinds of subpattern.
461 
462 Arguments:
463   mb                the match_data block with fixed information
464   this_start_code   the opening bracket of this subexpression's code
465   current_subject   where we currently are in the subject string
466   start_offset      start offset in the subject string
467   offsets           vector to contain the matching string offsets
468   offsetcount       size of same
469   workspace         vector of workspace
470   wscount           size of same
471   rlevel            function call recursion level
472 
473 Returns:            > 0 => number of match offset pairs placed in offsets
474                     = 0 => offsets overflowed; longest matches are present
475                      -1 => failed to match
476                    < -1 => some kind of unexpected problem
477 
478 The following macros are used for adding states to the two state vectors (one
479 for the current character, one for the following character). */
480 
481 #define ADD_ACTIVE(x,y) \
482   if (active_count++ < wscount) \
483     { \
484     next_active_state->offset = (x); \
485     next_active_state->count  = (y); \
486     next_active_state++; \
487     } \
488   else return PCRE2_ERROR_DFA_WSSIZE
489 
490 #define ADD_ACTIVE_DATA(x,y,z) \
491   if (active_count++ < wscount) \
492     { \
493     next_active_state->offset = (x); \
494     next_active_state->count  = (y); \
495     next_active_state->data   = (z); \
496     next_active_state++; \
497     } \
498   else return PCRE2_ERROR_DFA_WSSIZE
499 
500 #define ADD_NEW(x,y) \
501   if (new_count++ < wscount) \
502     { \
503     next_new_state->offset = (x); \
504     next_new_state->count  = (y); \
505     next_new_state++; \
506     } \
507   else return PCRE2_ERROR_DFA_WSSIZE
508 
509 #define ADD_NEW_DATA(x,y,z) \
510   if (new_count++ < wscount) \
511     { \
512     next_new_state->offset = (x); \
513     next_new_state->count  = (y); \
514     next_new_state->data   = (z); \
515     next_new_state++; \
516     } \
517   else return PCRE2_ERROR_DFA_WSSIZE
518 
519 /* And now, here is the code */
520 
521 static int
internal_dfa_match(dfa_match_block * mb,PCRE2_SPTR this_start_code,PCRE2_SPTR current_subject,PCRE2_SIZE start_offset,PCRE2_SIZE * offsets,uint32_t offsetcount,int * workspace,int wscount,uint32_t rlevel,int * RWS)522 internal_dfa_match(
523   dfa_match_block *mb,
524   PCRE2_SPTR this_start_code,
525   PCRE2_SPTR current_subject,
526   PCRE2_SIZE start_offset,
527   PCRE2_SIZE *offsets,
528   uint32_t offsetcount,
529   int *workspace,
530   int wscount,
531   uint32_t rlevel,
532   int *RWS)
533 {
534 stateblock *active_states, *new_states, *temp_states;
535 stateblock *next_active_state, *next_new_state;
536 const uint8_t *ctypes, *lcc, *fcc;
537 PCRE2_SPTR ptr;
538 PCRE2_SPTR end_code;
539 dfa_recursion_info new_recursive;
540 int active_count, new_count, match_count;
541 
542 /* Some fields in the mb block are frequently referenced, so we load them into
543 independent variables in the hope that this will perform better. */
544 
545 PCRE2_SPTR start_subject = mb->start_subject;
546 PCRE2_SPTR end_subject = mb->end_subject;
547 PCRE2_SPTR start_code = mb->start_code;
548 
549 #ifdef SUPPORT_UNICODE
550 BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
551 BOOL utf_or_ucp = utf || (mb->poptions & PCRE2_UCP) != 0;
552 #else
553 BOOL utf = FALSE;
554 #endif
555 
556 BOOL reset_could_continue = FALSE;
557 
558 if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;
559 if (rlevel++ > mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
560 offsetcount &= (uint32_t)(-2);  /* Round down */
561 
562 wscount -= 2;
563 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
564           (2 * INTS_PER_STATEBLOCK);
565 
566 ctypes = mb->tables + ctypes_offset;
567 lcc = mb->tables + lcc_offset;
568 fcc = mb->tables + fcc_offset;
569 
570 match_count = PCRE2_ERROR_NOMATCH;   /* A negative number */
571 
572 active_states = (stateblock *)(workspace + 2);
573 next_new_state = new_states = active_states + wscount;
574 new_count = 0;
575 
576 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
577 the alternative states onto the list, and find out where the end is. This
578 makes is possible to use this function recursively, when we want to stop at a
579 matching internal ket rather than at the end.
580 
581 If we are dealing with a backward assertion we have to find out the maximum
582 amount to move back, and set up each alternative appropriately. */
583 
584 if (*this_start_code == OP_ASSERTBACK || *this_start_code == OP_ASSERTBACK_NOT)
585   {
586   size_t max_back = 0;
587   size_t gone_back;
588 
589   end_code = this_start_code;
590   do
591     {
592     size_t back = (size_t)GET(end_code, 2+LINK_SIZE);
593     if (back > max_back) max_back = back;
594     end_code += GET(end_code, 1);
595     }
596   while (*end_code == OP_ALT);
597 
598   /* If we can't go back the amount required for the longest lookbehind
599   pattern, go back as far as we can; some alternatives may still be viable. */
600 
601 #ifdef SUPPORT_UNICODE
602   /* In character mode we have to step back character by character */
603 
604   if (utf)
605     {
606     for (gone_back = 0; gone_back < max_back; gone_back++)
607       {
608       if (current_subject <= start_subject) break;
609       current_subject--;
610       ACROSSCHAR(current_subject > start_subject, current_subject,
611         current_subject--);
612       }
613     }
614   else
615 #endif
616 
617   /* In byte-mode we can do this quickly. */
618 
619     {
620     size_t current_offset = (size_t)(current_subject - start_subject);
621     gone_back = (current_offset < max_back)? current_offset : max_back;
622     current_subject -= gone_back;
623     }
624 
625   /* Save the earliest consulted character */
626 
627   if (current_subject < mb->start_used_ptr)
628     mb->start_used_ptr = current_subject;
629 
630   /* Now we can process the individual branches. There will be an OP_REVERSE at
631   the start of each branch, except when the length of the branch is zero. */
632 
633   end_code = this_start_code;
634   do
635     {
636     uint32_t revlen = (end_code[1+LINK_SIZE] == OP_REVERSE)? 1 + LINK_SIZE : 0;
637     size_t back = (revlen == 0)? 0 : (size_t)GET(end_code, 2+LINK_SIZE);
638     if (back <= gone_back)
639       {
640       int bstate = (int)(end_code - start_code + 1 + LINK_SIZE + revlen);
641       ADD_NEW_DATA(-bstate, 0, (int)(gone_back - back));
642       }
643     end_code += GET(end_code, 1);
644     }
645   while (*end_code == OP_ALT);
646  }
647 
648 /* This is the code for a "normal" subpattern (not a backward assertion). The
649 start of a whole pattern is always one of these. If we are at the top level,
650 we may be asked to restart matching from the same point that we reached for a
651 previous partial match. We still have to scan through the top-level branches to
652 find the end state. */
653 
654 else
655   {
656   end_code = this_start_code;
657 
658   /* Restarting */
659 
660   if (rlevel == 1 && (mb->moptions & PCRE2_DFA_RESTART) != 0)
661     {
662     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
663     new_count = workspace[1];
664     if (!workspace[0])
665       memcpy(new_states, active_states, (size_t)new_count * sizeof(stateblock));
666     }
667 
668   /* Not restarting */
669 
670   else
671     {
672     int length = 1 + LINK_SIZE +
673       ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
674         *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
675         ? IMM2_SIZE:0);
676     do
677       {
678       ADD_NEW((int)(end_code - start_code + length), 0);
679       end_code += GET(end_code, 1);
680       length = 1 + LINK_SIZE;
681       }
682     while (*end_code == OP_ALT);
683     }
684   }
685 
686 workspace[0] = 0;    /* Bit indicating which vector is current */
687 
688 /* Loop for scanning the subject */
689 
690 ptr = current_subject;
691 for (;;)
692   {
693   int i, j;
694   int clen, dlen;
695   uint32_t c, d;
696   int forced_fail = 0;
697   BOOL partial_newline = FALSE;
698   BOOL could_continue = reset_could_continue;
699   reset_could_continue = FALSE;
700 
701   if (ptr > mb->last_used_ptr) mb->last_used_ptr = ptr;
702 
703   /* Make the new state list into the active state list and empty the
704   new state list. */
705 
706   temp_states = active_states;
707   active_states = new_states;
708   new_states = temp_states;
709   active_count = new_count;
710   new_count = 0;
711 
712   workspace[0] ^= 1;              /* Remember for the restarting feature */
713   workspace[1] = active_count;
714 
715   /* Set the pointers for adding new states */
716 
717   next_active_state = active_states + active_count;
718   next_new_state = new_states;
719 
720   /* Load the current character from the subject outside the loop, as many
721   different states may want to look at it, and we assume that at least one
722   will. */
723 
724   if (ptr < end_subject)
725     {
726     clen = 1;        /* Number of data items in the character */
727 #ifdef SUPPORT_UNICODE
728     GETCHARLENTEST(c, ptr, clen);
729 #else
730     c = *ptr;
731 #endif  /* SUPPORT_UNICODE */
732     }
733   else
734     {
735     clen = 0;        /* This indicates the end of the subject */
736     c = NOTACHAR;    /* This value should never actually be used */
737     }
738 
739   /* Scan up the active states and act on each one. The result of an action
740   may be to add more states to the currently active list (e.g. on hitting a
741   parenthesis) or it may be to put states on the new list, for considering
742   when we move the character pointer on. */
743 
744   for (i = 0; i < active_count; i++)
745     {
746     stateblock *current_state = active_states + i;
747     BOOL caseless = FALSE;
748     PCRE2_SPTR code;
749     uint32_t codevalue;
750     int state_offset = current_state->offset;
751     int rrc;
752     int count;
753 
754     /* A negative offset is a special case meaning "hold off going to this
755     (negated) state until the number of characters in the data field have
756     been skipped". If the could_continue flag was passed over from a previous
757     state, arrange for it to passed on. */
758 
759     if (state_offset < 0)
760       {
761       if (current_state->data > 0)
762         {
763         ADD_NEW_DATA(state_offset, current_state->count,
764           current_state->data - 1);
765         if (could_continue) reset_could_continue = TRUE;
766         continue;
767         }
768       else
769         {
770         current_state->offset = state_offset = -state_offset;
771         }
772       }
773 
774     /* Check for a duplicate state with the same count, and skip if found.
775     See the note at the head of this module about the possibility of improving
776     performance here. */
777 
778     for (j = 0; j < i; j++)
779       {
780       if (active_states[j].offset == state_offset &&
781           active_states[j].count == current_state->count)
782         goto NEXT_ACTIVE_STATE;
783       }
784 
785     /* The state offset is the offset to the opcode */
786 
787     code = start_code + state_offset;
788     codevalue = *code;
789 
790     /* If this opcode inspects a character, but we are at the end of the
791     subject, remember the fact for use when testing for a partial match. */
792 
793     if (clen == 0 && poptable[codevalue] != 0)
794       could_continue = TRUE;
795 
796     /* If this opcode is followed by an inline character, load it. It is
797     tempting to test for the presence of a subject character here, but that
798     is wrong, because sometimes zero repetitions of the subject are
799     permitted.
800 
801     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
802     argument that is not a data character - but is always one byte long because
803     the values are small. We have to take special action to deal with  \P, \p,
804     \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
805     these ones to new opcodes. */
806 
807     if (coptable[codevalue] > 0)
808       {
809       dlen = 1;
810 #ifdef SUPPORT_UNICODE
811       if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
812 #endif  /* SUPPORT_UNICODE */
813       d = code[coptable[codevalue]];
814       if (codevalue >= OP_TYPESTAR)
815         {
816         switch(d)
817           {
818           case OP_ANYBYTE: return PCRE2_ERROR_DFA_UITEM;
819           case OP_NOTPROP:
820           case OP_PROP: codevalue += OP_PROP_EXTRA; break;
821           case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
822           case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
823           case OP_NOT_HSPACE:
824           case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
825           case OP_NOT_VSPACE:
826           case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
827           default: break;
828           }
829         }
830       }
831     else
832       {
833       dlen = 0;         /* Not strictly necessary, but compilers moan */
834       d = NOTACHAR;     /* if these variables are not set. */
835       }
836 
837 
838     /* Now process the individual opcodes */
839 
840     switch (codevalue)
841       {
842 /* ========================================================================== */
843       /* These cases are never obeyed. This is a fudge that causes a compile-
844       time error if the vectors coptable or poptable, which are indexed by
845       opcode, are not the correct length. It seems to be the only way to do
846       such a check at compile time, as the sizeof() operator does not work
847       in the C preprocessor. */
848 
849       case OP_TABLE_LENGTH:
850       case OP_TABLE_LENGTH +
851         ((sizeof(coptable) == OP_TABLE_LENGTH) &&
852          (sizeof(poptable) == OP_TABLE_LENGTH)):
853       return 0;
854 
855 /* ========================================================================== */
856       /* Reached a closing bracket. If not at the end of the pattern, carry
857       on with the next opcode. For repeating opcodes, also add the repeat
858       state. Note that KETRPOS will always be encountered at the end of the
859       subpattern, because the possessive subpattern repeats are always handled
860       using recursive calls. Thus, it never adds any new states.
861 
862       At the end of the (sub)pattern, unless we have an empty string and
863       PCRE2_NOTEMPTY is set, or PCRE2_NOTEMPTY_ATSTART is set and we are at the
864       start of the subject, save the match data, shifting up all previous
865       matches so we always have the longest first. */
866 
867       case OP_KET:
868       case OP_KETRMIN:
869       case OP_KETRMAX:
870       case OP_KETRPOS:
871       if (code != end_code)
872         {
873         ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
874         if (codevalue != OP_KET)
875           {
876           ADD_ACTIVE(state_offset - (int)GET(code, 1), 0);
877           }
878         }
879       else
880         {
881         if (ptr > current_subject ||
882             ((mb->moptions & PCRE2_NOTEMPTY) == 0 &&
883               ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) == 0 ||
884                 current_subject > start_subject + mb->start_offset)))
885           {
886           if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
887             else if (match_count > 0 && ++match_count * 2 > (int)offsetcount)
888               match_count = 0;
889           count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2;
890           if (count > 0) (void)memmove(offsets + 2, offsets,
891             (size_t)count * sizeof(PCRE2_SIZE));
892           if (offsetcount >= 2)
893             {
894             offsets[0] = (PCRE2_SIZE)(current_subject - start_subject);
895             offsets[1] = (PCRE2_SIZE)(ptr - start_subject);
896             }
897           if ((mb->moptions & PCRE2_DFA_SHORTEST) != 0) return match_count;
898           }
899         }
900       break;
901 
902 /* ========================================================================== */
903       /* These opcodes add to the current list of states without looking
904       at the current character. */
905 
906       /*-----------------------------------------------------------------*/
907       case OP_ALT:
908       do { code += GET(code, 1); } while (*code == OP_ALT);
909       ADD_ACTIVE((int)(code - start_code), 0);
910       break;
911 
912       /*-----------------------------------------------------------------*/
913       case OP_BRA:
914       case OP_SBRA:
915       do
916         {
917         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
918         code += GET(code, 1);
919         }
920       while (*code == OP_ALT);
921       break;
922 
923       /*-----------------------------------------------------------------*/
924       case OP_CBRA:
925       case OP_SCBRA:
926       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE),  0);
927       code += GET(code, 1);
928       while (*code == OP_ALT)
929         {
930         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
931         code += GET(code, 1);
932         }
933       break;
934 
935       /*-----------------------------------------------------------------*/
936       case OP_BRAZERO:
937       case OP_BRAMINZERO:
938       ADD_ACTIVE(state_offset + 1, 0);
939       code += 1 + GET(code, 2);
940       while (*code == OP_ALT) code += GET(code, 1);
941       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
942       break;
943 
944       /*-----------------------------------------------------------------*/
945       case OP_SKIPZERO:
946       code += 1 + GET(code, 2);
947       while (*code == OP_ALT) code += GET(code, 1);
948       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
949       break;
950 
951       /*-----------------------------------------------------------------*/
952       case OP_CIRC:
953       if (ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0)
954         { ADD_ACTIVE(state_offset + 1, 0); }
955       break;
956 
957       /*-----------------------------------------------------------------*/
958       case OP_CIRCM:
959       if ((ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) ||
960           ((ptr != end_subject || (mb->poptions & PCRE2_ALT_CIRCUMFLEX) != 0 )
961             && WAS_NEWLINE(ptr)))
962         { ADD_ACTIVE(state_offset + 1, 0); }
963       break;
964 
965       /*-----------------------------------------------------------------*/
966       case OP_EOD:
967       if (ptr >= end_subject)
968         {
969         if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
970           return PCRE2_ERROR_PARTIAL;
971         else { ADD_ACTIVE(state_offset + 1, 0); }
972         }
973       break;
974 
975       /*-----------------------------------------------------------------*/
976       case OP_SOD:
977       if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
978       break;
979 
980       /*-----------------------------------------------------------------*/
981       case OP_SOM:
982       if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
983       break;
984 
985 
986 /* ========================================================================== */
987       /* These opcodes inspect the next subject character, and sometimes
988       the previous one as well, but do not have an argument. The variable
989       clen contains the length of the current character and is zero if we are
990       at the end of the subject. */
991 
992       /*-----------------------------------------------------------------*/
993       case OP_ANY:
994       if (clen > 0 && !IS_NEWLINE(ptr))
995         {
996         if (ptr + 1 >= mb->end_subject &&
997             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
998             NLBLOCK->nltype == NLTYPE_FIXED &&
999             NLBLOCK->nllen == 2 &&
1000             c == NLBLOCK->nl[0])
1001           {
1002           could_continue = partial_newline = TRUE;
1003           }
1004         else
1005           {
1006           ADD_NEW(state_offset + 1, 0);
1007           }
1008         }
1009       break;
1010 
1011       /*-----------------------------------------------------------------*/
1012       case OP_ALLANY:
1013       if (clen > 0)
1014         { ADD_NEW(state_offset + 1, 0); }
1015       break;
1016 
1017       /*-----------------------------------------------------------------*/
1018       case OP_EODN:
1019       if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - mb->nllen))
1020         {
1021         if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1022           return PCRE2_ERROR_PARTIAL;
1023         ADD_ACTIVE(state_offset + 1, 0);
1024         }
1025       break;
1026 
1027       /*-----------------------------------------------------------------*/
1028       case OP_DOLL:
1029       if ((mb->moptions & PCRE2_NOTEOL) == 0)
1030         {
1031         if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1032           could_continue = TRUE;
1033         else if (clen == 0 ||
1034             ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
1035                (ptr == end_subject - mb->nllen)
1036             ))
1037           { ADD_ACTIVE(state_offset + 1, 0); }
1038         else if (ptr + 1 >= mb->end_subject &&
1039                  (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1040                  NLBLOCK->nltype == NLTYPE_FIXED &&
1041                  NLBLOCK->nllen == 2 &&
1042                  c == NLBLOCK->nl[0])
1043           {
1044           if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1045             {
1046             reset_could_continue = TRUE;
1047             ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1048             }
1049           else could_continue = partial_newline = TRUE;
1050           }
1051         }
1052       break;
1053 
1054       /*-----------------------------------------------------------------*/
1055       case OP_DOLLM:
1056       if ((mb->moptions & PCRE2_NOTEOL) == 0)
1057         {
1058         if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1059           could_continue = TRUE;
1060         else if (clen == 0 ||
1061             ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
1062           { ADD_ACTIVE(state_offset + 1, 0); }
1063         else if (ptr + 1 >= mb->end_subject &&
1064                  (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
1065                  NLBLOCK->nltype == NLTYPE_FIXED &&
1066                  NLBLOCK->nllen == 2 &&
1067                  c == NLBLOCK->nl[0])
1068           {
1069           if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1070             {
1071             reset_could_continue = TRUE;
1072             ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1073             }
1074           else could_continue = partial_newline = TRUE;
1075           }
1076         }
1077       else if (IS_NEWLINE(ptr))
1078         { ADD_ACTIVE(state_offset + 1, 0); }
1079       break;
1080 
1081       /*-----------------------------------------------------------------*/
1082 
1083       case OP_DIGIT:
1084       case OP_WHITESPACE:
1085       case OP_WORDCHAR:
1086       if (clen > 0 && c < 256 &&
1087             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
1088         { ADD_NEW(state_offset + 1, 0); }
1089       break;
1090 
1091       /*-----------------------------------------------------------------*/
1092       case OP_NOT_DIGIT:
1093       case OP_NOT_WHITESPACE:
1094       case OP_NOT_WORDCHAR:
1095       if (clen > 0 && (c >= 256 ||
1096             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
1097         { ADD_NEW(state_offset + 1, 0); }
1098       break;
1099 
1100       /*-----------------------------------------------------------------*/
1101       case OP_WORD_BOUNDARY:
1102       case OP_NOT_WORD_BOUNDARY:
1103         {
1104         int left_word, right_word;
1105 
1106         if (ptr > start_subject)
1107           {
1108           PCRE2_SPTR temp = ptr - 1;
1109           if (temp < mb->start_used_ptr) mb->start_used_ptr = temp;
1110 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1111           if (utf) { BACKCHAR(temp); }
1112 #endif
1113           GETCHARTEST(d, temp);
1114 #ifdef SUPPORT_UNICODE
1115           if ((mb->poptions & PCRE2_UCP) != 0)
1116             {
1117             if (d == '_') left_word = TRUE; else
1118               {
1119               uint32_t cat = UCD_CATEGORY(d);
1120               left_word = (cat == ucp_L || cat == ucp_N);
1121               }
1122             }
1123           else
1124 #endif
1125           left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1126           }
1127         else left_word = FALSE;
1128 
1129         if (clen > 0)
1130           {
1131           if (ptr >= mb->last_used_ptr)
1132             {
1133             PCRE2_SPTR temp = ptr + 1;
1134 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1135             if (utf) { FORWARDCHARTEST(temp, mb->end_subject); }
1136 #endif
1137             mb->last_used_ptr = temp;
1138             }
1139 #ifdef SUPPORT_UNICODE
1140           if ((mb->poptions & PCRE2_UCP) != 0)
1141             {
1142             if (c == '_') right_word = TRUE; else
1143               {
1144               uint32_t cat = UCD_CATEGORY(c);
1145               right_word = (cat == ucp_L || cat == ucp_N);
1146               }
1147             }
1148           else
1149 #endif
1150           right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1151           }
1152         else right_word = FALSE;
1153 
1154         if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1155           { ADD_ACTIVE(state_offset + 1, 0); }
1156         }
1157       break;
1158 
1159 
1160       /*-----------------------------------------------------------------*/
1161       /* Check the next character by Unicode property. We will get here only
1162       if the support is in the binary; otherwise a compile-time error occurs.
1163       */
1164 
1165 #ifdef SUPPORT_UNICODE
1166       case OP_PROP:
1167       case OP_NOTPROP:
1168       if (clen > 0)
1169         {
1170         BOOL OK;
1171         const uint32_t *cp;
1172         const ucd_record * prop = GET_UCD(c);
1173         switch(code[1])
1174           {
1175           case PT_ANY:
1176           OK = TRUE;
1177           break;
1178 
1179           case PT_LAMP:
1180           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1181                prop->chartype == ucp_Lt;
1182           break;
1183 
1184           case PT_GC:
1185           OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1186           break;
1187 
1188           case PT_PC:
1189           OK = prop->chartype == code[2];
1190           break;
1191 
1192           case PT_SC:
1193           OK = prop->script == code[2];
1194           break;
1195 
1196           /* These are specials for combination cases. */
1197 
1198           case PT_ALNUM:
1199           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1200                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1201           break;
1202 
1203           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1204           which means that Perl space and POSIX space are now identical. PCRE
1205           was changed at release 8.34. */
1206 
1207           case PT_SPACE:    /* Perl space */
1208           case PT_PXSPACE:  /* POSIX space */
1209           switch(c)
1210             {
1211             HSPACE_CASES:
1212             VSPACE_CASES:
1213             OK = TRUE;
1214             break;
1215 
1216             default:
1217             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1218             break;
1219             }
1220           break;
1221 
1222           case PT_WORD:
1223           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1224                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1225                c == CHAR_UNDERSCORE;
1226           break;
1227 
1228           case PT_CLIST:
1229           cp = PRIV(ucd_caseless_sets) + code[2];
1230           for (;;)
1231             {
1232             if (c < *cp) { OK = FALSE; break; }
1233             if (c == *cp++) { OK = TRUE; break; }
1234             }
1235           break;
1236 
1237           case PT_UCNC:
1238           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1239                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1240                c >= 0xe000;
1241           break;
1242 
1243           /* Should never occur, but keep compilers from grumbling. */
1244 
1245           default:
1246           OK = codevalue != OP_PROP;
1247           break;
1248           }
1249 
1250         if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1251         }
1252       break;
1253 #endif
1254 
1255 
1256 
1257 /* ========================================================================== */
1258       /* These opcodes likewise inspect the subject character, but have an
1259       argument that is not a data character. It is one of these opcodes:
1260       OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1261       OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1262 
1263       case OP_TYPEPLUS:
1264       case OP_TYPEMINPLUS:
1265       case OP_TYPEPOSPLUS:
1266       count = current_state->count;  /* Already matched */
1267       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1268       if (clen > 0)
1269         {
1270         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1271             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1272             NLBLOCK->nltype == NLTYPE_FIXED &&
1273             NLBLOCK->nllen == 2 &&
1274             c == NLBLOCK->nl[0])
1275           {
1276           could_continue = partial_newline = TRUE;
1277           }
1278         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1279             (c < 256 &&
1280               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1281               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1282           {
1283           if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1284             {
1285             active_count--;            /* Remove non-match possibility */
1286             next_active_state--;
1287             }
1288           count++;
1289           ADD_NEW(state_offset, count);
1290           }
1291         }
1292       break;
1293 
1294       /*-----------------------------------------------------------------*/
1295       case OP_TYPEQUERY:
1296       case OP_TYPEMINQUERY:
1297       case OP_TYPEPOSQUERY:
1298       ADD_ACTIVE(state_offset + 2, 0);
1299       if (clen > 0)
1300         {
1301         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1302             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1303             NLBLOCK->nltype == NLTYPE_FIXED &&
1304             NLBLOCK->nllen == 2 &&
1305             c == NLBLOCK->nl[0])
1306           {
1307           could_continue = partial_newline = TRUE;
1308           }
1309         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1310             (c < 256 &&
1311               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1312               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1313           {
1314           if (codevalue == OP_TYPEPOSQUERY)
1315             {
1316             active_count--;            /* Remove non-match possibility */
1317             next_active_state--;
1318             }
1319           ADD_NEW(state_offset + 2, 0);
1320           }
1321         }
1322       break;
1323 
1324       /*-----------------------------------------------------------------*/
1325       case OP_TYPESTAR:
1326       case OP_TYPEMINSTAR:
1327       case OP_TYPEPOSSTAR:
1328       ADD_ACTIVE(state_offset + 2, 0);
1329       if (clen > 0)
1330         {
1331         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1332             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1333             NLBLOCK->nltype == NLTYPE_FIXED &&
1334             NLBLOCK->nllen == 2 &&
1335             c == NLBLOCK->nl[0])
1336           {
1337           could_continue = partial_newline = TRUE;
1338           }
1339         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1340             (c < 256 &&
1341               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1342               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1343           {
1344           if (codevalue == OP_TYPEPOSSTAR)
1345             {
1346             active_count--;            /* Remove non-match possibility */
1347             next_active_state--;
1348             }
1349           ADD_NEW(state_offset, 0);
1350           }
1351         }
1352       break;
1353 
1354       /*-----------------------------------------------------------------*/
1355       case OP_TYPEEXACT:
1356       count = current_state->count;  /* Number already matched */
1357       if (clen > 0)
1358         {
1359         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1360             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1361             NLBLOCK->nltype == NLTYPE_FIXED &&
1362             NLBLOCK->nllen == 2 &&
1363             c == NLBLOCK->nl[0])
1364           {
1365           could_continue = partial_newline = TRUE;
1366           }
1367         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1368             (c < 256 &&
1369               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1370               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1371           {
1372           if (++count >= (int)GET2(code, 1))
1373             { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1374           else
1375             { ADD_NEW(state_offset, count); }
1376           }
1377         }
1378       break;
1379 
1380       /*-----------------------------------------------------------------*/
1381       case OP_TYPEUPTO:
1382       case OP_TYPEMINUPTO:
1383       case OP_TYPEPOSUPTO:
1384       ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1385       count = current_state->count;  /* Number already matched */
1386       if (clen > 0)
1387         {
1388         if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1389             (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1390             NLBLOCK->nltype == NLTYPE_FIXED &&
1391             NLBLOCK->nllen == 2 &&
1392             c == NLBLOCK->nl[0])
1393           {
1394           could_continue = partial_newline = TRUE;
1395           }
1396         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1397             (c < 256 &&
1398               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1399               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1400           {
1401           if (codevalue == OP_TYPEPOSUPTO)
1402             {
1403             active_count--;           /* Remove non-match possibility */
1404             next_active_state--;
1405             }
1406           if (++count >= (int)GET2(code, 1))
1407             { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1408           else
1409             { ADD_NEW(state_offset, count); }
1410           }
1411         }
1412       break;
1413 
1414 /* ========================================================================== */
1415       /* These are virtual opcodes that are used when something like
1416       OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1417       argument. It keeps the code above fast for the other cases. The argument
1418       is in the d variable. */
1419 
1420 #ifdef SUPPORT_UNICODE
1421       case OP_PROP_EXTRA + OP_TYPEPLUS:
1422       case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1423       case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1424       count = current_state->count;           /* Already matched */
1425       if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1426       if (clen > 0)
1427         {
1428         BOOL OK;
1429         const uint32_t *cp;
1430         const ucd_record * prop = GET_UCD(c);
1431         switch(code[2])
1432           {
1433           case PT_ANY:
1434           OK = TRUE;
1435           break;
1436 
1437           case PT_LAMP:
1438           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1439             prop->chartype == ucp_Lt;
1440           break;
1441 
1442           case PT_GC:
1443           OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1444           break;
1445 
1446           case PT_PC:
1447           OK = prop->chartype == code[3];
1448           break;
1449 
1450           case PT_SC:
1451           OK = prop->script == code[3];
1452           break;
1453 
1454           /* These are specials for combination cases. */
1455 
1456           case PT_ALNUM:
1457           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1458                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1459           break;
1460 
1461           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1462           which means that Perl space and POSIX space are now identical. PCRE
1463           was changed at release 8.34. */
1464 
1465           case PT_SPACE:    /* Perl space */
1466           case PT_PXSPACE:  /* POSIX space */
1467           switch(c)
1468             {
1469             HSPACE_CASES:
1470             VSPACE_CASES:
1471             OK = TRUE;
1472             break;
1473 
1474             default:
1475             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1476             break;
1477             }
1478           break;
1479 
1480           case PT_WORD:
1481           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1482                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1483                c == CHAR_UNDERSCORE;
1484           break;
1485 
1486           case PT_CLIST:
1487           cp = PRIV(ucd_caseless_sets) + code[3];
1488           for (;;)
1489             {
1490             if (c < *cp) { OK = FALSE; break; }
1491             if (c == *cp++) { OK = TRUE; break; }
1492             }
1493           break;
1494 
1495           case PT_UCNC:
1496           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1497                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1498                c >= 0xe000;
1499           break;
1500 
1501           /* Should never occur, but keep compilers from grumbling. */
1502 
1503           default:
1504           OK = codevalue != OP_PROP;
1505           break;
1506           }
1507 
1508         if (OK == (d == OP_PROP))
1509           {
1510           if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1511             {
1512             active_count--;           /* Remove non-match possibility */
1513             next_active_state--;
1514             }
1515           count++;
1516           ADD_NEW(state_offset, count);
1517           }
1518         }
1519       break;
1520 
1521       /*-----------------------------------------------------------------*/
1522       case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1523       case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1524       case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1525       count = current_state->count;  /* Already matched */
1526       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1527       if (clen > 0)
1528         {
1529         int ncount = 0;
1530         if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1531           {
1532           active_count--;           /* Remove non-match possibility */
1533           next_active_state--;
1534           }
1535         (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1536           &ncount);
1537         count++;
1538         ADD_NEW_DATA(-state_offset, count, ncount);
1539         }
1540       break;
1541 #endif
1542 
1543       /*-----------------------------------------------------------------*/
1544       case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1545       case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1546       case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1547       count = current_state->count;  /* Already matched */
1548       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1549       if (clen > 0)
1550         {
1551         int ncount = 0;
1552         switch (c)
1553           {
1554           case CHAR_VT:
1555           case CHAR_FF:
1556           case CHAR_NEL:
1557 #ifndef EBCDIC
1558           case 0x2028:
1559           case 0x2029:
1560 #endif  /* Not EBCDIC */
1561           if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1562           goto ANYNL01;
1563 
1564           case CHAR_CR:
1565           if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1566           /* Fall through */
1567 
1568           ANYNL01:
1569           case CHAR_LF:
1570           if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1571             {
1572             active_count--;           /* Remove non-match possibility */
1573             next_active_state--;
1574             }
1575           count++;
1576           ADD_NEW_DATA(-state_offset, count, ncount);
1577           break;
1578 
1579           default:
1580           break;
1581           }
1582         }
1583       break;
1584 
1585       /*-----------------------------------------------------------------*/
1586       case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1587       case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1588       case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1589       count = current_state->count;  /* Already matched */
1590       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1591       if (clen > 0)
1592         {
1593         BOOL OK;
1594         switch (c)
1595           {
1596           VSPACE_CASES:
1597           OK = TRUE;
1598           break;
1599 
1600           default:
1601           OK = FALSE;
1602           break;
1603           }
1604 
1605         if (OK == (d == OP_VSPACE))
1606           {
1607           if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1608             {
1609             active_count--;           /* Remove non-match possibility */
1610             next_active_state--;
1611             }
1612           count++;
1613           ADD_NEW_DATA(-state_offset, count, 0);
1614           }
1615         }
1616       break;
1617 
1618       /*-----------------------------------------------------------------*/
1619       case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1620       case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1621       case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1622       count = current_state->count;  /* Already matched */
1623       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1624       if (clen > 0)
1625         {
1626         BOOL OK;
1627         switch (c)
1628           {
1629           HSPACE_CASES:
1630           OK = TRUE;
1631           break;
1632 
1633           default:
1634           OK = FALSE;
1635           break;
1636           }
1637 
1638         if (OK == (d == OP_HSPACE))
1639           {
1640           if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1641             {
1642             active_count--;           /* Remove non-match possibility */
1643             next_active_state--;
1644             }
1645           count++;
1646           ADD_NEW_DATA(-state_offset, count, 0);
1647           }
1648         }
1649       break;
1650 
1651       /*-----------------------------------------------------------------*/
1652 #ifdef SUPPORT_UNICODE
1653       case OP_PROP_EXTRA + OP_TYPEQUERY:
1654       case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1655       case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1656       count = 4;
1657       goto QS1;
1658 
1659       case OP_PROP_EXTRA + OP_TYPESTAR:
1660       case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1661       case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1662       count = 0;
1663 
1664       QS1:
1665 
1666       ADD_ACTIVE(state_offset + 4, 0);
1667       if (clen > 0)
1668         {
1669         BOOL OK;
1670         const uint32_t *cp;
1671         const ucd_record * prop = GET_UCD(c);
1672         switch(code[2])
1673           {
1674           case PT_ANY:
1675           OK = TRUE;
1676           break;
1677 
1678           case PT_LAMP:
1679           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1680             prop->chartype == ucp_Lt;
1681           break;
1682 
1683           case PT_GC:
1684           OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1685           break;
1686 
1687           case PT_PC:
1688           OK = prop->chartype == code[3];
1689           break;
1690 
1691           case PT_SC:
1692           OK = prop->script == code[3];
1693           break;
1694 
1695           /* These are specials for combination cases. */
1696 
1697           case PT_ALNUM:
1698           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1699                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1700           break;
1701 
1702           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1703           which means that Perl space and POSIX space are now identical. PCRE
1704           was changed at release 8.34. */
1705 
1706           case PT_SPACE:    /* Perl space */
1707           case PT_PXSPACE:  /* POSIX space */
1708           switch(c)
1709             {
1710             HSPACE_CASES:
1711             VSPACE_CASES:
1712             OK = TRUE;
1713             break;
1714 
1715             default:
1716             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1717             break;
1718             }
1719           break;
1720 
1721           case PT_WORD:
1722           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1723                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1724                c == CHAR_UNDERSCORE;
1725           break;
1726 
1727           case PT_CLIST:
1728           cp = PRIV(ucd_caseless_sets) + code[3];
1729           for (;;)
1730             {
1731             if (c < *cp) { OK = FALSE; break; }
1732             if (c == *cp++) { OK = TRUE; break; }
1733             }
1734           break;
1735 
1736           case PT_UCNC:
1737           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1738                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1739                c >= 0xe000;
1740           break;
1741 
1742           /* Should never occur, but keep compilers from grumbling. */
1743 
1744           default:
1745           OK = codevalue != OP_PROP;
1746           break;
1747           }
1748 
1749         if (OK == (d == OP_PROP))
1750           {
1751           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1752               codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1753             {
1754             active_count--;           /* Remove non-match possibility */
1755             next_active_state--;
1756             }
1757           ADD_NEW(state_offset + count, 0);
1758           }
1759         }
1760       break;
1761 
1762       /*-----------------------------------------------------------------*/
1763       case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1764       case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1765       case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1766       count = 2;
1767       goto QS2;
1768 
1769       case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1770       case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1771       case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1772       count = 0;
1773 
1774       QS2:
1775 
1776       ADD_ACTIVE(state_offset + 2, 0);
1777       if (clen > 0)
1778         {
1779         int ncount = 0;
1780         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1781             codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1782           {
1783           active_count--;           /* Remove non-match possibility */
1784           next_active_state--;
1785           }
1786         (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1787           &ncount);
1788         ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1789         }
1790       break;
1791 #endif
1792 
1793       /*-----------------------------------------------------------------*/
1794       case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1795       case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1796       case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1797       count = 2;
1798       goto QS3;
1799 
1800       case OP_ANYNL_EXTRA + OP_TYPESTAR:
1801       case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1802       case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1803       count = 0;
1804 
1805       QS3:
1806       ADD_ACTIVE(state_offset + 2, 0);
1807       if (clen > 0)
1808         {
1809         int ncount = 0;
1810         switch (c)
1811           {
1812           case CHAR_VT:
1813           case CHAR_FF:
1814           case CHAR_NEL:
1815 #ifndef EBCDIC
1816           case 0x2028:
1817           case 0x2029:
1818 #endif  /* Not EBCDIC */
1819           if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1820           goto ANYNL02;
1821 
1822           case CHAR_CR:
1823           if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1824           /* Fall through */
1825 
1826           ANYNL02:
1827           case CHAR_LF:
1828           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1829               codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1830             {
1831             active_count--;           /* Remove non-match possibility */
1832             next_active_state--;
1833             }
1834           ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1835           break;
1836 
1837           default:
1838           break;
1839           }
1840         }
1841       break;
1842 
1843       /*-----------------------------------------------------------------*/
1844       case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1845       case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1846       case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1847       count = 2;
1848       goto QS4;
1849 
1850       case OP_VSPACE_EXTRA + OP_TYPESTAR:
1851       case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1852       case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1853       count = 0;
1854 
1855       QS4:
1856       ADD_ACTIVE(state_offset + 2, 0);
1857       if (clen > 0)
1858         {
1859         BOOL OK;
1860         switch (c)
1861           {
1862           VSPACE_CASES:
1863           OK = TRUE;
1864           break;
1865 
1866           default:
1867           OK = FALSE;
1868           break;
1869           }
1870         if (OK == (d == OP_VSPACE))
1871           {
1872           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1873               codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1874             {
1875             active_count--;           /* Remove non-match possibility */
1876             next_active_state--;
1877             }
1878           ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1879           }
1880         }
1881       break;
1882 
1883       /*-----------------------------------------------------------------*/
1884       case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1885       case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1886       case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1887       count = 2;
1888       goto QS5;
1889 
1890       case OP_HSPACE_EXTRA + OP_TYPESTAR:
1891       case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1892       case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1893       count = 0;
1894 
1895       QS5:
1896       ADD_ACTIVE(state_offset + 2, 0);
1897       if (clen > 0)
1898         {
1899         BOOL OK;
1900         switch (c)
1901           {
1902           HSPACE_CASES:
1903           OK = TRUE;
1904           break;
1905 
1906           default:
1907           OK = FALSE;
1908           break;
1909           }
1910 
1911         if (OK == (d == OP_HSPACE))
1912           {
1913           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1914               codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1915             {
1916             active_count--;           /* Remove non-match possibility */
1917             next_active_state--;
1918             }
1919           ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1920           }
1921         }
1922       break;
1923 
1924       /*-----------------------------------------------------------------*/
1925 #ifdef SUPPORT_UNICODE
1926       case OP_PROP_EXTRA + OP_TYPEEXACT:
1927       case OP_PROP_EXTRA + OP_TYPEUPTO:
1928       case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1929       case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1930       if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1931         { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1932       count = current_state->count;  /* Number already matched */
1933       if (clen > 0)
1934         {
1935         BOOL OK;
1936         const uint32_t *cp;
1937         const ucd_record * prop = GET_UCD(c);
1938         switch(code[1 + IMM2_SIZE + 1])
1939           {
1940           case PT_ANY:
1941           OK = TRUE;
1942           break;
1943 
1944           case PT_LAMP:
1945           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1946             prop->chartype == ucp_Lt;
1947           break;
1948 
1949           case PT_GC:
1950           OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1951           break;
1952 
1953           case PT_PC:
1954           OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1955           break;
1956 
1957           case PT_SC:
1958           OK = prop->script == code[1 + IMM2_SIZE + 2];
1959           break;
1960 
1961           /* These are specials for combination cases. */
1962 
1963           case PT_ALNUM:
1964           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1965                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1966           break;
1967 
1968           /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1969           which means that Perl space and POSIX space are now identical. PCRE
1970           was changed at release 8.34. */
1971 
1972           case PT_SPACE:    /* Perl space */
1973           case PT_PXSPACE:  /* POSIX space */
1974           switch(c)
1975             {
1976             HSPACE_CASES:
1977             VSPACE_CASES:
1978             OK = TRUE;
1979             break;
1980 
1981             default:
1982             OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1983             break;
1984             }
1985           break;
1986 
1987           case PT_WORD:
1988           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1989                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1990                c == CHAR_UNDERSCORE;
1991           break;
1992 
1993           case PT_CLIST:
1994           cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
1995           for (;;)
1996             {
1997             if (c < *cp) { OK = FALSE; break; }
1998             if (c == *cp++) { OK = TRUE; break; }
1999             }
2000           break;
2001 
2002           case PT_UCNC:
2003           OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2004                c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2005                c >= 0xe000;
2006           break;
2007 
2008           /* Should never occur, but keep compilers from grumbling. */
2009 
2010           default:
2011           OK = codevalue != OP_PROP;
2012           break;
2013           }
2014 
2015         if (OK == (d == OP_PROP))
2016           {
2017           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
2018             {
2019             active_count--;           /* Remove non-match possibility */
2020             next_active_state--;
2021             }
2022           if (++count >= (int)GET2(code, 1))
2023             { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
2024           else
2025             { ADD_NEW(state_offset, count); }
2026           }
2027         }
2028       break;
2029 
2030       /*-----------------------------------------------------------------*/
2031       case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
2032       case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
2033       case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
2034       case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
2035       if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
2036         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2037       count = current_state->count;  /* Number already matched */
2038       if (clen > 0)
2039         {
2040         PCRE2_SPTR nptr;
2041         int ncount = 0;
2042         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
2043           {
2044           active_count--;           /* Remove non-match possibility */
2045           next_active_state--;
2046           }
2047         nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
2048           &ncount);
2049         if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2050             reset_could_continue = TRUE;
2051         if (++count >= (int)GET2(code, 1))
2052           { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2053         else
2054           { ADD_NEW_DATA(-state_offset, count, ncount); }
2055         }
2056       break;
2057 #endif
2058 
2059       /*-----------------------------------------------------------------*/
2060       case OP_ANYNL_EXTRA + OP_TYPEEXACT:
2061       case OP_ANYNL_EXTRA + OP_TYPEUPTO:
2062       case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
2063       case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
2064       if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
2065         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2066       count = current_state->count;  /* Number already matched */
2067       if (clen > 0)
2068         {
2069         int ncount = 0;
2070         switch (c)
2071           {
2072           case CHAR_VT:
2073           case CHAR_FF:
2074           case CHAR_NEL:
2075 #ifndef EBCDIC
2076           case 0x2028:
2077           case 0x2029:
2078 #endif  /* Not EBCDIC */
2079           if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2080           goto ANYNL03;
2081 
2082           case CHAR_CR:
2083           if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
2084           /* Fall through */
2085 
2086           ANYNL03:
2087           case CHAR_LF:
2088           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
2089             {
2090             active_count--;           /* Remove non-match possibility */
2091             next_active_state--;
2092             }
2093           if (++count >= (int)GET2(code, 1))
2094             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2095           else
2096             { ADD_NEW_DATA(-state_offset, count, ncount); }
2097           break;
2098 
2099           default:
2100           break;
2101           }
2102         }
2103       break;
2104 
2105       /*-----------------------------------------------------------------*/
2106       case OP_VSPACE_EXTRA + OP_TYPEEXACT:
2107       case OP_VSPACE_EXTRA + OP_TYPEUPTO:
2108       case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2109       case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2110       if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2111         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2112       count = current_state->count;  /* Number already matched */
2113       if (clen > 0)
2114         {
2115         BOOL OK;
2116         switch (c)
2117           {
2118           VSPACE_CASES:
2119           OK = TRUE;
2120           break;
2121 
2122           default:
2123           OK = FALSE;
2124           }
2125 
2126         if (OK == (d == OP_VSPACE))
2127           {
2128           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2129             {
2130             active_count--;           /* Remove non-match possibility */
2131             next_active_state--;
2132             }
2133           if (++count >= (int)GET2(code, 1))
2134             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2135           else
2136             { ADD_NEW_DATA(-state_offset, count, 0); }
2137           }
2138         }
2139       break;
2140 
2141       /*-----------------------------------------------------------------*/
2142       case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2143       case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2144       case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2145       case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2146       if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2147         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2148       count = current_state->count;  /* Number already matched */
2149       if (clen > 0)
2150         {
2151         BOOL OK;
2152         switch (c)
2153           {
2154           HSPACE_CASES:
2155           OK = TRUE;
2156           break;
2157 
2158           default:
2159           OK = FALSE;
2160           break;
2161           }
2162 
2163         if (OK == (d == OP_HSPACE))
2164           {
2165           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2166             {
2167             active_count--;           /* Remove non-match possibility */
2168             next_active_state--;
2169             }
2170           if (++count >= (int)GET2(code, 1))
2171             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2172           else
2173             { ADD_NEW_DATA(-state_offset, count, 0); }
2174           }
2175         }
2176       break;
2177 
2178 /* ========================================================================== */
2179       /* These opcodes are followed by a character that is usually compared
2180       to the current subject character; it is loaded into d. We still get
2181       here even if there is no subject character, because in some cases zero
2182       repetitions are permitted. */
2183 
2184       /*-----------------------------------------------------------------*/
2185       case OP_CHAR:
2186       if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2187       break;
2188 
2189       /*-----------------------------------------------------------------*/
2190       case OP_CHARI:
2191       if (clen == 0) break;
2192 
2193 #ifdef SUPPORT_UNICODE
2194       if (utf_or_ucp)
2195         {
2196         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2197           {
2198           unsigned int othercase;
2199           if (c < 128)
2200             othercase = fcc[c];
2201           else
2202             othercase = UCD_OTHERCASE(c);
2203           if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2204           }
2205         }
2206       else
2207 #endif  /* SUPPORT_UNICODE */
2208       /* Not UTF or UCP mode */
2209         {
2210         if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2211           { ADD_NEW(state_offset + 2, 0); }
2212         }
2213       break;
2214 
2215 
2216 #ifdef SUPPORT_UNICODE
2217       /*-----------------------------------------------------------------*/
2218       /* This is a tricky one because it can match more than one character.
2219       Find out how many characters to skip, and then set up a negative state
2220       to wait for them to pass before continuing. */
2221 
2222       case OP_EXTUNI:
2223       if (clen > 0)
2224         {
2225         int ncount = 0;
2226         PCRE2_SPTR nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject,
2227           end_subject, utf, &ncount);
2228         if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2229             reset_could_continue = TRUE;
2230         ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2231         }
2232       break;
2233 #endif
2234 
2235       /*-----------------------------------------------------------------*/
2236       /* This is a tricky like EXTUNI because it too can match more than one
2237       character (when CR is followed by LF). In this case, set up a negative
2238       state to wait for one character to pass before continuing. */
2239 
2240       case OP_ANYNL:
2241       if (clen > 0) switch(c)
2242         {
2243         case CHAR_VT:
2244         case CHAR_FF:
2245         case CHAR_NEL:
2246 #ifndef EBCDIC
2247         case 0x2028:
2248         case 0x2029:
2249 #endif  /* Not EBCDIC */
2250         if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2251         /* Fall through */
2252 
2253         case CHAR_LF:
2254         ADD_NEW(state_offset + 1, 0);
2255         break;
2256 
2257         case CHAR_CR:
2258         if (ptr + 1 >= end_subject)
2259           {
2260           ADD_NEW(state_offset + 1, 0);
2261           if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2262             reset_could_continue = TRUE;
2263           }
2264         else if (UCHAR21TEST(ptr + 1) == CHAR_LF)
2265           {
2266           ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2267           }
2268         else
2269           {
2270           ADD_NEW(state_offset + 1, 0);
2271           }
2272         break;
2273         }
2274       break;
2275 
2276       /*-----------------------------------------------------------------*/
2277       case OP_NOT_VSPACE:
2278       if (clen > 0) switch(c)
2279         {
2280         VSPACE_CASES:
2281         break;
2282 
2283         default:
2284         ADD_NEW(state_offset + 1, 0);
2285         break;
2286         }
2287       break;
2288 
2289       /*-----------------------------------------------------------------*/
2290       case OP_VSPACE:
2291       if (clen > 0) switch(c)
2292         {
2293         VSPACE_CASES:
2294         ADD_NEW(state_offset + 1, 0);
2295         break;
2296 
2297         default:
2298         break;
2299         }
2300       break;
2301 
2302       /*-----------------------------------------------------------------*/
2303       case OP_NOT_HSPACE:
2304       if (clen > 0) switch(c)
2305         {
2306         HSPACE_CASES:
2307         break;
2308 
2309         default:
2310         ADD_NEW(state_offset + 1, 0);
2311         break;
2312         }
2313       break;
2314 
2315       /*-----------------------------------------------------------------*/
2316       case OP_HSPACE:
2317       if (clen > 0) switch(c)
2318         {
2319         HSPACE_CASES:
2320         ADD_NEW(state_offset + 1, 0);
2321         break;
2322 
2323         default:
2324         break;
2325         }
2326       break;
2327 
2328       /*-----------------------------------------------------------------*/
2329       /* Match a negated single character casefully. */
2330 
2331       case OP_NOT:
2332       if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2333       break;
2334 
2335       /*-----------------------------------------------------------------*/
2336       /* Match a negated single character caselessly. */
2337 
2338       case OP_NOTI:
2339       if (clen > 0)
2340         {
2341         uint32_t otherd;
2342 #ifdef SUPPORT_UNICODE
2343         if (utf_or_ucp && d >= 128)
2344           otherd = UCD_OTHERCASE(d);
2345         else
2346 #endif  /* SUPPORT_UNICODE */
2347         otherd = TABLE_GET(d, fcc, d);
2348         if (c != d && c != otherd)
2349           { ADD_NEW(state_offset + dlen + 1, 0); }
2350         }
2351       break;
2352 
2353       /*-----------------------------------------------------------------*/
2354       case OP_PLUSI:
2355       case OP_MINPLUSI:
2356       case OP_POSPLUSI:
2357       case OP_NOTPLUSI:
2358       case OP_NOTMINPLUSI:
2359       case OP_NOTPOSPLUSI:
2360       caseless = TRUE;
2361       codevalue -= OP_STARI - OP_STAR;
2362 
2363       /* Fall through */
2364       case OP_PLUS:
2365       case OP_MINPLUS:
2366       case OP_POSPLUS:
2367       case OP_NOTPLUS:
2368       case OP_NOTMINPLUS:
2369       case OP_NOTPOSPLUS:
2370       count = current_state->count;  /* Already matched */
2371       if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2372       if (clen > 0)
2373         {
2374         uint32_t otherd = NOTACHAR;
2375         if (caseless)
2376           {
2377 #ifdef SUPPORT_UNICODE
2378           if (utf_or_ucp && d >= 128)
2379             otherd = UCD_OTHERCASE(d);
2380           else
2381 #endif  /* SUPPORT_UNICODE */
2382           otherd = TABLE_GET(d, fcc, d);
2383           }
2384         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2385           {
2386           if (count > 0 &&
2387               (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2388             {
2389             active_count--;             /* Remove non-match possibility */
2390             next_active_state--;
2391             }
2392           count++;
2393           ADD_NEW(state_offset, count);
2394           }
2395         }
2396       break;
2397 
2398       /*-----------------------------------------------------------------*/
2399       case OP_QUERYI:
2400       case OP_MINQUERYI:
2401       case OP_POSQUERYI:
2402       case OP_NOTQUERYI:
2403       case OP_NOTMINQUERYI:
2404       case OP_NOTPOSQUERYI:
2405       caseless = TRUE;
2406       codevalue -= OP_STARI - OP_STAR;
2407       /* Fall through */
2408       case OP_QUERY:
2409       case OP_MINQUERY:
2410       case OP_POSQUERY:
2411       case OP_NOTQUERY:
2412       case OP_NOTMINQUERY:
2413       case OP_NOTPOSQUERY:
2414       ADD_ACTIVE(state_offset + dlen + 1, 0);
2415       if (clen > 0)
2416         {
2417         uint32_t otherd = NOTACHAR;
2418         if (caseless)
2419           {
2420 #ifdef SUPPORT_UNICODE
2421           if (utf_or_ucp && d >= 128)
2422             otherd = UCD_OTHERCASE(d);
2423           else
2424 #endif  /* SUPPORT_UNICODE */
2425           otherd = TABLE_GET(d, fcc, d);
2426           }
2427         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2428           {
2429           if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2430             {
2431             active_count--;            /* Remove non-match possibility */
2432             next_active_state--;
2433             }
2434           ADD_NEW(state_offset + dlen + 1, 0);
2435           }
2436         }
2437       break;
2438 
2439       /*-----------------------------------------------------------------*/
2440       case OP_STARI:
2441       case OP_MINSTARI:
2442       case OP_POSSTARI:
2443       case OP_NOTSTARI:
2444       case OP_NOTMINSTARI:
2445       case OP_NOTPOSSTARI:
2446       caseless = TRUE;
2447       codevalue -= OP_STARI - OP_STAR;
2448       /* Fall through */
2449       case OP_STAR:
2450       case OP_MINSTAR:
2451       case OP_POSSTAR:
2452       case OP_NOTSTAR:
2453       case OP_NOTMINSTAR:
2454       case OP_NOTPOSSTAR:
2455       ADD_ACTIVE(state_offset + dlen + 1, 0);
2456       if (clen > 0)
2457         {
2458         uint32_t otherd = NOTACHAR;
2459         if (caseless)
2460           {
2461 #ifdef SUPPORT_UNICODE
2462           if (utf_or_ucp && d >= 128)
2463             otherd = UCD_OTHERCASE(d);
2464           else
2465 #endif  /* SUPPORT_UNICODE */
2466           otherd = TABLE_GET(d, fcc, d);
2467           }
2468         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2469           {
2470           if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2471             {
2472             active_count--;            /* Remove non-match possibility */
2473             next_active_state--;
2474             }
2475           ADD_NEW(state_offset, 0);
2476           }
2477         }
2478       break;
2479 
2480       /*-----------------------------------------------------------------*/
2481       case OP_EXACTI:
2482       case OP_NOTEXACTI:
2483       caseless = TRUE;
2484       codevalue -= OP_STARI - OP_STAR;
2485       /* Fall through */
2486       case OP_EXACT:
2487       case OP_NOTEXACT:
2488       count = current_state->count;  /* Number already matched */
2489       if (clen > 0)
2490         {
2491         uint32_t otherd = NOTACHAR;
2492         if (caseless)
2493           {
2494 #ifdef SUPPORT_UNICODE
2495           if (utf_or_ucp && d >= 128)
2496             otherd = UCD_OTHERCASE(d);
2497           else
2498 #endif  /* SUPPORT_UNICODE */
2499           otherd = TABLE_GET(d, fcc, d);
2500           }
2501         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2502           {
2503           if (++count >= (int)GET2(code, 1))
2504             { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2505           else
2506             { ADD_NEW(state_offset, count); }
2507           }
2508         }
2509       break;
2510 
2511       /*-----------------------------------------------------------------*/
2512       case OP_UPTOI:
2513       case OP_MINUPTOI:
2514       case OP_POSUPTOI:
2515       case OP_NOTUPTOI:
2516       case OP_NOTMINUPTOI:
2517       case OP_NOTPOSUPTOI:
2518       caseless = TRUE;
2519       codevalue -= OP_STARI - OP_STAR;
2520       /* Fall through */
2521       case OP_UPTO:
2522       case OP_MINUPTO:
2523       case OP_POSUPTO:
2524       case OP_NOTUPTO:
2525       case OP_NOTMINUPTO:
2526       case OP_NOTPOSUPTO:
2527       ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2528       count = current_state->count;  /* Number already matched */
2529       if (clen > 0)
2530         {
2531         uint32_t otherd = NOTACHAR;
2532         if (caseless)
2533           {
2534 #ifdef SUPPORT_UNICODE
2535           if (utf_or_ucp && d >= 128)
2536             otherd = UCD_OTHERCASE(d);
2537           else
2538 #endif  /* SUPPORT_UNICODE */
2539           otherd = TABLE_GET(d, fcc, d);
2540           }
2541         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2542           {
2543           if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2544             {
2545             active_count--;             /* Remove non-match possibility */
2546             next_active_state--;
2547             }
2548           if (++count >= (int)GET2(code, 1))
2549             { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2550           else
2551             { ADD_NEW(state_offset, count); }
2552           }
2553         }
2554       break;
2555 
2556 
2557 /* ========================================================================== */
2558       /* These are the class-handling opcodes */
2559 
2560       case OP_CLASS:
2561       case OP_NCLASS:
2562       case OP_XCLASS:
2563         {
2564         BOOL isinclass = FALSE;
2565         int next_state_offset;
2566         PCRE2_SPTR ecode;
2567 
2568         /* For a simple class, there is always just a 32-byte table, and we
2569         can set isinclass from it. */
2570 
2571         if (codevalue != OP_XCLASS)
2572           {
2573           ecode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
2574           if (clen > 0)
2575             {
2576             isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2577               ((((uint8_t *)(code + 1))[c/8] & (1u << (c&7))) != 0);
2578             }
2579           }
2580 
2581         /* An extended class may have a table or a list of single characters,
2582         ranges, or both, and it may be positive or negative. There's a
2583         function that sorts all this out. */
2584 
2585         else
2586          {
2587          ecode = code + GET(code, 1);
2588          if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2589          }
2590 
2591         /* At this point, isinclass is set for all kinds of class, and ecode
2592         points to the byte after the end of the class. If there is a
2593         quantifier, this is where it will be. */
2594 
2595         next_state_offset = (int)(ecode - start_code);
2596 
2597         switch (*ecode)
2598           {
2599           case OP_CRSTAR:
2600           case OP_CRMINSTAR:
2601           case OP_CRPOSSTAR:
2602           ADD_ACTIVE(next_state_offset + 1, 0);
2603           if (isinclass)
2604             {
2605             if (*ecode == OP_CRPOSSTAR)
2606               {
2607               active_count--;           /* Remove non-match possibility */
2608               next_active_state--;
2609               }
2610             ADD_NEW(state_offset, 0);
2611             }
2612           break;
2613 
2614           case OP_CRPLUS:
2615           case OP_CRMINPLUS:
2616           case OP_CRPOSPLUS:
2617           count = current_state->count;  /* Already matched */
2618           if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2619           if (isinclass)
2620             {
2621             if (count > 0 && *ecode == OP_CRPOSPLUS)
2622               {
2623               active_count--;           /* Remove non-match possibility */
2624               next_active_state--;
2625               }
2626             count++;
2627             ADD_NEW(state_offset, count);
2628             }
2629           break;
2630 
2631           case OP_CRQUERY:
2632           case OP_CRMINQUERY:
2633           case OP_CRPOSQUERY:
2634           ADD_ACTIVE(next_state_offset + 1, 0);
2635           if (isinclass)
2636             {
2637             if (*ecode == OP_CRPOSQUERY)
2638               {
2639               active_count--;           /* Remove non-match possibility */
2640               next_active_state--;
2641               }
2642             ADD_NEW(next_state_offset + 1, 0);
2643             }
2644           break;
2645 
2646           case OP_CRRANGE:
2647           case OP_CRMINRANGE:
2648           case OP_CRPOSRANGE:
2649           count = current_state->count;  /* Already matched */
2650           if (count >= (int)GET2(ecode, 1))
2651             { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2652           if (isinclass)
2653             {
2654             int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2655 
2656             if (*ecode == OP_CRPOSRANGE && count >= (int)GET2(ecode, 1))
2657               {
2658               active_count--;           /* Remove non-match possibility */
2659               next_active_state--;
2660               }
2661 
2662             if (++count >= max && max != 0)   /* Max 0 => no limit */
2663               { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2664             else
2665               { ADD_NEW(state_offset, count); }
2666             }
2667           break;
2668 
2669           default:
2670           if (isinclass) { ADD_NEW(next_state_offset, 0); }
2671           break;
2672           }
2673         }
2674       break;
2675 
2676 /* ========================================================================== */
2677       /* These are the opcodes for fancy brackets of various kinds. We have
2678       to use recursion in order to handle them. The "always failing" assertion
2679       (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2680       though the other "backtracking verbs" are not supported. */
2681 
2682       case OP_FAIL:
2683       forced_fail++;    /* Count FAILs for multiple states */
2684       break;
2685 
2686       case OP_ASSERT:
2687       case OP_ASSERT_NOT:
2688       case OP_ASSERTBACK:
2689       case OP_ASSERTBACK_NOT:
2690         {
2691         int rc;
2692         int *local_workspace;
2693         PCRE2_SIZE *local_offsets;
2694         PCRE2_SPTR endasscode = code + GET(code, 1);
2695         RWS_anchor *rws = (RWS_anchor *)RWS;
2696 
2697         if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2698           {
2699           rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2700           if (rc != 0) return rc;
2701           RWS = (int *)rws;
2702           }
2703 
2704         local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2705         local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2706         rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2707 
2708         while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2709 
2710         rc = internal_dfa_match(
2711           mb,                                   /* static match data */
2712           code,                                 /* this subexpression's code */
2713           ptr,                                  /* where we currently are */
2714           (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2715           local_offsets,                        /* offset vector */
2716           RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
2717           local_workspace,                      /* workspace vector */
2718           RWS_RSIZE,                            /* size of same */
2719           rlevel,                               /* function recursion level */
2720           RWS);                                 /* recursion workspace */
2721 
2722         rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2723 
2724         if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2725         if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2726             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2727         }
2728       break;
2729 
2730       /*-----------------------------------------------------------------*/
2731       case OP_COND:
2732       case OP_SCOND:
2733         {
2734         int codelink = (int)GET(code, 1);
2735         PCRE2_UCHAR condcode;
2736 
2737         /* Because of the way auto-callout works during compile, a callout item
2738         is inserted between OP_COND and an assertion condition. This does not
2739         happen for the other conditions. */
2740 
2741         if (code[LINK_SIZE + 1] == OP_CALLOUT
2742             || code[LINK_SIZE + 1] == OP_CALLOUT_STR)
2743           {
2744           PCRE2_SIZE callout_length;
2745           rrc = do_callout(code, offsets, current_subject, ptr, mb,
2746             1 + LINK_SIZE, &callout_length);
2747           if (rrc < 0) return rrc;                 /* Abandon */
2748           if (rrc > 0) break;                      /* Fail this thread */
2749           code += callout_length;                  /* Skip callout data */
2750           }
2751 
2752         condcode = code[LINK_SIZE+1];
2753 
2754         /* Back reference conditions and duplicate named recursion conditions
2755         are not supported */
2756 
2757         if (condcode == OP_CREF || condcode == OP_DNCREF ||
2758             condcode == OP_DNRREF)
2759           return PCRE2_ERROR_DFA_UCOND;
2760 
2761         /* The DEFINE condition is always false, and the assertion (?!) is
2762         converted to OP_FAIL. */
2763 
2764         if (condcode == OP_FALSE || condcode == OP_FAIL)
2765           { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2766 
2767         /* There is also an always-true condition */
2768 
2769         else if (condcode == OP_TRUE)
2770           { ADD_ACTIVE(state_offset + LINK_SIZE + 2, 0); }
2771 
2772         /* The only supported version of OP_RREF is for the value RREF_ANY,
2773         which means "test if in any recursion". We can't test for specifically
2774         recursed groups. */
2775 
2776         else if (condcode == OP_RREF)
2777           {
2778           unsigned int value = GET2(code, LINK_SIZE + 2);
2779           if (value != RREF_ANY) return PCRE2_ERROR_DFA_UCOND;
2780           if (mb->recursive != NULL)
2781             { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2782           else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2783           }
2784 
2785         /* Otherwise, the condition is an assertion */
2786 
2787         else
2788           {
2789           int rc;
2790           int *local_workspace;
2791           PCRE2_SIZE *local_offsets;
2792           PCRE2_SPTR asscode = code + LINK_SIZE + 1;
2793           PCRE2_SPTR endasscode = asscode + GET(asscode, 1);
2794           RWS_anchor *rws = (RWS_anchor *)RWS;
2795 
2796           if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2797             {
2798             rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2799             if (rc != 0) return rc;
2800             RWS = (int *)rws;
2801             }
2802 
2803           local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2804           local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2805           rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2806 
2807           while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2808 
2809           rc = internal_dfa_match(
2810             mb,                                   /* fixed match data */
2811             asscode,                              /* this subexpression's code */
2812             ptr,                                  /* where we currently are */
2813             (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2814             local_offsets,                        /* offset vector */
2815             RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
2816             local_workspace,                      /* workspace vector */
2817             RWS_RSIZE,                            /* size of same */
2818             rlevel,                               /* function recursion level */
2819             RWS);                                 /* recursion workspace */
2820 
2821           rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2822 
2823           if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2824           if ((rc >= 0) ==
2825                 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2826             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2827           else
2828             { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2829           }
2830         }
2831       break;
2832 
2833       /*-----------------------------------------------------------------*/
2834       case OP_RECURSE:
2835         {
2836         int rc;
2837         int *local_workspace;
2838         PCRE2_SIZE *local_offsets;
2839         RWS_anchor *rws = (RWS_anchor *)RWS;
2840         dfa_recursion_info *ri;
2841         PCRE2_SPTR callpat = start_code + GET(code, 1);
2842         uint32_t recno = (callpat == mb->start_code)? 0 :
2843           GET2(callpat, 1 + LINK_SIZE);
2844 
2845         if (rws->free < RWS_RSIZE + RWS_OVEC_RSIZE)
2846           {
2847           rc = more_workspace(&rws, RWS_OVEC_RSIZE, mb);
2848           if (rc != 0) return rc;
2849           RWS = (int *)rws;
2850           }
2851 
2852         local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2853         local_workspace = ((int *)local_offsets) + RWS_OVEC_RSIZE;
2854         rws->free -= RWS_RSIZE + RWS_OVEC_RSIZE;
2855 
2856         /* Check for repeating a recursion without advancing the subject
2857         pointer. This should catch convoluted mutual recursions. (Some simple
2858         cases are caught at compile time.) */
2859 
2860         for (ri = mb->recursive; ri != NULL; ri = ri->prevrec)
2861           if (recno == ri->group_num && ptr == ri->subject_position)
2862             return PCRE2_ERROR_RECURSELOOP;
2863 
2864         /* Remember this recursion and where we started it so as to
2865         catch infinite loops. */
2866 
2867         new_recursive.group_num = recno;
2868         new_recursive.subject_position = ptr;
2869         new_recursive.prevrec = mb->recursive;
2870         mb->recursive = &new_recursive;
2871 
2872         rc = internal_dfa_match(
2873           mb,                                   /* fixed match data */
2874           callpat,                              /* this subexpression's code */
2875           ptr,                                  /* where we currently are */
2876           (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2877           local_offsets,                        /* offset vector */
2878           RWS_OVEC_RSIZE/OVEC_UNIT,             /* size of same */
2879           local_workspace,                      /* workspace vector */
2880           RWS_RSIZE,                            /* size of same */
2881           rlevel,                               /* function recursion level */
2882           RWS);                                 /* recursion workspace */
2883 
2884         rws->free += RWS_RSIZE + RWS_OVEC_RSIZE;
2885         mb->recursive = new_recursive.prevrec;  /* Done this recursion */
2886 
2887         /* Ran out of internal offsets */
2888 
2889         if (rc == 0) return PCRE2_ERROR_DFA_RECURSE;
2890 
2891         /* For each successful matched substring, set up the next state with a
2892         count of characters to skip before trying it. Note that the count is in
2893         characters, not bytes. */
2894 
2895         if (rc > 0)
2896           {
2897           for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2898             {
2899             PCRE2_SIZE charcount = local_offsets[rc+1] - local_offsets[rc];
2900 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2901             if (utf)
2902               {
2903               PCRE2_SPTR p = start_subject + local_offsets[rc];
2904               PCRE2_SPTR pp = start_subject + local_offsets[rc+1];
2905               while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
2906               }
2907 #endif
2908             if (charcount > 0)
2909               {
2910               ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0,
2911                 (int)(charcount - 1));
2912               }
2913             else
2914               {
2915               ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2916               }
2917             }
2918           }
2919         else if (rc != PCRE2_ERROR_NOMATCH) return rc;
2920         }
2921       break;
2922 
2923       /*-----------------------------------------------------------------*/
2924       case OP_BRAPOS:
2925       case OP_SBRAPOS:
2926       case OP_CBRAPOS:
2927       case OP_SCBRAPOS:
2928       case OP_BRAPOSZERO:
2929         {
2930         int rc;
2931         int *local_workspace;
2932         PCRE2_SIZE *local_offsets;
2933         PCRE2_SIZE charcount, matched_count;
2934         PCRE2_SPTR local_ptr = ptr;
2935         RWS_anchor *rws = (RWS_anchor *)RWS;
2936         BOOL allow_zero;
2937 
2938         if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2939           {
2940           rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2941           if (rc != 0) return rc;
2942           RWS = (int *)rws;
2943           }
2944 
2945         local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2946         local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2947         rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2948 
2949         if (codevalue == OP_BRAPOSZERO)
2950           {
2951           allow_zero = TRUE;
2952           codevalue = *(++code);  /* Codevalue will be one of above BRAs */
2953           }
2954         else allow_zero = FALSE;
2955 
2956         /* Loop to match the subpattern as many times as possible as if it were
2957         a complete pattern. */
2958 
2959         for (matched_count = 0;; matched_count++)
2960           {
2961           rc = internal_dfa_match(
2962             mb,                                   /* fixed match data */
2963             code,                                 /* this subexpression's code */
2964             local_ptr,                            /* where we currently are */
2965             (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2966             local_offsets,                        /* offset vector */
2967             RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
2968             local_workspace,                      /* workspace vector */
2969             RWS_RSIZE,                            /* size of same */
2970             rlevel,                               /* function recursion level */
2971             RWS);                                 /* recursion workspace */
2972 
2973           /* Failed to match */
2974 
2975           if (rc < 0)
2976             {
2977             if (rc != PCRE2_ERROR_NOMATCH) return rc;
2978             break;
2979             }
2980 
2981           /* Matched: break the loop if zero characters matched. */
2982 
2983           charcount = local_offsets[1] - local_offsets[0];
2984           if (charcount == 0) break;
2985           local_ptr += charcount;    /* Advance temporary position ptr */
2986           }
2987 
2988         rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2989 
2990         /* At this point we have matched the subpattern matched_count
2991         times, and local_ptr is pointing to the character after the end of the
2992         last match. */
2993 
2994         if (matched_count > 0 || allow_zero)
2995           {
2996           PCRE2_SPTR end_subpattern = code;
2997           int next_state_offset;
2998 
2999           do { end_subpattern += GET(end_subpattern, 1); }
3000             while (*end_subpattern == OP_ALT);
3001           next_state_offset =
3002             (int)(end_subpattern - start_code + LINK_SIZE + 1);
3003 
3004           /* Optimization: if there are no more active states, and there
3005           are no new states yet set up, then skip over the subject string
3006           right here, to save looping. Otherwise, set up the new state to swing
3007           into action when the end of the matched substring is reached. */
3008 
3009           if (i + 1 >= active_count && new_count == 0)
3010             {
3011             ptr = local_ptr;
3012             clen = 0;
3013             ADD_NEW(next_state_offset, 0);
3014             }
3015           else
3016             {
3017             PCRE2_SPTR p = ptr;
3018             PCRE2_SPTR pp = local_ptr;
3019             charcount = (PCRE2_SIZE)(pp - p);
3020 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3021             if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3022 #endif
3023             ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3024             }
3025           }
3026         }
3027       break;
3028 
3029       /*-----------------------------------------------------------------*/
3030       case OP_ONCE:
3031         {
3032         int rc;
3033         int *local_workspace;
3034         PCRE2_SIZE *local_offsets;
3035         RWS_anchor *rws = (RWS_anchor *)RWS;
3036 
3037         if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
3038           {
3039           rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
3040           if (rc != 0) return rc;
3041           RWS = (int *)rws;
3042           }
3043 
3044         local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
3045         local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
3046         rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
3047 
3048         rc = internal_dfa_match(
3049           mb,                                   /* fixed match data */
3050           code,                                 /* this subexpression's code */
3051           ptr,                                  /* where we currently are */
3052           (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
3053           local_offsets,                        /* offset vector */
3054           RWS_OVEC_OSIZE/OVEC_UNIT,             /* size of same */
3055           local_workspace,                      /* workspace vector */
3056           RWS_RSIZE,                            /* size of same */
3057           rlevel,                               /* function recursion level */
3058           RWS);                                 /* recursion workspace */
3059 
3060         rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
3061 
3062         if (rc >= 0)
3063           {
3064           PCRE2_SPTR end_subpattern = code;
3065           PCRE2_SIZE charcount = local_offsets[1] - local_offsets[0];
3066           int next_state_offset, repeat_state_offset;
3067 
3068           do { end_subpattern += GET(end_subpattern, 1); }
3069             while (*end_subpattern == OP_ALT);
3070           next_state_offset =
3071             (int)(end_subpattern - start_code + LINK_SIZE + 1);
3072 
3073           /* If the end of this subpattern is KETRMAX or KETRMIN, we must
3074           arrange for the repeat state also to be added to the relevant list.
3075           Calculate the offset, or set -1 for no repeat. */
3076 
3077           repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
3078                                  *end_subpattern == OP_KETRMIN)?
3079             (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
3080 
3081           /* If we have matched an empty string, add the next state at the
3082           current character pointer. This is important so that the duplicate
3083           checking kicks in, which is what breaks infinite loops that match an
3084           empty string. */
3085 
3086           if (charcount == 0)
3087             {
3088             ADD_ACTIVE(next_state_offset, 0);
3089             }
3090 
3091           /* Optimization: if there are no more active states, and there
3092           are no new states yet set up, then skip over the subject string
3093           right here, to save looping. Otherwise, set up the new state to swing
3094           into action when the end of the matched substring is reached. */
3095 
3096           else if (i + 1 >= active_count && new_count == 0)
3097             {
3098             ptr += charcount;
3099             clen = 0;
3100             ADD_NEW(next_state_offset, 0);
3101 
3102             /* If we are adding a repeat state at the new character position,
3103             we must fudge things so that it is the only current state.
3104             Otherwise, it might be a duplicate of one we processed before, and
3105             that would cause it to be skipped. */
3106 
3107             if (repeat_state_offset >= 0)
3108               {
3109               next_active_state = active_states;
3110               active_count = 0;
3111               i = -1;
3112               ADD_ACTIVE(repeat_state_offset, 0);
3113               }
3114             }
3115           else
3116             {
3117 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3118             if (utf)
3119               {
3120               PCRE2_SPTR p = start_subject + local_offsets[0];
3121               PCRE2_SPTR pp = start_subject + local_offsets[1];
3122               while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3123               }
3124 #endif
3125             ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3126             if (repeat_state_offset >= 0)
3127               { ADD_NEW_DATA(-repeat_state_offset, 0, (int)(charcount - 1)); }
3128             }
3129           }
3130         else if (rc != PCRE2_ERROR_NOMATCH) return rc;
3131         }
3132       break;
3133 
3134 
3135 /* ========================================================================== */
3136       /* Handle callouts */
3137 
3138       case OP_CALLOUT:
3139       case OP_CALLOUT_STR:
3140         {
3141         PCRE2_SIZE callout_length;
3142         rrc = do_callout(code, offsets, current_subject, ptr, mb, 0,
3143           &callout_length);
3144         if (rrc < 0) return rrc;   /* Abandon */
3145         if (rrc == 0)
3146           { ADD_ACTIVE(state_offset + (int)callout_length, 0); }
3147         }
3148       break;
3149 
3150 
3151 /* ========================================================================== */
3152       default:        /* Unsupported opcode */
3153       return PCRE2_ERROR_DFA_UITEM;
3154       }
3155 
3156     NEXT_ACTIVE_STATE: continue;
3157 
3158     }      /* End of loop scanning active states */
3159 
3160   /* We have finished the processing at the current subject character. If no
3161   new states have been set for the next character, we have found all the
3162   matches that we are going to find. If partial matching has been requested,
3163   check for appropriate conditions.
3164 
3165   The "forced_ fail" variable counts the number of (*F) encountered for the
3166   character. If it is equal to the original active_count (saved in
3167   workspace[1]) it means that (*F) was found on every active state. In this
3168   case we don't want to give a partial match.
3169 
3170   The "could_continue" variable is true if a state could have continued but
3171   for the fact that the end of the subject was reached. */
3172 
3173   if (new_count <= 0)
3174     {
3175     if (could_continue &&                            /* Some could go on, and */
3176         forced_fail != workspace[1] &&               /* Not all forced fail & */
3177         (                                            /* either... */
3178         (mb->moptions & PCRE2_PARTIAL_HARD) != 0      /* Hard partial */
3179         ||                                           /* or... */
3180         ((mb->moptions & PCRE2_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
3181          match_count < 0)                             /* no matches */
3182         ) &&                                         /* And... */
3183         (
3184         partial_newline ||                   /* Either partial NL */
3185           (                                  /* or ... */
3186           ptr >= end_subject &&              /* End of subject and */
3187             (                                  /* either */
3188             ptr > mb->start_used_ptr ||        /* Inspected non-empty string */
3189             mb->allowemptypartial              /* or pattern has lookbehind */
3190             )                                  /* or could match empty */
3191           )
3192         ))
3193       match_count = PCRE2_ERROR_PARTIAL;
3194     break;  /* Exit from loop along the subject string */
3195     }
3196 
3197   /* One or more states are active for the next character. */
3198 
3199   ptr += clen;    /* Advance to next subject character */
3200   }               /* Loop to move along the subject string */
3201 
3202 /* Control gets here from "break" a few lines above. If we have a match and
3203 PCRE2_ENDANCHORED is set, the match fails. */
3204 
3205 if (match_count >= 0 &&
3206     ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0 &&
3207     ptr < end_subject)
3208   match_count = PCRE2_ERROR_NOMATCH;
3209 
3210 return match_count;
3211 }
3212 
3213 
3214 
3215 /*************************************************
3216 *     Match a pattern using the DFA algorithm    *
3217 *************************************************/
3218 
3219 /* This function matches a compiled pattern to a subject string, using the
3220 alternate matching algorithm that finds all matches at once.
3221 
3222 Arguments:
3223   code          points to the compiled pattern
3224   subject       subject string
3225   length        length of subject string
3226   startoffset   where to start matching in the subject
3227   options       option bits
3228   match_data    points to a match data structure
3229   gcontext      points to a match context
3230   workspace     pointer to workspace
3231   wscount       size of workspace
3232 
3233 Returns:        > 0 => number of match offset pairs placed in offsets
3234                 = 0 => offsets overflowed; longest matches are present
3235                  -1 => failed to match
3236                < -1 => some kind of unexpected problem
3237 */
3238 
3239 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_dfa_match(const pcre2_code * code,PCRE2_SPTR subject,PCRE2_SIZE length,PCRE2_SIZE start_offset,uint32_t options,pcre2_match_data * match_data,pcre2_match_context * mcontext,int * workspace,PCRE2_SIZE wscount)3240 pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
3241   PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
3242   pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount)
3243 {
3244 int rc;
3245 int was_zero_terminated = 0;
3246 
3247 const pcre2_real_code *re = (const pcre2_real_code *)code;
3248 
3249 PCRE2_SPTR start_match;
3250 PCRE2_SPTR end_subject;
3251 PCRE2_SPTR bumpalong_limit;
3252 PCRE2_SPTR req_cu_ptr;
3253 
3254 BOOL utf, anchored, startline, firstline;
3255 BOOL has_first_cu = FALSE;
3256 BOOL has_req_cu = FALSE;
3257 
3258 #if PCRE2_CODE_UNIT_WIDTH == 8
3259 BOOL memchr_not_found_first_cu = FALSE;
3260 BOOL memchr_not_found_first_cu2 = FALSE;
3261 #endif
3262 
3263 PCRE2_UCHAR first_cu = 0;
3264 PCRE2_UCHAR first_cu2 = 0;
3265 PCRE2_UCHAR req_cu = 0;
3266 PCRE2_UCHAR req_cu2 = 0;
3267 
3268 const uint8_t *start_bits = NULL;
3269 
3270 /* We need to have mb pointing to a match block, because the IS_NEWLINE macro
3271 is used below, and it expects NLBLOCK to be defined as a pointer. */
3272 
3273 pcre2_callout_block cb;
3274 dfa_match_block actual_match_block;
3275 dfa_match_block *mb = &actual_match_block;
3276 
3277 /* Set up a starting block of memory for use during recursive calls to
3278 internal_dfa_match(). By putting this on the stack, it minimizes resource use
3279 in the case when it is not needed. If this is too small, more memory is
3280 obtained from the heap. At the start of each block is an anchor structure.*/
3281 
3282 int base_recursion_workspace[RWS_BASE_SIZE];
3283 RWS_anchor *rws = (RWS_anchor *)base_recursion_workspace;
3284 rws->next = NULL;
3285 rws->size = RWS_BASE_SIZE;
3286 rws->free = RWS_BASE_SIZE - RWS_ANCHOR_SIZE;
3287 
3288 /* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated
3289 subject string. */
3290 
3291 if (length == PCRE2_ZERO_TERMINATED)
3292   {
3293   length = PRIV(strlen)(subject);
3294   was_zero_terminated = 1;
3295   }
3296 
3297 /* Plausibility checks */
3298 
3299 if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
3300 if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL)
3301   return PCRE2_ERROR_NULL;
3302 if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE;
3303 if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
3304 
3305 /* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same
3306 time. */
3307 
3308 if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
3309    ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
3310   return PCRE2_ERROR_BADOPTION;
3311 
3312 /* Invalid UTF support is not available for DFA matching. */
3313 
3314 if ((re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0)
3315   return PCRE2_ERROR_DFA_UINVALID_UTF;
3316 
3317 /* Check that the first field in the block is the magic number. If it is not,
3318 return with PCRE2_ERROR_BADMAGIC. */
3319 
3320 if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
3321 
3322 /* Check the code unit width. */
3323 
3324 if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
3325   return PCRE2_ERROR_BADMODE;
3326 
3327 /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
3328 options variable for this function. Users of PCRE2 who are not calling the
3329 function directly would like to have a way of setting these flags, in the same
3330 way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with
3331 constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
3332 (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be
3333 transferred to the options for this function. The bits are guaranteed to be
3334 adjacent, but do not have the same values. This bit of Boolean trickery assumes
3335 that the match-time bits are not more significant than the flag bits. If by
3336 accident this is not the case, a compile-time division by zero error will
3337 occur. */
3338 
3339 #define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
3340 #define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
3341 options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
3342 #undef FF
3343 #undef OO
3344 
3345 /* If restarting after a partial match, do some sanity checks on the contents
3346 of the workspace. */
3347 
3348 if ((options & PCRE2_DFA_RESTART) != 0)
3349   {
3350   if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3351     workspace[1] > (int)((wscount - 2)/INTS_PER_STATEBLOCK))
3352       return PCRE2_ERROR_DFA_BADRESTART;
3353   }
3354 
3355 /* Set some local values */
3356 
3357 utf = (re->overall_options & PCRE2_UTF) != 0;
3358 start_match = subject + start_offset;
3359 end_subject = subject + length;
3360 req_cu_ptr = start_match - 1;
3361 anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 ||
3362   (re->overall_options & PCRE2_ANCHORED) != 0;
3363 
3364 /* The "must be at the start of a line" flags are used in a loop when finding
3365 where to start. */
3366 
3367 startline = (re->flags & PCRE2_STARTLINE) != 0;
3368 firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;
3369 bumpalong_limit = end_subject;
3370 
3371 /* Initialize and set up the fixed fields in the callout block, with a pointer
3372 in the match block. */
3373 
3374 mb->cb = &cb;
3375 cb.version = 2;
3376 cb.subject = subject;
3377 cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
3378 cb.callout_flags = 0;
3379 cb.capture_top      = 1;      /* No capture support */
3380 cb.capture_last     = 0;
3381 cb.mark             = NULL;   /* No (*MARK) support */
3382 
3383 /* Get data from the match context, if present, and fill in the remaining
3384 fields in the match block. It is an error to set an offset limit without
3385 setting the flag at compile time. */
3386 
3387 if (mcontext == NULL)
3388   {
3389   mb->callout = NULL;
3390   mb->memctl = re->memctl;
3391   mb->match_limit = PRIV(default_match_context).match_limit;
3392   mb->match_limit_depth = PRIV(default_match_context).depth_limit;
3393   mb->heap_limit = PRIV(default_match_context).heap_limit;
3394   }
3395 else
3396   {
3397   if (mcontext->offset_limit != PCRE2_UNSET)
3398     {
3399     if ((re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
3400       return PCRE2_ERROR_BADOFFSETLIMIT;
3401     bumpalong_limit = subject + mcontext->offset_limit;
3402     }
3403   mb->callout = mcontext->callout;
3404   mb->callout_data = mcontext->callout_data;
3405   mb->memctl = mcontext->memctl;
3406   mb->match_limit = mcontext->match_limit;
3407   mb->match_limit_depth = mcontext->depth_limit;
3408   mb->heap_limit = mcontext->heap_limit;
3409   }
3410 
3411 if (mb->match_limit > re->limit_match)
3412   mb->match_limit = re->limit_match;
3413 
3414 if (mb->match_limit_depth > re->limit_depth)
3415   mb->match_limit_depth = re->limit_depth;
3416 
3417 if (mb->heap_limit > re->limit_heap)
3418   mb->heap_limit = re->limit_heap;
3419 
3420 mb->start_code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
3421   re->name_count * re->name_entry_size;
3422 mb->tables = re->tables;
3423 mb->start_subject = subject;
3424 mb->end_subject = end_subject;
3425 mb->start_offset = start_offset;
3426 mb->allowemptypartial = (re->max_lookbehind > 0) ||
3427   (re->flags & PCRE2_MATCH_EMPTY) != 0;
3428 mb->moptions = options;
3429 mb->poptions = re->overall_options;
3430 mb->match_call_count = 0;
3431 mb->heap_used = 0;
3432 
3433 /* Process the \R and newline settings. */
3434 
3435 mb->bsr_convention = re->bsr_convention;
3436 mb->nltype = NLTYPE_FIXED;
3437 switch(re->newline_convention)
3438   {
3439   case PCRE2_NEWLINE_CR:
3440   mb->nllen = 1;
3441   mb->nl[0] = CHAR_CR;
3442   break;
3443 
3444   case PCRE2_NEWLINE_LF:
3445   mb->nllen = 1;
3446   mb->nl[0] = CHAR_NL;
3447   break;
3448 
3449   case PCRE2_NEWLINE_NUL:
3450   mb->nllen = 1;
3451   mb->nl[0] = CHAR_NUL;
3452   break;
3453 
3454   case PCRE2_NEWLINE_CRLF:
3455   mb->nllen = 2;
3456   mb->nl[0] = CHAR_CR;
3457   mb->nl[1] = CHAR_NL;
3458   break;
3459 
3460   case PCRE2_NEWLINE_ANY:
3461   mb->nltype = NLTYPE_ANY;
3462   break;
3463 
3464   case PCRE2_NEWLINE_ANYCRLF:
3465   mb->nltype = NLTYPE_ANYCRLF;
3466   break;
3467 
3468   default: return PCRE2_ERROR_INTERNAL;
3469   }
3470 
3471 /* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
3472 we must also check that a starting offset does not point into the middle of a
3473 multiunit character. We check only the portion of the subject that is going to
3474 be inspected during matching - from the offset minus the maximum back reference
3475 to the given length. This saves time when a small part of a large subject is
3476 being matched by the use of a starting offset. Note that the maximum lookbehind
3477 is a number of characters, not code units. */
3478 
3479 #ifdef SUPPORT_UNICODE
3480 if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
3481   {
3482   PCRE2_SPTR check_subject = start_match;  /* start_match includes offset */
3483 
3484   if (start_offset > 0)
3485     {
3486 #if PCRE2_CODE_UNIT_WIDTH != 32
3487     unsigned int i;
3488     if (start_match < end_subject && NOT_FIRSTCU(*start_match))
3489       return PCRE2_ERROR_BADUTFOFFSET;
3490     for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
3491       {
3492       check_subject--;
3493       while (check_subject > subject &&
3494 #if PCRE2_CODE_UNIT_WIDTH == 8
3495       (*check_subject & 0xc0) == 0x80)
3496 #else  /* 16-bit */
3497       (*check_subject & 0xfc00) == 0xdc00)
3498 #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
3499         check_subject--;
3500       }
3501 #else   /* In the 32-bit library, one code unit equals one character. */
3502     check_subject -= re->max_lookbehind;
3503     if (check_subject < subject) check_subject = subject;
3504 #endif  /* PCRE2_CODE_UNIT_WIDTH != 32 */
3505     }
3506 
3507   /* Validate the relevant portion of the subject. After an error, adjust the
3508   offset to be an absolute offset in the whole string. */
3509 
3510   match_data->rc = PRIV(valid_utf)(check_subject,
3511     length - (PCRE2_SIZE)(check_subject - subject), &(match_data->startchar));
3512   if (match_data->rc != 0)
3513     {
3514     match_data->startchar += (PCRE2_SIZE)(check_subject - subject);
3515     return match_data->rc;
3516     }
3517   }
3518 #endif  /* SUPPORT_UNICODE */
3519 
3520 /* Set up the first code unit to match, if available. If there's no first code
3521 unit there may be a bitmap of possible first characters. */
3522 
3523 if ((re->flags & PCRE2_FIRSTSET) != 0)
3524   {
3525   has_first_cu = TRUE;
3526   first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
3527   if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
3528     {
3529     first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
3530 #ifdef SUPPORT_UNICODE
3531 #if PCRE2_CODE_UNIT_WIDTH == 8
3532     if (first_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
3533       first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3534 #else
3535     if (first_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
3536       first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3537 #endif
3538 #endif  /* SUPPORT_UNICODE */
3539     }
3540   }
3541 else
3542   if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
3543     start_bits = re->start_bitmap;
3544 
3545 /* There may be a "last known required code unit" set. */
3546 
3547 if ((re->flags & PCRE2_LASTSET) != 0)
3548   {
3549   has_req_cu = TRUE;
3550   req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
3551   if ((re->flags & PCRE2_LASTCASELESS) != 0)
3552     {
3553     req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
3554 #ifdef SUPPORT_UNICODE
3555 #if PCRE2_CODE_UNIT_WIDTH == 8
3556     if (req_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
3557       req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3558 #else
3559     if (req_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
3560       req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3561 #endif
3562 #endif  /* SUPPORT_UNICODE */
3563     }
3564   }
3565 
3566 /* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT,
3567 free the memory that was obtained. */
3568 
3569 if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0)
3570   {
3571   match_data->memctl.free((void *)match_data->subject,
3572     match_data->memctl.memory_data);
3573   match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT;
3574   }
3575 
3576 /* Fill in fields that are always returned in the match data. */
3577 
3578 match_data->code = re;
3579 match_data->subject = NULL;  /* Default for no match */
3580 match_data->mark = NULL;
3581 match_data->matchedby = PCRE2_MATCHEDBY_DFA_INTERPRETER;
3582 
3583 /* Call the main matching function, looping for a non-anchored regex after a
3584 failed match. If not restarting, perform certain optimizations at the start of
3585 a match. */
3586 
3587 for (;;)
3588   {
3589   /* ----------------- Start of match optimizations ---------------- */
3590 
3591   /* There are some optimizations that avoid running the match if a known
3592   starting point is not found, or if a known later code unit is not present.
3593   However, there is an option (settable at compile time) that disables
3594   these, for testing and for ensuring that all callouts do actually occur.
3595   The optimizations must also be avoided when restarting a DFA match. */
3596 
3597   if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
3598       (options & PCRE2_DFA_RESTART) == 0)
3599     {
3600     /* If firstline is TRUE, the start of the match is constrained to the first
3601     line of a multiline string. That is, the match must be before or at the
3602     first newline following the start of matching. Temporarily adjust
3603     end_subject so that we stop the optimization scans for a first code unit
3604     immediately after the first character of a newline (the first code unit can
3605     legitimately be a newline). If the match fails at the newline, later code
3606     breaks this loop. */
3607 
3608     if (firstline)
3609       {
3610       PCRE2_SPTR t = start_match;
3611 #ifdef SUPPORT_UNICODE
3612       if (utf)
3613         {
3614         while (t < end_subject && !IS_NEWLINE(t))
3615           {
3616           t++;
3617           ACROSSCHAR(t < end_subject, t, t++);
3618           }
3619         }
3620       else
3621 #endif
3622       while (t < end_subject && !IS_NEWLINE(t)) t++;
3623       end_subject = t;
3624       }
3625 
3626     /* Anchored: check the first code unit if one is recorded. This may seem
3627     pointless but it can help in detecting a no match case without scanning for
3628     the required code unit. */
3629 
3630     if (anchored)
3631       {
3632       if (has_first_cu || start_bits != NULL)
3633         {
3634         BOOL ok = start_match < end_subject;
3635         if (ok)
3636           {
3637           PCRE2_UCHAR c = UCHAR21TEST(start_match);
3638           ok = has_first_cu && (c == first_cu || c == first_cu2);
3639           if (!ok && start_bits != NULL)
3640             {
3641 #if PCRE2_CODE_UNIT_WIDTH != 8
3642             if (c > 255) c = 255;
3643 #endif
3644             ok = (start_bits[c/8] & (1u << (c&7))) != 0;
3645             }
3646           }
3647         if (!ok) break;
3648         }
3649       }
3650 
3651     /* Not anchored. Advance to a unique first code unit if there is one. In
3652     8-bit mode, the use of memchr() gives a big speed up, even though we have
3653     to call it twice in caseless mode, in order to find the earliest occurrence
3654     of the character in either of its cases. If a call to memchr() that
3655     searches the rest of the subject fails to find one case, remember that in
3656     order not to keep on repeating the search. This can make a huge difference
3657     when the strings are very long and only one case is present. */
3658 
3659     else
3660       {
3661       if (has_first_cu)
3662         {
3663         if (first_cu != first_cu2)  /* Caseless */
3664           {
3665 #if PCRE2_CODE_UNIT_WIDTH != 8
3666           PCRE2_UCHAR smc;
3667           while (start_match < end_subject &&
3668                 (smc = UCHAR21TEST(start_match)) != first_cu &&
3669                   smc != first_cu2)
3670             start_match++;
3671 
3672 #else  /* 8-bit code units */
3673           PCRE2_SPTR pp1 = NULL;
3674           PCRE2_SPTR pp2 = NULL;
3675           PCRE2_SIZE cu2size = end_subject - start_match;
3676 
3677           if (!memchr_not_found_first_cu)
3678             {
3679             pp1 = memchr(start_match, first_cu, end_subject - start_match);
3680             if (pp1 == NULL) memchr_not_found_first_cu = TRUE;
3681               else cu2size = pp1 - start_match;
3682             }
3683 
3684           /* If pp1 is not NULL, we have arranged to search only as far as pp1,
3685           to see if the other case is earlier, so we can set "not found" only
3686           when both searches have returned NULL. */
3687 
3688           if (!memchr_not_found_first_cu2)
3689             {
3690             pp2 = memchr(start_match, first_cu2, cu2size);
3691             memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL);
3692             }
3693 
3694           if (pp1 == NULL)
3695             start_match = (pp2 == NULL)? end_subject : pp2;
3696           else
3697             start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
3698 #endif
3699           }
3700 
3701         /* The caseful case */
3702 
3703         else
3704           {
3705 #if PCRE2_CODE_UNIT_WIDTH != 8
3706           while (start_match < end_subject && UCHAR21TEST(start_match) !=
3707                  first_cu)
3708             start_match++;
3709 #else  /* 8-bit code units */
3710           start_match = memchr(start_match, first_cu, end_subject - start_match);
3711           if (start_match == NULL) start_match = end_subject;
3712 #endif
3713           }
3714 
3715         /* If we can't find the required code unit, having reached the true end
3716         of the subject, break the bumpalong loop, to force a match failure,
3717         except when doing partial matching, when we let the next cycle run at
3718         the end of the subject. To see why, consider the pattern /(?<=abc)def/,
3719         which partially matches "abc", even though the string does not contain
3720         the starting character "d". If we have not reached the true end of the
3721         subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
3722         we also let the cycle run, because the matching string is legitimately
3723         allowed to start with the first code unit of a newline. */
3724 
3725         if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3726             start_match >= mb->end_subject)
3727           break;
3728         }
3729 
3730       /* If there's no first code unit, advance to just after a linebreak for a
3731       multiline match if required. */
3732 
3733       else if (startline)
3734         {
3735         if (start_match > mb->start_subject + start_offset)
3736           {
3737 #ifdef SUPPORT_UNICODE
3738           if (utf)
3739             {
3740             while (start_match < end_subject && !WAS_NEWLINE(start_match))
3741               {
3742               start_match++;
3743               ACROSSCHAR(start_match < end_subject, start_match, start_match++);
3744               }
3745             }
3746           else
3747 #endif
3748           while (start_match < end_subject && !WAS_NEWLINE(start_match))
3749             start_match++;
3750 
3751           /* If we have just passed a CR and the newline option is ANY or
3752           ANYCRLF, and we are now at a LF, advance the match position by one
3753           more code unit. */
3754 
3755           if (start_match[-1] == CHAR_CR &&
3756                (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
3757                start_match < end_subject &&
3758                UCHAR21TEST(start_match) == CHAR_NL)
3759             start_match++;
3760           }
3761         }
3762 
3763       /* If there's no first code unit or a requirement for a multiline line
3764       start, advance to a non-unique first code unit if any have been
3765       identified. The bitmap contains only 256 bits. When code units are 16 or
3766       32 bits wide, all code units greater than 254 set the 255 bit. */
3767 
3768       else if (start_bits != NULL)
3769         {
3770         while (start_match < end_subject)
3771           {
3772           uint32_t c = UCHAR21TEST(start_match);
3773 #if PCRE2_CODE_UNIT_WIDTH != 8
3774           if (c > 255) c = 255;
3775 #endif
3776           if ((start_bits[c/8] & (1u << (c&7))) != 0) break;
3777           start_match++;
3778           }
3779 
3780         /* See comment above in first_cu checking about the next line. */
3781 
3782         if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3783             start_match >= mb->end_subject)
3784           break;
3785         }
3786       }  /* End of first code unit handling */
3787 
3788     /* Restore fudged end_subject */
3789 
3790     end_subject = mb->end_subject;
3791 
3792     /* The following two optimizations are disabled for partial matching. */
3793 
3794     if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0)
3795       {
3796       PCRE2_SPTR p;
3797 
3798       /* The minimum matching length is a lower bound; no actual string of that
3799       length may actually match the pattern. Although the value is, strictly,
3800       in characters, we treat it as code units to avoid spending too much time
3801       in this optimization. */
3802 
3803       if (end_subject - start_match < re->minlength) goto NOMATCH_EXIT;
3804 
3805       /* If req_cu is set, we know that that code unit must appear in the
3806       subject for the match to succeed. If the first code unit is set, req_cu
3807       must be later in the subject; otherwise the test starts at the match
3808       point. This optimization can save a huge amount of backtracking in
3809       patterns with nested unlimited repeats that aren't going to match.
3810       Writing separate code for cased/caseless versions makes it go faster, as
3811       does using an autoincrement and backing off on a match. As in the case of
3812       the first code unit, using memchr() in the 8-bit library gives a big
3813       speed up. Unlike the first_cu check above, we do not need to call
3814       memchr() twice in the caseless case because we only need to check for the
3815       presence of the character in either case, not find the first occurrence.
3816 
3817       The search can be skipped if the code unit was found later than the
3818       current starting point in a previous iteration of the bumpalong loop.
3819 
3820       HOWEVER: when the subject string is very, very long, searching to its end
3821       can take a long time, and give bad performance on quite ordinary
3822       patterns. This showed up when somebody was matching something like
3823       /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
3824       sufficiently long, but it's worth searching a lot more for unanchored
3825       patterns. */
3826 
3827       p = start_match + (has_first_cu? 1:0);
3828       if (has_req_cu && p > req_cu_ptr)
3829         {
3830         PCRE2_SIZE check_length = end_subject - start_match;
3831 
3832         if (check_length < REQ_CU_MAX ||
3833               (!anchored && check_length < REQ_CU_MAX * 1000))
3834           {
3835           if (req_cu != req_cu2)  /* Caseless */
3836             {
3837 #if PCRE2_CODE_UNIT_WIDTH != 8
3838             while (p < end_subject)
3839               {
3840               uint32_t pp = UCHAR21INCTEST(p);
3841               if (pp == req_cu || pp == req_cu2) { p--; break; }
3842               }
3843 #else  /* 8-bit code units */
3844             PCRE2_SPTR pp = p;
3845             p = memchr(pp, req_cu, end_subject - pp);
3846             if (p == NULL)
3847               {
3848               p = memchr(pp, req_cu2, end_subject - pp);
3849               if (p == NULL) p = end_subject;
3850               }
3851 #endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
3852             }
3853 
3854           /* The caseful case */
3855 
3856           else
3857             {
3858 #if PCRE2_CODE_UNIT_WIDTH != 8
3859             while (p < end_subject)
3860               {
3861               if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
3862               }
3863 
3864 #else  /* 8-bit code units */
3865             p = memchr(p, req_cu, end_subject - p);
3866             if (p == NULL) p = end_subject;
3867 #endif
3868             }
3869 
3870           /* If we can't find the required code unit, break the matching loop,
3871           forcing a match failure. */
3872 
3873           if (p >= end_subject) break;
3874 
3875           /* If we have found the required code unit, save the point where we
3876           found it, so that we don't search again next time round the loop if
3877           the start hasn't passed this code unit yet. */
3878 
3879           req_cu_ptr = p;
3880           }
3881         }
3882       }
3883     }
3884 
3885   /* ------------ End of start of match optimizations ------------ */
3886 
3887   /* Give no match if we have passed the bumpalong limit. */
3888 
3889   if (start_match > bumpalong_limit) break;
3890 
3891   /* OK, now we can do the business */
3892 
3893   mb->start_used_ptr = start_match;
3894   mb->last_used_ptr = start_match;
3895   mb->recursive = NULL;
3896 
3897   rc = internal_dfa_match(
3898     mb,                           /* fixed match data */
3899     mb->start_code,               /* this subexpression's code */
3900     start_match,                  /* where we currently are */
3901     start_offset,                 /* start offset in subject */
3902     match_data->ovector,          /* offset vector */
3903     (uint32_t)match_data->oveccount * 2,  /* actual size of same */
3904     workspace,                    /* workspace vector */
3905     (int)wscount,                 /* size of same */
3906     0,                            /* function recurse level */
3907     base_recursion_workspace);    /* initial workspace for recursion */
3908 
3909   /* Anything other than "no match" means we are done, always; otherwise, carry
3910   on only if not anchored. */
3911 
3912   if (rc != PCRE2_ERROR_NOMATCH || anchored)
3913     {
3914     if (rc == PCRE2_ERROR_PARTIAL && match_data->oveccount > 0)
3915       {
3916       match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject);
3917       match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject);
3918       }
3919     match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject);
3920     match_data->rightchar = (PCRE2_SIZE)( mb->last_used_ptr - subject);
3921     match_data->startchar = (PCRE2_SIZE)(start_match - subject);
3922     match_data->rc = rc;
3923 
3924     if (rc >= 0 &&(options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
3925       {
3926       length = CU2BYTES(length + was_zero_terminated);
3927       match_data->subject = match_data->memctl.malloc(length,
3928         match_data->memctl.memory_data);
3929       if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;
3930       memcpy((void *)match_data->subject, subject, length);
3931       match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
3932       }
3933     else
3934       {
3935       if (rc >= 0 || rc == PCRE2_ERROR_PARTIAL) match_data->subject = subject;
3936       }
3937     goto EXIT;
3938     }
3939 
3940   /* Advance to the next subject character unless we are at the end of a line
3941   and firstline is set. */
3942 
3943   if (firstline && IS_NEWLINE(start_match)) break;
3944   start_match++;
3945 #ifdef SUPPORT_UNICODE
3946   if (utf)
3947     {
3948     ACROSSCHAR(start_match < end_subject, start_match, start_match++);
3949     }
3950 #endif
3951   if (start_match > end_subject) break;
3952 
3953   /* If we have just passed a CR and we are now at a LF, and the pattern does
3954   not contain any explicit matches for \r or \n, and the newline option is CRLF
3955   or ANY or ANYCRLF, advance the match position by one more character. */
3956 
3957   if (UCHAR21TEST(start_match - 1) == CHAR_CR &&
3958       start_match < end_subject &&
3959       UCHAR21TEST(start_match) == CHAR_NL &&
3960       (re->flags & PCRE2_HASCRORLF) == 0 &&
3961         (mb->nltype == NLTYPE_ANY ||
3962          mb->nltype == NLTYPE_ANYCRLF ||
3963          mb->nllen == 2))
3964     start_match++;
3965 
3966   }   /* "Bumpalong" loop */
3967 
3968 NOMATCH_EXIT:
3969 rc = PCRE2_ERROR_NOMATCH;
3970 
3971 EXIT:
3972 while (rws->next != NULL)
3973   {
3974   RWS_anchor *next = rws->next;
3975   rws->next = next->next;
3976   mb->memctl.free(next, mb->memctl.memory_data);
3977   }
3978 
3979 return rc;
3980 }
3981 
3982 /* End of pcre2_dfa_match.c */
3983