1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language (but see
7 below for why this module is different).
8
9 Written by Philip Hazel
10 Copyright (c) 1997-2017 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41 /* This module contains the external function pcre_dfa_exec(), which is an
42 alternative matching function that uses a sort of DFA algorithm (not a true
43 FSM). This is NOT Perl-compatible, but it has advantages in certain
44 applications. */
45
46 /* %ExternalCopyright% */
47
48 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49 the performance of his patterns greatly. I could not use it as it stood, as it
50 was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 test 7 to loop, and test 9 to crash with a segfault.
52
53 The issue is the check for duplicate states, which is done by a simple linear
54 search up the state list. (Grep for "duplicate" below to find the code.) For
55 many patterns, there will never be many states active at one time, so a simple
56 linear search is fine. In patterns that have many active states, it might be a
57 bottleneck. The suggested code used an indexing scheme to remember which states
58 had previously been used for each character, and avoided the linear search when
59 it knew there was no chance of a duplicate. This was implemented when adding
60 states to the state lists.
61
62 I wrote some thread-safe, not-limited code to try something similar at the time
63 of checking for duplicates (instead of when adding states), using index vectors
64 on the stack. It did give a 13% improvement with one specially constructed
65 pattern for certain subject strings, but on other strings and on many of the
66 simpler patterns in the test suite it did worse. The major problem, I think,
67 was the extra time to initialize the index. This had to be done for each call
68 of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69 only once - I suspect this was the cause of the problems with the tests.)
70
71 Overall, I concluded that the gains in some cases did not outweigh the losses
72 in others, so I abandoned this code. */
73
74
75
76 #ifdef HAVE_CONFIG_H
77 #include "config.h"
78 #endif
79
80 #define NLBLOCK md /* Block containing newline information */
81 #define PSSTART start_subject /* Field containing processed string start */
82 #define PSEND end_subject /* Field containing processed string end */
83
84 #include "pcre_internal.h"
85
86
87 /* For use to indent debugging output */
88
89 #define SP " "
90
91
92 /*************************************************
93 * Code parameters and static tables *
94 *************************************************/
95
96 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 into others, under special conditions. A gap of 20 between the blocks should be
98 enough. The resulting opcodes don't have to be less than 256 because they are
99 never stored, so we push them well clear of the normal opcodes. */
100
101 #define OP_PROP_EXTRA 300
102 #define OP_EXTUNI_EXTRA 320
103 #define OP_ANYNL_EXTRA 340
104 #define OP_HSPACE_EXTRA 360
105 #define OP_VSPACE_EXTRA 380
106
107
108 /* This table identifies those opcodes that are followed immediately by a
109 character that is to be tested in some way. This makes it possible to
110 centralize the loading of these characters. In the case of Type * etc, the
111 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 small value. Non-zero values in the table are the offsets from the opcode where
113 the character is to be found. ***NOTE*** If the start of this table is
114 modified, the three tables that follow must also be modified. */
115
116 static const pcre_uint8 coptable[] = {
117 0, /* End */
118 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
119 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
120 0, 0, 0, /* Any, AllAny, Anybyte */
121 0, 0, /* \P, \p */
122 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
123 0, /* \X */
124 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
125 1, /* Char */
126 1, /* Chari */
127 1, /* not */
128 1, /* noti */
129 /* Positive single-char repeats */
130 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
131 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */
132 1+IMM2_SIZE, /* exact */
133 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */
134 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
135 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */
136 1+IMM2_SIZE, /* exact I */
137 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */
138 /* Negative single-char repeats - only for chars < 256 */
139 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
140 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */
141 1+IMM2_SIZE, /* NOT exact */
142 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */
143 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
144 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */
145 1+IMM2_SIZE, /* NOT exact I */
146 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */
147 /* Positive type repeats */
148 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
149 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */
150 1+IMM2_SIZE, /* Type exact */
151 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */
152 /* Character class & ref repeats */
153 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
154 0, 0, /* CRRANGE, CRMINRANGE */
155 0, 0, 0, 0, /* Possessive *+, ++, ?+, CRPOSRANGE */
156 0, /* CLASS */
157 0, /* NCLASS */
158 0, /* XCLASS - variable length */
159 0, /* REF */
160 0, /* REFI */
161 0, /* DNREF */
162 0, /* DNREFI */
163 0, /* RECURSE */
164 0, /* CALLOUT */
165 0, /* Alt */
166 0, /* Ket */
167 0, /* KetRmax */
168 0, /* KetRmin */
169 0, /* KetRpos */
170 0, /* Reverse */
171 0, /* Assert */
172 0, /* Assert not */
173 0, /* Assert behind */
174 0, /* Assert behind not */
175 0, 0, /* ONCE, ONCE_NC */
176 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
177 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
178 0, 0, /* CREF, DNCREF */
179 0, 0, /* RREF, DNRREF */
180 0, /* DEF */
181 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
182 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
183 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
184 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
185 0, 0 /* CLOSE, SKIPZERO */
186 };
187
188 /* This table identifies those opcodes that inspect a character. It is used to
189 remember the fact that a character could have been inspected when the end of
190 the subject is reached. ***NOTE*** If the start of this table is modified, the
191 two tables that follow must also be modified. */
192
193 static const pcre_uint8 poptable[] = {
194 0, /* End */
195 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
196 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
197 1, 1, 1, /* Any, AllAny, Anybyte */
198 1, 1, /* \P, \p */
199 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
200 1, /* \X */
201 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
202 1, /* Char */
203 1, /* Chari */
204 1, /* not */
205 1, /* noti */
206 /* Positive single-char repeats */
207 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
208 1, 1, 1, /* upto, minupto, exact */
209 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
210 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
211 1, 1, 1, /* upto I, minupto I, exact I */
212 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
213 /* Negative single-char repeats - only for chars < 256 */
214 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
215 1, 1, 1, /* NOT upto, minupto, exact */
216 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
217 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
218 1, 1, 1, /* NOT upto I, minupto I, exact I */
219 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
220 /* Positive type repeats */
221 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
222 1, 1, 1, /* Type upto, minupto, exact */
223 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
224 /* Character class & ref repeats */
225 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
226 1, 1, /* CRRANGE, CRMINRANGE */
227 1, 1, 1, 1, /* Possessive *+, ++, ?+, CRPOSRANGE */
228 1, /* CLASS */
229 1, /* NCLASS */
230 1, /* XCLASS - variable length */
231 0, /* REF */
232 0, /* REFI */
233 0, /* DNREF */
234 0, /* DNREFI */
235 0, /* RECURSE */
236 0, /* CALLOUT */
237 0, /* Alt */
238 0, /* Ket */
239 0, /* KetRmax */
240 0, /* KetRmin */
241 0, /* KetRpos */
242 0, /* Reverse */
243 0, /* Assert */
244 0, /* Assert not */
245 0, /* Assert behind */
246 0, /* Assert behind not */
247 0, 0, /* ONCE, ONCE_NC */
248 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
249 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
250 0, 0, /* CREF, DNCREF */
251 0, 0, /* RREF, DNRREF */
252 0, /* DEF */
253 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
254 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
255 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
256 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
257 0, 0 /* CLOSE, SKIPZERO */
258 };
259
260 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
261 and \w */
262
263 static const pcre_uint8 toptable1[] = {
264 0, 0, 0, 0, 0, 0,
265 ctype_digit, ctype_digit,
266 ctype_space, ctype_space,
267 ctype_word, ctype_word,
268 0, 0 /* OP_ANY, OP_ALLANY */
269 };
270
271 static const pcre_uint8 toptable2[] = {
272 0, 0, 0, 0, 0, 0,
273 ctype_digit, 0,
274 ctype_space, 0,
275 ctype_word, 0,
276 1, 1 /* OP_ANY, OP_ALLANY */
277 };
278
279
280 /* Structure for holding data about a particular state, which is in effect the
281 current data for an active path through the match tree. It must consist
282 entirely of ints because the working vector we are passed, and which we put
283 these structures in, is a vector of ints. */
284
285 typedef struct stateblock {
286 int offset; /* Offset to opcode */
287 int count; /* Count for repeats */
288 int data; /* Some use extra data */
289 } stateblock;
290
291 #define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int))
292
293
294 #ifdef PCRE_DEBUG
295 /*************************************************
296 * Print character string *
297 *************************************************/
298
299 /* Character string printing function for debugging.
300
301 Arguments:
302 p points to string
303 length number of bytes
304 f where to print
305
306 Returns: nothing
307 */
308
309 static void
pchars(const pcre_uchar * p,int length,FILE * f)310 pchars(const pcre_uchar *p, int length, FILE *f)
311 {
312 pcre_uint32 c;
313 while (length-- > 0)
314 {
315 if (isprint(c = *(p++)))
316 fprintf(f, "%c", c);
317 else
318 fprintf(f, "\\x{%02x}", c);
319 }
320 }
321 #endif
322
323
324
325 /*************************************************
326 * Execute a Regular Expression - DFA engine *
327 *************************************************/
328
329 /* This internal function applies a compiled pattern to a subject string,
330 starting at a given point, using a DFA engine. This function is called from the
331 external one, possibly multiple times if the pattern is not anchored. The
332 function calls itself recursively for some kinds of subpattern.
333
334 Arguments:
335 md the match_data block with fixed information
336 this_start_code the opening bracket of this subexpression's code
337 current_subject where we currently are in the subject string
338 start_offset start offset in the subject string
339 offsets vector to contain the matching string offsets
340 offsetcount size of same
341 workspace vector of workspace
342 wscount size of same
343 rlevel function call recursion level
344
345 Returns: > 0 => number of match offset pairs placed in offsets
346 = 0 => offsets overflowed; longest matches are present
347 -1 => failed to match
348 < -1 => some kind of unexpected problem
349
350 The following macros are used for adding states to the two state vectors (one
351 for the current character, one for the following character). */
352
353 #define ADD_ACTIVE(x,y) \
354 if (active_count++ < wscount) \
355 { \
356 next_active_state->offset = (x); \
357 next_active_state->count = (y); \
358 next_active_state++; \
359 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
360 } \
361 else return PCRE_ERROR_DFA_WSSIZE
362
363 #define ADD_ACTIVE_DATA(x,y,z) \
364 if (active_count++ < wscount) \
365 { \
366 next_active_state->offset = (x); \
367 next_active_state->count = (y); \
368 next_active_state->data = (z); \
369 next_active_state++; \
370 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
371 } \
372 else return PCRE_ERROR_DFA_WSSIZE
373
374 #define ADD_NEW(x,y) \
375 if (new_count++ < wscount) \
376 { \
377 next_new_state->offset = (x); \
378 next_new_state->count = (y); \
379 next_new_state++; \
380 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
381 } \
382 else return PCRE_ERROR_DFA_WSSIZE
383
384 #define ADD_NEW_DATA(x,y,z) \
385 if (new_count++ < wscount) \
386 { \
387 next_new_state->offset = (x); \
388 next_new_state->count = (y); \
389 next_new_state->data = (z); \
390 next_new_state++; \
391 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
392 (x), (y), (z), __LINE__)); \
393 } \
394 else return PCRE_ERROR_DFA_WSSIZE
395
396 /* And now, here is the code */
397
398 static int
internal_dfa_exec(dfa_match_data * md,const pcre_uchar * this_start_code,const pcre_uchar * current_subject,int start_offset,int * offsets,int offsetcount,int * workspace,int wscount,int rlevel)399 internal_dfa_exec(
400 dfa_match_data *md,
401 const pcre_uchar *this_start_code,
402 const pcre_uchar *current_subject,
403 int start_offset,
404 int *offsets,
405 int offsetcount,
406 int *workspace,
407 int wscount,
408 int rlevel)
409 {
410 stateblock *active_states, *new_states, *temp_states;
411 stateblock *next_active_state, *next_new_state;
412
413 const pcre_uint8 *ctypes, *lcc, *fcc;
414 const pcre_uchar *ptr;
415 const pcre_uchar *end_code, *first_op;
416
417 dfa_recursion_info new_recursive;
418
419 int active_count, new_count, match_count;
420
421 /* Some fields in the md block are frequently referenced, so we load them into
422 independent variables in the hope that this will perform better. */
423
424 const pcre_uchar *start_subject = md->start_subject;
425 const pcre_uchar *end_subject = md->end_subject;
426 const pcre_uchar *start_code = md->start_code;
427
428 #ifdef SUPPORT_UTF
429 BOOL utf = (md->poptions & PCRE_UTF8) != 0;
430 #else
431 BOOL utf = FALSE;
432 #endif
433
434 BOOL reset_could_continue = FALSE;
435
436 rlevel++;
437 offsetcount &= (-2);
438
439 wscount -= 2;
440 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
441 (2 * INTS_PER_STATEBLOCK);
442
443 DPRINTF(("\n%.*s---------------------\n"
444 "%.*sCall to internal_dfa_exec f=%d\n",
445 rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
446
447 ctypes = md->tables + ctypes_offset;
448 lcc = md->tables + lcc_offset;
449 fcc = md->tables + fcc_offset;
450
451 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
452
453 active_states = (stateblock *)(workspace + 2);
454 next_new_state = new_states = active_states + wscount;
455 new_count = 0;
456
457 first_op = this_start_code + 1 + LINK_SIZE +
458 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
459 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
460 ? IMM2_SIZE:0);
461
462 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
463 the alternative states onto the list, and find out where the end is. This
464 makes is possible to use this function recursively, when we want to stop at a
465 matching internal ket rather than at the end.
466
467 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
468 a backward assertion. In that case, we have to find out the maximum amount to
469 move back, and set up each alternative appropriately. */
470
471 if (*first_op == OP_REVERSE)
472 {
473 int max_back = 0;
474 int gone_back;
475
476 end_code = this_start_code;
477 do
478 {
479 int back = GET(end_code, 2+LINK_SIZE);
480 if (back > max_back) max_back = back;
481 end_code += GET(end_code, 1);
482 }
483 while (*end_code == OP_ALT);
484
485 /* If we can't go back the amount required for the longest lookbehind
486 pattern, go back as far as we can; some alternatives may still be viable. */
487
488 #ifdef SUPPORT_UTF
489 /* In character mode we have to step back character by character */
490
491 if (utf)
492 {
493 for (gone_back = 0; gone_back < max_back; gone_back++)
494 {
495 if (current_subject <= start_subject) break;
496 current_subject--;
497 ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
498 }
499 }
500 else
501 #endif
502
503 /* In byte-mode we can do this quickly. */
504
505 {
506 gone_back = (current_subject - max_back < start_subject)?
507 (int)(current_subject - start_subject) : max_back;
508 current_subject -= gone_back;
509 }
510
511 /* Save the earliest consulted character */
512
513 if (current_subject < md->start_used_ptr)
514 md->start_used_ptr = current_subject;
515
516 /* Now we can process the individual branches. */
517
518 end_code = this_start_code;
519 do
520 {
521 int back = GET(end_code, 2+LINK_SIZE);
522 if (back <= gone_back)
523 {
524 int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
525 ADD_NEW_DATA(-bstate, 0, gone_back - back);
526 }
527 end_code += GET(end_code, 1);
528 }
529 while (*end_code == OP_ALT);
530 }
531
532 /* This is the code for a "normal" subpattern (not a backward assertion). The
533 start of a whole pattern is always one of these. If we are at the top level,
534 we may be asked to restart matching from the same point that we reached for a
535 previous partial match. We still have to scan through the top-level branches to
536 find the end state. */
537
538 else
539 {
540 end_code = this_start_code;
541
542 /* Restarting */
543
544 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
545 {
546 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
547 new_count = workspace[1];
548 if (!workspace[0])
549 memcpy(new_states, active_states, new_count * sizeof(stateblock));
550 }
551
552 /* Not restarting */
553
554 else
555 {
556 int length = 1 + LINK_SIZE +
557 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
558 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
559 ? IMM2_SIZE:0);
560 do
561 {
562 ADD_NEW((int)(end_code - start_code + length), 0);
563 end_code += GET(end_code, 1);
564 length = 1 + LINK_SIZE;
565 }
566 while (*end_code == OP_ALT);
567 }
568 }
569
570 workspace[0] = 0; /* Bit indicating which vector is current */
571
572 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
573
574 /* Loop for scanning the subject */
575
576 ptr = current_subject;
577 for (;;)
578 {
579 int i, j;
580 int clen, dlen;
581 pcre_uint32 c, d;
582 int forced_fail = 0;
583 BOOL partial_newline = FALSE;
584 BOOL could_continue = reset_could_continue;
585 reset_could_continue = FALSE;
586
587 /* Make the new state list into the active state list and empty the
588 new state list. */
589
590 temp_states = active_states;
591 active_states = new_states;
592 new_states = temp_states;
593 active_count = new_count;
594 new_count = 0;
595
596 workspace[0] ^= 1; /* Remember for the restarting feature */
597 workspace[1] = active_count;
598
599 #ifdef PCRE_DEBUG
600 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
601 pchars(ptr, STRLEN_UC(ptr), stdout);
602 printf("\"\n");
603
604 printf("%.*sActive states: ", rlevel*2-2, SP);
605 for (i = 0; i < active_count; i++)
606 printf("%d/%d ", active_states[i].offset, active_states[i].count);
607 printf("\n");
608 #endif
609
610 /* Set the pointers for adding new states */
611
612 next_active_state = active_states + active_count;
613 next_new_state = new_states;
614
615 /* Load the current character from the subject outside the loop, as many
616 different states may want to look at it, and we assume that at least one
617 will. */
618
619 if (ptr < end_subject)
620 {
621 clen = 1; /* Number of data items in the character */
622 #ifdef SUPPORT_UTF
623 GETCHARLENTEST(c, ptr, clen);
624 #else
625 c = *ptr;
626 #endif /* SUPPORT_UTF */
627 }
628 else
629 {
630 clen = 0; /* This indicates the end of the subject */
631 c = NOTACHAR; /* This value should never actually be used */
632 }
633
634 /* Scan up the active states and act on each one. The result of an action
635 may be to add more states to the currently active list (e.g. on hitting a
636 parenthesis) or it may be to put states on the new list, for considering
637 when we move the character pointer on. */
638
639 for (i = 0; i < active_count; i++)
640 {
641 stateblock *current_state = active_states + i;
642 BOOL caseless = FALSE;
643 const pcre_uchar *code;
644 int state_offset = current_state->offset;
645 int codevalue, rrc;
646 int count;
647
648 #ifdef PCRE_DEBUG
649 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
650 if (clen == 0) printf("EOL\n");
651 else if (c > 32 && c < 127) printf("'%c'\n", c);
652 else printf("0x%02x\n", c);
653 #endif
654
655 /* A negative offset is a special case meaning "hold off going to this
656 (negated) state until the number of characters in the data field have
657 been skipped". If the could_continue flag was passed over from a previous
658 state, arrange for it to passed on. */
659
660 if (state_offset < 0)
661 {
662 if (current_state->data > 0)
663 {
664 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
665 ADD_NEW_DATA(state_offset, current_state->count,
666 current_state->data - 1);
667 if (could_continue) reset_could_continue = TRUE;
668 continue;
669 }
670 else
671 {
672 current_state->offset = state_offset = -state_offset;
673 }
674 }
675
676 /* Check for a duplicate state with the same count, and skip if found.
677 See the note at the head of this module about the possibility of improving
678 performance here. */
679
680 for (j = 0; j < i; j++)
681 {
682 if (active_states[j].offset == state_offset &&
683 active_states[j].count == current_state->count)
684 {
685 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
686 goto NEXT_ACTIVE_STATE;
687 }
688 }
689
690 /* The state offset is the offset to the opcode */
691
692 code = start_code + state_offset;
693 codevalue = *code;
694
695 /* If this opcode inspects a character, but we are at the end of the
696 subject, remember the fact for use when testing for a partial match. */
697
698 if (clen == 0 && poptable[codevalue] != 0)
699 could_continue = TRUE;
700
701 /* If this opcode is followed by an inline character, load it. It is
702 tempting to test for the presence of a subject character here, but that
703 is wrong, because sometimes zero repetitions of the subject are
704 permitted.
705
706 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
707 argument that is not a data character - but is always one byte long because
708 the values are small. We have to take special action to deal with \P, \p,
709 \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
710 these ones to new opcodes. */
711
712 if (coptable[codevalue] > 0)
713 {
714 dlen = 1;
715 #ifdef SUPPORT_UTF
716 if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
717 #endif /* SUPPORT_UTF */
718 d = code[coptable[codevalue]];
719 if (codevalue >= OP_TYPESTAR)
720 {
721 switch(d)
722 {
723 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
724 case OP_NOTPROP:
725 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
726 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
727 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
728 case OP_NOT_HSPACE:
729 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
730 case OP_NOT_VSPACE:
731 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
732 default: break;
733 }
734 }
735 }
736 else
737 {
738 dlen = 0; /* Not strictly necessary, but compilers moan */
739 d = NOTACHAR; /* if these variables are not set. */
740 }
741
742
743 /* Now process the individual opcodes */
744
745 switch (codevalue)
746 {
747 /* ========================================================================== */
748 /* These cases are never obeyed. This is a fudge that causes a compile-
749 time error if the vectors coptable or poptable, which are indexed by
750 opcode, are not the correct length. It seems to be the only way to do
751 such a check at compile time, as the sizeof() operator does not work
752 in the C preprocessor. */
753
754 case OP_TABLE_LENGTH:
755 case OP_TABLE_LENGTH +
756 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
757 (sizeof(poptable) == OP_TABLE_LENGTH)):
758 break;
759
760 /* ========================================================================== */
761 /* Reached a closing bracket. If not at the end of the pattern, carry
762 on with the next opcode. For repeating opcodes, also add the repeat
763 state. Note that KETRPOS will always be encountered at the end of the
764 subpattern, because the possessive subpattern repeats are always handled
765 using recursive calls. Thus, it never adds any new states.
766
767 At the end of the (sub)pattern, unless we have an empty string and
768 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
769 start of the subject, save the match data, shifting up all previous
770 matches so we always have the longest first. */
771
772 case OP_KET:
773 case OP_KETRMIN:
774 case OP_KETRMAX:
775 case OP_KETRPOS:
776 if (code != end_code)
777 {
778 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
779 if (codevalue != OP_KET)
780 {
781 ADD_ACTIVE(state_offset - GET(code, 1), 0);
782 }
783 }
784 else
785 {
786 if (ptr > current_subject ||
787 ((md->moptions & PCRE_NOTEMPTY) == 0 &&
788 ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
789 current_subject > start_subject + md->start_offset)))
790 {
791 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
792 else if (match_count > 0 && ++match_count * 2 > offsetcount)
793 match_count = 0;
794 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
795 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
796 if (offsetcount >= 2)
797 {
798 offsets[0] = (int)(current_subject - start_subject);
799 offsets[1] = (int)(ptr - start_subject);
800 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
801 offsets[1] - offsets[0], (char *)current_subject));
802 }
803 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
804 {
805 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
806 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
807 match_count, rlevel*2-2, SP));
808 return match_count;
809 }
810 }
811 }
812 break;
813
814 /* ========================================================================== */
815 /* These opcodes add to the current list of states without looking
816 at the current character. */
817
818 /*-----------------------------------------------------------------*/
819 case OP_ALT:
820 do { code += GET(code, 1); } while (*code == OP_ALT);
821 ADD_ACTIVE((int)(code - start_code), 0);
822 break;
823
824 /*-----------------------------------------------------------------*/
825 case OP_BRA:
826 case OP_SBRA:
827 do
828 {
829 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
830 code += GET(code, 1);
831 }
832 while (*code == OP_ALT);
833 break;
834
835 /*-----------------------------------------------------------------*/
836 case OP_CBRA:
837 case OP_SCBRA:
838 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
839 code += GET(code, 1);
840 while (*code == OP_ALT)
841 {
842 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
843 code += GET(code, 1);
844 }
845 break;
846
847 /*-----------------------------------------------------------------*/
848 case OP_BRAZERO:
849 case OP_BRAMINZERO:
850 ADD_ACTIVE(state_offset + 1, 0);
851 code += 1 + GET(code, 2);
852 while (*code == OP_ALT) code += GET(code, 1);
853 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
854 break;
855
856 /*-----------------------------------------------------------------*/
857 case OP_SKIPZERO:
858 code += 1 + GET(code, 2);
859 while (*code == OP_ALT) code += GET(code, 1);
860 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
861 break;
862
863 /*-----------------------------------------------------------------*/
864 case OP_CIRC:
865 if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
866 { ADD_ACTIVE(state_offset + 1, 0); }
867 break;
868
869 /*-----------------------------------------------------------------*/
870 case OP_CIRCM:
871 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
872 (ptr != end_subject && WAS_NEWLINE(ptr)))
873 { ADD_ACTIVE(state_offset + 1, 0); }
874 break;
875
876 /*-----------------------------------------------------------------*/
877 case OP_EOD:
878 if (ptr >= end_subject)
879 {
880 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
881 could_continue = TRUE;
882 else { ADD_ACTIVE(state_offset + 1, 0); }
883 }
884 break;
885
886 /*-----------------------------------------------------------------*/
887 case OP_SOD:
888 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
889 break;
890
891 /*-----------------------------------------------------------------*/
892 case OP_SOM:
893 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
894 break;
895
896
897 /* ========================================================================== */
898 /* These opcodes inspect the next subject character, and sometimes
899 the previous one as well, but do not have an argument. The variable
900 clen contains the length of the current character and is zero if we are
901 at the end of the subject. */
902
903 /*-----------------------------------------------------------------*/
904 case OP_ANY:
905 if (clen > 0 && !IS_NEWLINE(ptr))
906 {
907 if (ptr + 1 >= md->end_subject &&
908 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
909 NLBLOCK->nltype == NLTYPE_FIXED &&
910 NLBLOCK->nllen == 2 &&
911 c == NLBLOCK->nl[0])
912 {
913 could_continue = partial_newline = TRUE;
914 }
915 else
916 {
917 ADD_NEW(state_offset + 1, 0);
918 }
919 }
920 break;
921
922 /*-----------------------------------------------------------------*/
923 case OP_ALLANY:
924 if (clen > 0)
925 { ADD_NEW(state_offset + 1, 0); }
926 break;
927
928 /*-----------------------------------------------------------------*/
929 case OP_EODN:
930 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
931 could_continue = TRUE;
932 else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
933 { ADD_ACTIVE(state_offset + 1, 0); }
934 break;
935
936 /*-----------------------------------------------------------------*/
937 case OP_DOLL:
938 if ((md->moptions & PCRE_NOTEOL) == 0)
939 {
940 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
941 could_continue = TRUE;
942 else if (clen == 0 ||
943 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
944 (ptr == end_subject - md->nllen)
945 ))
946 { ADD_ACTIVE(state_offset + 1, 0); }
947 else if (ptr + 1 >= md->end_subject &&
948 (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
949 NLBLOCK->nltype == NLTYPE_FIXED &&
950 NLBLOCK->nllen == 2 &&
951 c == NLBLOCK->nl[0])
952 {
953 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
954 {
955 reset_could_continue = TRUE;
956 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
957 }
958 else could_continue = partial_newline = TRUE;
959 }
960 }
961 break;
962
963 /*-----------------------------------------------------------------*/
964 case OP_DOLLM:
965 if ((md->moptions & PCRE_NOTEOL) == 0)
966 {
967 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
968 could_continue = TRUE;
969 else if (clen == 0 ||
970 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
971 { ADD_ACTIVE(state_offset + 1, 0); }
972 else if (ptr + 1 >= md->end_subject &&
973 (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
974 NLBLOCK->nltype == NLTYPE_FIXED &&
975 NLBLOCK->nllen == 2 &&
976 c == NLBLOCK->nl[0])
977 {
978 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
979 {
980 reset_could_continue = TRUE;
981 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
982 }
983 else could_continue = partial_newline = TRUE;
984 }
985 }
986 else if (IS_NEWLINE(ptr))
987 { ADD_ACTIVE(state_offset + 1, 0); }
988 break;
989
990 /*-----------------------------------------------------------------*/
991
992 case OP_DIGIT:
993 case OP_WHITESPACE:
994 case OP_WORDCHAR:
995 if (clen > 0 && c < 256 &&
996 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
997 { ADD_NEW(state_offset + 1, 0); }
998 break;
999
1000 /*-----------------------------------------------------------------*/
1001 case OP_NOT_DIGIT:
1002 case OP_NOT_WHITESPACE:
1003 case OP_NOT_WORDCHAR:
1004 if (clen > 0 && (c >= 256 ||
1005 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
1006 { ADD_NEW(state_offset + 1, 0); }
1007 break;
1008
1009 /*-----------------------------------------------------------------*/
1010 case OP_WORD_BOUNDARY:
1011 case OP_NOT_WORD_BOUNDARY:
1012 {
1013 int left_word, right_word;
1014
1015 if (ptr > start_subject)
1016 {
1017 const pcre_uchar *temp = ptr - 1;
1018 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
1019 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
1020 if (utf) { BACKCHAR(temp); }
1021 #endif
1022 GETCHARTEST(d, temp);
1023 #ifdef SUPPORT_UCP
1024 if ((md->poptions & PCRE_UCP) != 0)
1025 {
1026 if (d == '_') left_word = TRUE; else
1027 {
1028 int cat = UCD_CATEGORY(d);
1029 left_word = (cat == ucp_L || cat == ucp_N);
1030 }
1031 }
1032 else
1033 #endif
1034 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1035 }
1036 else left_word = FALSE;
1037
1038 if (clen > 0)
1039 {
1040 #ifdef SUPPORT_UCP
1041 if ((md->poptions & PCRE_UCP) != 0)
1042 {
1043 if (c == '_') right_word = TRUE; else
1044 {
1045 int cat = UCD_CATEGORY(c);
1046 right_word = (cat == ucp_L || cat == ucp_N);
1047 }
1048 }
1049 else
1050 #endif
1051 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1052 }
1053 else right_word = FALSE;
1054
1055 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1056 { ADD_ACTIVE(state_offset + 1, 0); }
1057 }
1058 break;
1059
1060
1061 /*-----------------------------------------------------------------*/
1062 /* Check the next character by Unicode property. We will get here only
1063 if the support is in the binary; otherwise a compile-time error occurs.
1064 */
1065
1066 #ifdef SUPPORT_UCP
1067 case OP_PROP:
1068 case OP_NOTPROP:
1069 if (clen > 0)
1070 {
1071 BOOL OK;
1072 const pcre_uint32 *cp;
1073 const ucd_record * prop = GET_UCD(c);
1074 switch(code[1])
1075 {
1076 case PT_ANY:
1077 OK = TRUE;
1078 break;
1079
1080 case PT_LAMP:
1081 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1082 prop->chartype == ucp_Lt;
1083 break;
1084
1085 case PT_GC:
1086 OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1087 break;
1088
1089 case PT_PC:
1090 OK = prop->chartype == code[2];
1091 break;
1092
1093 case PT_SC:
1094 OK = prop->script == code[2];
1095 break;
1096
1097 /* These are specials for combination cases. */
1098
1099 case PT_ALNUM:
1100 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1101 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1102 break;
1103
1104 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1105 which means that Perl space and POSIX space are now identical. PCRE
1106 was changed at release 8.34. */
1107
1108 case PT_SPACE: /* Perl space */
1109 case PT_PXSPACE: /* POSIX space */
1110 switch(c)
1111 {
1112 HSPACE_CASES:
1113 VSPACE_CASES:
1114 OK = TRUE;
1115 break;
1116
1117 default:
1118 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1119 break;
1120 }
1121 break;
1122
1123 case PT_WORD:
1124 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1125 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1126 c == CHAR_UNDERSCORE;
1127 break;
1128
1129 case PT_CLIST:
1130 cp = PRIV(ucd_caseless_sets) + code[2];
1131 for (;;)
1132 {
1133 if (c < *cp) { OK = FALSE; break; }
1134 if (c == *cp++) { OK = TRUE; break; }
1135 }
1136 break;
1137
1138 case PT_UCNC:
1139 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1140 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1141 c >= 0xe000;
1142 break;
1143
1144 /* Should never occur, but keep compilers from grumbling. */
1145
1146 default:
1147 OK = codevalue != OP_PROP;
1148 break;
1149 }
1150
1151 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1152 }
1153 break;
1154 #endif
1155
1156
1157
1158 /* ========================================================================== */
1159 /* These opcodes likewise inspect the subject character, but have an
1160 argument that is not a data character. It is one of these opcodes:
1161 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1162 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1163
1164 case OP_TYPEPLUS:
1165 case OP_TYPEMINPLUS:
1166 case OP_TYPEPOSPLUS:
1167 count = current_state->count; /* Already matched */
1168 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1169 if (clen > 0)
1170 {
1171 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1172 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1173 NLBLOCK->nltype == NLTYPE_FIXED &&
1174 NLBLOCK->nllen == 2 &&
1175 c == NLBLOCK->nl[0])
1176 {
1177 could_continue = partial_newline = TRUE;
1178 }
1179 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1180 (c < 256 &&
1181 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1182 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1183 {
1184 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1185 {
1186 active_count--; /* Remove non-match possibility */
1187 next_active_state--;
1188 }
1189 count++;
1190 ADD_NEW(state_offset, count);
1191 }
1192 }
1193 break;
1194
1195 /*-----------------------------------------------------------------*/
1196 case OP_TYPEQUERY:
1197 case OP_TYPEMINQUERY:
1198 case OP_TYPEPOSQUERY:
1199 ADD_ACTIVE(state_offset + 2, 0);
1200 if (clen > 0)
1201 {
1202 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1203 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1204 NLBLOCK->nltype == NLTYPE_FIXED &&
1205 NLBLOCK->nllen == 2 &&
1206 c == NLBLOCK->nl[0])
1207 {
1208 could_continue = partial_newline = TRUE;
1209 }
1210 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1211 (c < 256 &&
1212 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1213 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1214 {
1215 if (codevalue == OP_TYPEPOSQUERY)
1216 {
1217 active_count--; /* Remove non-match possibility */
1218 next_active_state--;
1219 }
1220 ADD_NEW(state_offset + 2, 0);
1221 }
1222 }
1223 break;
1224
1225 /*-----------------------------------------------------------------*/
1226 case OP_TYPESTAR:
1227 case OP_TYPEMINSTAR:
1228 case OP_TYPEPOSSTAR:
1229 ADD_ACTIVE(state_offset + 2, 0);
1230 if (clen > 0)
1231 {
1232 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1233 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1234 NLBLOCK->nltype == NLTYPE_FIXED &&
1235 NLBLOCK->nllen == 2 &&
1236 c == NLBLOCK->nl[0])
1237 {
1238 could_continue = partial_newline = TRUE;
1239 }
1240 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1241 (c < 256 &&
1242 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1243 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1244 {
1245 if (codevalue == OP_TYPEPOSSTAR)
1246 {
1247 active_count--; /* Remove non-match possibility */
1248 next_active_state--;
1249 }
1250 ADD_NEW(state_offset, 0);
1251 }
1252 }
1253 break;
1254
1255 /*-----------------------------------------------------------------*/
1256 case OP_TYPEEXACT:
1257 count = current_state->count; /* Number already matched */
1258 if (clen > 0)
1259 {
1260 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1261 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1262 NLBLOCK->nltype == NLTYPE_FIXED &&
1263 NLBLOCK->nllen == 2 &&
1264 c == NLBLOCK->nl[0])
1265 {
1266 could_continue = partial_newline = TRUE;
1267 }
1268 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1269 (c < 256 &&
1270 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1271 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1272 {
1273 if (++count >= (int)GET2(code, 1))
1274 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1275 else
1276 { ADD_NEW(state_offset, count); }
1277 }
1278 }
1279 break;
1280
1281 /*-----------------------------------------------------------------*/
1282 case OP_TYPEUPTO:
1283 case OP_TYPEMINUPTO:
1284 case OP_TYPEPOSUPTO:
1285 ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1286 count = current_state->count; /* Number already matched */
1287 if (clen > 0)
1288 {
1289 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1290 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1291 NLBLOCK->nltype == NLTYPE_FIXED &&
1292 NLBLOCK->nllen == 2 &&
1293 c == NLBLOCK->nl[0])
1294 {
1295 could_continue = partial_newline = TRUE;
1296 }
1297 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1298 (c < 256 &&
1299 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1300 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1301 {
1302 if (codevalue == OP_TYPEPOSUPTO)
1303 {
1304 active_count--; /* Remove non-match possibility */
1305 next_active_state--;
1306 }
1307 if (++count >= (int)GET2(code, 1))
1308 { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1309 else
1310 { ADD_NEW(state_offset, count); }
1311 }
1312 }
1313 break;
1314
1315 /* ========================================================================== */
1316 /* These are virtual opcodes that are used when something like
1317 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1318 argument. It keeps the code above fast for the other cases. The argument
1319 is in the d variable. */
1320
1321 #ifdef SUPPORT_UCP
1322 case OP_PROP_EXTRA + OP_TYPEPLUS:
1323 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1324 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1325 count = current_state->count; /* Already matched */
1326 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1327 if (clen > 0)
1328 {
1329 BOOL OK;
1330 const pcre_uint32 *cp;
1331 const ucd_record * prop = GET_UCD(c);
1332 switch(code[2])
1333 {
1334 case PT_ANY:
1335 OK = TRUE;
1336 break;
1337
1338 case PT_LAMP:
1339 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1340 prop->chartype == ucp_Lt;
1341 break;
1342
1343 case PT_GC:
1344 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1345 break;
1346
1347 case PT_PC:
1348 OK = prop->chartype == code[3];
1349 break;
1350
1351 case PT_SC:
1352 OK = prop->script == code[3];
1353 break;
1354
1355 /* These are specials for combination cases. */
1356
1357 case PT_ALNUM:
1358 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1359 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1360 break;
1361
1362 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1363 which means that Perl space and POSIX space are now identical. PCRE
1364 was changed at release 8.34. */
1365
1366 case PT_SPACE: /* Perl space */
1367 case PT_PXSPACE: /* POSIX space */
1368 switch(c)
1369 {
1370 HSPACE_CASES:
1371 VSPACE_CASES:
1372 OK = TRUE;
1373 break;
1374
1375 default:
1376 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1377 break;
1378 }
1379 break;
1380
1381 case PT_WORD:
1382 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1383 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1384 c == CHAR_UNDERSCORE;
1385 break;
1386
1387 case PT_CLIST:
1388 cp = PRIV(ucd_caseless_sets) + code[3];
1389 for (;;)
1390 {
1391 if (c < *cp) { OK = FALSE; break; }
1392 if (c == *cp++) { OK = TRUE; break; }
1393 }
1394 break;
1395
1396 case PT_UCNC:
1397 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1398 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1399 c >= 0xe000;
1400 break;
1401
1402 /* Should never occur, but keep compilers from grumbling. */
1403
1404 default:
1405 OK = codevalue != OP_PROP;
1406 break;
1407 }
1408
1409 if (OK == (d == OP_PROP))
1410 {
1411 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1412 {
1413 active_count--; /* Remove non-match possibility */
1414 next_active_state--;
1415 }
1416 count++;
1417 ADD_NEW(state_offset, count);
1418 }
1419 }
1420 break;
1421
1422 /*-----------------------------------------------------------------*/
1423 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1424 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1425 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1426 count = current_state->count; /* Already matched */
1427 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1428 if (clen > 0)
1429 {
1430 int lgb, rgb;
1431 const pcre_uchar *nptr = ptr + clen;
1432 int ncount = 0;
1433 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1434 {
1435 active_count--; /* Remove non-match possibility */
1436 next_active_state--;
1437 }
1438 lgb = UCD_GRAPHBREAK(c);
1439 while (nptr < end_subject)
1440 {
1441 dlen = 1;
1442 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1443 rgb = UCD_GRAPHBREAK(d);
1444 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1445 ncount++;
1446 lgb = rgb;
1447 nptr += dlen;
1448 }
1449 count++;
1450 ADD_NEW_DATA(-state_offset, count, ncount);
1451 }
1452 break;
1453 #endif
1454
1455 /*-----------------------------------------------------------------*/
1456 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1457 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1458 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1459 count = current_state->count; /* Already matched */
1460 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1461 if (clen > 0)
1462 {
1463 int ncount = 0;
1464 switch (c)
1465 {
1466 case CHAR_VT:
1467 case CHAR_FF:
1468 case CHAR_NEL:
1469 #ifndef EBCDIC
1470 case 0x2028:
1471 case 0x2029:
1472 #endif /* Not EBCDIC */
1473 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1474 goto ANYNL01;
1475
1476 case CHAR_CR:
1477 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1478 /* Fall through */
1479
1480 ANYNL01:
1481 case CHAR_LF:
1482 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1483 {
1484 active_count--; /* Remove non-match possibility */
1485 next_active_state--;
1486 }
1487 count++;
1488 ADD_NEW_DATA(-state_offset, count, ncount);
1489 break;
1490
1491 default:
1492 break;
1493 }
1494 }
1495 break;
1496
1497 /*-----------------------------------------------------------------*/
1498 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1499 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1500 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1501 count = current_state->count; /* Already matched */
1502 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1503 if (clen > 0)
1504 {
1505 BOOL OK;
1506 switch (c)
1507 {
1508 VSPACE_CASES:
1509 OK = TRUE;
1510 break;
1511
1512 default:
1513 OK = FALSE;
1514 break;
1515 }
1516
1517 if (OK == (d == OP_VSPACE))
1518 {
1519 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1520 {
1521 active_count--; /* Remove non-match possibility */
1522 next_active_state--;
1523 }
1524 count++;
1525 ADD_NEW_DATA(-state_offset, count, 0);
1526 }
1527 }
1528 break;
1529
1530 /*-----------------------------------------------------------------*/
1531 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1532 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1533 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1534 count = current_state->count; /* Already matched */
1535 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1536 if (clen > 0)
1537 {
1538 BOOL OK;
1539 switch (c)
1540 {
1541 HSPACE_CASES:
1542 OK = TRUE;
1543 break;
1544
1545 default:
1546 OK = FALSE;
1547 break;
1548 }
1549
1550 if (OK == (d == OP_HSPACE))
1551 {
1552 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1553 {
1554 active_count--; /* Remove non-match possibility */
1555 next_active_state--;
1556 }
1557 count++;
1558 ADD_NEW_DATA(-state_offset, count, 0);
1559 }
1560 }
1561 break;
1562
1563 /*-----------------------------------------------------------------*/
1564 #ifdef SUPPORT_UCP
1565 case OP_PROP_EXTRA + OP_TYPEQUERY:
1566 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1567 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1568 count = 4;
1569 goto QS1;
1570
1571 case OP_PROP_EXTRA + OP_TYPESTAR:
1572 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1573 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1574 count = 0;
1575
1576 QS1:
1577
1578 ADD_ACTIVE(state_offset + 4, 0);
1579 if (clen > 0)
1580 {
1581 BOOL OK;
1582 const pcre_uint32 *cp;
1583 const ucd_record * prop = GET_UCD(c);
1584 switch(code[2])
1585 {
1586 case PT_ANY:
1587 OK = TRUE;
1588 break;
1589
1590 case PT_LAMP:
1591 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1592 prop->chartype == ucp_Lt;
1593 break;
1594
1595 case PT_GC:
1596 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1597 break;
1598
1599 case PT_PC:
1600 OK = prop->chartype == code[3];
1601 break;
1602
1603 case PT_SC:
1604 OK = prop->script == code[3];
1605 break;
1606
1607 /* These are specials for combination cases. */
1608
1609 case PT_ALNUM:
1610 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1611 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1612 break;
1613
1614 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1615 which means that Perl space and POSIX space are now identical. PCRE
1616 was changed at release 8.34. */
1617
1618 case PT_SPACE: /* Perl space */
1619 case PT_PXSPACE: /* POSIX space */
1620 switch(c)
1621 {
1622 HSPACE_CASES:
1623 VSPACE_CASES:
1624 OK = TRUE;
1625 break;
1626
1627 default:
1628 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1629 break;
1630 }
1631 break;
1632
1633 case PT_WORD:
1634 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1635 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1636 c == CHAR_UNDERSCORE;
1637 break;
1638
1639 case PT_CLIST:
1640 cp = PRIV(ucd_caseless_sets) + code[3];
1641 for (;;)
1642 {
1643 if (c < *cp) { OK = FALSE; break; }
1644 if (c == *cp++) { OK = TRUE; break; }
1645 }
1646 break;
1647
1648 case PT_UCNC:
1649 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1650 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1651 c >= 0xe000;
1652 break;
1653
1654 /* Should never occur, but keep compilers from grumbling. */
1655
1656 default:
1657 OK = codevalue != OP_PROP;
1658 break;
1659 }
1660
1661 if (OK == (d == OP_PROP))
1662 {
1663 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1664 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1665 {
1666 active_count--; /* Remove non-match possibility */
1667 next_active_state--;
1668 }
1669 ADD_NEW(state_offset + count, 0);
1670 }
1671 }
1672 break;
1673
1674 /*-----------------------------------------------------------------*/
1675 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1676 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1677 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1678 count = 2;
1679 goto QS2;
1680
1681 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1682 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1683 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1684 count = 0;
1685
1686 QS2:
1687
1688 ADD_ACTIVE(state_offset + 2, 0);
1689 if (clen > 0)
1690 {
1691 int lgb, rgb;
1692 const pcre_uchar *nptr = ptr + clen;
1693 int ncount = 0;
1694 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1695 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1696 {
1697 active_count--; /* Remove non-match possibility */
1698 next_active_state--;
1699 }
1700 lgb = UCD_GRAPHBREAK(c);
1701 while (nptr < end_subject)
1702 {
1703 dlen = 1;
1704 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1705 rgb = UCD_GRAPHBREAK(d);
1706 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1707 ncount++;
1708 lgb = rgb;
1709 nptr += dlen;
1710 }
1711 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1712 }
1713 break;
1714 #endif
1715
1716 /*-----------------------------------------------------------------*/
1717 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1718 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1719 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1720 count = 2;
1721 goto QS3;
1722
1723 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1724 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1725 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1726 count = 0;
1727
1728 QS3:
1729 ADD_ACTIVE(state_offset + 2, 0);
1730 if (clen > 0)
1731 {
1732 int ncount = 0;
1733 switch (c)
1734 {
1735 case CHAR_VT:
1736 case CHAR_FF:
1737 case CHAR_NEL:
1738 #ifndef EBCDIC
1739 case 0x2028:
1740 case 0x2029:
1741 #endif /* Not EBCDIC */
1742 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1743 goto ANYNL02;
1744
1745 case CHAR_CR:
1746 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1747 /* Fall through */
1748
1749 ANYNL02:
1750 case CHAR_LF:
1751 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1752 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1753 {
1754 active_count--; /* Remove non-match possibility */
1755 next_active_state--;
1756 }
1757 ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1758 break;
1759
1760 default:
1761 break;
1762 }
1763 }
1764 break;
1765
1766 /*-----------------------------------------------------------------*/
1767 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1768 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1769 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1770 count = 2;
1771 goto QS4;
1772
1773 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1774 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1775 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1776 count = 0;
1777
1778 QS4:
1779 ADD_ACTIVE(state_offset + 2, 0);
1780 if (clen > 0)
1781 {
1782 BOOL OK;
1783 switch (c)
1784 {
1785 VSPACE_CASES:
1786 OK = TRUE;
1787 break;
1788
1789 default:
1790 OK = FALSE;
1791 break;
1792 }
1793 if (OK == (d == OP_VSPACE))
1794 {
1795 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1796 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1797 {
1798 active_count--; /* Remove non-match possibility */
1799 next_active_state--;
1800 }
1801 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1802 }
1803 }
1804 break;
1805
1806 /*-----------------------------------------------------------------*/
1807 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1808 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1809 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1810 count = 2;
1811 goto QS5;
1812
1813 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1814 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1815 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1816 count = 0;
1817
1818 QS5:
1819 ADD_ACTIVE(state_offset + 2, 0);
1820 if (clen > 0)
1821 {
1822 BOOL OK;
1823 switch (c)
1824 {
1825 HSPACE_CASES:
1826 OK = TRUE;
1827 break;
1828
1829 default:
1830 OK = FALSE;
1831 break;
1832 }
1833
1834 if (OK == (d == OP_HSPACE))
1835 {
1836 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1837 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1838 {
1839 active_count--; /* Remove non-match possibility */
1840 next_active_state--;
1841 }
1842 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1843 }
1844 }
1845 break;
1846
1847 /*-----------------------------------------------------------------*/
1848 #ifdef SUPPORT_UCP
1849 case OP_PROP_EXTRA + OP_TYPEEXACT:
1850 case OP_PROP_EXTRA + OP_TYPEUPTO:
1851 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1852 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1853 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1854 { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1855 count = current_state->count; /* Number already matched */
1856 if (clen > 0)
1857 {
1858 BOOL OK;
1859 const pcre_uint32 *cp;
1860 const ucd_record * prop = GET_UCD(c);
1861 switch(code[1 + IMM2_SIZE + 1])
1862 {
1863 case PT_ANY:
1864 OK = TRUE;
1865 break;
1866
1867 case PT_LAMP:
1868 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1869 prop->chartype == ucp_Lt;
1870 break;
1871
1872 case PT_GC:
1873 OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1874 break;
1875
1876 case PT_PC:
1877 OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1878 break;
1879
1880 case PT_SC:
1881 OK = prop->script == code[1 + IMM2_SIZE + 2];
1882 break;
1883
1884 /* These are specials for combination cases. */
1885
1886 case PT_ALNUM:
1887 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1888 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1889 break;
1890
1891 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1892 which means that Perl space and POSIX space are now identical. PCRE
1893 was changed at release 8.34. */
1894
1895 case PT_SPACE: /* Perl space */
1896 case PT_PXSPACE: /* POSIX space */
1897 switch(c)
1898 {
1899 HSPACE_CASES:
1900 VSPACE_CASES:
1901 OK = TRUE;
1902 break;
1903
1904 default:
1905 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1906 break;
1907 }
1908 break;
1909
1910 case PT_WORD:
1911 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1912 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1913 c == CHAR_UNDERSCORE;
1914 break;
1915
1916 case PT_CLIST:
1917 cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
1918 for (;;)
1919 {
1920 if (c < *cp) { OK = FALSE; break; }
1921 if (c == *cp++) { OK = TRUE; break; }
1922 }
1923 break;
1924
1925 case PT_UCNC:
1926 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1927 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1928 c >= 0xe000;
1929 break;
1930
1931 /* Should never occur, but keep compilers from grumbling. */
1932
1933 default:
1934 OK = codevalue != OP_PROP;
1935 break;
1936 }
1937
1938 if (OK == (d == OP_PROP))
1939 {
1940 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1941 {
1942 active_count--; /* Remove non-match possibility */
1943 next_active_state--;
1944 }
1945 if (++count >= (int)GET2(code, 1))
1946 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1947 else
1948 { ADD_NEW(state_offset, count); }
1949 }
1950 }
1951 break;
1952
1953 /*-----------------------------------------------------------------*/
1954 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1955 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1956 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1957 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1958 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1959 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1960 count = current_state->count; /* Number already matched */
1961 if (clen > 0)
1962 {
1963 int lgb, rgb;
1964 const pcre_uchar *nptr = ptr + clen;
1965 int ncount = 0;
1966 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1967 {
1968 active_count--; /* Remove non-match possibility */
1969 next_active_state--;
1970 }
1971 lgb = UCD_GRAPHBREAK(c);
1972 while (nptr < end_subject)
1973 {
1974 dlen = 1;
1975 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1976 rgb = UCD_GRAPHBREAK(d);
1977 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1978 ncount++;
1979 lgb = rgb;
1980 nptr += dlen;
1981 }
1982 if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
1983 reset_could_continue = TRUE;
1984 if (++count >= (int)GET2(code, 1))
1985 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1986 else
1987 { ADD_NEW_DATA(-state_offset, count, ncount); }
1988 }
1989 break;
1990 #endif
1991
1992 /*-----------------------------------------------------------------*/
1993 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1994 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1995 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1996 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1997 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1998 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1999 count = current_state->count; /* Number already matched */
2000 if (clen > 0)
2001 {
2002 int ncount = 0;
2003 switch (c)
2004 {
2005 case CHAR_VT:
2006 case CHAR_FF:
2007 case CHAR_NEL:
2008 #ifndef EBCDIC
2009 case 0x2028:
2010 case 0x2029:
2011 #endif /* Not EBCDIC */
2012 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2013 goto ANYNL03;
2014
2015 case CHAR_CR:
2016 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
2017 /* Fall through */
2018
2019 ANYNL03:
2020 case CHAR_LF:
2021 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
2022 {
2023 active_count--; /* Remove non-match possibility */
2024 next_active_state--;
2025 }
2026 if (++count >= (int)GET2(code, 1))
2027 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2028 else
2029 { ADD_NEW_DATA(-state_offset, count, ncount); }
2030 break;
2031
2032 default:
2033 break;
2034 }
2035 }
2036 break;
2037
2038 /*-----------------------------------------------------------------*/
2039 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
2040 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
2041 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2042 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2043 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2044 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2045 count = current_state->count; /* Number already matched */
2046 if (clen > 0)
2047 {
2048 BOOL OK;
2049 switch (c)
2050 {
2051 VSPACE_CASES:
2052 OK = TRUE;
2053 break;
2054
2055 default:
2056 OK = FALSE;
2057 }
2058
2059 if (OK == (d == OP_VSPACE))
2060 {
2061 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2062 {
2063 active_count--; /* Remove non-match possibility */
2064 next_active_state--;
2065 }
2066 if (++count >= (int)GET2(code, 1))
2067 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2068 else
2069 { ADD_NEW_DATA(-state_offset, count, 0); }
2070 }
2071 }
2072 break;
2073
2074 /*-----------------------------------------------------------------*/
2075 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2076 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2077 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2078 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2079 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2080 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2081 count = current_state->count; /* Number already matched */
2082 if (clen > 0)
2083 {
2084 BOOL OK;
2085 switch (c)
2086 {
2087 HSPACE_CASES:
2088 OK = TRUE;
2089 break;
2090
2091 default:
2092 OK = FALSE;
2093 break;
2094 }
2095
2096 if (OK == (d == OP_HSPACE))
2097 {
2098 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2099 {
2100 active_count--; /* Remove non-match possibility */
2101 next_active_state--;
2102 }
2103 if (++count >= (int)GET2(code, 1))
2104 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2105 else
2106 { ADD_NEW_DATA(-state_offset, count, 0); }
2107 }
2108 }
2109 break;
2110
2111 /* ========================================================================== */
2112 /* These opcodes are followed by a character that is usually compared
2113 to the current subject character; it is loaded into d. We still get
2114 here even if there is no subject character, because in some cases zero
2115 repetitions are permitted. */
2116
2117 /*-----------------------------------------------------------------*/
2118 case OP_CHAR:
2119 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2120 break;
2121
2122 /*-----------------------------------------------------------------*/
2123 case OP_CHARI:
2124 if (clen == 0) break;
2125
2126 #ifdef SUPPORT_UTF
2127 if (utf)
2128 {
2129 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2130 {
2131 unsigned int othercase;
2132 if (c < 128)
2133 othercase = fcc[c];
2134 else
2135 /* If we have Unicode property support, we can use it to test the
2136 other case of the character. */
2137 #ifdef SUPPORT_UCP
2138 othercase = UCD_OTHERCASE(c);
2139 #else
2140 othercase = NOTACHAR;
2141 #endif
2142
2143 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2144 }
2145 }
2146 else
2147 #endif /* SUPPORT_UTF */
2148 /* Not UTF mode */
2149 {
2150 if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2151 { ADD_NEW(state_offset + 2, 0); }
2152 }
2153 break;
2154
2155
2156 #ifdef SUPPORT_UCP
2157 /*-----------------------------------------------------------------*/
2158 /* This is a tricky one because it can match more than one character.
2159 Find out how many characters to skip, and then set up a negative state
2160 to wait for them to pass before continuing. */
2161
2162 case OP_EXTUNI:
2163 if (clen > 0)
2164 {
2165 int lgb, rgb;
2166 const pcre_uchar *nptr = ptr + clen;
2167 int ncount = 0;
2168 lgb = UCD_GRAPHBREAK(c);
2169 while (nptr < end_subject)
2170 {
2171 dlen = 1;
2172 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
2173 rgb = UCD_GRAPHBREAK(d);
2174 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2175 ncount++;
2176 lgb = rgb;
2177 nptr += dlen;
2178 }
2179 if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
2180 reset_could_continue = TRUE;
2181 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2182 }
2183 break;
2184 #endif
2185
2186 /*-----------------------------------------------------------------*/
2187 /* This is a tricky like EXTUNI because it too can match more than one
2188 character (when CR is followed by LF). In this case, set up a negative
2189 state to wait for one character to pass before continuing. */
2190
2191 case OP_ANYNL:
2192 if (clen > 0) switch(c)
2193 {
2194 case CHAR_VT:
2195 case CHAR_FF:
2196 case CHAR_NEL:
2197 #ifndef EBCDIC
2198 case 0x2028:
2199 case 0x2029:
2200 #endif /* Not EBCDIC */
2201 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2202
2203 case CHAR_LF:
2204 ADD_NEW(state_offset + 1, 0);
2205 break;
2206
2207 case CHAR_CR:
2208 if (ptr + 1 >= end_subject)
2209 {
2210 ADD_NEW(state_offset + 1, 0);
2211 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
2212 reset_could_continue = TRUE;
2213 }
2214 else if (UCHAR21TEST(ptr + 1) == CHAR_LF)
2215 {
2216 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2217 }
2218 else
2219 {
2220 ADD_NEW(state_offset + 1, 0);
2221 }
2222 break;
2223 }
2224 break;
2225
2226 /*-----------------------------------------------------------------*/
2227 case OP_NOT_VSPACE:
2228 if (clen > 0) switch(c)
2229 {
2230 VSPACE_CASES:
2231 break;
2232
2233 default:
2234 ADD_NEW(state_offset + 1, 0);
2235 break;
2236 }
2237 break;
2238
2239 /*-----------------------------------------------------------------*/
2240 case OP_VSPACE:
2241 if (clen > 0) switch(c)
2242 {
2243 VSPACE_CASES:
2244 ADD_NEW(state_offset + 1, 0);
2245 break;
2246
2247 default:
2248 break;
2249 }
2250 break;
2251
2252 /*-----------------------------------------------------------------*/
2253 case OP_NOT_HSPACE:
2254 if (clen > 0) switch(c)
2255 {
2256 HSPACE_CASES:
2257 break;
2258
2259 default:
2260 ADD_NEW(state_offset + 1, 0);
2261 break;
2262 }
2263 break;
2264
2265 /*-----------------------------------------------------------------*/
2266 case OP_HSPACE:
2267 if (clen > 0) switch(c)
2268 {
2269 HSPACE_CASES:
2270 ADD_NEW(state_offset + 1, 0);
2271 break;
2272
2273 default:
2274 break;
2275 }
2276 break;
2277
2278 /*-----------------------------------------------------------------*/
2279 /* Match a negated single character casefully. */
2280
2281 case OP_NOT:
2282 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2283 break;
2284
2285 /*-----------------------------------------------------------------*/
2286 /* Match a negated single character caselessly. */
2287
2288 case OP_NOTI:
2289 if (clen > 0)
2290 {
2291 pcre_uint32 otherd;
2292 #ifdef SUPPORT_UTF
2293 if (utf && d >= 128)
2294 {
2295 #ifdef SUPPORT_UCP
2296 otherd = UCD_OTHERCASE(d);
2297 #else
2298 otherd = d;
2299 #endif /* SUPPORT_UCP */
2300 }
2301 else
2302 #endif /* SUPPORT_UTF */
2303 otherd = TABLE_GET(d, fcc, d);
2304 if (c != d && c != otherd)
2305 { ADD_NEW(state_offset + dlen + 1, 0); }
2306 }
2307 break;
2308
2309 /*-----------------------------------------------------------------*/
2310 case OP_PLUSI:
2311 case OP_MINPLUSI:
2312 case OP_POSPLUSI:
2313 case OP_NOTPLUSI:
2314 case OP_NOTMINPLUSI:
2315 case OP_NOTPOSPLUSI:
2316 caseless = TRUE;
2317 codevalue -= OP_STARI - OP_STAR;
2318
2319 /* Fall through */
2320 case OP_PLUS:
2321 case OP_MINPLUS:
2322 case OP_POSPLUS:
2323 case OP_NOTPLUS:
2324 case OP_NOTMINPLUS:
2325 case OP_NOTPOSPLUS:
2326 count = current_state->count; /* Already matched */
2327 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2328 if (clen > 0)
2329 {
2330 pcre_uint32 otherd = NOTACHAR;
2331 if (caseless)
2332 {
2333 #ifdef SUPPORT_UTF
2334 if (utf && d >= 128)
2335 {
2336 #ifdef SUPPORT_UCP
2337 otherd = UCD_OTHERCASE(d);
2338 #endif /* SUPPORT_UCP */
2339 }
2340 else
2341 #endif /* SUPPORT_UTF */
2342 otherd = TABLE_GET(d, fcc, d);
2343 }
2344 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2345 {
2346 if (count > 0 &&
2347 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2348 {
2349 active_count--; /* Remove non-match possibility */
2350 next_active_state--;
2351 }
2352 count++;
2353 ADD_NEW(state_offset, count);
2354 }
2355 }
2356 break;
2357
2358 /*-----------------------------------------------------------------*/
2359 case OP_QUERYI:
2360 case OP_MINQUERYI:
2361 case OP_POSQUERYI:
2362 case OP_NOTQUERYI:
2363 case OP_NOTMINQUERYI:
2364 case OP_NOTPOSQUERYI:
2365 caseless = TRUE;
2366 codevalue -= OP_STARI - OP_STAR;
2367 /* Fall through */
2368 case OP_QUERY:
2369 case OP_MINQUERY:
2370 case OP_POSQUERY:
2371 case OP_NOTQUERY:
2372 case OP_NOTMINQUERY:
2373 case OP_NOTPOSQUERY:
2374 ADD_ACTIVE(state_offset + dlen + 1, 0);
2375 if (clen > 0)
2376 {
2377 pcre_uint32 otherd = NOTACHAR;
2378 if (caseless)
2379 {
2380 #ifdef SUPPORT_UTF
2381 if (utf && d >= 128)
2382 {
2383 #ifdef SUPPORT_UCP
2384 otherd = UCD_OTHERCASE(d);
2385 #endif /* SUPPORT_UCP */
2386 }
2387 else
2388 #endif /* SUPPORT_UTF */
2389 otherd = TABLE_GET(d, fcc, d);
2390 }
2391 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2392 {
2393 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2394 {
2395 active_count--; /* Remove non-match possibility */
2396 next_active_state--;
2397 }
2398 ADD_NEW(state_offset + dlen + 1, 0);
2399 }
2400 }
2401 break;
2402
2403 /*-----------------------------------------------------------------*/
2404 case OP_STARI:
2405 case OP_MINSTARI:
2406 case OP_POSSTARI:
2407 case OP_NOTSTARI:
2408 case OP_NOTMINSTARI:
2409 case OP_NOTPOSSTARI:
2410 caseless = TRUE;
2411 codevalue -= OP_STARI - OP_STAR;
2412 /* Fall through */
2413 case OP_STAR:
2414 case OP_MINSTAR:
2415 case OP_POSSTAR:
2416 case OP_NOTSTAR:
2417 case OP_NOTMINSTAR:
2418 case OP_NOTPOSSTAR:
2419 ADD_ACTIVE(state_offset + dlen + 1, 0);
2420 if (clen > 0)
2421 {
2422 pcre_uint32 otherd = NOTACHAR;
2423 if (caseless)
2424 {
2425 #ifdef SUPPORT_UTF
2426 if (utf && d >= 128)
2427 {
2428 #ifdef SUPPORT_UCP
2429 otherd = UCD_OTHERCASE(d);
2430 #endif /* SUPPORT_UCP */
2431 }
2432 else
2433 #endif /* SUPPORT_UTF */
2434 otherd = TABLE_GET(d, fcc, d);
2435 }
2436 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2437 {
2438 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2439 {
2440 active_count--; /* Remove non-match possibility */
2441 next_active_state--;
2442 }
2443 ADD_NEW(state_offset, 0);
2444 }
2445 }
2446 break;
2447
2448 /*-----------------------------------------------------------------*/
2449 case OP_EXACTI:
2450 case OP_NOTEXACTI:
2451 caseless = TRUE;
2452 codevalue -= OP_STARI - OP_STAR;
2453 /* Fall through */
2454 case OP_EXACT:
2455 case OP_NOTEXACT:
2456 count = current_state->count; /* Number already matched */
2457 if (clen > 0)
2458 {
2459 pcre_uint32 otherd = NOTACHAR;
2460 if (caseless)
2461 {
2462 #ifdef SUPPORT_UTF
2463 if (utf && d >= 128)
2464 {
2465 #ifdef SUPPORT_UCP
2466 otherd = UCD_OTHERCASE(d);
2467 #endif /* SUPPORT_UCP */
2468 }
2469 else
2470 #endif /* SUPPORT_UTF */
2471 otherd = TABLE_GET(d, fcc, d);
2472 }
2473 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2474 {
2475 if (++count >= (int)GET2(code, 1))
2476 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2477 else
2478 { ADD_NEW(state_offset, count); }
2479 }
2480 }
2481 break;
2482
2483 /*-----------------------------------------------------------------*/
2484 case OP_UPTOI:
2485 case OP_MINUPTOI:
2486 case OP_POSUPTOI:
2487 case OP_NOTUPTOI:
2488 case OP_NOTMINUPTOI:
2489 case OP_NOTPOSUPTOI:
2490 caseless = TRUE;
2491 codevalue -= OP_STARI - OP_STAR;
2492 /* Fall through */
2493 case OP_UPTO:
2494 case OP_MINUPTO:
2495 case OP_POSUPTO:
2496 case OP_NOTUPTO:
2497 case OP_NOTMINUPTO:
2498 case OP_NOTPOSUPTO:
2499 ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2500 count = current_state->count; /* Number already matched */
2501 if (clen > 0)
2502 {
2503 pcre_uint32 otherd = NOTACHAR;
2504 if (caseless)
2505 {
2506 #ifdef SUPPORT_UTF
2507 if (utf && d >= 128)
2508 {
2509 #ifdef SUPPORT_UCP
2510 otherd = UCD_OTHERCASE(d);
2511 #endif /* SUPPORT_UCP */
2512 }
2513 else
2514 #endif /* SUPPORT_UTF */
2515 otherd = TABLE_GET(d, fcc, d);
2516 }
2517 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2518 {
2519 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2520 {
2521 active_count--; /* Remove non-match possibility */
2522 next_active_state--;
2523 }
2524 if (++count >= (int)GET2(code, 1))
2525 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2526 else
2527 { ADD_NEW(state_offset, count); }
2528 }
2529 }
2530 break;
2531
2532
2533 /* ========================================================================== */
2534 /* These are the class-handling opcodes */
2535
2536 case OP_CLASS:
2537 case OP_NCLASS:
2538 case OP_XCLASS:
2539 {
2540 BOOL isinclass = FALSE;
2541 int next_state_offset;
2542 const pcre_uchar *ecode;
2543
2544 /* For a simple class, there is always just a 32-byte table, and we
2545 can set isinclass from it. */
2546
2547 if (codevalue != OP_XCLASS)
2548 {
2549 ecode = code + 1 + (32 / sizeof(pcre_uchar));
2550 if (clen > 0)
2551 {
2552 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2553 ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2554 }
2555 }
2556
2557 /* An extended class may have a table or a list of single characters,
2558 ranges, or both, and it may be positive or negative. There's a
2559 function that sorts all this out. */
2560
2561 else
2562 {
2563 ecode = code + GET(code, 1);
2564 if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2565 }
2566
2567 /* At this point, isinclass is set for all kinds of class, and ecode
2568 points to the byte after the end of the class. If there is a
2569 quantifier, this is where it will be. */
2570
2571 next_state_offset = (int)(ecode - start_code);
2572
2573 switch (*ecode)
2574 {
2575 case OP_CRSTAR:
2576 case OP_CRMINSTAR:
2577 case OP_CRPOSSTAR:
2578 ADD_ACTIVE(next_state_offset + 1, 0);
2579 if (isinclass)
2580 {
2581 if (*ecode == OP_CRPOSSTAR)
2582 {
2583 active_count--; /* Remove non-match possibility */
2584 next_active_state--;
2585 }
2586 ADD_NEW(state_offset, 0);
2587 }
2588 break;
2589
2590 case OP_CRPLUS:
2591 case OP_CRMINPLUS:
2592 case OP_CRPOSPLUS:
2593 count = current_state->count; /* Already matched */
2594 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2595 if (isinclass)
2596 {
2597 if (count > 0 && *ecode == OP_CRPOSPLUS)
2598 {
2599 active_count--; /* Remove non-match possibility */
2600 next_active_state--;
2601 }
2602 count++;
2603 ADD_NEW(state_offset, count);
2604 }
2605 break;
2606
2607 case OP_CRQUERY:
2608 case OP_CRMINQUERY:
2609 case OP_CRPOSQUERY:
2610 ADD_ACTIVE(next_state_offset + 1, 0);
2611 if (isinclass)
2612 {
2613 if (*ecode == OP_CRPOSQUERY)
2614 {
2615 active_count--; /* Remove non-match possibility */
2616 next_active_state--;
2617 }
2618 ADD_NEW(next_state_offset + 1, 0);
2619 }
2620 break;
2621
2622 case OP_CRRANGE:
2623 case OP_CRMINRANGE:
2624 case OP_CRPOSRANGE:
2625 count = current_state->count; /* Already matched */
2626 if (count >= (int)GET2(ecode, 1))
2627 { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2628 if (isinclass)
2629 {
2630 int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2631 if (*ecode == OP_CRPOSRANGE && count >= (int)GET2(ecode, 1))
2632 {
2633 active_count--; /* Remove non-match possibility */
2634 next_active_state--;
2635 }
2636 if (++count >= max && max != 0) /* Max 0 => no limit */
2637 { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2638 else
2639 { ADD_NEW(state_offset, count); }
2640 }
2641 break;
2642
2643 default:
2644 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2645 break;
2646 }
2647 }
2648 break;
2649
2650 /* ========================================================================== */
2651 /* These are the opcodes for fancy brackets of various kinds. We have
2652 to use recursion in order to handle them. The "always failing" assertion
2653 (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2654 though the other "backtracking verbs" are not supported. */
2655
2656 case OP_FAIL:
2657 forced_fail++; /* Count FAILs for multiple states */
2658 break;
2659
2660 case OP_ASSERT:
2661 case OP_ASSERT_NOT:
2662 case OP_ASSERTBACK:
2663 case OP_ASSERTBACK_NOT:
2664 {
2665 int rc;
2666 int local_offsets[2];
2667 int local_workspace[1000];
2668 const pcre_uchar *endasscode = code + GET(code, 1);
2669
2670 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2671
2672 rc = internal_dfa_exec(
2673 md, /* static match data */
2674 code, /* this subexpression's code */
2675 ptr, /* where we currently are */
2676 (int)(ptr - start_subject), /* start offset */
2677 local_offsets, /* offset vector */
2678 sizeof(local_offsets)/sizeof(int), /* size of same */
2679 local_workspace, /* workspace vector */
2680 sizeof(local_workspace)/sizeof(int), /* size of same */
2681 rlevel); /* function recursion level */
2682
2683 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2684 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2685 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2686 }
2687 break;
2688
2689 /*-----------------------------------------------------------------*/
2690 case OP_COND:
2691 case OP_SCOND:
2692 {
2693 int local_offsets[1000];
2694 int local_workspace[1000];
2695 int codelink = GET(code, 1);
2696 int condcode;
2697
2698 /* Because of the way auto-callout works during compile, a callout item
2699 is inserted between OP_COND and an assertion condition. This does not
2700 happen for the other conditions. */
2701
2702 if (code[LINK_SIZE+1] == OP_CALLOUT)
2703 {
2704 rrc = 0;
2705 if (PUBL(callout) != NULL)
2706 {
2707 PUBL(callout_block) cb;
2708 cb.version = 1; /* Version 1 of the callout block */
2709 cb.callout_number = code[LINK_SIZE+2];
2710 cb.offset_vector = offsets;
2711 #if defined COMPILE_PCRE8
2712 cb.subject = (PCRE_SPTR)start_subject;
2713 #elif defined COMPILE_PCRE16
2714 cb.subject = (PCRE_SPTR16)start_subject;
2715 #elif defined COMPILE_PCRE32
2716 cb.subject = (PCRE_SPTR32)start_subject;
2717 #endif
2718 cb.subject_length = (int)(end_subject - start_subject);
2719 cb.start_match = (int)(current_subject - start_subject);
2720 cb.current_position = (int)(ptr - start_subject);
2721 cb.pattern_position = GET(code, LINK_SIZE + 3);
2722 cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2723 cb.capture_top = 1;
2724 cb.capture_last = -1;
2725 cb.callout_data = md->callout_data;
2726 cb.mark = NULL; /* No (*MARK) support */
2727 if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
2728 }
2729 if (rrc > 0) break; /* Fail this thread */
2730 code += PRIV(OP_lengths)[OP_CALLOUT]; /* Skip callout data */
2731 }
2732
2733 condcode = code[LINK_SIZE+1];
2734
2735 /* Back reference conditions and duplicate named recursion conditions
2736 are not supported */
2737
2738 if (condcode == OP_CREF || condcode == OP_DNCREF ||
2739 condcode == OP_DNRREF)
2740 return PCRE_ERROR_DFA_UCOND;
2741
2742 /* The DEFINE condition is always false, and the assertion (?!) is
2743 converted to OP_FAIL. */
2744
2745 if (condcode == OP_DEF || condcode == OP_FAIL)
2746 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2747
2748 /* The only supported version of OP_RREF is for the value RREF_ANY,
2749 which means "test if in any recursion". We can't test for specifically
2750 recursed groups. */
2751
2752 else if (condcode == OP_RREF)
2753 {
2754 int value = GET2(code, LINK_SIZE + 2);
2755 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2756 if (md->recursive != NULL)
2757 { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2758 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2759 }
2760
2761 /* Otherwise, the condition is an assertion */
2762
2763 else
2764 {
2765 int rc;
2766 const pcre_uchar *asscode = code + LINK_SIZE + 1;
2767 const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2768
2769 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2770
2771 rc = internal_dfa_exec(
2772 md, /* fixed match data */
2773 asscode, /* this subexpression's code */
2774 ptr, /* where we currently are */
2775 (int)(ptr - start_subject), /* start offset */
2776 local_offsets, /* offset vector */
2777 sizeof(local_offsets)/sizeof(int), /* size of same */
2778 local_workspace, /* workspace vector */
2779 sizeof(local_workspace)/sizeof(int), /* size of same */
2780 rlevel); /* function recursion level */
2781
2782 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2783 if ((rc >= 0) ==
2784 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2785 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2786 else
2787 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2788 }
2789 }
2790 break;
2791
2792 /*-----------------------------------------------------------------*/
2793 case OP_RECURSE:
2794 {
2795 dfa_recursion_info *ri;
2796 int local_offsets[1000];
2797 int local_workspace[1000];
2798 const pcre_uchar *callpat = start_code + GET(code, 1);
2799 int recno = (callpat == md->start_code)? 0 :
2800 GET2(callpat, 1 + LINK_SIZE);
2801 int rc;
2802
2803 DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2804
2805 /* Check for repeating a recursion without advancing the subject
2806 pointer. This should catch convoluted mutual recursions. (Some simple
2807 cases are caught at compile time.) */
2808
2809 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2810 if (recno == ri->group_num && ptr == ri->subject_position)
2811 return PCRE_ERROR_RECURSELOOP;
2812
2813 /* Remember this recursion and where we started it so as to
2814 catch infinite loops. */
2815
2816 new_recursive.group_num = recno;
2817 new_recursive.subject_position = ptr;
2818 new_recursive.prevrec = md->recursive;
2819 md->recursive = &new_recursive;
2820
2821 rc = internal_dfa_exec(
2822 md, /* fixed match data */
2823 callpat, /* this subexpression's code */
2824 ptr, /* where we currently are */
2825 (int)(ptr - start_subject), /* start offset */
2826 local_offsets, /* offset vector */
2827 sizeof(local_offsets)/sizeof(int), /* size of same */
2828 local_workspace, /* workspace vector */
2829 sizeof(local_workspace)/sizeof(int), /* size of same */
2830 rlevel); /* function recursion level */
2831
2832 md->recursive = new_recursive.prevrec; /* Done this recursion */
2833
2834 DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2835 rc));
2836
2837 /* Ran out of internal offsets */
2838
2839 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2840
2841 /* For each successful matched substring, set up the next state with a
2842 count of characters to skip before trying it. Note that the count is in
2843 characters, not bytes. */
2844
2845 if (rc > 0)
2846 {
2847 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2848 {
2849 int charcount = local_offsets[rc+1] - local_offsets[rc];
2850 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2851 if (utf)
2852 {
2853 const pcre_uchar *p = start_subject + local_offsets[rc];
2854 const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2855 while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2856 }
2857 #endif
2858 if (charcount > 0)
2859 {
2860 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2861 }
2862 else
2863 {
2864 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2865 }
2866 }
2867 }
2868 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2869 }
2870 break;
2871
2872 /*-----------------------------------------------------------------*/
2873 case OP_BRAPOS:
2874 case OP_SBRAPOS:
2875 case OP_CBRAPOS:
2876 case OP_SCBRAPOS:
2877 case OP_BRAPOSZERO:
2878 {
2879 int charcount, matched_count;
2880 const pcre_uchar *local_ptr = ptr;
2881 BOOL allow_zero;
2882
2883 if (codevalue == OP_BRAPOSZERO)
2884 {
2885 allow_zero = TRUE;
2886 codevalue = *(++code); /* Codevalue will be one of above BRAs */
2887 }
2888 else allow_zero = FALSE;
2889
2890 /* Loop to match the subpattern as many times as possible as if it were
2891 a complete pattern. */
2892
2893 for (matched_count = 0;; matched_count++)
2894 {
2895 int local_offsets[2];
2896 int local_workspace[1000];
2897
2898 int rc = internal_dfa_exec(
2899 md, /* fixed match data */
2900 code, /* this subexpression's code */
2901 local_ptr, /* where we currently are */
2902 (int)(ptr - start_subject), /* start offset */
2903 local_offsets, /* offset vector */
2904 sizeof(local_offsets)/sizeof(int), /* size of same */
2905 local_workspace, /* workspace vector */
2906 sizeof(local_workspace)/sizeof(int), /* size of same */
2907 rlevel); /* function recursion level */
2908
2909 /* Failed to match */
2910
2911 if (rc < 0)
2912 {
2913 if (rc != PCRE_ERROR_NOMATCH) return rc;
2914 break;
2915 }
2916
2917 /* Matched: break the loop if zero characters matched. */
2918
2919 charcount = local_offsets[1] - local_offsets[0];
2920 if (charcount == 0) break;
2921 local_ptr += charcount; /* Advance temporary position ptr */
2922 }
2923
2924 /* At this point we have matched the subpattern matched_count
2925 times, and local_ptr is pointing to the character after the end of the
2926 last match. */
2927
2928 if (matched_count > 0 || allow_zero)
2929 {
2930 const pcre_uchar *end_subpattern = code;
2931 int next_state_offset;
2932
2933 do { end_subpattern += GET(end_subpattern, 1); }
2934 while (*end_subpattern == OP_ALT);
2935 next_state_offset =
2936 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2937
2938 /* Optimization: if there are no more active states, and there
2939 are no new states yet set up, then skip over the subject string
2940 right here, to save looping. Otherwise, set up the new state to swing
2941 into action when the end of the matched substring is reached. */
2942
2943 if (i + 1 >= active_count && new_count == 0)
2944 {
2945 ptr = local_ptr;
2946 clen = 0;
2947 ADD_NEW(next_state_offset, 0);
2948 }
2949 else
2950 {
2951 const pcre_uchar *p = ptr;
2952 const pcre_uchar *pp = local_ptr;
2953 charcount = (int)(pp - p);
2954 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2955 if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2956 #endif
2957 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2958 }
2959 }
2960 }
2961 break;
2962
2963 /*-----------------------------------------------------------------*/
2964 case OP_ONCE:
2965 case OP_ONCE_NC:
2966 {
2967 int local_offsets[2];
2968 int local_workspace[1000];
2969
2970 int rc = internal_dfa_exec(
2971 md, /* fixed match data */
2972 code, /* this subexpression's code */
2973 ptr, /* where we currently are */
2974 (int)(ptr - start_subject), /* start offset */
2975 local_offsets, /* offset vector */
2976 sizeof(local_offsets)/sizeof(int), /* size of same */
2977 local_workspace, /* workspace vector */
2978 sizeof(local_workspace)/sizeof(int), /* size of same */
2979 rlevel); /* function recursion level */
2980
2981 if (rc >= 0)
2982 {
2983 const pcre_uchar *end_subpattern = code;
2984 int charcount = local_offsets[1] - local_offsets[0];
2985 int next_state_offset, repeat_state_offset;
2986
2987 do { end_subpattern += GET(end_subpattern, 1); }
2988 while (*end_subpattern == OP_ALT);
2989 next_state_offset =
2990 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2991
2992 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2993 arrange for the repeat state also to be added to the relevant list.
2994 Calculate the offset, or set -1 for no repeat. */
2995
2996 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2997 *end_subpattern == OP_KETRMIN)?
2998 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2999
3000 /* If we have matched an empty string, add the next state at the
3001 current character pointer. This is important so that the duplicate
3002 checking kicks in, which is what breaks infinite loops that match an
3003 empty string. */
3004
3005 if (charcount == 0)
3006 {
3007 ADD_ACTIVE(next_state_offset, 0);
3008 }
3009
3010 /* Optimization: if there are no more active states, and there
3011 are no new states yet set up, then skip over the subject string
3012 right here, to save looping. Otherwise, set up the new state to swing
3013 into action when the end of the matched substring is reached. */
3014
3015 else if (i + 1 >= active_count && new_count == 0)
3016 {
3017 ptr += charcount;
3018 clen = 0;
3019 ADD_NEW(next_state_offset, 0);
3020
3021 /* If we are adding a repeat state at the new character position,
3022 we must fudge things so that it is the only current state.
3023 Otherwise, it might be a duplicate of one we processed before, and
3024 that would cause it to be skipped. */
3025
3026 if (repeat_state_offset >= 0)
3027 {
3028 next_active_state = active_states;
3029 active_count = 0;
3030 i = -1;
3031 ADD_ACTIVE(repeat_state_offset, 0);
3032 }
3033 }
3034 else
3035 {
3036 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
3037 if (utf)
3038 {
3039 const pcre_uchar *p = start_subject + local_offsets[0];
3040 const pcre_uchar *pp = start_subject + local_offsets[1];
3041 while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
3042 }
3043 #endif
3044 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
3045 if (repeat_state_offset >= 0)
3046 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
3047 }
3048 }
3049 else if (rc != PCRE_ERROR_NOMATCH) return rc;
3050 }
3051 break;
3052
3053
3054 /* ========================================================================== */
3055 /* Handle callouts */
3056
3057 case OP_CALLOUT:
3058 rrc = 0;
3059 if (PUBL(callout) != NULL)
3060 {
3061 PUBL(callout_block) cb;
3062 cb.version = 1; /* Version 1 of the callout block */
3063 cb.callout_number = code[1];
3064 cb.offset_vector = offsets;
3065 #if defined COMPILE_PCRE8
3066 cb.subject = (PCRE_SPTR)start_subject;
3067 #elif defined COMPILE_PCRE16
3068 cb.subject = (PCRE_SPTR16)start_subject;
3069 #elif defined COMPILE_PCRE32
3070 cb.subject = (PCRE_SPTR32)start_subject;
3071 #endif
3072 cb.subject_length = (int)(end_subject - start_subject);
3073 cb.start_match = (int)(current_subject - start_subject);
3074 cb.current_position = (int)(ptr - start_subject);
3075 cb.pattern_position = GET(code, 2);
3076 cb.next_item_length = GET(code, 2 + LINK_SIZE);
3077 cb.capture_top = 1;
3078 cb.capture_last = -1;
3079 cb.callout_data = md->callout_data;
3080 cb.mark = NULL; /* No (*MARK) support */
3081 if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
3082 }
3083 if (rrc == 0)
3084 { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
3085 break;
3086
3087
3088 /* ========================================================================== */
3089 default: /* Unsupported opcode */
3090 return PCRE_ERROR_DFA_UITEM;
3091 }
3092
3093 NEXT_ACTIVE_STATE: continue;
3094
3095 } /* End of loop scanning active states */
3096
3097 /* We have finished the processing at the current subject character. If no
3098 new states have been set for the next character, we have found all the
3099 matches that we are going to find. If we are at the top level and partial
3100 matching has been requested, check for appropriate conditions.
3101
3102 The "forced_ fail" variable counts the number of (*F) encountered for the
3103 character. If it is equal to the original active_count (saved in
3104 workspace[1]) it means that (*F) was found on every active state. In this
3105 case we don't want to give a partial match.
3106
3107 The "could_continue" variable is true if a state could have continued but
3108 for the fact that the end of the subject was reached. */
3109
3110 if (new_count <= 0)
3111 {
3112 if (rlevel == 1 && /* Top level, and */
3113 could_continue && /* Some could go on, and */
3114 forced_fail != workspace[1] && /* Not all forced fail & */
3115 ( /* either... */
3116 (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
3117 || /* or... */
3118 ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
3119 match_count < 0) /* no matches */
3120 ) && /* And... */
3121 (
3122 partial_newline || /* Either partial NL */
3123 ( /* or ... */
3124 ptr >= end_subject && /* End of subject and */
3125 ptr > md->start_used_ptr) /* Inspected non-empty string */
3126 )
3127 )
3128 match_count = PCRE_ERROR_PARTIAL;
3129 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
3130 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
3131 rlevel*2-2, SP));
3132 break; /* In effect, "return", but see the comment below */
3133 }
3134
3135 /* One or more states are active for the next character. */
3136
3137 ptr += clen; /* Advance to next subject character */
3138 } /* Loop to move along the subject string */
3139
3140 /* Control gets here from "break" a few lines above. We do it this way because
3141 if we use "return" above, we have compiler trouble. Some compilers warn if
3142 there's nothing here because they think the function doesn't return a value. On
3143 the other hand, if we put a dummy statement here, some more clever compilers
3144 complain that it can't be reached. Sigh. */
3145
3146 return match_count;
3147 }
3148
3149
3150
3151
3152 /*************************************************
3153 * Execute a Regular Expression - DFA engine *
3154 *************************************************/
3155
3156 /* This external function applies a compiled re to a subject string using a DFA
3157 engine. This function calls the internal function multiple times if the pattern
3158 is not anchored.
3159
3160 Arguments:
3161 argument_re points to the compiled expression
3162 extra_data points to extra data or is NULL
3163 subject points to the subject string
3164 length length of subject string (may contain binary zeros)
3165 start_offset where to start in the subject string
3166 options option bits
3167 offsets vector of match offsets
3168 offsetcount size of same
3169 workspace workspace vector
3170 wscount size of same
3171
3172 Returns: > 0 => number of match offset pairs placed in offsets
3173 = 0 => offsets overflowed; longest matches are present
3174 -1 => failed to match
3175 < -1 => some kind of unexpected problem
3176 */
3177
3178 #if defined COMPILE_PCRE8
3179 #if defined(ERLANG_INTEGRATION)
3180 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
erts_pcre_dfa_exec(const pcre * argument_re,const erts_pcre_extra * extra_data,const char * subject,int length,int start_offset,int options,int * offsets,int offsetcount,int * workspace,int wscount)3181 erts_pcre_dfa_exec(const pcre *argument_re, const erts_pcre_extra *extra_data,
3182 const char *subject, int length, int start_offset, int options, int *offsets,
3183 int offsetcount, int *workspace, int wscount)
3184 #else
3185 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3186 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3187 const char *subject, int length, int start_offset, int options, int *offsets,
3188 int offsetcount, int *workspace, int wscount)
3189 #endif
3190 #elif defined COMPILE_PCRE16
3191 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3192 pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
3193 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3194 int offsetcount, int *workspace, int wscount)
3195 #elif defined COMPILE_PCRE32
3196 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3197 pcre32_dfa_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
3198 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
3199 int offsetcount, int *workspace, int wscount)
3200 #endif
3201 {
3202 REAL_PCRE *re = (REAL_PCRE *)argument_re;
3203 dfa_match_data match_block;
3204 dfa_match_data *md = &match_block;
3205 BOOL utf, anchored, startline, firstline;
3206 const pcre_uchar *current_subject, *end_subject;
3207 const pcre_study_data *study = NULL;
3208
3209 const pcre_uchar *req_char_ptr;
3210 const pcre_uint8 *start_bits = NULL;
3211 BOOL has_first_char = FALSE;
3212 BOOL has_req_char = FALSE;
3213 pcre_uchar first_char = 0;
3214 pcre_uchar first_char2 = 0;
3215 pcre_uchar req_char = 0;
3216 pcre_uchar req_char2 = 0;
3217 int newline;
3218
3219 /* Plausibility checks */
3220
3221 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3222 if (re == NULL || subject == NULL || workspace == NULL ||
3223 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3224 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3225 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3226 if (length < 0) return PCRE_ERROR_BADLENGTH;
3227 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3228
3229 /* Check that the first field in the block is the magic number. If it is not,
3230 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3231 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3232 means that the pattern is likely compiled with different endianness. */
3233
3234 if (re->magic_number != MAGIC_NUMBER)
3235 return re->magic_number == REVERSED_MAGIC_NUMBER?
3236 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3237 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
3238
3239 /* If restarting after a partial match, do some sanity checks on the contents
3240 of the workspace. */
3241
3242 if ((options & PCRE_DFA_RESTART) != 0)
3243 {
3244 if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3245 workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK)
3246 return PCRE_ERROR_DFA_BADRESTART;
3247 }
3248
3249 /* Set up study, callout, and table data */
3250
3251 md->tables = re->tables;
3252 md->callout_data = NULL;
3253
3254 if (extra_data != NULL)
3255 {
3256 unsigned long int flags = extra_data->flags;
3257 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3258 study = (const pcre_study_data *)extra_data->study_data;
3259 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3260 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3261 return PCRE_ERROR_DFA_UMLIMIT;
3262 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3263 md->callout_data = extra_data->callout_data;
3264 if ((flags & PCRE_EXTRA_TABLES) != 0)
3265 md->tables = extra_data->tables;
3266 }
3267
3268 /* Set some local values */
3269
3270 current_subject = (const pcre_uchar *)subject + start_offset;
3271 end_subject = (const pcre_uchar *)subject + length;
3272 req_char_ptr = current_subject - 1;
3273
3274 #ifdef SUPPORT_UTF
3275 /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
3276 utf = (re->options & PCRE_UTF8) != 0;
3277 #else
3278 utf = FALSE;
3279 #endif
3280
3281 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3282 (re->options & PCRE_ANCHORED) != 0;
3283
3284 /* The remaining fixed data for passing around. */
3285
3286 md->start_code = (const pcre_uchar *)argument_re +
3287 re->name_table_offset + re->name_count * re->name_entry_size;
3288 md->start_subject = (const pcre_uchar *)subject;
3289 md->end_subject = end_subject;
3290 md->start_offset = start_offset;
3291 md->moptions = options;
3292 md->poptions = re->options;
3293
3294 /* If the BSR option is not set at match time, copy what was set
3295 at compile time. */
3296
3297 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3298 {
3299 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3300 md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3301 #ifdef BSR_ANYCRLF
3302 else md->moptions |= PCRE_BSR_ANYCRLF;
3303 #endif
3304 }
3305
3306 /* Handle different types of newline. The three bits give eight cases. If
3307 nothing is set at run time, whatever was used at compile time applies. */
3308
3309 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3310 PCRE_NEWLINE_BITS)
3311 {
3312 case 0: newline = NEWLINE; break; /* Compile-time default */
3313 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3314 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3315 case PCRE_NEWLINE_CR+
3316 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3317 case PCRE_NEWLINE_ANY: newline = -1; break;
3318 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3319 default: return PCRE_ERROR_BADNEWLINE;
3320 }
3321
3322 if (newline == -2)
3323 {
3324 md->nltype = NLTYPE_ANYCRLF;
3325 }
3326 else if (newline < 0)
3327 {
3328 md->nltype = NLTYPE_ANY;
3329 }
3330 else
3331 {
3332 md->nltype = NLTYPE_FIXED;
3333 if (newline > 255)
3334 {
3335 md->nllen = 2;
3336 md->nl[0] = (newline >> 8) & 255;
3337 md->nl[1] = newline & 255;
3338 }
3339 else
3340 {
3341 md->nllen = 1;
3342 md->nl[0] = newline;
3343 }
3344 }
3345
3346 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3347 back the character offset. */
3348
3349 #ifdef SUPPORT_UTF
3350 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3351 {
3352 int erroroffset;
3353 int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3354 if (errorcode != 0)
3355 {
3356 if (offsetcount >= 2)
3357 {
3358 offsets[0] = erroroffset;
3359 offsets[1] = errorcode;
3360 }
3361 #if defined COMPILE_PCRE8
3362 return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0) ?
3363 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3364 #elif defined COMPILE_PCRE16
3365 return (errorcode <= PCRE_UTF16_ERR1 && (options & PCRE_PARTIAL_HARD) != 0) ?
3366 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
3367 #elif defined COMPILE_PCRE32
3368 return PCRE_ERROR_BADUTF32;
3369 #endif
3370 }
3371 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
3372 if (start_offset > 0 && start_offset < length &&
3373 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3374 return PCRE_ERROR_BADUTF8_OFFSET;
3375 #endif
3376 }
3377 #endif
3378
3379 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3380 is a feature that makes it possible to save compiled regex and re-use them
3381 in other programs later. */
3382
3383 if (md->tables == NULL) md->tables = PRIV(default_tables);
3384
3385 /* The "must be at the start of a line" flags are used in a loop when finding
3386 where to start. */
3387
3388 startline = (re->flags & PCRE_STARTLINE) != 0;
3389 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3390
3391 /* Set up the first character to match, if available. The first_byte value is
3392 never set for an anchored regular expression, but the anchoring may be forced
3393 at run time, so we have to test for anchoring. The first char may be unset for
3394 an unanchored pattern, of course. If there's no first char and the pattern was
3395 studied, there may be a bitmap of possible first characters. */
3396
3397 if (!anchored)
3398 {
3399 if ((re->flags & PCRE_FIRSTSET) != 0)
3400 {
3401 has_first_char = TRUE;
3402 first_char = first_char2 = (pcre_uchar)(re->first_char);
3403 if ((re->flags & PCRE_FCH_CASELESS) != 0)
3404 {
3405 first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3406 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3407 if (utf && first_char > 127)
3408 first_char2 = UCD_OTHERCASE(first_char);
3409 #endif
3410 }
3411 }
3412 else
3413 {
3414 if (!startline && study != NULL &&
3415 (study->flags & PCRE_STUDY_MAPPED) != 0)
3416 start_bits = study->start_bits;
3417 }
3418 }
3419
3420 /* For anchored or unanchored matches, there may be a "last known required
3421 character" set. */
3422
3423 if ((re->flags & PCRE_REQCHSET) != 0)
3424 {
3425 has_req_char = TRUE;
3426 req_char = req_char2 = (pcre_uchar)(re->req_char);
3427 if ((re->flags & PCRE_RCH_CASELESS) != 0)
3428 {
3429 req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3430 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3431 if (utf && req_char > 127)
3432 req_char2 = UCD_OTHERCASE(req_char);
3433 #endif
3434 }
3435 }
3436
3437 /* Call the main matching function, looping for a non-anchored regex after a
3438 failed match. If not restarting, perform certain optimizations at the start of
3439 a match. */
3440
3441 for (;;)
3442 {
3443 int rc;
3444
3445 if ((options & PCRE_DFA_RESTART) == 0)
3446 {
3447 const pcre_uchar *save_end_subject = end_subject;
3448
3449 /* If firstline is TRUE, the start of the match is constrained to the first
3450 line of a multiline string. Implement this by temporarily adjusting
3451 end_subject so that we stop scanning at a newline. If the match fails at
3452 the newline, later code breaks this loop. */
3453
3454 if (firstline)
3455 {
3456 PCRE_PUCHAR t = current_subject;
3457 #ifdef SUPPORT_UTF
3458 if (utf)
3459 {
3460 while (t < md->end_subject && !IS_NEWLINE(t))
3461 {
3462 t++;
3463 ACROSSCHAR(t < end_subject, *t, t++);
3464 }
3465 }
3466 else
3467 #endif
3468 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3469 end_subject = t;
3470 }
3471
3472 /* There are some optimizations that avoid running the match if a known
3473 starting point is not found. However, there is an option that disables
3474 these, for testing and for ensuring that all callouts do actually occur.
3475 The option can be set in the regex by (*NO_START_OPT) or passed in
3476 match-time options. */
3477
3478 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3479 {
3480 /* Advance to a known first pcre_uchar (i.e. data item) */
3481
3482 if (has_first_char)
3483 {
3484 if (first_char != first_char2)
3485 {
3486 pcre_uchar csc;
3487 while (current_subject < end_subject &&
3488 (csc = UCHAR21TEST(current_subject)) != first_char && csc != first_char2)
3489 current_subject++;
3490 }
3491 else
3492 while (current_subject < end_subject &&
3493 UCHAR21TEST(current_subject) != first_char)
3494 current_subject++;
3495 }
3496
3497 /* Or to just after a linebreak for a multiline match if possible */
3498
3499 else if (startline)
3500 {
3501 if (current_subject > md->start_subject + start_offset)
3502 {
3503 #ifdef SUPPORT_UTF
3504 if (utf)
3505 {
3506 while (current_subject < end_subject &&
3507 !WAS_NEWLINE(current_subject))
3508 {
3509 current_subject++;
3510 ACROSSCHAR(current_subject < end_subject, *current_subject,
3511 current_subject++);
3512 }
3513 }
3514 else
3515 #endif
3516 while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3517 current_subject++;
3518
3519 /* If we have just passed a CR and the newline option is ANY or
3520 ANYCRLF, and we are now at a LF, advance the match position by one
3521 more character. */
3522
3523 if (UCHAR21TEST(current_subject - 1) == CHAR_CR &&
3524 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3525 current_subject < end_subject &&
3526 UCHAR21TEST(current_subject) == CHAR_NL)
3527 current_subject++;
3528 }
3529 }
3530
3531 /* Advance to a non-unique first pcre_uchar after study */
3532
3533 else if (start_bits != NULL)
3534 {
3535 while (current_subject < end_subject)
3536 {
3537 register pcre_uint32 c = UCHAR21TEST(current_subject);
3538 #ifndef COMPILE_PCRE8
3539 if (c > 255) c = 255;
3540 #endif
3541 if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
3542 current_subject++;
3543 }
3544 }
3545 }
3546
3547 /* Restore fudged end_subject */
3548
3549 end_subject = save_end_subject;
3550
3551 /* The following two optimizations are disabled for partial matching or if
3552 disabling is explicitly requested (and of course, by the test above, this
3553 code is not obeyed when restarting after a partial match). */
3554
3555 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3556 (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3557 {
3558 /* If the pattern was studied, a minimum subject length may be set. This
3559 is a lower bound; no actual string of that length may actually match the
3560 pattern. Although the value is, strictly, in characters, we treat it as
3561 in pcre_uchar units to avoid spending too much time in this optimization.
3562 */
3563
3564 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3565 (pcre_uint32)(end_subject - current_subject) < study->minlength)
3566 return PCRE_ERROR_NOMATCH;
3567
3568 /* If req_char is set, we know that that pcre_uchar must appear in the
3569 subject for the match to succeed. If the first pcre_uchar is set,
3570 req_char must be later in the subject; otherwise the test starts at the
3571 match point. This optimization can save a huge amount of work in patterns
3572 with nested unlimited repeats that aren't going to match. Writing
3573 separate code for cased/caseless versions makes it go faster, as does
3574 using an autoincrement and backing off on a match.
3575
3576 HOWEVER: when the subject string is very, very long, searching to its end
3577 can take a long time, and give bad performance on quite ordinary
3578 patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3579 string... so we don't do this when the string is sufficiently long. */
3580
3581 if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3582 {
3583 register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3584
3585 /* We don't need to repeat the search if we haven't yet reached the
3586 place we found it at last time. */
3587
3588 if (p > req_char_ptr)
3589 {
3590 if (req_char != req_char2)
3591 {
3592 while (p < end_subject)
3593 {
3594 register pcre_uint32 pp = UCHAR21INCTEST(p);
3595 if (pp == req_char || pp == req_char2) { p--; break; }
3596 }
3597 }
3598 else
3599 {
3600 while (p < end_subject)
3601 {
3602 if (UCHAR21INCTEST(p) == req_char) { p--; break; }
3603 }
3604 }
3605
3606 /* If we can't find the required pcre_uchar, break the matching loop,
3607 which will cause a return or PCRE_ERROR_NOMATCH. */
3608
3609 if (p >= end_subject) break;
3610
3611 /* If we have found the required pcre_uchar, save the point where we
3612 found it, so that we don't search again next time round the loop if
3613 the start hasn't passed this point yet. */
3614
3615 req_char_ptr = p;
3616 }
3617 }
3618 }
3619 } /* End of optimizations that are done when not restarting */
3620
3621 /* OK, now we can do the business */
3622
3623 md->start_used_ptr = current_subject;
3624 md->recursive = NULL;
3625
3626 rc = internal_dfa_exec(
3627 md, /* fixed match data */
3628 md->start_code, /* this subexpression's code */
3629 current_subject, /* where we currently are */
3630 start_offset, /* start offset in subject */
3631 offsets, /* offset vector */
3632 offsetcount, /* size of same */
3633 workspace, /* workspace vector */
3634 wscount, /* size of same */
3635 0); /* function recurse level */
3636
3637 /* Anything other than "no match" means we are done, always; otherwise, carry
3638 on only if not anchored. */
3639
3640 if (rc != PCRE_ERROR_NOMATCH || anchored)
3641 {
3642 if (rc == PCRE_ERROR_PARTIAL && offsetcount >= 2)
3643 {
3644 offsets[0] = (int)(md->start_used_ptr - (PCRE_PUCHAR)subject);
3645 offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
3646 if (offsetcount > 2)
3647 offsets[2] = (int)(current_subject - (PCRE_PUCHAR)subject);
3648 }
3649 return rc;
3650 }
3651
3652 /* Advance to the next subject character unless we are at the end of a line
3653 and firstline is set. */
3654
3655 if (firstline && IS_NEWLINE(current_subject)) break;
3656 current_subject++;
3657 #ifdef SUPPORT_UTF
3658 if (utf)
3659 {
3660 ACROSSCHAR(current_subject < end_subject, *current_subject,
3661 current_subject++);
3662 }
3663 #endif
3664 if (current_subject > end_subject) break;
3665
3666 /* If we have just passed a CR and we are now at a LF, and the pattern does
3667 not contain any explicit matches for \r or \n, and the newline option is CRLF
3668 or ANY or ANYCRLF, advance the match position by one more character. */
3669
3670 if (UCHAR21TEST(current_subject - 1) == CHAR_CR &&
3671 current_subject < end_subject &&
3672 UCHAR21TEST(current_subject) == CHAR_NL &&
3673 (re->flags & PCRE_HASCRORLF) == 0 &&
3674 (md->nltype == NLTYPE_ANY ||
3675 md->nltype == NLTYPE_ANYCRLF ||
3676 md->nllen == 2))
3677 current_subject++;
3678
3679 } /* "Bumpalong" loop */
3680
3681 return PCRE_ERROR_NOMATCH;
3682 }
3683
3684 /* End of pcre_dfa_exec.c */
3685