1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2018 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40 /* #define ERLANG_DEBUG 1 */
41
42 /* This module contains pcre_exec(), the externally visible function that does
43 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
44 possible. There are also some static supporting functions. */
45
46 /* %ExternalCopyright% */
47
48 #ifdef HAVE_CONFIG_H
49 #include "config.h"
50 #endif
51
52 #define NLBLOCK md /* Block containing newline information */
53 #define PSSTART start_subject /* Field containing processed string start */
54 #define PSEND end_subject /* Field containing processed string end */
55
56 #include "pcre_internal.h"
57
58 /* Undefine some potentially clashing cpp symbols */
59
60 #undef min
61 #undef max
62
63 /* The md->capture_last field uses the lower 16 bits for the last captured
64 substring (which can never be greater than 65535) and a bit in the top half
65 to mean "capture vector overflowed". This odd way of doing things was
66 implemented when it was realized that preserving and restoring the overflow bit
67 whenever the last capture number was saved/restored made for a neater
68 interface, and doing it this way saved on (a) another variable, which would
69 have increased the stack frame size (a big NO-NO in PCRE) and (b) another
70 separate set of save/restore instructions. The following defines are used in
71 implementing this. */
72
73 #define CAPLMASK 0x0000ffff /* The bits used for last_capture */
74 #define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */
75 #define OVFLBIT 0x00010000 /* The bit that is set for overflow */
76
77 /* Values for setting in md->match_function_type to indicate two special types
78 of call to match(). We do it this way to save on using another stack variable,
79 as stack usage is to be discouraged. */
80
81 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
82 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
83
84 /* Non-error returns from the match() function. Error returns are externally
85 defined PCRE_ERROR_xxx codes, which are all negative. */
86
87 #define MATCH_MATCH 1
88 #define MATCH_NOMATCH 0
89
90 /* Special internal returns from the match() function. Make them sufficiently
91 negative to avoid the external error codes. */
92
93 #define MATCH_ACCEPT (-999)
94 #define MATCH_KETRPOS (-998)
95 #define MATCH_ONCE (-997)
96 /* The next 5 must be kept together and in sequence so that a test that checks
97 for any one of them can use a range. */
98 #define MATCH_COMMIT (-996)
99 #define MATCH_PRUNE (-995)
100 #define MATCH_SKIP (-994)
101 #define MATCH_SKIP_ARG (-993)
102 #define MATCH_THEN (-992)
103 #define MATCH_BACKTRACK_MAX MATCH_THEN
104 #define MATCH_BACKTRACK_MIN MATCH_COMMIT
105
106 /* Maximum number of ints of offset to save on the stack for recursive calls.
107 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
108 because the offset vector is always a multiple of 3 long. */
109
110 #define REC_STACK_SAVE_MAX 30
111
112 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
113
114 static const char rep_min[] = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, };
115 static const char rep_max[] = { 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, };
116
117 #ifdef PCRE_DEBUG
118 /*************************************************
119 * Debugging function to print chars *
120 *************************************************/
121
122 /* Print a sequence of chars in printable format, stopping at the end of the
123 subject if the requested.
124
125 Arguments:
126 p points to characters
127 length number to print
128 is_subject TRUE if printing from within md->start_subject
129 md pointer to matching data block, if is_subject is TRUE
130
131 Returns: nothing
132 */
133
134 static void
pchars(const pcre_uchar * p,int length,BOOL is_subject,match_data * md)135 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
136 {
137 pcre_uint32 c;
138 BOOL utf = md->utf;
139 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
140 while (length-- > 0)
141 if (isprint(c = UCHAR21INCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
142 }
143 #endif
144
145 #ifdef ERLANG_INTEGRATION
146 #ifdef ERLANG_DEBUG
147 #include <stdarg.h>
148 static void
edebug_printf(const char * format,...)149 edebug_printf(const char *format, ...)
150 {
151 va_list args;
152
153 va_start(args, format);
154 fprintf(stderr, "PCRE: ");
155 vfprintf(stderr, format, args);
156 va_end(args);
157 fprintf(stderr, "\r\n");
158 }
159 #endif
160 #endif
161
162
163 /*************************************************
164 * Match a back-reference *
165 *************************************************/
166
167 /* Normally, if a back reference hasn't been set, the length that is passed is
168 negative, so the match always fails. However, in JavaScript compatibility mode,
169 the length passed is zero. Note that in caseless UTF-8 mode, the number of
170 subject bytes matched may be different to the number of reference bytes.
171
172 Arguments:
173 offset index into the offset vector
174 eptr pointer into the subject
175 length length of reference to be matched (number of bytes)
176 md points to match data block
177 caseless TRUE if caseless
178
179 Returns: >= 0 the number of subject bytes matched
180 -1 no match
181 -2 partial match; always given if at end subject
182 */
183
184 static int
match_ref(int offset,register PCRE_PUCHAR eptr,int length,match_data * md,BOOL caseless)185 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
186 BOOL caseless)
187 {
188 PCRE_PUCHAR eptr_start = eptr;
189 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
190 #if defined SUPPORT_UTF && defined SUPPORT_UCP
191 BOOL utf = md->utf;
192 #endif
193
194 #ifdef PCRE_DEBUG
195 if (eptr >= md->end_subject)
196 printf("matching subject <null>");
197 else
198 {
199 printf("matching subject ");
200 pchars(eptr, length, TRUE, md);
201 }
202 printf(" against backref ");
203 pchars(p, length, FALSE, md);
204 printf("\n");
205 #endif
206
207 /* Always fail if reference not set (and not JavaScript compatible - in that
208 case the length is passed as zero). */
209
210 if (length < 0) return -1;
211
212 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
213 properly if Unicode properties are supported. Otherwise, we can check only
214 ASCII characters. */
215
216 if (caseless)
217 {
218 #if defined SUPPORT_UTF && defined SUPPORT_UCP
219 if (utf)
220 {
221 /* Match characters up to the end of the reference. NOTE: the number of
222 data units matched may differ, because in UTF-8 there are some characters
223 whose upper and lower case versions code have different numbers of bytes.
224 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
225 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
226 sequence of two of the latter. It is important, therefore, to check the
227 length along the reference, not along the subject (earlier code did this
228 wrong). */
229
230 PCRE_PUCHAR endptr = p + length;
231 while (p < endptr)
232 {
233 pcre_uint32 c, d;
234 const ucd_record *ur;
235 if (eptr >= md->end_subject) return -2; /* Partial match */
236 GETCHARINC(c, eptr);
237 GETCHARINC(d, p);
238 ur = GET_UCD(d);
239 if (c != d && c != d + ur->other_case)
240 {
241 const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
242 for (;;)
243 {
244 if (c < *pp) return -1;
245 if (c == *pp++) break;
246 }
247 }
248 }
249 }
250 else
251 #endif
252
253 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
254 is no UCP support. */
255 {
256 while (length-- > 0)
257 {
258 pcre_uint32 cc, cp;
259 if (eptr >= md->end_subject) return -2; /* Partial match */
260 cc = UCHAR21TEST(eptr);
261 cp = UCHAR21TEST(p);
262 if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
263 p++;
264 eptr++;
265 }
266 }
267 }
268
269 /* In the caseful case, we can just compare the bytes, whether or not we
270 are in UTF-8 mode. */
271
272 else
273 {
274 while (length-- > 0)
275 {
276 if (eptr >= md->end_subject) return -2; /* Partial match */
277 if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1;
278 }
279 }
280
281 return (int)(eptr - eptr_start);
282 }
283
284
285
286 /***************************************************************************
287 ****************************************************************************
288 RECURSION IN THE match() FUNCTION
289
290 The match() function is highly recursive, though not every recursive call
291 increases the recursive depth. Nevertheless, some regular expressions can cause
292 it to recurse to a great depth. I was writing for Unix, so I just let it call
293 itself recursively. This uses the stack for saving everything that has to be
294 saved for a recursive call. On Unix, the stack can be large, and this works
295 fine.
296
297 It turns out that on some non-Unix-like systems there are problems with
298 programs that use a lot of stack. (This despite the fact that every last chip
299 has oodles of memory these days, and techniques for extending the stack have
300 been known for decades.) So....
301
302 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
303 calls by keeping local variables that need to be preserved in blocks of memory
304 obtained from malloc() instead instead of on the stack. Macros are used to
305 achieve this so that the actual code doesn't look very different to what it
306 always used to.
307
308 The original heap-recursive code used longjmp(). However, it seems that this
309 can be very slow on some operating systems. Following a suggestion from Stan
310 Switzer, the use of longjmp() has been abolished, at the cost of having to
311 provide a unique number for each call to RMATCH. There is no way of generating
312 a sequence of numbers at compile time in C. I have given them names, to make
313 them stand out more clearly.
314
315 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
316 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
317 tests. Furthermore, not using longjmp() means that local dynamic variables
318 don't have indeterminate values; this has meant that the frame size can be
319 reduced because the result can be "passed back" by straight setting of the
320 variable instead of being passed in the frame.
321 ****************************************************************************
322 ***************************************************************************/
323
324 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
325 below must be updated in sync. */
326
327 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
328 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
329 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
330 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
331 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
332 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
333 RM61, RM62, RM63, RM64, RM65, RM66, RM67 };
334
335 /* These versions of the macros use the stack, as normal. There are debugging
336 versions and production versions. Note that the "rw" argument of RMATCH isn't
337 actually used in this definition. */
338
339 #ifndef NO_RECURSE
340 #define REGISTER register
341
342 #ifdef PCRE_DEBUG
343 #define RMATCH(ra,rb,rc,rd,re,rw) \
344 { \
345 printf("match() called in line %d\n", __LINE__); \
346 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
347 printf("to line %d\n", __LINE__); \
348 }
349 #define RRETURN(ra) \
350 { \
351 printf("match() returned %d from line %d\n", ra, __LINE__); \
352 return ra; \
353 }
354 #else
355 #define RMATCH(ra,rb,rc,rd,re,rw) \
356 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
357 #define RRETURN(ra) return ra
358 #endif
359
360 #else
361
362
363 /* These versions of the macros manage a private stack on the heap. Note that
364 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
365 argument of match(), which never changes. */
366
367 #define REGISTER
368
369 #define RMATCH(ra,rb,rc,rd,re,rw)\
370 {\
371 heapframe *newframe = frame->Xnextframe;\
372 if (newframe == NULL)\
373 {\
374 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
375 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
376 newframe->Xnextframe = NULL;\
377 frame->Xnextframe = newframe;\
378 }\
379 frame->Xwhere = rw;\
380 newframe->Xeptr = ra;\
381 newframe->Xecode = rb;\
382 newframe->Xmstart = mstart;\
383 newframe->Xoffset_top = rc;\
384 newframe->Xeptrb = re;\
385 newframe->Xrdepth = frame->Xrdepth + 1;\
386 newframe->Xprevframe = frame;\
387 frame = newframe;\
388 DPRINTF(("restarting from line %d\n", __LINE__));\
389 goto HEAP_RECURSE;\
390 L_##rw:\
391 DPRINTF(("jumped back to line %d\n", __LINE__));\
392 }
393
394 #ifdef ERLANG_INTEGRATION
395 #define RRETURN(ra)\
396 {\
397 heapframe *oldframe = frame;\
398 frame = oldframe->Xprevframe;\
399 if (frame != NULL)\
400 {\
401 rrc = ra;\
402 goto HEAP_RETURN;\
403 }\
404 if (LOOP_LIMIT != 0) \
405 { \
406 md->loop_limit -= LOOP_COUNT; \
407 } \
408 return ra;\
409 }
410 #else
411 #define RRETURN(ra)\
412 {\
413 heapframe *oldframe = frame;\
414 frame = oldframe->Xprevframe;\
415 if (frame != NULL)\
416 {\
417 rrc = ra;\
418 goto HEAP_RETURN;\
419 }\
420 return ra;\
421 }
422 #endif
423
424 /* Structure for remembering the local variables in a private frame */
425
426 typedef struct heapframe {
427 struct heapframe *Xprevframe;
428 struct heapframe *Xnextframe;
429
430 /* Function arguments that may change */
431
432 PCRE_PUCHAR Xeptr;
433 const pcre_uchar *Xecode;
434 PCRE_PUCHAR Xmstart;
435 int Xoffset_top;
436 eptrblock *Xeptrb;
437 unsigned int Xrdepth;
438
439 /* Function local variables */
440
441 PCRE_PUCHAR Xcallpat;
442 #ifdef SUPPORT_UTF
443 PCRE_PUCHAR Xcharptr;
444 #endif
445 PCRE_PUCHAR Xdata;
446 PCRE_PUCHAR Xnext;
447 PCRE_PUCHAR Xpp;
448 PCRE_PUCHAR Xprev;
449 PCRE_PUCHAR Xsaved_eptr;
450
451 recursion_info Xnew_recursive;
452
453 BOOL Xcur_is_word;
454 BOOL Xcondition;
455 BOOL Xprev_is_word;
456
457 #ifdef SUPPORT_UCP
458 int Xprop_type;
459 unsigned int Xprop_value;
460 int Xprop_fail_result;
461 int Xoclength;
462 pcre_uchar Xocchars[6];
463 #endif
464
465 int Xcodelink;
466 int Xctype;
467 unsigned int Xfc;
468 int Xfi;
469 int Xlength;
470 int Xmax;
471 int Xmin;
472 unsigned int Xnumber;
473 int Xoffset;
474 unsigned int Xop;
475 pcre_int32 Xsave_capture_last;
476 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
477 int Xstacksave[REC_STACK_SAVE_MAX];
478
479 eptrblock Xnewptrb;
480
481 /* Where to jump back to */
482
483 int Xwhere;
484 #if defined(ERLANG_INTEGRATION)
485 int Xlgb;
486 int Xrgb;
487 #endif
488 } heapframe;
489
490 #endif
491
492
493 /***************************************************************************
494 ***************************************************************************/
495
496
497
498 /*************************************************
499 * Match from current position *
500 *************************************************/
501
502 /* This function is called recursively in many circumstances. Whenever it
503 returns a negative (error) response, the outer incarnation must also return the
504 same response. */
505
506 /* These macros pack up tests that are used for partial matching, and which
507 appear several times in the code. We set the "hit end" flag if the pointer is
508 at the end of the subject and also past the start of the subject (i.e.
509 something has been matched). For hard partial matching, we then return
510 immediately. The second one is used when we already know we are past the end of
511 the subject. */
512
513 #define CHECK_PARTIAL()\
514 if (md->partial != 0 && eptr >= md->end_subject && \
515 eptr > md->start_used_ptr) \
516 { \
517 md->hitend = TRUE; \
518 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
519 }
520
521 #define SCHECK_PARTIAL()\
522 if (md->partial != 0 && eptr > md->start_used_ptr) \
523 { \
524 md->hitend = TRUE; \
525 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
526 }
527
528
529 /* Performance note: It might be tempting to extract commonly used fields from
530 the md structure (e.g. utf, end_subject) into individual variables to improve
531 performance. Tests using gcc on a SPARC disproved this; in the first case, it
532 made performance worse.
533
534 Arguments:
535 eptr pointer to current character in subject
536 ecode pointer to current position in compiled code
537 mstart pointer to the current match start position (can be modified
538 by encountering \K)
539 offset_top current top pointer
540 md pointer to "static" info for the match
541 eptrb pointer to chain of blocks containing eptr at start of
542 brackets - for testing for empty matches
543 rdepth the recursion depth
544
545 Returns: MATCH_MATCH if matched ) these values are >= 0
546 MATCH_NOMATCH if failed to match )
547 a negative MATCH_xxx value for PRUNE, SKIP, etc
548 a negative PCRE_ERROR_xxx value if aborted by an error condition
549 (e.g. stopped by repeated call or recursion limit)
550 */
551
552 static int
match(REGISTER PCRE_PUCHAR eptr,REGISTER const pcre_uchar * ecode,PCRE_PUCHAR mstart,int offset_top,match_data * md,eptrblock * eptrb,unsigned int rdepth)553 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
554 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
555 unsigned int rdepth)
556 {
557 /* These variables do not need to be preserved over recursion in this function,
558 so they can be ordinary variables in all cases. Mark some of them with
559 "register" because they are used a lot in loops. */
560
561 register int rrc; /* Returns from recursive calls */
562 register int i; /* Used for loops not involving calls to RMATCH() */
563 register pcre_uint32 c; /* Character values not kept over RMATCH() calls */
564 register BOOL utf; /* Local copy of UTF flag for speed */
565
566 BOOL minimize, possessive; /* Quantifier options */
567 BOOL caseless;
568 int condcode;
569
570 /* When recursion is not being used, all "local" variables that have to be
571 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
572 frame on the stack here; subsequent instantiations are obtained from the heap
573 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
574 the top-level on the stack rather than malloc-ing them all gives a performance
575 boost in many cases where there is not much "recursion". */
576
577 #ifdef NO_RECURSE
578
579 #ifdef ERLANG_INTEGRATION
580 #define LOOP_COUNT loop_count
581 #define LOOP_LIMIT loop_limit
582 #ifdef ERLANG_DEBUG
583 #define EDEBUGF(X) edebug_printf X
584 #else
585 #define EDEBUGF(X)
586 #endif
587 #define COST(N) (LOOP_COUNT += (N))
588 #define LABEL_XCAT(A,B) A##B
589 #define LABEL_CAT(A,B) LABEL_XCAT(A,B)
590
591 #define COST_CHK(N) \
592 do { \
593 LOOP_COUNT += (N); \
594 if (LOOP_LIMIT != 0) { \
595 if (LOOP_COUNT > LOOP_LIMIT) { \
596 frame->Xwhere = __LINE__ + 100; \
597 goto LOOP_COUNT_BREAK; \
598 LABEL_CAT(L_LOOP_COUNT_,__LINE__): \
599 ; \
600 } \
601 } \
602 } while (0)
603
604 register int loop_count = 0;
605 register int loop_limit = md->loop_limit;
606 heapframe *frame;
607 if (md->state_save) {
608 frame = md->state_save;
609 EDEBUGF(("Break restore!"));
610 goto LOOP_COUNT_RETURN;
611 }
612 frame = (heapframe *)md->match_frames_base;
613 #else
614 #define COST(N)
615 #define COST_CHK(N)
616 heapframe *frame = (heapframe *)md->match_frames_base;
617 #endif
618
619
620 /* Copy in the original argument variables */
621
622 frame->Xeptr = eptr;
623 frame->Xecode = ecode;
624 frame->Xmstart = mstart;
625 frame->Xoffset_top = offset_top;
626 frame->Xeptrb = eptrb;
627 frame->Xrdepth = rdepth;
628
629 /* This is where control jumps back to to effect "recursion" */
630
631 HEAP_RECURSE:
632
633 /* Macros make the argument variables come from the current frame */
634
635 #define eptr frame->Xeptr
636 #define ecode frame->Xecode
637 #define mstart frame->Xmstart
638 #define offset_top frame->Xoffset_top
639 #define eptrb frame->Xeptrb
640 #define rdepth frame->Xrdepth
641
642 /* Ditto for the local variables */
643
644 #ifdef SUPPORT_UTF
645 #define charptr frame->Xcharptr
646 #endif
647 #define callpat frame->Xcallpat
648 #define codelink frame->Xcodelink
649 #define data frame->Xdata
650 #define next frame->Xnext
651 #define pp frame->Xpp
652 #define prev frame->Xprev
653 #define saved_eptr frame->Xsaved_eptr
654
655 #define new_recursive frame->Xnew_recursive
656
657 #define cur_is_word frame->Xcur_is_word
658 #define condition frame->Xcondition
659 #define prev_is_word frame->Xprev_is_word
660
661 #ifdef SUPPORT_UCP
662 #define prop_type frame->Xprop_type
663 #define prop_value frame->Xprop_value
664 #define prop_fail_result frame->Xprop_fail_result
665 #define oclength frame->Xoclength
666 #define occhars frame->Xocchars
667 #endif
668
669 #define ctype frame->Xctype
670 #define fc frame->Xfc
671 #define fi frame->Xfi
672 #define length frame->Xlength
673 #define max frame->Xmax
674 #define min frame->Xmin
675 #define number frame->Xnumber
676 #define offset frame->Xoffset
677 #define op frame->Xop
678 #define save_capture_last frame->Xsave_capture_last
679 #define save_offset1 frame->Xsave_offset1
680 #define save_offset2 frame->Xsave_offset2
681 #define save_offset3 frame->Xsave_offset3
682 #define stacksave frame->Xstacksave
683 #if defined(ERLANG_INTEGRATION)
684 #define lgb frame->Xlgb
685 #define rgb frame->Xrgb
686 #endif
687
688 #define newptrb frame->Xnewptrb
689
690 /* When recursion is being used, local variables are allocated on the stack and
691 get preserved during recursion in the normal way. In this environment, fi and
692 i, and fc and c, can be the same variables. */
693
694 #else /* NO_RECURSE not defined */
695 #define COST(N)
696 #define COST_CHK(N)
697 #define fi i
698 #define fc c
699
700 /* Many of the following variables are used only in small blocks of the code.
701 My normal style of coding would have declared them within each of those blocks.
702 However, in order to accommodate the version of this code that uses an external
703 "stack" implemented on the heap, it is easier to declare them all here, so the
704 declarations can be cut out in a block. The only declarations within blocks
705 below are for variables that do not have to be preserved over a recursive call
706 to RMATCH(). */
707
708 #ifdef SUPPORT_UTF
709 const pcre_uchar *charptr;
710 #endif
711 const pcre_uchar *callpat;
712 const pcre_uchar *data;
713 const pcre_uchar *next;
714 PCRE_PUCHAR pp;
715 const pcre_uchar *prev;
716 PCRE_PUCHAR saved_eptr;
717
718 recursion_info new_recursive;
719
720 BOOL cur_is_word;
721 BOOL condition;
722 BOOL prev_is_word;
723
724 #ifdef SUPPORT_UCP
725 int prop_type;
726 unsigned int prop_value;
727 int prop_fail_result;
728 int oclength;
729 pcre_uchar occhars[6];
730 #endif
731
732 int codelink;
733 int ctype;
734 int length;
735 int max;
736 int min;
737 unsigned int number;
738 int offset;
739 unsigned int op;
740 pcre_int32 save_capture_last;
741 int save_offset1, save_offset2, save_offset3;
742 int stacksave[REC_STACK_SAVE_MAX];
743
744 eptrblock newptrb;
745
746 /* There is a special fudge for calling match() in a way that causes it to
747 measure the size of its basic stack frame when the stack is being used for
748 recursion. The second argument (ecode) being NULL triggers this behaviour. It
749 cannot normally ever be NULL. The return is the negated value of the frame
750 size. */
751
752 if (ecode == NULL)
753 {
754 if (rdepth == 0)
755 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
756 else
757 {
758 int len = (int)((char *)&rdepth - (char *)eptr);
759 return (len > 0)? -len : len;
760 }
761 }
762 #endif /* NO_RECURSE */
763
764 /* To save space on the stack and in the heap frame, I have doubled up on some
765 of the local variables that are used only in localised parts of the code, but
766 still need to be preserved over recursive calls of match(). These macros define
767 the alternative names that are used. */
768
769 #define allow_zero cur_is_word
770 #define cbegroup condition
771 #define code_offset codelink
772 #define condassert condition
773 #define matched_once prev_is_word
774 #define foc number
775 #define save_mark data
776
777 /* These statements are here to stop the compiler complaining about unitialized
778 variables. */
779
780 #ifdef SUPPORT_UCP
781 prop_value = 0;
782 prop_fail_result = 0;
783 #endif
784
785
786 /* This label is used for tail recursion, which is used in a few cases even
787 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
788 used. Thanks to Ian Taylor for noticing this possibility and sending the
789 original patch. */
790
791 TAIL_RECURSE:
792
793 /* OK, now we can get on with the real code of the function. Recursive calls
794 are specified by the macro RMATCH and RRETURN is used to return. When
795 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
796 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
797 defined). However, RMATCH isn't like a function call because it's quite a
798 complicated macro. It has to be used in one particular way. This shouldn't,
799 however, impact performance when true recursion is being used. */
800
801 #ifdef SUPPORT_UTF
802 utf = md->utf; /* Local copy of the flag */
803 #else
804 utf = FALSE;
805 #endif
806
807 /* First check that we haven't called match() too many times, or that we
808 haven't exceeded the recursive call limit. */
809
810 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
811 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
812
813 /* At the start of a group with an unlimited repeat that may match an empty
814 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
815 done this way to save having to use another function argument, which would take
816 up space on the stack. See also MATCH_CONDASSERT below.
817
818 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
819 such remembered pointers, to be checked when we hit the closing ket, in order
820 to break infinite loops that match no characters. When match() is called in
821 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
822 NOT be used with tail recursion, because the memory block that is used is on
823 the stack, so a new one may be required for each match(). */
824
825 if (md->match_function_type == MATCH_CBEGROUP)
826 {
827 newptrb.epb_saved_eptr = eptr;
828 newptrb.epb_prev = eptrb;
829 eptrb = &newptrb;
830 md->match_function_type = 0;
831 }
832
833 /* Now start processing the opcodes. */
834
835 for (;;)
836 {
837 COST_CHK(1);
838 minimize = possessive = FALSE;
839 op = *ecode;
840 EDEBUGF(("Op = %d",op));
841
842 switch(op)
843 {
844 case OP_MARK:
845 md->nomatch_mark = ecode + 2;
846 md->mark = NULL; /* In case previously set by assertion */
847 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
848 eptrb, RM55);
849 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
850 md->mark == NULL) md->mark = ecode + 2;
851
852 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
853 argument, and we must check whether that argument matches this MARK's
854 argument. It is passed back in md->start_match_ptr (an overloading of that
855 variable). If it does match, we reset that variable to the current subject
856 position and return MATCH_SKIP. Otherwise, pass back the return code
857 unaltered. */
858
859 else if (rrc == MATCH_SKIP_ARG &&
860 STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0)
861 {
862 md->start_match_ptr = eptr;
863 RRETURN(MATCH_SKIP);
864 }
865 RRETURN(rrc);
866
867 case OP_FAIL:
868 RRETURN(MATCH_NOMATCH);
869
870 case OP_COMMIT:
871 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
872 eptrb, RM52);
873 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
874 RRETURN(MATCH_COMMIT);
875
876 case OP_PRUNE:
877 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
878 eptrb, RM51);
879 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
880 RRETURN(MATCH_PRUNE);
881
882 case OP_PRUNE_ARG:
883 md->nomatch_mark = ecode + 2;
884 md->mark = NULL; /* In case previously set by assertion */
885 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
886 eptrb, RM56);
887 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
888 md->mark == NULL) md->mark = ecode + 2;
889 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
890 RRETURN(MATCH_PRUNE);
891
892 case OP_SKIP:
893 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
894 eptrb, RM53);
895 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
896 md->start_match_ptr = eptr; /* Pass back current position */
897 RRETURN(MATCH_SKIP);
898
899 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
900 nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
901 not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
902 that failed and any that precede it (either they also failed, or were not
903 triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
904 SKIP_ARG gets to top level, the match is re-run with md->ignore_skip_arg
905 set to the count of the one that failed. */
906
907 case OP_SKIP_ARG:
908 md->skip_arg_count++;
909 if (md->skip_arg_count <= md->ignore_skip_arg)
910 {
911 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
912 break;
913 }
914 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
915 eptrb, RM57);
916 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
917
918 /* Pass back the current skip name by overloading md->start_match_ptr and
919 returning the special MATCH_SKIP_ARG return code. This will either be
920 caught by a matching MARK, or get to the top, where it causes a rematch
921 with md->ignore_skip_arg set to the value of md->skip_arg_count. */
922
923 md->start_match_ptr = ecode + 2;
924 RRETURN(MATCH_SKIP_ARG);
925
926 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
927 the branch in which it occurs can be determined. Overload the start of
928 match pointer to do this. */
929
930 case OP_THEN:
931 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
932 eptrb, RM54);
933 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
934 md->start_match_ptr = ecode;
935 RRETURN(MATCH_THEN);
936
937 case OP_THEN_ARG:
938 md->nomatch_mark = ecode + 2;
939 md->mark = NULL; /* In case previously set by assertion */
940 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
941 md, eptrb, RM58);
942 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
943 md->mark == NULL) md->mark = ecode + 2;
944 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
945 md->start_match_ptr = ecode;
946 RRETURN(MATCH_THEN);
947
948 /* Handle an atomic group that does not contain any capturing parentheses.
949 This can be handled like an assertion. Prior to 8.13, all atomic groups
950 were handled this way. In 8.13, the code was changed as below for ONCE, so
951 that backups pass through the group and thereby reset captured values.
952 However, this uses a lot more stack, so in 8.20, atomic groups that do not
953 contain any captures generate OP_ONCE_NC, which can be handled in the old,
954 less stack intensive way.
955
956 Check the alternative branches in turn - the matching won't pass the KET
957 for this kind of subpattern. If any one branch matches, we carry on as at
958 the end of a normal bracket, leaving the subject pointer, but resetting
959 the start-of-match value in case it was changed by \K. */
960
961 case OP_ONCE_NC:
962 prev = ecode;
963 saved_eptr = eptr;
964 save_mark = md->mark;
965 do /* LOOP_COUNT: Ok */
966 {
967 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
968 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
969 {
970 mstart = md->start_match_ptr;
971 break;
972 }
973 if (rrc == MATCH_THEN)
974 {
975 next = ecode + GET(ecode,1);
976 if (md->start_match_ptr < next &&
977 (*ecode == OP_ALT || *next == OP_ALT))
978 rrc = MATCH_NOMATCH;
979 }
980
981 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
982 ecode += GET(ecode,1);
983 md->mark = save_mark;
984 }
985 while (*ecode == OP_ALT);
986
987 /* If hit the end of the group (which could be repeated), fail */
988
989 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
990
991 /* Continue as from after the group, updating the offsets high water
992 mark, since extracts may have been taken. */
993
994 do ecode += GET(ecode, 1); while (*ecode == OP_ALT); /* LOOP_COUNT: Ok */
995
996 offset_top = md->end_offset_top;
997 eptr = md->end_match_ptr;
998
999 /* For a non-repeating ket, just continue at this level. This also
1000 happens for a repeating ket if no characters were matched in the group.
1001 This is the forcible breaking of infinite loops as implemented in Perl
1002 5.005. */
1003
1004 if (*ecode == OP_KET || eptr == saved_eptr)
1005 {
1006 ecode += 1+LINK_SIZE;
1007 break;
1008 }
1009
1010 /* The repeating kets try the rest of the pattern or restart from the
1011 preceding bracket, in the appropriate order. The second "call" of match()
1012 uses tail recursion, to avoid using another stack frame. */
1013
1014 if (*ecode == OP_KETRMIN)
1015 {
1016 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
1017 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1018 ecode = prev;
1019 goto TAIL_RECURSE;
1020 }
1021 else /* OP_KETRMAX */
1022 {
1023 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
1024 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1025 ecode += 1 + LINK_SIZE;
1026 goto TAIL_RECURSE;
1027 }
1028 /* Control never gets here */
1029
1030 /* Handle a capturing bracket, other than those that are possessive with an
1031 unlimited repeat. If there is space in the offset vector, save the current
1032 subject position in the working slot at the top of the vector. We mustn't
1033 change the current values of the data slot, because they may be set from a
1034 previous iteration of this group, and be referred to by a reference inside
1035 the group. A failure to match might occur after the group has succeeded,
1036 if something later on doesn't match. For this reason, we need to restore
1037 the working value and also the values of the final offsets, in case they
1038 were set by a previous iteration of the same bracket.
1039
1040 If there isn't enough space in the offset vector, treat this as if it were
1041 a non-capturing bracket. Don't worry about setting the flag for the error
1042 case here; that is handled in the code for KET. */
1043
1044 case OP_CBRA:
1045 case OP_SCBRA:
1046 number = GET2(ecode, 1+LINK_SIZE);
1047 offset = number << 1;
1048
1049 #ifdef PCRE_DEBUG
1050 printf("start bracket %d\n", number);
1051 printf("subject=");
1052 pchars(eptr, 16, TRUE, md);
1053 printf("\n");
1054 #endif
1055
1056 if (offset < md->offset_max)
1057 {
1058 save_offset1 = md->offset_vector[offset];
1059 save_offset2 = md->offset_vector[offset+1];
1060 save_offset3 = md->offset_vector[md->offset_end - number];
1061 save_capture_last = md->capture_last;
1062 save_mark = md->mark;
1063
1064 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1065 md->offset_vector[md->offset_end - number] =
1066 (int)(eptr - md->start_subject);
1067
1068 for (;;) /* LOOP_COUNT: Ok */
1069 {
1070 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1071 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1072 eptrb, RM1);
1073 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
1074
1075 /* If we backed up to a THEN, check whether it is within the current
1076 branch by comparing the address of the THEN that is passed back with
1077 the end of the branch. If it is within the current branch, and the
1078 branch is one of two or more alternatives (it either starts or ends
1079 with OP_ALT), we have reached the limit of THEN's action, so convert
1080 the return code to NOMATCH, which will cause normal backtracking to
1081 happen from now on. Otherwise, THEN is passed back to an outer
1082 alternative. This implements Perl's treatment of parenthesized groups,
1083 where a group not containing | does not affect the current alternative,
1084 that is, (X) is NOT the same as (X|(*F)). */
1085
1086 if (rrc == MATCH_THEN)
1087 {
1088 next = ecode + GET(ecode,1);
1089 if (md->start_match_ptr < next &&
1090 (*ecode == OP_ALT || *next == OP_ALT))
1091 rrc = MATCH_NOMATCH;
1092 }
1093
1094 /* Anything other than NOMATCH is passed back. */
1095
1096 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1097 md->capture_last = save_capture_last;
1098 ecode += GET(ecode, 1);
1099 md->mark = save_mark;
1100 if (*ecode != OP_ALT) break;
1101 }
1102
1103 DPRINTF(("bracket %d failed\n", number));
1104 md->offset_vector[offset] = save_offset1;
1105 md->offset_vector[offset+1] = save_offset2;
1106 md->offset_vector[md->offset_end - number] = save_offset3;
1107
1108 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1109
1110 RRETURN(rrc);
1111 }
1112
1113 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1114 as a non-capturing bracket. */
1115
1116 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1117 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1118
1119 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1120
1121 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1122 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1123
1124 /* Non-capturing or atomic group, except for possessive with unlimited
1125 repeat and ONCE group with no captures. Loop for all the alternatives.
1126
1127 When we get to the final alternative within the brackets, we used to return
1128 the result of a recursive call to match() whatever happened so it was
1129 possible to reduce stack usage by turning this into a tail recursion,
1130 except in the case of a possibly empty group. However, now that there is
1131 the possibility of (*THEN) occurring in the final alternative, this
1132 optimization is no longer always possible.
1133
1134 We can optimize if we know there are no (*THEN)s in the pattern; at present
1135 this is the best that can be done.
1136
1137 MATCH_ONCE is returned when the end of an atomic group is successfully
1138 reached, but subsequent matching fails. It passes back up the tree (causing
1139 captured values to be reset) until the original atomic group level is
1140 reached. This is tested by comparing md->once_target with the start of the
1141 group. At this point, the return is converted into MATCH_NOMATCH so that
1142 previous backup points can be taken. */
1143
1144 case OP_ONCE:
1145 case OP_BRA:
1146 case OP_SBRA:
1147 DPRINTF(("start non-capturing bracket\n"));
1148
1149 for (;;) /* LOOP_COUNT: Ok */
1150 {
1151 if (op >= OP_SBRA || op == OP_ONCE)
1152 md->match_function_type = MATCH_CBEGROUP;
1153
1154 /* If this is not a possibly empty group, and there are no (*THEN)s in
1155 the pattern, and this is the final alternative, optimize as described
1156 above. */
1157
1158 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1159 {
1160 ecode += PRIV(OP_lengths)[*ecode];
1161 goto TAIL_RECURSE;
1162 }
1163
1164 /* In all other cases, we have to make another call to match(). */
1165
1166 save_mark = md->mark;
1167 save_capture_last = md->capture_last;
1168 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1169 RM2);
1170
1171 /* See comment in the code for capturing groups above about handling
1172 THEN. */
1173
1174 if (rrc == MATCH_THEN)
1175 {
1176 next = ecode + GET(ecode,1);
1177 if (md->start_match_ptr < next &&
1178 (*ecode == OP_ALT || *next == OP_ALT))
1179 rrc = MATCH_NOMATCH;
1180 }
1181
1182 if (rrc != MATCH_NOMATCH)
1183 {
1184 if (rrc == MATCH_ONCE)
1185 {
1186 const pcre_uchar *scode = ecode;
1187 if (*scode != OP_ONCE) /* If not at start, find it */
1188 {
1189 while (*scode == OP_ALT) scode += GET(scode, 1); /* LOOP_COUNT: Ok */
1190 scode -= GET(scode, 1);
1191 }
1192 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1193 }
1194 RRETURN(rrc);
1195 }
1196 ecode += GET(ecode, 1);
1197 md->mark = save_mark;
1198 if (*ecode != OP_ALT) break;
1199 md->capture_last = save_capture_last;
1200 }
1201
1202 RRETURN(MATCH_NOMATCH);
1203
1204 /* Handle possessive capturing brackets with an unlimited repeat. We come
1205 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1206 handled similarly to the normal case above. However, the matching is
1207 different. The end of these brackets will always be OP_KETRPOS, which
1208 returns MATCH_KETRPOS without going further in the pattern. By this means
1209 we can handle the group by iteration rather than recursion, thereby
1210 reducing the amount of stack needed. */
1211
1212 case OP_CBRAPOS:
1213 case OP_SCBRAPOS:
1214 allow_zero = FALSE;
1215
1216 POSSESSIVE_CAPTURE:
1217 number = GET2(ecode, 1+LINK_SIZE);
1218 offset = number << 1;
1219
1220 #ifdef PCRE_DEBUG
1221 printf("start possessive bracket %d\n", number);
1222 printf("subject=");
1223 pchars(eptr, 16, TRUE, md);
1224 printf("\n");
1225 #endif
1226
1227 if (offset >= md->offset_max) goto POSSESSIVE_NON_CAPTURE;
1228
1229 matched_once = FALSE;
1230 code_offset = (int)(ecode - md->start_code);
1231
1232 save_offset1 = md->offset_vector[offset];
1233 save_offset2 = md->offset_vector[offset+1];
1234 save_offset3 = md->offset_vector[md->offset_end - number];
1235 save_capture_last = md->capture_last;
1236
1237 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1238
1239 /* Each time round the loop, save the current subject position for use
1240 when the group matches. For MATCH_MATCH, the group has matched, so we
1241 restart it with a new subject starting position, remembering that we had
1242 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1243 usual. If we haven't matched any alternatives in any iteration, check to
1244 see if a previous iteration matched. If so, the group has matched;
1245 continue from afterwards. Otherwise it has failed; restore the previous
1246 capture values before returning NOMATCH. */
1247
1248 for (;;) /* LOOP_COUNT: Ok */
1249 {
1250 md->offset_vector[md->offset_end - number] =
1251 (int)(eptr - md->start_subject);
1252 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1253 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1254 eptrb, RM63);
1255 if (rrc == MATCH_KETRPOS)
1256 {
1257 offset_top = md->end_offset_top;
1258 ecode = md->start_code + code_offset;
1259 save_capture_last = md->capture_last;
1260 matched_once = TRUE;
1261 mstart = md->start_match_ptr; /* In case \K changed it */
1262 if (eptr == md->end_match_ptr) /* Matched an empty string */
1263 {
1264 do ecode += GET(ecode, 1); while (*ecode == OP_ALT); /* LOOP_COUNT: Ok */
1265 break;
1266 }
1267 eptr = md->end_match_ptr;
1268 continue;
1269 }
1270
1271 /* See comment in the code for capturing groups above about handling
1272 THEN. */
1273
1274 if (rrc == MATCH_THEN)
1275 {
1276 next = ecode + GET(ecode,1);
1277 if (md->start_match_ptr < next &&
1278 (*ecode == OP_ALT || *next == OP_ALT))
1279 rrc = MATCH_NOMATCH;
1280 }
1281
1282 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1283 md->capture_last = save_capture_last;
1284 ecode += GET(ecode, 1);
1285 if (*ecode != OP_ALT) break;
1286 }
1287
1288 if (!matched_once)
1289 {
1290 md->offset_vector[offset] = save_offset1;
1291 md->offset_vector[offset+1] = save_offset2;
1292 md->offset_vector[md->offset_end - number] = save_offset3;
1293 }
1294
1295 if (allow_zero || matched_once)
1296 {
1297 ecode += 1 + LINK_SIZE;
1298 break;
1299 }
1300
1301 RRETURN(MATCH_NOMATCH);
1302
1303 /* Non-capturing possessive bracket with unlimited repeat. We come here
1304 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1305 without the capturing complication. It is written out separately for speed
1306 and cleanliness. */
1307
1308 case OP_BRAPOS:
1309 case OP_SBRAPOS:
1310 allow_zero = FALSE;
1311
1312 POSSESSIVE_NON_CAPTURE:
1313 matched_once = FALSE;
1314 code_offset = (int)(ecode - md->start_code);
1315 save_capture_last = md->capture_last;
1316
1317 for (;;) /* LOOP_COUNT: Ok */
1318 {
1319 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1320 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1321 eptrb, RM48);
1322 if (rrc == MATCH_KETRPOS)
1323 {
1324 offset_top = md->end_offset_top;
1325 ecode = md->start_code + code_offset;
1326 matched_once = TRUE;
1327 mstart = md->start_match_ptr; /* In case \K reset it */
1328 if (eptr == md->end_match_ptr) /* Matched an empty string */
1329 {
1330 do ecode += GET(ecode, 1); while (*ecode == OP_ALT); /* LOOP_COUNT: Ok */
1331 break;
1332 }
1333 eptr = md->end_match_ptr;
1334 continue;
1335 }
1336
1337 /* See comment in the code for capturing groups above about handling
1338 THEN. */
1339
1340 if (rrc == MATCH_THEN)
1341 {
1342 next = ecode + GET(ecode,1);
1343 if (md->start_match_ptr < next &&
1344 (*ecode == OP_ALT || *next == OP_ALT))
1345 rrc = MATCH_NOMATCH;
1346 }
1347
1348 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1349 ecode += GET(ecode, 1);
1350 if (*ecode != OP_ALT) break;
1351 md->capture_last = save_capture_last;
1352 }
1353
1354 if (matched_once || allow_zero)
1355 {
1356 ecode += 1 + LINK_SIZE;
1357 break;
1358 }
1359 RRETURN(MATCH_NOMATCH);
1360
1361 /* Control never reaches here. */
1362
1363 /* Conditional group: compilation checked that there are no more than two
1364 branches. If the condition is false, skipping the first branch takes us
1365 past the end of the item if there is only one branch, but that's exactly
1366 what we want. */
1367
1368 case OP_COND:
1369 case OP_SCOND:
1370
1371 /* The variable codelink will be added to ecode when the condition is
1372 false, to get to the second branch. Setting it to the offset to the ALT
1373 or KET, then incrementing ecode achieves this effect. We now have ecode
1374 pointing to the condition or callout. */
1375
1376 codelink = GET(ecode, 1); /* Offset to the second branch */
1377 ecode += 1 + LINK_SIZE; /* From this opcode */
1378
1379 /* Because of the way auto-callout works during compile, a callout item is
1380 inserted between OP_COND and an assertion condition. */
1381
1382 if (*ecode == OP_CALLOUT)
1383 {
1384 if (PUBL(callout) != NULL)
1385 {
1386 PUBL(callout_block) cb;
1387 cb.version = 2; /* Version 1 of the callout block */
1388 cb.callout_number = ecode[1];
1389 cb.offset_vector = md->offset_vector;
1390 #if defined COMPILE_PCRE8
1391 cb.subject = (PCRE_SPTR)md->start_subject;
1392 #elif defined COMPILE_PCRE16
1393 cb.subject = (PCRE_SPTR16)md->start_subject;
1394 #elif defined COMPILE_PCRE32
1395 cb.subject = (PCRE_SPTR32)md->start_subject;
1396 #endif
1397 cb.subject_length = (int)(md->end_subject - md->start_subject);
1398 cb.start_match = (int)(mstart - md->start_subject);
1399 cb.current_position = (int)(eptr - md->start_subject);
1400 cb.pattern_position = GET(ecode, 2);
1401 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1402 cb.capture_top = offset_top/2;
1403 cb.capture_last = md->capture_last & CAPLMASK;
1404 /* Internal change requires this for API compatibility. */
1405 if (cb.capture_last == 0) cb.capture_last = -1;
1406 cb.callout_data = md->callout_data;
1407 cb.mark = md->nomatch_mark;
1408 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1409 if (rrc < 0) RRETURN(rrc);
1410 }
1411
1412 /* Advance ecode past the callout, so it now points to the condition. We
1413 must adjust codelink so that the value of ecode+codelink is unchanged. */
1414
1415 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1416 codelink -= PRIV(OP_lengths)[OP_CALLOUT];
1417 }
1418
1419 /* Test the various possible conditions */
1420
1421 condition = FALSE;
1422 switch(condcode = *ecode)
1423 {
1424 case OP_RREF: /* Numbered group recursion test */
1425 if (md->recursive != NULL) /* Not recursing => FALSE */
1426 {
1427 unsigned int recno = GET2(ecode, 1); /* Recursion group number*/
1428 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1429 }
1430 break;
1431
1432 case OP_DNRREF: /* Duplicate named group recursion test */
1433 if (md->recursive != NULL)
1434 {
1435 int count = GET2(ecode, 1 + IMM2_SIZE);
1436 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
1437 while (count-- > 0) /* LOOP_COUNT: COST */
1438 {
1439 unsigned int recno = GET2(slot, 0);
1440 condition = recno == md->recursive->group_num;
1441 if (condition) break;
1442 slot += md->name_entry_size;
1443 COST(1);
1444 }
1445 }
1446 break;
1447
1448 case OP_CREF: /* Numbered group used test */
1449 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1450 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1451 break;
1452
1453 case OP_DNCREF: /* Duplicate named group used test */
1454 {
1455 int count = GET2(ecode, 1 + IMM2_SIZE);
1456 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
1457 while (count-- > 0) /* LOOP_COUNT: COST */
1458 {
1459 offset = GET2(slot, 0) << 1;
1460 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1461 if (condition) break;
1462 slot += md->name_entry_size;
1463 COST(1);
1464 }
1465 }
1466 break;
1467
1468 case OP_DEF: /* DEFINE - always false */
1469 case OP_FAIL: /* From optimized (?!) condition */
1470 break;
1471
1472 /* The condition is an assertion. Call match() to evaluate it - setting
1473 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end
1474 of an assertion. */
1475
1476 default:
1477 md->match_function_type = MATCH_CONDASSERT;
1478 RMATCH(eptr, ecode, offset_top, md, NULL, RM3);
1479 if (rrc == MATCH_MATCH)
1480 {
1481 if (md->end_offset_top > offset_top)
1482 offset_top = md->end_offset_top; /* Captures may have happened */
1483 condition = TRUE;
1484
1485 /* Advance ecode past the assertion to the start of the first branch,
1486 but adjust it so that the general choosing code below works. If the
1487 assertion has a quantifier that allows zero repeats we must skip over
1488 the BRAZERO. This is a lunatic thing to do, but somebody did! */
1489
1490 if (*ecode == OP_BRAZERO) ecode++;
1491 ecode += GET(ecode, 1);
1492 while (*ecode == OP_ALT) ecode += GET(ecode, 1); /* LOOP_COUNT: Ok */
1493 ecode += 1 + LINK_SIZE - PRIV(OP_lengths)[condcode];
1494 }
1495
1496 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1497 assertion; it is therefore treated as NOMATCH. Any other return is an
1498 error. */
1499
1500 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1501 {
1502 RRETURN(rrc); /* Need braces because of following else */
1503 }
1504 break;
1505 }
1506
1507 /* Choose branch according to the condition */
1508
1509 ecode += condition? PRIV(OP_lengths)[condcode] : codelink;
1510
1511 /* We are now at the branch that is to be obeyed. As there is only one, we
1512 can use tail recursion to avoid using another stack frame, except when
1513 there is unlimited repeat of a possibly empty group. In the latter case, a
1514 recursive call to match() is always required, unless the second alternative
1515 doesn't exist, in which case we can just plough on. Note that, for
1516 compatibility with Perl, the | in a conditional group is NOT treated as
1517 creating two alternatives. If a THEN is encountered in the branch, it
1518 propagates out to the enclosing alternative (unless nested in a deeper set
1519 of alternatives, of course). */
1520
1521 if (condition || ecode[-(1+LINK_SIZE)] == OP_ALT)
1522 {
1523 if (op != OP_SCOND)
1524 {
1525 goto TAIL_RECURSE;
1526 }
1527
1528 md->match_function_type = MATCH_CBEGROUP;
1529 RMATCH(eptr, ecode, offset_top, md, eptrb, RM49);
1530 RRETURN(rrc);
1531 }
1532
1533 /* Condition false & no alternative; continue after the group. */
1534
1535 else
1536 {
1537 }
1538 break;
1539
1540
1541 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1542 to close any currently open capturing brackets. */
1543
1544 case OP_CLOSE:
1545 number = GET2(ecode, 1); /* Must be less than 65536 */
1546 offset = number << 1;
1547
1548 #ifdef PCRE_DEBUG
1549 printf("end bracket %d at *ACCEPT", number);
1550 printf("\n");
1551 #endif
1552
1553 md->capture_last = (md->capture_last & OVFLMASK) | number;
1554 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1555 {
1556 md->offset_vector[offset] =
1557 md->offset_vector[md->offset_end - number];
1558 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1559
1560 /* If this group is at or above the current highwater mark, ensure that
1561 any groups between the current high water mark and this group are marked
1562 unset and then update the high water mark. */
1563
1564 if (offset >= offset_top)
1565 {
1566 register int *iptr = md->offset_vector + offset_top;
1567 register int *iend = md->offset_vector + offset;
1568 if (iptr < iend)
1569 {
1570 COST(iend - iptr);
1571 while (iptr < iend) *iptr++ = -1; /* LOOP_COUNT: COST */
1572 }
1573 offset_top = offset + 2;
1574 }
1575 }
1576 ecode += 1 + IMM2_SIZE;
1577 break;
1578
1579
1580 /* End of the pattern, either real or forced. */
1581
1582 case OP_END:
1583 case OP_ACCEPT:
1584 case OP_ASSERT_ACCEPT:
1585
1586 /* If we have matched an empty string, fail if not in an assertion and not
1587 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1588 is set and we have matched at the start of the subject. In both cases,
1589 backtracking will then try other alternatives, if any. */
1590
1591 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1592 md->recursive == NULL &&
1593 (md->notempty ||
1594 (md->notempty_atstart &&
1595 mstart == md->start_subject + md->start_offset)))
1596 RRETURN(MATCH_NOMATCH);
1597
1598 /* Otherwise, we have a match. */
1599
1600 md->end_match_ptr = eptr; /* Record where we ended */
1601 md->end_offset_top = offset_top; /* and how many extracts were taken */
1602 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1603
1604 /* For some reason, the macros don't work properly if an expression is
1605 given as the argument to RRETURN when the heap is in use. */
1606
1607 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1608 RRETURN(rrc);
1609
1610 /* Assertion brackets. Check the alternative branches in turn - the
1611 matching won't pass the KET for an assertion. If any one branch matches,
1612 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1613 start of each branch to move the current point backwards, so the code at
1614 this level is identical to the lookahead case. When the assertion is part
1615 of a condition, we want to return immediately afterwards. The caller of
1616 this incarnation of the match() function will have set MATCH_CONDASSERT in
1617 md->match_function type, and one of these opcodes will be the first opcode
1618 that is processed. We use a local variable that is preserved over calls to
1619 match() to remember this case. */
1620
1621 case OP_ASSERT:
1622 case OP_ASSERTBACK:
1623 save_mark = md->mark;
1624 if (md->match_function_type == MATCH_CONDASSERT)
1625 {
1626 condassert = TRUE;
1627 md->match_function_type = 0;
1628 }
1629 else condassert = FALSE;
1630
1631 /* Loop for each branch */
1632
1633 do /* LOOP_COUNT: Ok */
1634 {
1635 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1636
1637 /* A match means that the assertion is true; break out of the loop
1638 that matches its alternatives. */
1639
1640 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1641 {
1642 mstart = md->start_match_ptr; /* In case \K reset it */
1643 break;
1644 }
1645
1646 /* If not matched, restore the previous mark setting. */
1647
1648 md->mark = save_mark;
1649
1650 /* See comment in the code for capturing groups above about handling
1651 THEN. */
1652
1653 if (rrc == MATCH_THEN)
1654 {
1655 next = ecode + GET(ecode,1);
1656 if (md->start_match_ptr < next &&
1657 (*ecode == OP_ALT || *next == OP_ALT))
1658 rrc = MATCH_NOMATCH;
1659 }
1660
1661 /* Anything other than NOMATCH causes the entire assertion to fail,
1662 passing back the return code. This includes COMMIT, SKIP, PRUNE and an
1663 uncaptured THEN, which means they take their normal effect. This
1664 consistent approach does not always have exactly the same effect as in
1665 Perl. */
1666
1667 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1668 ecode += GET(ecode, 1);
1669 }
1670 while (*ecode == OP_ALT); /* Continue for next alternative */ /* LOOP_COUNT: Ok */
1671
1672 /* If we have tried all the alternative branches, the assertion has
1673 failed. If not, we broke out after a match. */
1674
1675 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1676
1677 /* If checking an assertion for a condition, return MATCH_MATCH. */
1678
1679 if (condassert) RRETURN(MATCH_MATCH);
1680
1681 /* Continue from after a successful assertion, updating the offsets high
1682 water mark, since extracts may have been taken during the assertion. */
1683
1684 do ecode += GET(ecode,1); while (*ecode == OP_ALT); /* LOOP_COUNT: Ok */
1685 ecode += 1 + LINK_SIZE;
1686 offset_top = md->end_offset_top;
1687 continue;
1688
1689 /* Negative assertion: all branches must fail to match for the assertion to
1690 succeed. */
1691
1692 case OP_ASSERT_NOT:
1693 case OP_ASSERTBACK_NOT:
1694 save_mark = md->mark;
1695 if (md->match_function_type == MATCH_CONDASSERT)
1696 {
1697 condassert = TRUE;
1698 md->match_function_type = 0;
1699 }
1700 else condassert = FALSE;
1701
1702 /* Loop for each alternative branch. */
1703
1704 do /* LOOP_COUNT: Ok */
1705 {
1706 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1707 md->mark = save_mark; /* Always restore the mark setting */
1708
1709 switch(rrc)
1710 {
1711 case MATCH_MATCH: /* A successful match means */
1712 case MATCH_ACCEPT: /* the assertion has failed. */
1713 RRETURN(MATCH_NOMATCH);
1714
1715 case MATCH_NOMATCH: /* Carry on with next branch */
1716 break;
1717
1718 /* See comment in the code for capturing groups above about handling
1719 THEN. */
1720
1721 case MATCH_THEN:
1722 next = ecode + GET(ecode,1);
1723 if (md->start_match_ptr < next &&
1724 (*ecode == OP_ALT || *next == OP_ALT))
1725 {
1726 rrc = MATCH_NOMATCH;
1727 break;
1728 }
1729 /* Otherwise fall through. */
1730
1731 /* COMMIT, SKIP, PRUNE, and an uncaptured THEN cause the whole
1732 assertion to fail to match, without considering any more alternatives.
1733 Failing to match means the assertion is true. This is a consistent
1734 approach, but does not always have the same effect as in Perl. */
1735
1736 case MATCH_COMMIT:
1737 case MATCH_SKIP:
1738 case MATCH_SKIP_ARG:
1739 case MATCH_PRUNE:
1740 do ecode += GET(ecode,1); while (*ecode == OP_ALT); /* LOOP_COUNT: Ok */
1741 goto NEG_ASSERT_TRUE; /* Break out of alternation loop */
1742
1743 /* Anything else is an error */
1744
1745 default:
1746 RRETURN(rrc);
1747 }
1748
1749 /* Continue with next branch */
1750
1751 ecode += GET(ecode,1);
1752 }
1753 while (*ecode == OP_ALT);
1754
1755 /* All branches in the assertion failed to match. */
1756
1757 NEG_ASSERT_TRUE:
1758 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1759 ecode += 1 + LINK_SIZE; /* Continue with current branch */
1760 continue;
1761
1762 /* Move the subject pointer back. This occurs only at the start of
1763 each branch of a lookbehind assertion. If we are too close to the start to
1764 move back, this match function fails. When working with UTF-8 we move
1765 back a number of characters, not bytes. */
1766
1767 case OP_REVERSE:
1768 #ifdef SUPPORT_UTF
1769 if (utf)
1770 {
1771 i = GET(ecode, 1);
1772 COST(i);
1773 while (i-- > 0) /* LOOP_COUNT: COST */
1774 {
1775 eptr--;
1776 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1777 BACKCHAR(eptr);
1778 }
1779 }
1780 else
1781 #endif
1782
1783 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1784
1785 {
1786 eptr -= GET(ecode, 1);
1787 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1788 }
1789
1790 /* Save the earliest consulted character, then skip to next op code */
1791
1792 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1793 ecode += 1 + LINK_SIZE;
1794 break;
1795
1796 /* The callout item calls an external function, if one is provided, passing
1797 details of the match so far. This is mainly for debugging, though the
1798 function is able to force a failure. */
1799
1800 case OP_CALLOUT:
1801 if (PUBL(callout) != NULL)
1802 {
1803 PUBL(callout_block) cb;
1804 cb.version = 2; /* Version 1 of the callout block */
1805 cb.callout_number = ecode[1];
1806 cb.offset_vector = md->offset_vector;
1807 #if defined COMPILE_PCRE8
1808 cb.subject = (PCRE_SPTR)md->start_subject;
1809 #elif defined COMPILE_PCRE16
1810 cb.subject = (PCRE_SPTR16)md->start_subject;
1811 #elif defined COMPILE_PCRE32
1812 cb.subject = (PCRE_SPTR32)md->start_subject;
1813 #endif
1814 cb.subject_length = (int)(md->end_subject - md->start_subject);
1815 cb.start_match = (int)(mstart - md->start_subject);
1816 cb.current_position = (int)(eptr - md->start_subject);
1817 cb.pattern_position = GET(ecode, 2);
1818 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1819 cb.capture_top = offset_top/2;
1820 cb.capture_last = md->capture_last & CAPLMASK;
1821 /* Internal change requires this for API compatibility. */
1822 if (cb.capture_last == 0) cb.capture_last = -1;
1823 cb.callout_data = md->callout_data;
1824 cb.mark = md->nomatch_mark;
1825 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1826 if (rrc < 0) RRETURN(rrc);
1827 }
1828 ecode += 2 + 2*LINK_SIZE;
1829 break;
1830
1831 /* Recursion either matches the current regex, or some subexpression. The
1832 offset data is the offset to the starting bracket from the start of the
1833 whole pattern. (This is so that it works from duplicated subpatterns.)
1834
1835 The state of the capturing groups is preserved over recursion, and
1836 re-instated afterwards. We don't know how many are started and not yet
1837 finished (offset_top records the completed total) so we just have to save
1838 all the potential data. There may be up to 65535 such values, which is too
1839 large to put on the stack, but using malloc for small numbers seems
1840 expensive. As a compromise, the stack is used when there are no more than
1841 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1842
1843 There are also other values that have to be saved. We use a chained
1844 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1845 for the original version of this logic. It has, however, been hacked around
1846 a lot, so he is not to blame for the current way it works. */
1847
1848 case OP_RECURSE:
1849 {
1850 recursion_info *ri;
1851 unsigned int recno; /* LOOP_COUNT: Warning, no CHK until after Marker1 */
1852
1853 callpat = md->start_code + GET(ecode, 1);
1854 recno = (callpat == md->start_code)? 0 :
1855 GET2(callpat, 1 + LINK_SIZE);
1856
1857 /* Check for repeating a recursion without advancing the subject pointer.
1858 This should catch convoluted mutual recursions. (Some simple cases are
1859 caught at compile time.) */
1860
1861 for (ri = md->recursive; ri != NULL; ri = ri->prevrec) /* LOOP_COUNT: COST */
1862 {
1863 if (recno == ri->group_num && eptr == ri->subject_position)
1864 RRETURN(PCRE_ERROR_RECURSELOOP);
1865 COST(1);
1866 }
1867
1868 /* Add to "recursing stack" */
1869
1870 new_recursive.group_num = recno; /* LOOP_COUNT: Marker1 */
1871 new_recursive.saved_capture_last = md->capture_last;
1872 new_recursive.subject_position = eptr;
1873 new_recursive.prevrec = md->recursive;
1874 md->recursive = &new_recursive;
1875
1876 /* Where to continue from afterwards */
1877
1878 ecode += 1 + LINK_SIZE;
1879
1880 /* Now save the offset data */
1881
1882 new_recursive.saved_max = md->offset_end;
1883 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1884 new_recursive.offset_save = stacksave;
1885 else
1886 {
1887 new_recursive.offset_save =
1888 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1889 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1890 }
1891 memcpy(new_recursive.offset_save, md->offset_vector,
1892 new_recursive.saved_max * sizeof(int));
1893
1894 /* OK, now we can do the recursion. After processing each alternative,
1895 restore the offset data and the last captured value. If there were nested
1896 recursions, md->recursive might be changed, so reset it before looping.
1897 */
1898
1899 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1900 cbegroup = (*callpat >= OP_SBRA);
1901 do /* LOOP_COUNT: Ok */
1902 {
1903 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1904 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1905 md, eptrb, RM6);
1906 memcpy(md->offset_vector, new_recursive.offset_save,
1907 new_recursive.saved_max * sizeof(int));
1908 md->capture_last = new_recursive.saved_capture_last;
1909 md->recursive = new_recursive.prevrec;
1910 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1911 {
1912 DPRINTF(("Recursion matched\n"));
1913 if (new_recursive.offset_save != stacksave)
1914 (PUBL(free))(new_recursive.offset_save);
1915
1916 /* Set where we got to in the subject, and reset the start in case
1917 it was changed by \K. This *is* propagated back out of a recursion,
1918 for Perl compatibility. */
1919
1920 eptr = md->end_match_ptr;
1921 mstart = md->start_match_ptr;
1922 goto RECURSION_MATCHED; /* Exit loop; end processing */
1923 }
1924
1925 /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a
1926 recursion; they cause a NOMATCH for the entire recursion. These codes
1927 are defined in a range that can be tested for. */
1928
1929 if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX)
1930 {
1931 if (new_recursive.offset_save != stacksave)
1932 (PUBL(free))(new_recursive.offset_save);
1933 RRETURN(MATCH_NOMATCH);
1934 }
1935
1936 /* Any return code other than NOMATCH is an error. */
1937
1938 if (rrc != MATCH_NOMATCH)
1939 {
1940 DPRINTF(("Recursion gave error %d\n", rrc));
1941 if (new_recursive.offset_save != stacksave)
1942 (PUBL(free))(new_recursive.offset_save);
1943 RRETURN(rrc);
1944 }
1945
1946 md->recursive = &new_recursive;
1947 callpat += GET(callpat, 1);
1948 }
1949 while (*callpat == OP_ALT);
1950
1951 DPRINTF(("Recursion didn't match\n"));
1952 md->recursive = new_recursive.prevrec;
1953 if (new_recursive.offset_save != stacksave)
1954 (PUBL(free))(new_recursive.offset_save);
1955 RRETURN(MATCH_NOMATCH);
1956 }
1957
1958 RECURSION_MATCHED:
1959 break;
1960
1961 /* An alternation is the end of a branch; scan along to find the end of the
1962 bracketed group and go to there. */
1963
1964 case OP_ALT:
1965 do ecode += GET(ecode,1); while (*ecode == OP_ALT); /* LOOP_COUNT: Ok */
1966 break;
1967
1968 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1969 indicating that it may occur zero times. It may repeat infinitely, or not
1970 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1971 with fixed upper repeat limits are compiled as a number of copies, with the
1972 optional ones preceded by BRAZERO or BRAMINZERO. */
1973
1974 case OP_BRAZERO:
1975 next = ecode + 1;
1976 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1977 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1978 do next += GET(next, 1); while (*next == OP_ALT); /* LOOP_COUNT: Ok */
1979 ecode = next + 1 + LINK_SIZE;
1980 break;
1981
1982 case OP_BRAMINZERO:
1983 next = ecode + 1;
1984 do next += GET(next, 1); while (*next == OP_ALT); /* LOOP_COUNT: Ok */
1985 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1986 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1987 ecode++;
1988 break;
1989
1990 case OP_SKIPZERO:
1991 next = ecode+1;
1992 do next += GET(next,1); while (*next == OP_ALT); /* LOOP_COUNT: Ok */
1993 ecode = next + 1 + LINK_SIZE;
1994 break;
1995
1996 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1997 here; just jump to the group, with allow_zero set TRUE. */
1998
1999 case OP_BRAPOSZERO:
2000 op = *(++ecode);
2001 allow_zero = TRUE;
2002 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
2003 goto POSSESSIVE_NON_CAPTURE;
2004
2005 /* End of a group, repeated or non-repeating. */
2006
2007 case OP_KET:
2008 case OP_KETRMIN:
2009 case OP_KETRMAX:
2010 case OP_KETRPOS:
2011 prev = ecode - GET(ecode, 1);
2012
2013 /* If this was a group that remembered the subject start, in order to break
2014 infinite repeats of empty string matches, retrieve the subject start from
2015 the chain. Otherwise, set it NULL. */
2016
2017 if (*prev >= OP_SBRA || *prev == OP_ONCE)
2018 {
2019 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
2020 eptrb = eptrb->epb_prev; /* Backup to previous group */
2021 }
2022 else saved_eptr = NULL;
2023
2024 /* If we are at the end of an assertion group or a non-capturing atomic
2025 group, stop matching and return MATCH_MATCH, but record the current high
2026 water mark for use by positive assertions. We also need to record the match
2027 start in case it was changed by \K. */
2028
2029 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
2030 *prev == OP_ONCE_NC)
2031 {
2032 md->end_match_ptr = eptr; /* For ONCE_NC */
2033 md->end_offset_top = offset_top;
2034 md->start_match_ptr = mstart;
2035 RRETURN(MATCH_MATCH); /* Sets md->mark */
2036 }
2037
2038 /* For capturing groups we have to check the group number back at the start
2039 and if necessary complete handling an extraction by setting the offsets and
2040 bumping the high water mark. Whole-pattern recursion is coded as a recurse
2041 into group 0, so it won't be picked up here. Instead, we catch it when the
2042 OP_END is reached. Other recursion is handled here. We just have to record
2043 the current subject position and start match pointer and give a MATCH
2044 return. */
2045
2046 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
2047 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
2048 {
2049 number = GET2(prev, 1+LINK_SIZE);
2050 offset = number << 1;
2051
2052 #ifdef PCRE_DEBUG
2053 printf("end bracket %d", number);
2054 printf("\n");
2055 #endif
2056
2057 /* Handle a recursively called group. */
2058
2059 if (md->recursive != NULL && md->recursive->group_num == number)
2060 {
2061 md->end_match_ptr = eptr;
2062 md->start_match_ptr = mstart;
2063 RRETURN(MATCH_MATCH);
2064 }
2065
2066 /* Deal with capturing */
2067
2068 md->capture_last = (md->capture_last & OVFLMASK) | number;
2069 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
2070 {
2071 /* If offset is greater than offset_top, it means that we are
2072 "skipping" a capturing group, and that group's offsets must be marked
2073 unset. In earlier versions of PCRE, all the offsets were unset at the
2074 start of matching, but this doesn't work because atomic groups and
2075 assertions can cause a value to be set that should later be unset.
2076 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
2077 part of the atomic group, but this is not on the final matching path,
2078 so must be unset when 2 is set. (If there is no group 2, there is no
2079 problem, because offset_top will then be 2, indicating no capture.) */
2080
2081 if (offset > offset_top)
2082 {
2083 register int *iptr = md->offset_vector + offset_top;
2084 register int *iend = md->offset_vector + offset;
2085 if (iptr < iend)
2086 {
2087 COST(iend - iptr);
2088 while (iptr < iend) *iptr++ = -1; /* LOOP_COUNT: COST */
2089 }
2090 }
2091
2092 /* Now make the extraction */
2093
2094 md->offset_vector[offset] =
2095 md->offset_vector[md->offset_end - number];
2096 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
2097 if (offset_top <= offset) offset_top = offset + 2;
2098 }
2099 }
2100
2101 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
2102 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
2103 at a time from the outer level, thus saving stack. This must precede the
2104 empty string test - in this case that test is done at the outer level. */
2105
2106 if (*ecode == OP_KETRPOS)
2107 {
2108 md->start_match_ptr = mstart; /* In case \K reset it */
2109 md->end_match_ptr = eptr;
2110 md->end_offset_top = offset_top;
2111 RRETURN(MATCH_KETRPOS);
2112 }
2113
2114 /* For an ordinary non-repeating ket, just continue at this level. This
2115 also happens for a repeating ket if no characters were matched in the
2116 group. This is the forcible breaking of infinite loops as implemented in
2117 Perl 5.005. For a non-repeating atomic group that includes captures,
2118 establish a backup point by processing the rest of the pattern at a lower
2119 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
2120 original OP_ONCE level, thereby bypassing intermediate backup points, but
2121 resetting any captures that happened along the way. */
2122
2123 if (*ecode == OP_KET || eptr == saved_eptr)
2124 {
2125 if (*prev == OP_ONCE)
2126 {
2127 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
2128 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2129 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2130 RRETURN(MATCH_ONCE);
2131 }
2132 ecode += 1 + LINK_SIZE; /* Carry on at this level */
2133 break;
2134 }
2135
2136 /* The normal repeating kets try the rest of the pattern or restart from
2137 the preceding bracket, in the appropriate order. In the second case, we can
2138 use tail recursion to avoid using another stack frame, unless we have an
2139 an atomic group or an unlimited repeat of a group that can match an empty
2140 string. */
2141
2142 if (*ecode == OP_KETRMIN)
2143 {
2144 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2145 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2146 if (*prev == OP_ONCE)
2147 {
2148 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2149 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2150 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2151 RRETURN(MATCH_ONCE);
2152 }
2153 if (*prev >= OP_SBRA) /* Could match an empty string */
2154 {
2155 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2156 RRETURN(rrc);
2157 }
2158 ecode = prev;
2159 goto TAIL_RECURSE;
2160 }
2161 else /* OP_KETRMAX */
2162 {
2163 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2164 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2165 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2166 if (*prev == OP_ONCE)
2167 {
2168 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2169 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2170 md->once_target = prev;
2171 RRETURN(MATCH_ONCE);
2172 }
2173 ecode += 1 + LINK_SIZE;
2174 goto TAIL_RECURSE;
2175 }
2176 /* Control never gets here */
2177
2178 /* Not multiline mode: start of subject assertion, unless notbol. */
2179
2180 case OP_CIRC:
2181 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2182
2183 /* Start of subject assertion */
2184
2185 case OP_SOD:
2186 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2187 ecode++;
2188 break;
2189
2190 /* Multiline mode: start of subject unless notbol, or after any newline. */
2191
2192 case OP_CIRCM:
2193 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2194 if (eptr != md->start_subject &&
2195 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2196 RRETURN(MATCH_NOMATCH);
2197 ecode++;
2198 break;
2199
2200 /* Start of match assertion */
2201
2202 case OP_SOM:
2203 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2204 ecode++;
2205 break;
2206
2207 /* Reset the start of match point */
2208
2209 case OP_SET_SOM:
2210 mstart = eptr;
2211 ecode++;
2212 break;
2213
2214 /* Multiline mode: assert before any newline, or before end of subject
2215 unless noteol is set. */
2216
2217 case OP_DOLLM:
2218 if (eptr < md->end_subject)
2219 {
2220 if (!IS_NEWLINE(eptr))
2221 {
2222 if (md->partial != 0 &&
2223 eptr + 1 >= md->end_subject &&
2224 NLBLOCK->nltype == NLTYPE_FIXED &&
2225 NLBLOCK->nllen == 2 &&
2226 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2227 {
2228 md->hitend = TRUE;
2229 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2230 }
2231 RRETURN(MATCH_NOMATCH);
2232 }
2233 }
2234 else
2235 {
2236 if (md->noteol) RRETURN(MATCH_NOMATCH);
2237 SCHECK_PARTIAL();
2238 }
2239 ecode++;
2240 break;
2241
2242 /* Not multiline mode: assert before a terminating newline or before end of
2243 subject unless noteol is set. */
2244
2245 case OP_DOLL:
2246 if (md->noteol) RRETURN(MATCH_NOMATCH);
2247 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2248
2249 /* ... else fall through for endonly */
2250
2251 /* End of subject assertion (\z) */
2252
2253 case OP_EOD:
2254 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2255 SCHECK_PARTIAL();
2256 ecode++;
2257 break;
2258
2259 /* End of subject or ending \n assertion (\Z) */
2260
2261 case OP_EODN:
2262 ASSERT_NL_OR_EOS:
2263 if (eptr < md->end_subject &&
2264 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2265 {
2266 if (md->partial != 0 &&
2267 eptr + 1 >= md->end_subject &&
2268 NLBLOCK->nltype == NLTYPE_FIXED &&
2269 NLBLOCK->nllen == 2 &&
2270 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2271 {
2272 md->hitend = TRUE;
2273 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2274 }
2275 RRETURN(MATCH_NOMATCH);
2276 }
2277
2278 /* Either at end of string or \n before end. */
2279
2280 SCHECK_PARTIAL();
2281 ecode++;
2282 break;
2283
2284 /* Word boundary assertions */
2285
2286 case OP_NOT_WORD_BOUNDARY:
2287 case OP_WORD_BOUNDARY:
2288 {
2289
2290 /* Find out if the previous and current characters are "word" characters.
2291 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2292 be "non-word" characters. Remember the earliest consulted character for
2293 partial matching. */
2294
2295 #ifdef SUPPORT_UTF
2296 if (utf)
2297 {
2298 /* Get status of previous character */
2299
2300 if (eptr == md->start_subject) prev_is_word = FALSE; else
2301 {
2302 PCRE_PUCHAR lastptr = eptr - 1;
2303 BACKCHAR(lastptr);
2304 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2305 GETCHAR(c, lastptr);
2306 #ifdef SUPPORT_UCP
2307 if (md->use_ucp)
2308 {
2309 if (c == '_') prev_is_word = TRUE; else
2310 {
2311 int cat = UCD_CATEGORY(c);
2312 prev_is_word = (cat == ucp_L || cat == ucp_N);
2313 }
2314 }
2315 else
2316 #endif
2317 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2318 }
2319
2320 /* Get status of next character */
2321
2322 if (eptr >= md->end_subject)
2323 {
2324 SCHECK_PARTIAL();
2325 cur_is_word = FALSE;
2326 }
2327 else
2328 {
2329 GETCHAR(c, eptr);
2330 #ifdef SUPPORT_UCP
2331 if (md->use_ucp)
2332 {
2333 if (c == '_') cur_is_word = TRUE; else
2334 {
2335 int cat = UCD_CATEGORY(c);
2336 cur_is_word = (cat == ucp_L || cat == ucp_N);
2337 }
2338 }
2339 else
2340 #endif
2341 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2342 }
2343 }
2344 else
2345 #endif
2346
2347 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2348 consistency with the behaviour of \w we do use it in this case. */
2349
2350 {
2351 /* Get status of previous character */
2352
2353 if (eptr == md->start_subject) prev_is_word = FALSE; else
2354 {
2355 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2356 #ifdef SUPPORT_UCP
2357 if (md->use_ucp)
2358 {
2359 c = eptr[-1];
2360 if (c == '_') prev_is_word = TRUE; else
2361 {
2362 int cat = UCD_CATEGORY(c);
2363 prev_is_word = (cat == ucp_L || cat == ucp_N);
2364 }
2365 }
2366 else
2367 #endif
2368 prev_is_word = MAX_255(eptr[-1])
2369 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2370 }
2371
2372 /* Get status of next character */
2373
2374 if (eptr >= md->end_subject)
2375 {
2376 SCHECK_PARTIAL();
2377 cur_is_word = FALSE;
2378 }
2379 else
2380 #ifdef SUPPORT_UCP
2381 if (md->use_ucp)
2382 {
2383 c = *eptr;
2384 if (c == '_') cur_is_word = TRUE; else
2385 {
2386 int cat = UCD_CATEGORY(c);
2387 cur_is_word = (cat == ucp_L || cat == ucp_N);
2388 }
2389 }
2390 else
2391 #endif
2392 cur_is_word = MAX_255(*eptr)
2393 && ((md->ctypes[*eptr] & ctype_word) != 0);
2394 }
2395
2396 /* Now see if the situation is what we want */
2397
2398 if ((*ecode++ == OP_WORD_BOUNDARY)?
2399 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2400 RRETURN(MATCH_NOMATCH);
2401 }
2402 break;
2403
2404 /* Match any single character type except newline; have to take care with
2405 CRLF newlines and partial matching. */
2406
2407 case OP_ANY:
2408 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2409 if (md->partial != 0 &&
2410 eptr == md->end_subject - 1 &&
2411 NLBLOCK->nltype == NLTYPE_FIXED &&
2412 NLBLOCK->nllen == 2 &&
2413 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2414 {
2415 md->hitend = TRUE;
2416 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2417 }
2418
2419 /* Fall through */
2420
2421 /* Match any single character whatsoever. */
2422
2423 case OP_ALLANY:
2424 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2425 { /* not be updated before SCHECK_PARTIAL. */
2426 SCHECK_PARTIAL();
2427 RRETURN(MATCH_NOMATCH);
2428 }
2429 eptr++;
2430 #ifdef SUPPORT_UTF
2431 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2432 #endif
2433 ecode++;
2434 break;
2435
2436 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2437 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2438
2439 case OP_ANYBYTE:
2440 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2441 { /* not be updated before SCHECK_PARTIAL. */
2442 SCHECK_PARTIAL();
2443 RRETURN(MATCH_NOMATCH);
2444 }
2445 eptr++;
2446 ecode++;
2447 break;
2448
2449 case OP_NOT_DIGIT:
2450 if (eptr >= md->end_subject)
2451 {
2452 SCHECK_PARTIAL();
2453 RRETURN(MATCH_NOMATCH);
2454 }
2455 GETCHARINCTEST(c, eptr);
2456 if (
2457 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2458 c < 256 &&
2459 #endif
2460 (md->ctypes[c] & ctype_digit) != 0
2461 )
2462 RRETURN(MATCH_NOMATCH);
2463 ecode++;
2464 break;
2465
2466 case OP_DIGIT:
2467 if (eptr >= md->end_subject)
2468 {
2469 SCHECK_PARTIAL();
2470 RRETURN(MATCH_NOMATCH);
2471 }
2472 GETCHARINCTEST(c, eptr);
2473 if (
2474 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2475 c > 255 ||
2476 #endif
2477 (md->ctypes[c] & ctype_digit) == 0
2478 )
2479 RRETURN(MATCH_NOMATCH);
2480 ecode++;
2481 break;
2482
2483 case OP_NOT_WHITESPACE:
2484 if (eptr >= md->end_subject)
2485 {
2486 SCHECK_PARTIAL();
2487 RRETURN(MATCH_NOMATCH);
2488 }
2489 GETCHARINCTEST(c, eptr);
2490 if (
2491 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2492 c < 256 &&
2493 #endif
2494 (md->ctypes[c] & ctype_space) != 0
2495 )
2496 RRETURN(MATCH_NOMATCH);
2497 ecode++;
2498 break;
2499
2500 case OP_WHITESPACE:
2501 if (eptr >= md->end_subject)
2502 {
2503 SCHECK_PARTIAL();
2504 RRETURN(MATCH_NOMATCH);
2505 }
2506 GETCHARINCTEST(c, eptr);
2507 if (
2508 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2509 c > 255 ||
2510 #endif
2511 (md->ctypes[c] & ctype_space) == 0
2512 )
2513 RRETURN(MATCH_NOMATCH);
2514 ecode++;
2515 break;
2516
2517 case OP_NOT_WORDCHAR:
2518 if (eptr >= md->end_subject)
2519 {
2520 SCHECK_PARTIAL();
2521 RRETURN(MATCH_NOMATCH);
2522 }
2523 GETCHARINCTEST(c, eptr);
2524 if (
2525 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2526 c < 256 &&
2527 #endif
2528 (md->ctypes[c] & ctype_word) != 0
2529 )
2530 RRETURN(MATCH_NOMATCH);
2531 ecode++;
2532 break;
2533
2534 case OP_WORDCHAR:
2535 if (eptr >= md->end_subject)
2536 {
2537 SCHECK_PARTIAL();
2538 RRETURN(MATCH_NOMATCH);
2539 }
2540 GETCHARINCTEST(c, eptr);
2541 if (
2542 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2543 c > 255 ||
2544 #endif
2545 (md->ctypes[c] & ctype_word) == 0
2546 )
2547 RRETURN(MATCH_NOMATCH);
2548 ecode++;
2549 break;
2550
2551 case OP_ANYNL:
2552 if (eptr >= md->end_subject)
2553 {
2554 SCHECK_PARTIAL();
2555 RRETURN(MATCH_NOMATCH);
2556 }
2557 GETCHARINCTEST(c, eptr);
2558 switch(c)
2559 {
2560 default: RRETURN(MATCH_NOMATCH);
2561
2562 case CHAR_CR:
2563 if (eptr >= md->end_subject)
2564 {
2565 SCHECK_PARTIAL();
2566 }
2567 else if (UCHAR21TEST(eptr) == CHAR_LF) eptr++;
2568 break;
2569
2570 case CHAR_LF:
2571 break;
2572
2573 case CHAR_VT:
2574 case CHAR_FF:
2575 case CHAR_NEL:
2576 #ifndef EBCDIC
2577 case 0x2028:
2578 case 0x2029:
2579 #endif /* Not EBCDIC */
2580 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2581 break;
2582 }
2583 ecode++;
2584 break;
2585
2586 case OP_NOT_HSPACE:
2587 if (eptr >= md->end_subject)
2588 {
2589 SCHECK_PARTIAL();
2590 RRETURN(MATCH_NOMATCH);
2591 }
2592 GETCHARINCTEST(c, eptr);
2593 switch(c)
2594 {
2595 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2596 default: break;
2597 }
2598 ecode++;
2599 break;
2600
2601 case OP_HSPACE:
2602 if (eptr >= md->end_subject)
2603 {
2604 SCHECK_PARTIAL();
2605 RRETURN(MATCH_NOMATCH);
2606 }
2607 GETCHARINCTEST(c, eptr);
2608 switch(c)
2609 {
2610 HSPACE_CASES: break; /* Byte and multibyte cases */
2611 default: RRETURN(MATCH_NOMATCH);
2612 }
2613 ecode++;
2614 break;
2615
2616 case OP_NOT_VSPACE:
2617 if (eptr >= md->end_subject)
2618 {
2619 SCHECK_PARTIAL();
2620 RRETURN(MATCH_NOMATCH);
2621 }
2622 GETCHARINCTEST(c, eptr);
2623 switch(c)
2624 {
2625 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2626 default: break;
2627 }
2628 ecode++;
2629 break;
2630
2631 case OP_VSPACE:
2632 if (eptr >= md->end_subject)
2633 {
2634 SCHECK_PARTIAL();
2635 RRETURN(MATCH_NOMATCH);
2636 }
2637 GETCHARINCTEST(c, eptr);
2638 switch(c)
2639 {
2640 VSPACE_CASES: break;
2641 default: RRETURN(MATCH_NOMATCH);
2642 }
2643 ecode++;
2644 break;
2645
2646 #ifdef SUPPORT_UCP
2647 /* Check the next character by Unicode property. We will get here only
2648 if the support is in the binary; otherwise a compile-time error occurs. */
2649
2650 case OP_PROP:
2651 case OP_NOTPROP:
2652 if (eptr >= md->end_subject)
2653 {
2654 SCHECK_PARTIAL();
2655 RRETURN(MATCH_NOMATCH);
2656 }
2657 GETCHARINCTEST(c, eptr);
2658 {
2659 const pcre_uint32 *cp;
2660 const ucd_record *prop = GET_UCD(c); /* LOOP_COUNT: Warning, no CHK in this block! */
2661
2662 switch(ecode[1])
2663 {
2664 case PT_ANY:
2665 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2666 break;
2667
2668 case PT_LAMP:
2669 if ((prop->chartype == ucp_Lu ||
2670 prop->chartype == ucp_Ll ||
2671 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2672 RRETURN(MATCH_NOMATCH);
2673 break;
2674
2675 case PT_GC:
2676 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2677 RRETURN(MATCH_NOMATCH);
2678 break;
2679
2680 case PT_PC:
2681 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2682 RRETURN(MATCH_NOMATCH);
2683 break;
2684
2685 case PT_SC:
2686 if ((ecode[2] != prop->script) == (op == OP_PROP))
2687 RRETURN(MATCH_NOMATCH);
2688 break;
2689
2690 /* These are specials */
2691
2692 case PT_ALNUM:
2693 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2694 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2695 RRETURN(MATCH_NOMATCH);
2696 break;
2697
2698 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2699 which means that Perl space and POSIX space are now identical. PCRE
2700 was changed at release 8.34. */
2701
2702 case PT_SPACE: /* Perl space */
2703 case PT_PXSPACE: /* POSIX space */
2704 switch(c)
2705 {
2706 HSPACE_CASES:
2707 VSPACE_CASES:
2708 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2709 break;
2710
2711 default:
2712 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) ==
2713 (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH);
2714 break;
2715 }
2716 break;
2717
2718 case PT_WORD:
2719 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2720 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2721 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2722 RRETURN(MATCH_NOMATCH);
2723 break;
2724
2725 case PT_CLIST:
2726 cp = PRIV(ucd_caseless_sets) + ecode[2];
2727 for (;;) /* LOOP_COUNT: COST */
2728 {
2729 if (c < *cp)
2730 { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2731 if (c == *cp++)
2732 { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2733 COST(1);
2734 }
2735 break;
2736
2737 case PT_UCNC:
2738 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2739 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2740 c >= 0xe000) == (op == OP_NOTPROP))
2741 RRETURN(MATCH_NOMATCH);
2742 break;
2743
2744 /* This should never occur */
2745
2746 default:
2747 RRETURN(PCRE_ERROR_INTERNAL);
2748 }
2749
2750 ecode += 3;
2751 }
2752 break;
2753
2754 /* Match an extended Unicode sequence. We will get here only if the support
2755 is in the binary; otherwise a compile-time error occurs. */
2756
2757 case OP_EXTUNI:
2758 if (eptr >= md->end_subject)
2759 {
2760 SCHECK_PARTIAL();
2761 RRETURN(MATCH_NOMATCH);
2762 }
2763 else
2764 {
2765 #ifndef ERLANG_INTEGRATION
2766 int lgb, rgb;
2767 #endif
2768 GETCHARINCTEST(c, eptr);
2769 lgb = UCD_GRAPHBREAK(c);
2770 while (eptr < md->end_subject) /* LOOP_COUNT: CHK */
2771 {
2772 int len = 1;
2773 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2774 rgb = UCD_GRAPHBREAK(c);
2775 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2776 lgb = rgb;
2777 eptr += len;
2778 COST_CHK(1);
2779 }
2780 }
2781 CHECK_PARTIAL();
2782 ecode++;
2783 break;
2784 #endif /* SUPPORT_UCP */
2785
2786
2787 /* Match a back reference, possibly repeatedly. Look past the end of the
2788 item to see if there is repeat information following. The code is similar
2789 to that for character classes, but repeated for efficiency. Then obey
2790 similar code to character type repeats - written out again for speed.
2791 However, if the referenced string is the empty string, always treat
2792 it as matched, any number of times (otherwise there could be infinite
2793 loops). If the reference is unset, there are two possibilities:
2794
2795 (a) In the default, Perl-compatible state, set the length negative;
2796 this ensures that every attempt at a match fails. We can't just fail
2797 here, because of the possibility of quantifiers with zero minima.
2798
2799 (b) If the JavaScript compatibility flag is set, set the length to zero
2800 so that the back reference matches an empty string.
2801
2802 Otherwise, set the length to the length of what was matched by the
2803 referenced subpattern.
2804
2805 The OP_REF and OP_REFI opcodes are used for a reference to a numbered group
2806 or to a non-duplicated named group. For a duplicated named group, OP_DNREF
2807 and OP_DNREFI are used. In this case we must scan the list of groups to
2808 which the name refers, and use the first one that is set. */
2809
2810 case OP_DNREF:
2811 case OP_DNREFI:
2812 caseless = op == OP_DNREFI;
2813 {
2814 int count = GET2(ecode, 1+IMM2_SIZE);
2815 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
2816 ecode += 1 + 2*IMM2_SIZE;
2817
2818 /* Setting the default length first and initializing 'offset' avoids
2819 compiler warnings in the REF_REPEAT code. */
2820
2821 length = (md->jscript_compat)? 0 : -1;
2822 offset = 0;
2823
2824 while (count-- > 0) /* LOOP_COUNT: COST */
2825 {
2826 offset = GET2(slot, 0) << 1;
2827 if (offset < offset_top && md->offset_vector[offset] >= 0)
2828 {
2829 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2830 break;
2831 }
2832 slot += md->name_entry_size;
2833 }
2834 COST(1);
2835 }
2836 goto REF_REPEAT;
2837
2838 case OP_REF:
2839 case OP_REFI:
2840 caseless = op == OP_REFI;
2841 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2842 ecode += 1 + IMM2_SIZE;
2843 if (offset >= offset_top || md->offset_vector[offset] < 0)
2844 length = (md->jscript_compat)? 0 : -1;
2845 else
2846 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2847
2848 /* Set up for repetition, or handle the non-repeated case */
2849
2850 REF_REPEAT:
2851 switch (*ecode)
2852 {
2853 case OP_CRSTAR:
2854 case OP_CRMINSTAR:
2855 case OP_CRPLUS:
2856 case OP_CRMINPLUS:
2857 case OP_CRQUERY:
2858 case OP_CRMINQUERY:
2859 c = *ecode++ - OP_CRSTAR;
2860 minimize = (c & 1) != 0;
2861 min = rep_min[c]; /* Pick up values from tables; */
2862 max = rep_max[c]; /* zero for max => infinity */
2863 if (max == 0) max = INT_MAX;
2864 break;
2865
2866 case OP_CRRANGE:
2867 case OP_CRMINRANGE:
2868 minimize = (*ecode == OP_CRMINRANGE);
2869 min = GET2(ecode, 1);
2870 max = GET2(ecode, 1 + IMM2_SIZE);
2871 if (max == 0) max = INT_MAX;
2872 ecode += 1 + 2 * IMM2_SIZE;
2873 break;
2874
2875 default: /* No repeat follows */
2876 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2877 {
2878 if (length == -2) eptr = md->end_subject; /* Partial match */
2879 CHECK_PARTIAL();
2880 RRETURN(MATCH_NOMATCH);
2881 }
2882 eptr += length;
2883 continue; /* With the main loop */
2884 }
2885
2886 /* Handle repeated back references. If the length of the reference is
2887 zero, just continue with the main loop. If the length is negative, it
2888 means the reference is unset in non-Java-compatible mode. If the minimum is
2889 zero, we can continue at the same level without recursion. For any other
2890 minimum, carrying on will result in NOMATCH. */
2891
2892 if (length == 0) continue;
2893 if (length < 0 && min == 0) continue;
2894
2895 /* First, ensure the minimum number of matches are present. We get back
2896 the length of the reference string explicitly rather than passing the
2897 address of eptr, so that eptr can be a register variable. */
2898 COST(min);
2899 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
2900 {
2901 int slength;
2902 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2903 {
2904 if (slength == -2) eptr = md->end_subject; /* Partial match */
2905 CHECK_PARTIAL();
2906 RRETURN(MATCH_NOMATCH);
2907 }
2908 eptr += slength;
2909 }
2910
2911 /* If min = max, continue at the same level without recursion.
2912 They are not both allowed to be zero. */
2913
2914 if (min == max) continue;
2915
2916 /* If minimizing, keep trying and advancing the pointer */
2917
2918 if (minimize)
2919 {
2920 for (fi = min;; fi++) /* LOOP_COUNT: Ok */
2921 {
2922 int slength;
2923 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2924 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2925 if (fi >= max) RRETURN(MATCH_NOMATCH);
2926 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2927 {
2928 if (slength == -2) eptr = md->end_subject; /* Partial match */
2929 CHECK_PARTIAL();
2930 RRETURN(MATCH_NOMATCH);
2931 }
2932 eptr += slength;
2933 }
2934 /* Control never gets here */
2935 }
2936
2937 /* If maximizing, find the longest string and work backwards */
2938
2939 else
2940 {
2941 pp = eptr;
2942 for (i = min; i < max; i++)
2943 {
2944 int slength;
2945 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0) /* LOOP_COUNT: CHK */
2946 {
2947 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2948 the soft partial matching case. */
2949
2950 if (slength == -2 && md->partial != 0 &&
2951 md->end_subject > md->start_used_ptr)
2952 {
2953 md->hitend = TRUE;
2954 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2955 }
2956 break;
2957 }
2958 eptr += slength;
2959 COST_CHK(1);
2960 }
2961
2962 while (eptr >= pp) /* LOOP_COUNT: Ok */
2963 {
2964 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2965 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2966 eptr -= length;
2967 }
2968 RRETURN(MATCH_NOMATCH);
2969 }
2970 /* Control never gets here */
2971
2972 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2973 used when all the characters in the class have values in the range 0-255,
2974 and either the matching is caseful, or the characters are in the range
2975 0-127 when UTF-8 processing is enabled. The only difference between
2976 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2977 encountered.
2978
2979 First, look past the end of the item to see if there is repeat information
2980 following. Then obey similar code to character type repeats - written out
2981 again for speed. */
2982
2983 case OP_NCLASS:
2984 case OP_CLASS:
2985 {
2986 /* The data variable is saved across frames, so the byte map needs to
2987 be stored there. */
2988 #define BYTE_MAP ((pcre_uint8 *)data)
2989 data = ecode + 1; /* Save for matching */
2990 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2991 #ifdef ERLANG_INTEGRATION
2992 EDEBUGF(("OP_(N)CLASS (%d)...",*ecode));
2993 #endif
2994
2995 switch (*ecode)
2996 {
2997 case OP_CRSTAR:
2998 case OP_CRMINSTAR:
2999 case OP_CRPLUS:
3000 case OP_CRMINPLUS:
3001 case OP_CRQUERY:
3002 case OP_CRMINQUERY:
3003 case OP_CRPOSSTAR:
3004 case OP_CRPOSPLUS:
3005 case OP_CRPOSQUERY:
3006 c = *ecode++ - OP_CRSTAR;
3007 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
3008 else possessive = TRUE;
3009 min = rep_min[c]; /* Pick up values from tables; */
3010 max = rep_max[c]; /* zero for max => infinity */
3011 if (max == 0) max = INT_MAX;
3012 break;
3013
3014 case OP_CRRANGE:
3015 case OP_CRMINRANGE:
3016 case OP_CRPOSRANGE:
3017 minimize = (*ecode == OP_CRMINRANGE);
3018 possessive = (*ecode == OP_CRPOSRANGE);
3019 min = GET2(ecode, 1);
3020 max = GET2(ecode, 1 + IMM2_SIZE);
3021 if (max == 0) max = INT_MAX;
3022 ecode += 1 + 2 * IMM2_SIZE;
3023 break;
3024
3025 default: /* No repeat follows */
3026 min = max = 1;
3027 break;
3028 }
3029
3030 /* First, ensure the minimum number of matches are present. */
3031
3032 #ifdef SUPPORT_UTF
3033 if (utf)
3034 {
3035 COST(min);
3036 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
3037 {
3038 if (eptr >= md->end_subject)
3039 {
3040 SCHECK_PARTIAL();
3041 RRETURN(MATCH_NOMATCH);
3042 }
3043 GETCHARINC(c, eptr);
3044 if (c > 255)
3045 {
3046 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
3047 }
3048 else
3049 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
3050 }
3051 }
3052 else
3053 #endif
3054 /* Not UTF mode */
3055 {
3056 COST(min);
3057 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
3058 {
3059 if (eptr >= md->end_subject)
3060 {
3061 SCHECK_PARTIAL();
3062 RRETURN(MATCH_NOMATCH);
3063 }
3064 c = *eptr++;
3065 #ifndef COMPILE_PCRE8
3066 if (c > 255)
3067 {
3068 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
3069 }
3070 else
3071 #endif
3072 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
3073 }
3074 }
3075
3076 /* If max == min we can continue with the main loop without the
3077 need to recurse. */
3078
3079 if (min == max) continue;
3080
3081 /* If minimizing, keep testing the rest of the expression and advancing
3082 the pointer while it matches the class. */
3083
3084 if (minimize)
3085 {
3086 #ifdef SUPPORT_UTF
3087 if (utf)
3088 {
3089 for (fi = min;; fi++) /* LOOP_COUNT: Ok */
3090 {
3091 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
3092 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3093 if (fi >= max) RRETURN(MATCH_NOMATCH);
3094 if (eptr >= md->end_subject)
3095 {
3096 SCHECK_PARTIAL();
3097 RRETURN(MATCH_NOMATCH);
3098 }
3099 GETCHARINC(c, eptr);
3100 if (c > 255)
3101 {
3102 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
3103 }
3104 else
3105 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
3106 }
3107 }
3108 else
3109 #endif
3110 /* Not UTF mode */
3111 {
3112 for (fi = min;; fi++) /* LOOP_COUNT: Ok */
3113 {
3114 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
3115 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3116 if (fi >= max) RRETURN(MATCH_NOMATCH);
3117 if (eptr >= md->end_subject)
3118 {
3119 SCHECK_PARTIAL();
3120 RRETURN(MATCH_NOMATCH);
3121 }
3122 c = *eptr++;
3123 #ifndef COMPILE_PCRE8
3124 if (c > 255)
3125 {
3126 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
3127 }
3128 else
3129 #endif
3130 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
3131 }
3132 }
3133 /* Control never gets here */
3134 }
3135
3136 /* If maximizing, find the longest possible run, then work backwards. */
3137
3138 else
3139 {
3140 pp = eptr;
3141
3142 #ifdef SUPPORT_UTF
3143 if (utf)
3144 {
3145 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
3146 {
3147 int len = 1;
3148 if (eptr >= md->end_subject)
3149 {
3150 SCHECK_PARTIAL();
3151 break;
3152 }
3153 GETCHARLEN(c, eptr, len);
3154 if (c > 255)
3155 {
3156 if (op == OP_CLASS) break;
3157 }
3158 else
3159 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3160 eptr += len;
3161 COST_CHK(1);
3162 }
3163
3164 if (possessive) continue; /* No backtracking */
3165
3166 for (;;) /* LOOP_COUNT: Ok */
3167 {
3168 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
3169 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3170 if (eptr-- <= pp) break; /* Stop if tried at original pos */
3171 BACKCHAR(eptr);
3172 }
3173 }
3174 else
3175 #endif
3176 /* Not UTF mode */
3177 {
3178 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
3179 {
3180 if (eptr >= md->end_subject)
3181 {
3182 SCHECK_PARTIAL();
3183 break;
3184 }
3185 c = *eptr;
3186 #ifndef COMPILE_PCRE8
3187 if (c > 255)
3188 {
3189 if (op == OP_CLASS) break;
3190 }
3191 else
3192 #endif
3193 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3194 COST_CHK(1);
3195 eptr++;
3196 }
3197
3198 if (possessive) continue; /* No backtracking */
3199
3200 while (eptr >= pp) /* LOOP_COUNT: Ok */
3201 {
3202 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3203 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3204 eptr--;
3205 }
3206 }
3207
3208 RRETURN(MATCH_NOMATCH);
3209 }
3210 #undef BYTE_MAP
3211 }
3212 /* Control never gets here */
3213
3214
3215 /* Match an extended character class. In the 8-bit library, this opcode is
3216 encountered only when UTF-8 mode mode is supported. In the 16-bit and
3217 32-bit libraries, codepoints greater than 255 may be encountered even when
3218 UTF is not supported. */
3219
3220 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3221 case OP_XCLASS:
3222 {
3223 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3224 ecode += GET(ecode, 1); /* Advance past the item */
3225
3226 switch (*ecode)
3227 {
3228 case OP_CRSTAR:
3229 case OP_CRMINSTAR:
3230 case OP_CRPLUS:
3231 case OP_CRMINPLUS:
3232 case OP_CRQUERY:
3233 case OP_CRMINQUERY:
3234 case OP_CRPOSSTAR:
3235 case OP_CRPOSPLUS:
3236 case OP_CRPOSQUERY:
3237 c = *ecode++ - OP_CRSTAR;
3238 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
3239 else possessive = TRUE;
3240 min = rep_min[c]; /* Pick up values from tables; */
3241 max = rep_max[c]; /* zero for max => infinity */
3242 if (max == 0) max = INT_MAX;
3243 break;
3244
3245 case OP_CRRANGE:
3246 case OP_CRMINRANGE:
3247 case OP_CRPOSRANGE:
3248 minimize = (*ecode == OP_CRMINRANGE);
3249 possessive = (*ecode == OP_CRPOSRANGE);
3250 min = GET2(ecode, 1);
3251 max = GET2(ecode, 1 + IMM2_SIZE);
3252 if (max == 0) max = INT_MAX;
3253 ecode += 1 + 2 * IMM2_SIZE;
3254 break;
3255
3256 default: /* No repeat follows */
3257 min = max = 1;
3258 break;
3259 }
3260
3261 /* First, ensure the minimum number of matches are present. */
3262 COST(min);
3263 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
3264 {
3265 if (eptr >= md->end_subject)
3266 {
3267 SCHECK_PARTIAL();
3268 RRETURN(MATCH_NOMATCH);
3269 }
3270 GETCHARINCTEST(c, eptr);
3271 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3272 }
3273
3274 /* If max == min we can continue with the main loop without the
3275 need to recurse. */
3276
3277 if (min == max) continue;
3278
3279 /* If minimizing, keep testing the rest of the expression and advancing
3280 the pointer while it matches the class. */
3281
3282 if (minimize)
3283 {
3284 for (fi = min;; fi++) /* LOOP_COUNT: Ok */
3285 {
3286 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3287 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3288 if (fi >= max) RRETURN(MATCH_NOMATCH);
3289 if (eptr >= md->end_subject)
3290 {
3291 SCHECK_PARTIAL();
3292 RRETURN(MATCH_NOMATCH);
3293 }
3294 GETCHARINCTEST(c, eptr);
3295 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3296 }
3297 /* Control never gets here */
3298 }
3299
3300 /* If maximizing, find the longest possible run, then work backwards. */
3301
3302 else
3303 {
3304 pp = eptr;
3305 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
3306 {
3307 int len = 1;
3308 if (eptr >= md->end_subject)
3309 {
3310 SCHECK_PARTIAL();
3311 break;
3312 }
3313 #ifdef SUPPORT_UTF
3314 GETCHARLENTEST(c, eptr, len);
3315 #else
3316 c = *eptr;
3317 #endif
3318 if (!PRIV(xclass)(c, data, utf)) break;
3319 eptr += len;
3320 COST_CHK(1);
3321 }
3322
3323 if (possessive) continue; /* No backtracking */
3324
3325 for(;;) /* LOOP_COUNT: Ok */
3326 {
3327 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3328 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3329 if (eptr-- <= pp) break; /* Stop if tried at original pos */
3330 #ifdef SUPPORT_UTF
3331 if (utf) BACKCHAR(eptr);
3332 #endif
3333 }
3334 RRETURN(MATCH_NOMATCH);
3335 }
3336
3337 /* Control never gets here */
3338 }
3339 #endif /* End of XCLASS */
3340
3341 /* Match a single character, casefully */
3342
3343 case OP_CHAR:
3344 #ifdef SUPPORT_UTF
3345 if (utf)
3346 {
3347 length = 1;
3348 ecode++;
3349 GETCHARLEN(fc, ecode, length);
3350 if (length > md->end_subject - eptr)
3351 {
3352 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3353 RRETURN(MATCH_NOMATCH);
3354 }
3355 while (length-- > 0) if (*ecode++ != UCHAR21INC(eptr)) RRETURN(MATCH_NOMATCH); /* LOOP_COUNT: Ok */
3356 }
3357 else
3358 #endif
3359 /* Not UTF mode */
3360 {
3361 if (md->end_subject - eptr < 1)
3362 {
3363 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3364 RRETURN(MATCH_NOMATCH);
3365 }
3366 EDEBUGF(("code to match:%d, code is:%d",ecode[1],*eptr));
3367 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3368 ecode += 2;
3369 }
3370 break;
3371
3372 /* Match a single character, caselessly. If we are at the end of the
3373 subject, give up immediately. */
3374
3375 case OP_CHARI:
3376 if (eptr >= md->end_subject)
3377 {
3378 SCHECK_PARTIAL();
3379 RRETURN(MATCH_NOMATCH);
3380 }
3381
3382 #ifdef SUPPORT_UTF
3383 if (utf)
3384 {
3385 length = 1;
3386 ecode++;
3387 GETCHARLEN(fc, ecode, length);
3388
3389 /* If the pattern character's value is < 128, we have only one byte, and
3390 we know that its other case must also be one byte long, so we can use the
3391 fast lookup table. We know that there is at least one byte left in the
3392 subject. */
3393
3394 if (fc < 128)
3395 {
3396 pcre_uint32 cc = UCHAR21(eptr);
3397 if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
3398 ecode++;
3399 eptr++;
3400 }
3401
3402 /* Otherwise we must pick up the subject character. Note that we cannot
3403 use the value of "length" to check for sufficient bytes left, because the
3404 other case of the character may have more or fewer bytes. */
3405
3406 else
3407 {
3408 pcre_uint32 dc;
3409 GETCHARINC(dc, eptr);
3410 ecode += length;
3411
3412 /* If we have Unicode property support, we can use it to test the other
3413 case of the character, if there is one. */
3414
3415 if (fc != dc)
3416 {
3417 #ifdef SUPPORT_UCP
3418 if (dc != UCD_OTHERCASE(fc))
3419 #endif
3420 RRETURN(MATCH_NOMATCH);
3421 }
3422 }
3423 }
3424 else
3425 #endif /* SUPPORT_UTF */
3426
3427 /* Not UTF mode */
3428 {
3429 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3430 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3431 eptr++;
3432 ecode += 2;
3433 }
3434 break;
3435
3436 /* Match a single character repeatedly. */
3437
3438 case OP_EXACT:
3439 case OP_EXACTI:
3440 min = max = GET2(ecode, 1);
3441 ecode += 1 + IMM2_SIZE;
3442 goto REPEATCHAR;
3443
3444 case OP_POSUPTO:
3445 case OP_POSUPTOI:
3446 possessive = TRUE;
3447 /* Fall through */
3448
3449 case OP_UPTO:
3450 case OP_UPTOI:
3451 case OP_MINUPTO:
3452 case OP_MINUPTOI:
3453 min = 0;
3454 max = GET2(ecode, 1);
3455 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3456 ecode += 1 + IMM2_SIZE;
3457 goto REPEATCHAR;
3458
3459 case OP_POSSTAR:
3460 case OP_POSSTARI:
3461 possessive = TRUE;
3462 min = 0;
3463 max = INT_MAX;
3464 ecode++;
3465 goto REPEATCHAR;
3466
3467 case OP_POSPLUS:
3468 case OP_POSPLUSI:
3469 possessive = TRUE;
3470 min = 1;
3471 max = INT_MAX;
3472 ecode++;
3473 goto REPEATCHAR;
3474
3475 case OP_POSQUERY:
3476 case OP_POSQUERYI:
3477 possessive = TRUE;
3478 min = 0;
3479 max = 1;
3480 ecode++;
3481 goto REPEATCHAR;
3482
3483 case OP_STAR:
3484 case OP_STARI:
3485 case OP_MINSTAR:
3486 case OP_MINSTARI:
3487 case OP_PLUS:
3488 case OP_PLUSI:
3489 case OP_MINPLUS:
3490 case OP_MINPLUSI:
3491 case OP_QUERY:
3492 case OP_QUERYI:
3493 case OP_MINQUERY:
3494 case OP_MINQUERYI:
3495 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3496 minimize = (c & 1) != 0;
3497 min = rep_min[c]; /* Pick up values from tables; */
3498 max = rep_max[c]; /* zero for max => infinity */
3499 if (max == 0) max = INT_MAX;
3500
3501 /* Common code for all repeated single-character matches. We first check
3502 for the minimum number of characters. If the minimum equals the maximum, we
3503 are done. Otherwise, if minimizing, check the rest of the pattern for a
3504 match; if there isn't one, advance up to the maximum, one character at a
3505 time.
3506
3507 If maximizing, advance up to the maximum number of matching characters,
3508 until eptr is past the end of the maximum run. If possessive, we are
3509 then done (no backing up). Otherwise, match at this position; anything
3510 other than no match is immediately returned. For nomatch, back up one
3511 character, unless we are matching \R and the last thing matched was
3512 \r\n, in which case, back up two bytes. When we reach the first optional
3513 character position, we can save stack by doing a tail recurse.
3514
3515 The various UTF/non-UTF and caseful/caseless cases are handled separately,
3516 for speed. */
3517
3518 REPEATCHAR:
3519 #ifdef SUPPORT_UTF
3520 if (utf)
3521 {
3522 length = 1;
3523 charptr = ecode;
3524 GETCHARLEN(fc, ecode, length);
3525 ecode += length;
3526
3527 /* Handle multibyte character matching specially here. There is
3528 support for caseless matching if UCP support is present. */
3529
3530 if (length > 1)
3531 {
3532 #ifdef SUPPORT_UCP
3533 pcre_uint32 othercase;
3534 if (op >= OP_STARI && /* Caseless */
3535 (othercase = UCD_OTHERCASE(fc)) != fc)
3536 oclength = PRIV(ord2utf)(othercase, occhars);
3537 else oclength = 0;
3538 #endif /* SUPPORT_UCP */
3539 COST(min);
3540 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
3541 {
3542 if (eptr <= md->end_subject - length &&
3543 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3544 #ifdef SUPPORT_UCP
3545 else if (oclength > 0 &&
3546 eptr <= md->end_subject - oclength &&
3547 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3548 #endif /* SUPPORT_UCP */
3549 else
3550 {
3551 CHECK_PARTIAL();
3552 RRETURN(MATCH_NOMATCH);
3553 }
3554 }
3555
3556 if (min == max) continue;
3557
3558 if (minimize)
3559 {
3560 for (fi = min;; fi++) /* LOOP_COUNT: Ok */
3561 {
3562 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3563 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3564 if (fi >= max) RRETURN(MATCH_NOMATCH);
3565 if (eptr <= md->end_subject - length &&
3566 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3567 #ifdef SUPPORT_UCP
3568 else if (oclength > 0 &&
3569 eptr <= md->end_subject - oclength &&
3570 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3571 #endif /* SUPPORT_UCP */
3572 else
3573 {
3574 CHECK_PARTIAL();
3575 RRETURN(MATCH_NOMATCH);
3576 }
3577 }
3578 /* Control never gets here */
3579 }
3580
3581 else /* Maximize */
3582 {
3583 pp = eptr;
3584 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
3585 {
3586 if (eptr <= md->end_subject - length &&
3587 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3588 #ifdef SUPPORT_UCP
3589 else if (oclength > 0 &&
3590 eptr <= md->end_subject - oclength &&
3591 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3592 #endif /* SUPPORT_UCP */
3593 else
3594 {
3595 CHECK_PARTIAL();
3596 break;
3597 }
3598 COST_CHK(1);
3599 }
3600
3601 if (possessive) continue; /* No backtracking */
3602 for(;;) /* LOOP_COUNT: Ok */
3603 {
3604 if (eptr <= pp) goto TAIL_RECURSE;
3605 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3606 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3607 #ifdef SUPPORT_UCP
3608 eptr--;
3609 BACKCHAR(eptr);
3610 #else /* without SUPPORT_UCP */
3611 eptr -= length;
3612 #endif /* SUPPORT_UCP */
3613 }
3614 }
3615 /* Control never gets here */
3616 }
3617
3618 /* If the length of a UTF-8 character is 1, we fall through here, and
3619 obey the code as for non-UTF-8 characters below, though in this case the
3620 value of fc will always be < 128. */
3621 }
3622 else
3623 #endif /* SUPPORT_UTF */
3624 /* When not in UTF-8 mode, load a single-byte character. */
3625 fc = *ecode++;
3626
3627 /* The value of fc at this point is always one character, though we may
3628 or may not be in UTF mode. The code is duplicated for the caseless and
3629 caseful cases, for speed, since matching characters is likely to be quite
3630 common. First, ensure the minimum number of matches are present. If min =
3631 max, continue at the same level without recursing. Otherwise, if
3632 minimizing, keep trying the rest of the expression and advancing one
3633 matching character if failing, up to the maximum. Alternatively, if
3634 maximizing, find the maximum number of characters and work backwards. */
3635
3636 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3637 max, (char *)eptr));
3638
3639 if (op >= OP_STARI) /* Caseless */
3640 {
3641 #ifdef COMPILE_PCRE8
3642 /* fc must be < 128 if UTF is enabled. */
3643 foc = md->fcc[fc];
3644 #else
3645 #ifdef SUPPORT_UTF
3646 #ifdef SUPPORT_UCP
3647 if (utf && fc > 127)
3648 foc = UCD_OTHERCASE(fc);
3649 #else
3650 if (utf && fc > 127)
3651 foc = fc;
3652 #endif /* SUPPORT_UCP */
3653 else
3654 #endif /* SUPPORT_UTF */
3655 foc = TABLE_GET(fc, md->fcc, fc);
3656 #endif /* COMPILE_PCRE8 */
3657
3658 for (i = 1; i <= min; i++) /* LOOP_COUNT: CHK */
3659 {
3660 pcre_uint32 cc; /* Faster than pcre_uchar */
3661 if (eptr >= md->end_subject)
3662 {
3663 SCHECK_PARTIAL();
3664 RRETURN(MATCH_NOMATCH);
3665 }
3666 cc = UCHAR21TEST(eptr);
3667 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3668 eptr++;
3669 COST_CHK(1);
3670 }
3671 if (min == max) continue;
3672 if (minimize)
3673 {
3674 for (fi = min;; fi++) /* LOOP_COUNT: Ok */
3675 {
3676 pcre_uint32 cc; /* Faster than pcre_uchar */
3677 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3678 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3679 if (fi >= max) RRETURN(MATCH_NOMATCH);
3680 if (eptr >= md->end_subject)
3681 {
3682 SCHECK_PARTIAL();
3683 RRETURN(MATCH_NOMATCH);
3684 }
3685 cc = UCHAR21TEST(eptr);
3686 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3687 eptr++;
3688 }
3689 /* Control never gets here */
3690 }
3691 else /* Maximize */
3692 {
3693 pp = eptr;
3694 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
3695 {
3696 pcre_uint32 cc; /* Faster than pcre_uchar */
3697 if (eptr >= md->end_subject)
3698 {
3699 SCHECK_PARTIAL();
3700 break;
3701 }
3702 cc = UCHAR21TEST(eptr);
3703 if (fc != cc && foc != cc) break;
3704 eptr++;
3705 COST_CHK(1);
3706 }
3707 if (possessive) continue; /* No backtracking */
3708 for (;;) /* LOOP_COUNT: Ok */
3709 {
3710 if (eptr == pp) goto TAIL_RECURSE;
3711 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3712 eptr--;
3713 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3714 }
3715 /* Control never gets here */
3716 }
3717 }
3718
3719 /* Caseful comparisons (includes all multi-byte characters) */
3720
3721 else
3722 {
3723 COST(min);
3724 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
3725 {
3726 if (eptr >= md->end_subject)
3727 {
3728 SCHECK_PARTIAL();
3729 RRETURN(MATCH_NOMATCH);
3730 }
3731 if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3732 }
3733
3734 if (min == max) continue;
3735
3736 if (minimize)
3737 {
3738 for (fi = min;; fi++) /* LOOP_COUNT: Ok */
3739 {
3740 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3741 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3742 if (fi >= max) RRETURN(MATCH_NOMATCH);
3743 if (eptr >= md->end_subject)
3744 {
3745 SCHECK_PARTIAL();
3746 RRETURN(MATCH_NOMATCH);
3747 }
3748 if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3749 }
3750 /* Control never gets here */
3751 }
3752 else /* Maximize */
3753 {
3754 pp = eptr;
3755 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
3756 {
3757 if (eptr >= md->end_subject)
3758 {
3759 SCHECK_PARTIAL();
3760 break;
3761 }
3762 if (fc != UCHAR21TEST(eptr)) break;
3763 eptr++;
3764 COST_CHK(1);
3765 }
3766 if (possessive) continue; /* No backtracking */
3767 for (;;) /* LOOP_COUNT: Ok */
3768 {
3769 if (eptr == pp) goto TAIL_RECURSE;
3770 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3771 eptr--;
3772 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3773 }
3774 /* Control never gets here */
3775 }
3776 }
3777 /* Control never gets here */
3778
3779 /* Match a negated single one-byte character. The character we are
3780 checking can be multibyte. */
3781
3782 case OP_NOT:
3783 case OP_NOTI:
3784 if (eptr >= md->end_subject)
3785 {
3786 SCHECK_PARTIAL();
3787 RRETURN(MATCH_NOMATCH);
3788 }
3789 #ifdef SUPPORT_UTF
3790 if (utf)
3791 {
3792 register pcre_uint32 ch, och;
3793
3794 ecode++;
3795 GETCHARINC(ch, ecode);
3796 GETCHARINC(c, eptr);
3797
3798 if (op == OP_NOT)
3799 {
3800 if (ch == c) RRETURN(MATCH_NOMATCH);
3801 }
3802 else
3803 {
3804 #ifdef SUPPORT_UCP
3805 if (ch > 127)
3806 och = UCD_OTHERCASE(ch);
3807 #else
3808 if (ch > 127)
3809 och = ch;
3810 #endif /* SUPPORT_UCP */
3811 else
3812 och = TABLE_GET(ch, md->fcc, ch);
3813 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3814 }
3815 }
3816 else
3817 #endif
3818 {
3819 register pcre_uint32 ch = ecode[1];
3820 c = *eptr++;
3821 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3822 RRETURN(MATCH_NOMATCH);
3823 ecode += 2;
3824 }
3825 break;
3826
3827 /* Match a negated single one-byte character repeatedly. This is almost a
3828 repeat of the code for a repeated single character, but I haven't found a
3829 nice way of commoning these up that doesn't require a test of the
3830 positive/negative option for each character match. Maybe that wouldn't add
3831 very much to the time taken, but character matching *is* what this is all
3832 about... */
3833
3834 case OP_NOTEXACT:
3835 case OP_NOTEXACTI:
3836 min = max = GET2(ecode, 1);
3837 ecode += 1 + IMM2_SIZE;
3838 goto REPEATNOTCHAR;
3839
3840 case OP_NOTUPTO:
3841 case OP_NOTUPTOI:
3842 case OP_NOTMINUPTO:
3843 case OP_NOTMINUPTOI:
3844 min = 0;
3845 max = GET2(ecode, 1);
3846 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3847 ecode += 1 + IMM2_SIZE;
3848 goto REPEATNOTCHAR;
3849
3850 case OP_NOTPOSSTAR:
3851 case OP_NOTPOSSTARI:
3852 possessive = TRUE;
3853 min = 0;
3854 max = INT_MAX;
3855 ecode++;
3856 goto REPEATNOTCHAR;
3857
3858 case OP_NOTPOSPLUS:
3859 case OP_NOTPOSPLUSI:
3860 possessive = TRUE;
3861 min = 1;
3862 max = INT_MAX;
3863 ecode++;
3864 goto REPEATNOTCHAR;
3865
3866 case OP_NOTPOSQUERY:
3867 case OP_NOTPOSQUERYI:
3868 possessive = TRUE;
3869 min = 0;
3870 max = 1;
3871 ecode++;
3872 goto REPEATNOTCHAR;
3873
3874 case OP_NOTPOSUPTO:
3875 case OP_NOTPOSUPTOI:
3876 possessive = TRUE;
3877 min = 0;
3878 max = GET2(ecode, 1);
3879 ecode += 1 + IMM2_SIZE;
3880 goto REPEATNOTCHAR;
3881
3882 case OP_NOTSTAR:
3883 case OP_NOTSTARI:
3884 case OP_NOTMINSTAR:
3885 case OP_NOTMINSTARI:
3886 case OP_NOTPLUS:
3887 case OP_NOTPLUSI:
3888 case OP_NOTMINPLUS:
3889 case OP_NOTMINPLUSI:
3890 case OP_NOTQUERY:
3891 case OP_NOTQUERYI:
3892 case OP_NOTMINQUERY:
3893 case OP_NOTMINQUERYI:
3894 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3895 minimize = (c & 1) != 0;
3896 min = rep_min[c]; /* Pick up values from tables; */
3897 max = rep_max[c]; /* zero for max => infinity */
3898 if (max == 0) max = INT_MAX;
3899
3900 /* Common code for all repeated single-byte matches. */
3901
3902 REPEATNOTCHAR:
3903 GETCHARINCTEST(fc, ecode);
3904
3905 /* The code is duplicated for the caseless and caseful cases, for speed,
3906 since matching characters is likely to be quite common. First, ensure the
3907 minimum number of matches are present. If min = max, continue at the same
3908 level without recursing. Otherwise, if minimizing, keep trying the rest of
3909 the expression and advancing one matching character if failing, up to the
3910 maximum. Alternatively, if maximizing, find the maximum number of
3911 characters and work backwards. */
3912
3913 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3914 max, (char *)eptr));
3915
3916 if (op >= OP_NOTSTARI) /* Caseless */
3917 {
3918 #ifdef SUPPORT_UTF
3919 #ifdef SUPPORT_UCP
3920 if (utf && fc > 127)
3921 foc = UCD_OTHERCASE(fc);
3922 #else
3923 if (utf && fc > 127)
3924 foc = fc;
3925 #endif /* SUPPORT_UCP */
3926 else
3927 #endif /* SUPPORT_UTF */
3928 foc = TABLE_GET(fc, md->fcc, fc);
3929
3930 #ifdef SUPPORT_UTF
3931 if (utf)
3932 {
3933 register pcre_uint32 d;
3934 COST(min);
3935 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
3936 {
3937 if (eptr >= md->end_subject)
3938 {
3939 SCHECK_PARTIAL();
3940 RRETURN(MATCH_NOMATCH);
3941 }
3942 GETCHARINC(d, eptr);
3943 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3944 }
3945 }
3946 else
3947 #endif /* SUPPORT_UTF */
3948 /* Not UTF mode */
3949 {
3950 COST(min);
3951 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
3952 {
3953 if (eptr >= md->end_subject)
3954 {
3955 SCHECK_PARTIAL();
3956 RRETURN(MATCH_NOMATCH);
3957 }
3958 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3959 eptr++;
3960 }
3961 }
3962
3963 if (min == max) continue;
3964
3965 if (minimize)
3966 {
3967 #ifdef SUPPORT_UTF
3968 if (utf)
3969 {
3970 register pcre_uint32 d;
3971 for (fi = min;; fi++) /* LOOP_COUNT: Ok */
3972 {
3973 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3974 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3975 if (fi >= max) RRETURN(MATCH_NOMATCH);
3976 if (eptr >= md->end_subject)
3977 {
3978 SCHECK_PARTIAL();
3979 RRETURN(MATCH_NOMATCH);
3980 }
3981 GETCHARINC(d, eptr);
3982 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3983 }
3984 }
3985 else
3986 #endif /*SUPPORT_UTF */
3987 /* Not UTF mode */
3988 {
3989 for (fi = min;; fi++) /* LOOP_COUNT: Ok */
3990 {
3991 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3992 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3993 if (fi >= max) RRETURN(MATCH_NOMATCH);
3994 if (eptr >= md->end_subject)
3995 {
3996 SCHECK_PARTIAL();
3997 RRETURN(MATCH_NOMATCH);
3998 }
3999 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
4000 eptr++;
4001 }
4002 }
4003 /* Control never gets here */
4004 }
4005
4006 /* Maximize case */
4007
4008 else
4009 {
4010 pp = eptr;
4011
4012 #ifdef SUPPORT_UTF
4013 if (utf)
4014 {
4015 register pcre_uint32 d;
4016 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
4017 {
4018 int len = 1;
4019 if (eptr >= md->end_subject)
4020 {
4021 SCHECK_PARTIAL();
4022 break;
4023 }
4024 GETCHARLEN(d, eptr, len);
4025 if (fc == d || (unsigned int)foc == d) break;
4026 eptr += len;
4027 COST_CHK(1); /* 'd' is not alive */
4028 }
4029 if (possessive) continue; /* No backtracking */
4030 for(;;) /* LOOP_COUNT: Ok */
4031 {
4032 if (eptr <= pp) goto TAIL_RECURSE;
4033 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
4034 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4035 eptr--;
4036 BACKCHAR(eptr);
4037 }
4038 }
4039 else
4040 #endif /* SUPPORT_UTF */
4041 /* Not UTF mode */
4042 {
4043 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
4044 {
4045 if (eptr >= md->end_subject)
4046 {
4047 SCHECK_PARTIAL();
4048 break;
4049 }
4050 if (fc == *eptr || foc == *eptr) break;
4051 eptr++;
4052 COST_CHK(1);
4053 }
4054 if (possessive) continue; /* No backtracking */
4055 for (;;) /* LOOP_COUNT: Ok */
4056 {
4057 if (eptr == pp) goto TAIL_RECURSE;
4058 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
4059 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4060 eptr--;
4061 }
4062 }
4063 /* Control never gets here */
4064 }
4065 }
4066
4067 /* Caseful comparisons */
4068
4069 else
4070 {
4071 #ifdef SUPPORT_UTF
4072 if (utf)
4073 {
4074 register pcre_uint32 d;
4075 for (i = 1; i <= min; i++) /* LOOP_COUNT: CHK */
4076 {
4077 if (eptr >= md->end_subject)
4078 {
4079 SCHECK_PARTIAL();
4080 RRETURN(MATCH_NOMATCH);
4081 }
4082 GETCHARINC(d, eptr);
4083 if (fc == d) RRETURN(MATCH_NOMATCH);
4084 COST_CHK(1);
4085 }
4086 }
4087 else
4088 #endif
4089 /* Not UTF mode */
4090 {
4091 COST(min);
4092 for (i = 1; i <= min; i++) /* LOOP_COUNT: Cost */
4093 {
4094 if (eptr >= md->end_subject)
4095 {
4096 SCHECK_PARTIAL();
4097 RRETURN(MATCH_NOMATCH);
4098 }
4099 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
4100 }
4101 }
4102
4103 if (min == max) continue;
4104
4105 if (minimize)
4106 {
4107 #ifdef SUPPORT_UTF
4108 if (utf)
4109 {
4110 register pcre_uint32 d;
4111 for (fi = min;; fi++) /* LOOP_COUNT: Ok */
4112 {
4113 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
4114 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4115 if (fi >= max) RRETURN(MATCH_NOMATCH);
4116 if (eptr >= md->end_subject)
4117 {
4118 SCHECK_PARTIAL();
4119 RRETURN(MATCH_NOMATCH);
4120 }
4121 GETCHARINC(d, eptr);
4122 if (fc == d) RRETURN(MATCH_NOMATCH);
4123 }
4124 }
4125 else
4126 #endif
4127 /* Not UTF mode */
4128 {
4129 for (fi = min;; fi++) /* LOOP_COUNT: Ok */
4130 {
4131 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
4132 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4133 if (fi >= max) RRETURN(MATCH_NOMATCH);
4134 if (eptr >= md->end_subject)
4135 {
4136 SCHECK_PARTIAL();
4137 RRETURN(MATCH_NOMATCH);
4138 }
4139 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
4140 }
4141 }
4142 /* Control never gets here */
4143 }
4144
4145 /* Maximize case */
4146
4147 else
4148 {
4149 pp = eptr;
4150
4151 #ifdef SUPPORT_UTF
4152 if (utf)
4153 {
4154 register pcre_uint32 d;
4155 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
4156 {
4157 int len = 1;
4158 if (eptr >= md->end_subject)
4159 {
4160 SCHECK_PARTIAL();
4161 break;
4162 }
4163 GETCHARLEN(d, eptr, len);
4164 if (fc == d) break;
4165 eptr += len;
4166 COST_CHK(1);
4167 }
4168 if (possessive) continue; /* No backtracking */
4169 for(;;) /* LOOP_COUNT: Ok */
4170 {
4171 if (eptr <= pp) goto TAIL_RECURSE;
4172 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
4173 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4174 eptr--;
4175 BACKCHAR(eptr);
4176 }
4177 }
4178 else
4179 #endif
4180 /* Not UTF mode */
4181 {
4182 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
4183 {
4184 if (eptr >= md->end_subject)
4185 {
4186 SCHECK_PARTIAL();
4187 break;
4188 }
4189 if (fc == *eptr) break;
4190 eptr++;
4191 COST_CHK(1);
4192 }
4193 if (possessive) continue; /* No backtracking */
4194 for (;;) /* LOOP_COUNT: Ok */
4195 {
4196 if (eptr == pp) goto TAIL_RECURSE;
4197 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
4198 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4199 eptr--;
4200 }
4201 }
4202 /* Control never gets here */
4203 }
4204 }
4205 /* Control never gets here */
4206
4207 /* Match a single character type repeatedly; several different opcodes
4208 share code. This is very similar to the code for single characters, but we
4209 repeat it in the interests of efficiency. */
4210
4211 case OP_TYPEEXACT:
4212 min = max = GET2(ecode, 1);
4213 minimize = TRUE;
4214 ecode += 1 + IMM2_SIZE;
4215 goto REPEATTYPE;
4216
4217 case OP_TYPEUPTO:
4218 case OP_TYPEMINUPTO:
4219 min = 0;
4220 max = GET2(ecode, 1);
4221 minimize = *ecode == OP_TYPEMINUPTO;
4222 ecode += 1 + IMM2_SIZE;
4223 goto REPEATTYPE;
4224
4225 case OP_TYPEPOSSTAR:
4226 possessive = TRUE;
4227 min = 0;
4228 max = INT_MAX;
4229 ecode++;
4230 goto REPEATTYPE;
4231
4232 case OP_TYPEPOSPLUS:
4233 possessive = TRUE;
4234 min = 1;
4235 max = INT_MAX;
4236 ecode++;
4237 goto REPEATTYPE;
4238
4239 case OP_TYPEPOSQUERY:
4240 possessive = TRUE;
4241 min = 0;
4242 max = 1;
4243 ecode++;
4244 goto REPEATTYPE;
4245
4246 case OP_TYPEPOSUPTO:
4247 possessive = TRUE;
4248 min = 0;
4249 max = GET2(ecode, 1);
4250 ecode += 1 + IMM2_SIZE;
4251 goto REPEATTYPE;
4252
4253 case OP_TYPESTAR:
4254 case OP_TYPEMINSTAR:
4255 case OP_TYPEPLUS:
4256 case OP_TYPEMINPLUS:
4257 case OP_TYPEQUERY:
4258 case OP_TYPEMINQUERY:
4259 c = *ecode++ - OP_TYPESTAR;
4260 minimize = (c & 1) != 0;
4261 min = rep_min[c]; /* Pick up values from tables; */
4262 max = rep_max[c]; /* zero for max => infinity */
4263 if (max == 0) max = INT_MAX;
4264
4265 /* Common code for all repeated single character type matches. Note that
4266 in UTF-8 mode, '.' matches a character of any length, but for the other
4267 character types, the valid characters are all one-byte long. */
4268
4269 REPEATTYPE:
4270 ctype = *ecode++; /* Code for the character type */
4271
4272 #ifdef SUPPORT_UCP
4273 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4274 {
4275 prop_fail_result = ctype == OP_NOTPROP;
4276 prop_type = *ecode++;
4277 prop_value = *ecode++;
4278 }
4279 else prop_type = -1;
4280 #endif
4281
4282 /* First, ensure the minimum number of matches are present. Use inline
4283 code for maximizing the speed, and do the type test once at the start
4284 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4285 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4286 and single-bytes. */
4287
4288 if (min > 0)
4289 {
4290 #ifdef SUPPORT_UCP
4291 if (prop_type >= 0)
4292 {
4293 COST(min);
4294 switch(prop_type)
4295 {
4296 case PT_ANY:
4297 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4298 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST (above) */
4299 {
4300 if (eptr >= md->end_subject)
4301 {
4302 SCHECK_PARTIAL();
4303 RRETURN(MATCH_NOMATCH);
4304 }
4305 GETCHARINCTEST(c, eptr);
4306 }
4307 break;
4308
4309 case PT_LAMP:
4310 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST (above) */
4311 {
4312 int chartype;
4313 if (eptr >= md->end_subject)
4314 {
4315 SCHECK_PARTIAL();
4316 RRETURN(MATCH_NOMATCH);
4317 }
4318 GETCHARINCTEST(c, eptr);
4319 chartype = UCD_CHARTYPE(c);
4320 if ((chartype == ucp_Lu ||
4321 chartype == ucp_Ll ||
4322 chartype == ucp_Lt) == prop_fail_result)
4323 RRETURN(MATCH_NOMATCH);
4324 }
4325 break;
4326
4327 case PT_GC:
4328 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST (above) */
4329 {
4330 if (eptr >= md->end_subject)
4331 {
4332 SCHECK_PARTIAL();
4333 RRETURN(MATCH_NOMATCH);
4334 }
4335 GETCHARINCTEST(c, eptr);
4336 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4337 RRETURN(MATCH_NOMATCH);
4338 }
4339 break;
4340
4341 case PT_PC:
4342 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST (above) */
4343 {
4344 if (eptr >= md->end_subject)
4345 {
4346 SCHECK_PARTIAL();
4347 RRETURN(MATCH_NOMATCH);
4348 }
4349 GETCHARINCTEST(c, eptr);
4350 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4351 RRETURN(MATCH_NOMATCH);
4352 }
4353 break;
4354
4355 case PT_SC:
4356 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST (above) */
4357 {
4358 if (eptr >= md->end_subject)
4359 {
4360 SCHECK_PARTIAL();
4361 RRETURN(MATCH_NOMATCH);
4362 }
4363 GETCHARINCTEST(c, eptr);
4364 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4365 RRETURN(MATCH_NOMATCH);
4366 }
4367 break;
4368
4369 case PT_ALNUM: /* LOOP_COUNT: COST (above) */
4370 for (i = 1; i <= min; i++)
4371 {
4372 int category;
4373 if (eptr >= md->end_subject)
4374 {
4375 SCHECK_PARTIAL();
4376 RRETURN(MATCH_NOMATCH);
4377 }
4378 GETCHARINCTEST(c, eptr);
4379 category = UCD_CATEGORY(c);
4380 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4381 RRETURN(MATCH_NOMATCH);
4382 }
4383 break;
4384
4385 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
4386 which means that Perl space and POSIX space are now identical. PCRE
4387 was changed at release 8.34. */
4388
4389 case PT_SPACE: /* Perl space */
4390 case PT_PXSPACE: /* POSIX space */
4391 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST (above) */
4392 {
4393 if (eptr >= md->end_subject)
4394 {
4395 SCHECK_PARTIAL();
4396 RRETURN(MATCH_NOMATCH);
4397 }
4398 GETCHARINCTEST(c, eptr);
4399 switch(c)
4400 {
4401 HSPACE_CASES:
4402 VSPACE_CASES:
4403 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4404 break;
4405
4406 default:
4407 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
4408 RRETURN(MATCH_NOMATCH);
4409 break;
4410 }
4411 }
4412 break;
4413
4414 case PT_WORD:
4415 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST (above) */
4416 {
4417 int category;
4418 if (eptr >= md->end_subject)
4419 {
4420 SCHECK_PARTIAL();
4421 RRETURN(MATCH_NOMATCH);
4422 }
4423 GETCHARINCTEST(c, eptr);
4424 category = UCD_CATEGORY(c);
4425 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4426 == prop_fail_result)
4427 RRETURN(MATCH_NOMATCH);
4428 }
4429 break;
4430
4431 case PT_CLIST:
4432 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST (above) */
4433 {
4434 const pcre_uint32 *cp;
4435 if (eptr >= md->end_subject)
4436 {
4437 SCHECK_PARTIAL();
4438 RRETURN(MATCH_NOMATCH);
4439 }
4440 GETCHARINCTEST(c, eptr);
4441 cp = PRIV(ucd_caseless_sets) + prop_value;
4442 for (;;) /* LOOP_COUNT: COST */
4443 {
4444 if (c < *cp)
4445 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4446 if (c == *cp++)
4447 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4448 COST(1);
4449 }
4450 }
4451 break;
4452
4453 case PT_UCNC:
4454 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST (above) */
4455 {
4456 if (eptr >= md->end_subject)
4457 {
4458 SCHECK_PARTIAL();
4459 RRETURN(MATCH_NOMATCH);
4460 }
4461 GETCHARINCTEST(c, eptr);
4462 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
4463 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
4464 c >= 0xe000) == prop_fail_result)
4465 RRETURN(MATCH_NOMATCH);
4466 }
4467 break;
4468
4469 /* This should not occur */
4470
4471 default:
4472 RRETURN(PCRE_ERROR_INTERNAL);
4473 }
4474 }
4475
4476 /* Match extended Unicode sequences. We will get here only if the
4477 support is in the binary; otherwise a compile-time error occurs. */
4478
4479 else if (ctype == OP_EXTUNI)
4480 {
4481 COST(min);
4482 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
4483 {
4484 if (eptr >= md->end_subject)
4485 {
4486 SCHECK_PARTIAL();
4487 RRETURN(MATCH_NOMATCH);
4488 }
4489 else
4490 {
4491 #ifndef ERLANG_INTEGRATION
4492 int lgb, rgb;
4493 #endif
4494 GETCHARINCTEST(c, eptr);
4495 lgb = UCD_GRAPHBREAK(c);
4496 while (eptr < md->end_subject) /* LOOP_COUNT: CHK */
4497 {
4498 int len = 1;
4499 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4500 rgb = UCD_GRAPHBREAK(c);
4501 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4502 lgb = rgb;
4503 eptr += len;
4504 COST_CHK(1);
4505 }
4506 }
4507 CHECK_PARTIAL();
4508 }
4509 }
4510
4511 else
4512 #endif /* SUPPORT_UCP */
4513
4514 /* Handle all other cases when the coding is UTF-8 */
4515
4516 #ifdef SUPPORT_UTF
4517 if (utf) switch(ctype)
4518 {
4519 case OP_ANY:
4520 COST(min);
4521 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
4522 {
4523 if (eptr >= md->end_subject)
4524 {
4525 SCHECK_PARTIAL();
4526 RRETURN(MATCH_NOMATCH);
4527 }
4528 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4529 if (md->partial != 0 &&
4530 eptr + 1 >= md->end_subject &&
4531 NLBLOCK->nltype == NLTYPE_FIXED &&
4532 NLBLOCK->nllen == 2 &&
4533 UCHAR21(eptr) == NLBLOCK->nl[0])
4534 {
4535 md->hitend = TRUE;
4536 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4537 }
4538 eptr++;
4539 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4540 }
4541 break;
4542
4543 case OP_ALLANY:
4544 COST(min);
4545 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
4546 {
4547 if (eptr >= md->end_subject)
4548 {
4549 SCHECK_PARTIAL();
4550 RRETURN(MATCH_NOMATCH);
4551 }
4552 eptr++;
4553 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4554 }
4555 break;
4556
4557 case OP_ANYBYTE:
4558 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4559 eptr += min;
4560 break;
4561
4562 case OP_ANYNL:
4563 COST(min);
4564 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
4565 {
4566 if (eptr >= md->end_subject)
4567 {
4568 SCHECK_PARTIAL();
4569 RRETURN(MATCH_NOMATCH);
4570 }
4571 GETCHARINC(c, eptr);
4572 switch(c)
4573 {
4574 default: RRETURN(MATCH_NOMATCH);
4575
4576 case CHAR_CR:
4577 if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
4578 break;
4579
4580 case CHAR_LF:
4581 break;
4582
4583 case CHAR_VT:
4584 case CHAR_FF:
4585 case CHAR_NEL:
4586 #ifndef EBCDIC
4587 case 0x2028:
4588 case 0x2029:
4589 #endif /* Not EBCDIC */
4590 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4591 break;
4592 }
4593 }
4594 break;
4595
4596 case OP_NOT_HSPACE:
4597 COST(min);
4598 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
4599 {
4600 if (eptr >= md->end_subject)
4601 {
4602 SCHECK_PARTIAL();
4603 RRETURN(MATCH_NOMATCH);
4604 }
4605 GETCHARINC(c, eptr);
4606 switch(c)
4607 {
4608 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
4609 default: break;
4610 }
4611 }
4612 break;
4613
4614 case OP_HSPACE:
4615 COST(min);
4616 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
4617 {
4618 if (eptr >= md->end_subject)
4619 {
4620 SCHECK_PARTIAL();
4621 RRETURN(MATCH_NOMATCH);
4622 }
4623 GETCHARINC(c, eptr);
4624 switch(c)
4625 {
4626 HSPACE_CASES: break; /* Byte and multibyte cases */
4627 default: RRETURN(MATCH_NOMATCH);
4628 }
4629 }
4630 break;
4631
4632 case OP_NOT_VSPACE:
4633 COST(min);
4634 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
4635 {
4636 if (eptr >= md->end_subject)
4637 {
4638 SCHECK_PARTIAL();
4639 RRETURN(MATCH_NOMATCH);
4640 }
4641 GETCHARINC(c, eptr);
4642 switch(c)
4643 {
4644 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4645 default: break;
4646 }
4647 }
4648 break;
4649
4650 case OP_VSPACE:
4651 COST(min);
4652 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
4653 {
4654 if (eptr >= md->end_subject)
4655 {
4656 SCHECK_PARTIAL();
4657 RRETURN(MATCH_NOMATCH);
4658 }
4659 GETCHARINC(c, eptr);
4660 switch(c)
4661 {
4662 VSPACE_CASES: break;
4663 default: RRETURN(MATCH_NOMATCH);
4664 }
4665 }
4666 break;
4667
4668 case OP_NOT_DIGIT:
4669 COST(min);
4670 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
4671 {
4672 if (eptr >= md->end_subject)
4673 {
4674 SCHECK_PARTIAL();
4675 RRETURN(MATCH_NOMATCH);
4676 }
4677 GETCHARINC(c, eptr);
4678 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4679 RRETURN(MATCH_NOMATCH);
4680 }
4681 break;
4682
4683 case OP_DIGIT:
4684 COST(min);
4685 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
4686 {
4687 pcre_uint32 cc;
4688 if (eptr >= md->end_subject)
4689 {
4690 SCHECK_PARTIAL();
4691 RRETURN(MATCH_NOMATCH);
4692 }
4693 cc = UCHAR21(eptr);
4694 if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
4695 RRETURN(MATCH_NOMATCH);
4696 eptr++;
4697 /* No need to skip more bytes - we know it's a 1-byte character */
4698 }
4699 break;
4700
4701 case OP_NOT_WHITESPACE:
4702 COST(min);
4703 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
4704 {
4705 pcre_uint32 cc;
4706 if (eptr >= md->end_subject)
4707 {
4708 SCHECK_PARTIAL();
4709 RRETURN(MATCH_NOMATCH);
4710 }
4711 cc = UCHAR21(eptr);
4712 if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
4713 RRETURN(MATCH_NOMATCH);
4714 eptr++;
4715 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4716 }
4717 break;
4718
4719 case OP_WHITESPACE:
4720 COST(min);
4721 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
4722 {
4723 pcre_uint32 cc;
4724 if (eptr >= md->end_subject)
4725 {
4726 SCHECK_PARTIAL();
4727 RRETURN(MATCH_NOMATCH);
4728 }
4729 cc = UCHAR21(eptr);
4730 if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
4731 RRETURN(MATCH_NOMATCH);
4732 eptr++;
4733 /* No need to skip more bytes - we know it's a 1-byte character */
4734 }
4735 break;
4736
4737 case OP_NOT_WORDCHAR:
4738 COST(min);
4739 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
4740 {
4741 pcre_uint32 cc;
4742 if (eptr >= md->end_subject)
4743 {
4744 SCHECK_PARTIAL();
4745 RRETURN(MATCH_NOMATCH);
4746 }
4747 cc = UCHAR21(eptr);
4748 if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
4749 RRETURN(MATCH_NOMATCH);
4750 eptr++;
4751 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4752 }
4753 break;
4754
4755 case OP_WORDCHAR:
4756 COST(min);
4757 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
4758 {
4759 pcre_uint32 cc;
4760 if (eptr >= md->end_subject)
4761 {
4762 SCHECK_PARTIAL();
4763 RRETURN(MATCH_NOMATCH);
4764 }
4765 cc = UCHAR21(eptr);
4766 if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
4767 RRETURN(MATCH_NOMATCH);
4768 eptr++;
4769 /* No need to skip more bytes - we know it's a 1-byte character */
4770 }
4771 break;
4772
4773 default:
4774 RRETURN(PCRE_ERROR_INTERNAL);
4775 } /* End switch(ctype) */
4776
4777 else
4778 #endif /* SUPPORT_UTF */
4779
4780 /* Code for the non-UTF-8 case for minimum matching of operators other
4781 than OP_PROP and OP_NOTPROP. */
4782
4783 switch(ctype)
4784 {
4785 case OP_ANY:
4786 COST(min);
4787 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
4788 {
4789 if (eptr >= md->end_subject)
4790 {
4791 SCHECK_PARTIAL();
4792 RRETURN(MATCH_NOMATCH);
4793 }
4794 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4795 if (md->partial != 0 &&
4796 eptr + 1 >= md->end_subject &&
4797 NLBLOCK->nltype == NLTYPE_FIXED &&
4798 NLBLOCK->nllen == 2 &&
4799 *eptr == NLBLOCK->nl[0])
4800 {
4801 md->hitend = TRUE;
4802 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4803 }
4804 eptr++;
4805 }
4806 break;
4807
4808 case OP_ALLANY:
4809 if (eptr > md->end_subject - min)
4810 {
4811 SCHECK_PARTIAL();
4812 RRETURN(MATCH_NOMATCH);
4813 }
4814 eptr += min;
4815 break;
4816
4817 case OP_ANYBYTE:
4818 if (eptr > md->end_subject - min)
4819 {
4820 SCHECK_PARTIAL();
4821 RRETURN(MATCH_NOMATCH);
4822 }
4823 eptr += min;
4824 break;
4825
4826 case OP_ANYNL:
4827 COST(min);
4828 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
4829 {
4830 if (eptr >= md->end_subject)
4831 {
4832 SCHECK_PARTIAL();
4833 RRETURN(MATCH_NOMATCH);
4834 }
4835 switch(*eptr++)
4836 {
4837 default: RRETURN(MATCH_NOMATCH);
4838
4839 case CHAR_CR:
4840 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4841 break;
4842
4843 case CHAR_LF:
4844 break;
4845
4846 case CHAR_VT:
4847 case CHAR_FF:
4848 case CHAR_NEL:
4849 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4850 case 0x2028:
4851 case 0x2029:
4852 #endif
4853 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4854 break;
4855 }
4856 }
4857 break;
4858
4859 case OP_NOT_HSPACE:
4860 COST(min);
4861 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
4862 {
4863 if (eptr >= md->end_subject)
4864 {
4865 SCHECK_PARTIAL();
4866 RRETURN(MATCH_NOMATCH);
4867 }
4868 switch(*eptr++)
4869 {
4870 default: break;
4871 HSPACE_BYTE_CASES:
4872 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4873 HSPACE_MULTIBYTE_CASES:
4874 #endif
4875 RRETURN(MATCH_NOMATCH);
4876 }
4877 }
4878 break;
4879
4880 case OP_HSPACE:
4881 COST(min);
4882 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
4883 {
4884 if (eptr >= md->end_subject)
4885 {
4886 SCHECK_PARTIAL();
4887 RRETURN(MATCH_NOMATCH);
4888 }
4889 switch(*eptr++)
4890 {
4891 default: RRETURN(MATCH_NOMATCH);
4892 HSPACE_BYTE_CASES:
4893 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4894 HSPACE_MULTIBYTE_CASES:
4895 #endif
4896 break;
4897 }
4898 }
4899 break;
4900
4901 case OP_NOT_VSPACE:
4902 COST(min);
4903 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
4904 {
4905 if (eptr >= md->end_subject)
4906 {
4907 SCHECK_PARTIAL();
4908 RRETURN(MATCH_NOMATCH);
4909 }
4910 switch(*eptr++)
4911 {
4912 VSPACE_BYTE_CASES:
4913 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4914 VSPACE_MULTIBYTE_CASES:
4915 #endif
4916 RRETURN(MATCH_NOMATCH);
4917 default: break;
4918 }
4919 }
4920 break;
4921
4922 case OP_VSPACE:
4923 COST(min);
4924 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
4925 {
4926 if (eptr >= md->end_subject)
4927 {
4928 SCHECK_PARTIAL();
4929 RRETURN(MATCH_NOMATCH);
4930 }
4931 switch(*eptr++)
4932 {
4933 default: RRETURN(MATCH_NOMATCH);
4934 VSPACE_BYTE_CASES:
4935 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4936 VSPACE_MULTIBYTE_CASES:
4937 #endif
4938 break;
4939 }
4940 }
4941 break;
4942
4943 case OP_NOT_DIGIT:
4944 COST(min);
4945 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
4946 {
4947 if (eptr >= md->end_subject)
4948 {
4949 SCHECK_PARTIAL();
4950 RRETURN(MATCH_NOMATCH);
4951 }
4952 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4953 RRETURN(MATCH_NOMATCH);
4954 eptr++;
4955 }
4956 break;
4957
4958 case OP_DIGIT:
4959 COST(min);
4960 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
4961 {
4962 if (eptr >= md->end_subject)
4963 {
4964 SCHECK_PARTIAL();
4965 RRETURN(MATCH_NOMATCH);
4966 }
4967 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4968 RRETURN(MATCH_NOMATCH);
4969 eptr++;
4970 }
4971 break;
4972
4973 case OP_NOT_WHITESPACE:
4974 COST(min);
4975 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
4976 {
4977 if (eptr >= md->end_subject)
4978 {
4979 SCHECK_PARTIAL();
4980 RRETURN(MATCH_NOMATCH);
4981 }
4982 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4983 RRETURN(MATCH_NOMATCH);
4984 eptr++;
4985 }
4986 break;
4987
4988 case OP_WHITESPACE:
4989 COST(min);
4990 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
4991 {
4992 if (eptr >= md->end_subject)
4993 {
4994 SCHECK_PARTIAL();
4995 RRETURN(MATCH_NOMATCH);
4996 }
4997 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4998 RRETURN(MATCH_NOMATCH);
4999 eptr++;
5000 }
5001 break;
5002
5003 case OP_NOT_WORDCHAR:
5004 COST(min);
5005 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
5006 {
5007 if (eptr >= md->end_subject)
5008 {
5009 SCHECK_PARTIAL();
5010 RRETURN(MATCH_NOMATCH);
5011 }
5012 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
5013 RRETURN(MATCH_NOMATCH);
5014 eptr++;
5015 }
5016 break;
5017
5018 case OP_WORDCHAR:
5019 COST(min);
5020 for (i = 1; i <= min; i++) /* LOOP_COUNT: COST */
5021 {
5022 if (eptr >= md->end_subject)
5023 {
5024 SCHECK_PARTIAL();
5025 RRETURN(MATCH_NOMATCH);
5026 }
5027 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
5028 RRETURN(MATCH_NOMATCH);
5029 eptr++;
5030 }
5031 break;
5032
5033 default:
5034 RRETURN(PCRE_ERROR_INTERNAL);
5035 }
5036 }
5037
5038 /* If min = max, continue at the same level without recursing */
5039
5040 if (min == max) continue;
5041
5042 /* If minimizing, we have to test the rest of the pattern before each
5043 subsequent match. Again, separate the UTF-8 case for speed, and also
5044 separate the UCP cases. */
5045
5046 if (minimize)
5047 {
5048 #ifdef SUPPORT_UCP
5049 if (prop_type >= 0)
5050 {
5051 switch(prop_type)
5052 {
5053 case PT_ANY:
5054 for (fi = min;; fi++) /* LOOP_COUNT: Ok */
5055 {
5056 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
5057 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5058 if (fi >= max) RRETURN(MATCH_NOMATCH);
5059 if (eptr >= md->end_subject)
5060 {
5061 SCHECK_PARTIAL();
5062 RRETURN(MATCH_NOMATCH);
5063 }
5064 GETCHARINCTEST(c, eptr);
5065 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
5066 }
5067 /* Control never gets here */
5068
5069 case PT_LAMP:
5070 for (fi = min;; fi++) /* LOOP_COUNT: Ok */
5071 {
5072 int chartype;
5073 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
5074 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5075 if (fi >= max) RRETURN(MATCH_NOMATCH);
5076 if (eptr >= md->end_subject)
5077 {
5078 SCHECK_PARTIAL();
5079 RRETURN(MATCH_NOMATCH);
5080 }
5081 GETCHARINCTEST(c, eptr);
5082 chartype = UCD_CHARTYPE(c);
5083 if ((chartype == ucp_Lu ||
5084 chartype == ucp_Ll ||
5085 chartype == ucp_Lt) == prop_fail_result)
5086 RRETURN(MATCH_NOMATCH);
5087 }
5088 /* Control never gets here */
5089
5090 case PT_GC:
5091 for (fi = min;; fi++) /* LOOP_COUNT: Ok */
5092 {
5093 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
5094 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5095 if (fi >= max) RRETURN(MATCH_NOMATCH);
5096 if (eptr >= md->end_subject)
5097 {
5098 SCHECK_PARTIAL();
5099 RRETURN(MATCH_NOMATCH);
5100 }
5101 GETCHARINCTEST(c, eptr);
5102 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
5103 RRETURN(MATCH_NOMATCH);
5104 }
5105 /* Control never gets here */
5106
5107 case PT_PC:
5108 for (fi = min;; fi++) /* LOOP_COUNT: Ok */
5109 {
5110 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
5111 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5112 if (fi >= max) RRETURN(MATCH_NOMATCH);
5113 if (eptr >= md->end_subject)
5114 {
5115 SCHECK_PARTIAL();
5116 RRETURN(MATCH_NOMATCH);
5117 }
5118 GETCHARINCTEST(c, eptr);
5119 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
5120 RRETURN(MATCH_NOMATCH);
5121 }
5122 /* Control never gets here */
5123
5124 case PT_SC:
5125 for (fi = min;; fi++) /* LOOP_COUNT: Ok */
5126 {
5127 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
5128 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5129 if (fi >= max) RRETURN(MATCH_NOMATCH);
5130 if (eptr >= md->end_subject)
5131 {
5132 SCHECK_PARTIAL();
5133 RRETURN(MATCH_NOMATCH);
5134 }
5135 GETCHARINCTEST(c, eptr);
5136 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
5137 RRETURN(MATCH_NOMATCH);
5138 }
5139 /* Control never gets here */
5140
5141 case PT_ALNUM:
5142 for (fi = min;; fi++) /* LOOP_COUNT: Ok */
5143 {
5144 int category;
5145 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
5146 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5147 if (fi >= max) RRETURN(MATCH_NOMATCH);
5148 if (eptr >= md->end_subject)
5149 {
5150 SCHECK_PARTIAL();
5151 RRETURN(MATCH_NOMATCH);
5152 }
5153 GETCHARINCTEST(c, eptr);
5154 category = UCD_CATEGORY(c);
5155 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5156 RRETURN(MATCH_NOMATCH);
5157 }
5158 /* Control never gets here */
5159
5160 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
5161 which means that Perl space and POSIX space are now identical. PCRE
5162 was changed at release 8.34. */
5163
5164 case PT_SPACE: /* Perl space */
5165 case PT_PXSPACE: /* POSIX space */
5166 for (fi = min;; fi++) /* LOOP_COUNT: Ok */
5167 {
5168 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
5169 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5170 if (fi >= max) RRETURN(MATCH_NOMATCH);
5171 if (eptr >= md->end_subject)
5172 {
5173 SCHECK_PARTIAL();
5174 RRETURN(MATCH_NOMATCH);
5175 }
5176 GETCHARINCTEST(c, eptr);
5177 switch(c)
5178 {
5179 HSPACE_CASES:
5180 VSPACE_CASES:
5181 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
5182 break;
5183
5184 default:
5185 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
5186 RRETURN(MATCH_NOMATCH);
5187 break;
5188 }
5189 }
5190 /* Control never gets here */
5191
5192 case PT_WORD:
5193 for (fi = min;; fi++) /* LOOP_COUNT: Ok */
5194 {
5195 int category;
5196 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
5197 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5198 if (fi >= max) RRETURN(MATCH_NOMATCH);
5199 if (eptr >= md->end_subject)
5200 {
5201 SCHECK_PARTIAL();
5202 RRETURN(MATCH_NOMATCH);
5203 }
5204 GETCHARINCTEST(c, eptr);
5205 category = UCD_CATEGORY(c);
5206 if ((category == ucp_L ||
5207 category == ucp_N ||
5208 c == CHAR_UNDERSCORE)
5209 == prop_fail_result)
5210 RRETURN(MATCH_NOMATCH);
5211 }
5212 /* Control never gets here */
5213
5214 case PT_CLIST:
5215 for (fi = min;; fi++) /* LOOP_COUNT: Ok */
5216 {
5217 const pcre_uint32 *cp;
5218 RMATCH(eptr, ecode, offset_top, md, eptrb, RM67);
5219 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5220 if (fi >= max) RRETURN(MATCH_NOMATCH);
5221 if (eptr >= md->end_subject)
5222 {
5223 SCHECK_PARTIAL();
5224 RRETURN(MATCH_NOMATCH);
5225 }
5226 GETCHARINCTEST(c, eptr);
5227 cp = PRIV(ucd_caseless_sets) + prop_value;
5228 for (;;) /* LOOP_COUNT: COST */
5229 {
5230 if (c < *cp)
5231 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
5232 if (c == *cp++)
5233 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
5234 COST(1);
5235 }
5236 }
5237 /* Control never gets here */
5238
5239 case PT_UCNC:
5240 for (fi = min;; fi++) /* LOOP_COUNT: Ok */
5241 {
5242 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
5243 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5244 if (fi >= max) RRETURN(MATCH_NOMATCH);
5245 if (eptr >= md->end_subject)
5246 {
5247 SCHECK_PARTIAL();
5248 RRETURN(MATCH_NOMATCH);
5249 }
5250 GETCHARINCTEST(c, eptr);
5251 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5252 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5253 c >= 0xe000) == prop_fail_result)
5254 RRETURN(MATCH_NOMATCH);
5255 }
5256 /* Control never gets here */
5257
5258 /* This should never occur */
5259 default:
5260 RRETURN(PCRE_ERROR_INTERNAL);
5261 }
5262 }
5263
5264 /* Match extended Unicode sequences. We will get here only if the
5265 support is in the binary; otherwise a compile-time error occurs. */
5266
5267 else if (ctype == OP_EXTUNI)
5268 {
5269 for (fi = min;; fi++) /* LOOP_COUNT: Ok */
5270 {
5271 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5272 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5273 if (fi >= max) RRETURN(MATCH_NOMATCH);
5274 if (eptr >= md->end_subject)
5275 {
5276 SCHECK_PARTIAL();
5277 RRETURN(MATCH_NOMATCH);
5278 }
5279 else
5280 {
5281 #ifndef ERLANG_INTEGRATION
5282 int lgb, rgb;
5283 #endif
5284 GETCHARINCTEST(c, eptr);
5285 lgb = UCD_GRAPHBREAK(c);
5286 while (eptr < md->end_subject) /* LOOP_COUNT: CHK */
5287 {
5288 int len = 1;
5289 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5290 rgb = UCD_GRAPHBREAK(c);
5291 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5292 lgb = rgb;
5293 eptr += len;
5294 COST_CHK(1);
5295 }
5296 }
5297 CHECK_PARTIAL();
5298 }
5299 }
5300 else
5301 #endif /* SUPPORT_UCP */
5302
5303 #ifdef SUPPORT_UTF
5304 if (utf)
5305 {
5306 for (fi = min;; fi++) /* LOOP_COUNT: Ok */
5307 {
5308 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5309 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5310 if (fi >= max) RRETURN(MATCH_NOMATCH);
5311 if (eptr >= md->end_subject)
5312 {
5313 SCHECK_PARTIAL();
5314 RRETURN(MATCH_NOMATCH);
5315 }
5316 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5317 RRETURN(MATCH_NOMATCH);
5318 GETCHARINC(c, eptr);
5319 switch(ctype)
5320 {
5321 case OP_ANY: /* This is the non-NL case */
5322 if (md->partial != 0 && /* Take care with CRLF partial */
5323 eptr >= md->end_subject &&
5324 NLBLOCK->nltype == NLTYPE_FIXED &&
5325 NLBLOCK->nllen == 2 &&
5326 c == NLBLOCK->nl[0])
5327 {
5328 md->hitend = TRUE;
5329 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5330 }
5331 break;
5332
5333 case OP_ALLANY:
5334 case OP_ANYBYTE:
5335 break;
5336
5337 case OP_ANYNL:
5338 switch(c)
5339 {
5340 default: RRETURN(MATCH_NOMATCH);
5341 case CHAR_CR:
5342 if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
5343 break;
5344
5345 case CHAR_LF:
5346 break;
5347
5348 case CHAR_VT:
5349 case CHAR_FF:
5350 case CHAR_NEL:
5351 #ifndef EBCDIC
5352 case 0x2028:
5353 case 0x2029:
5354 #endif /* Not EBCDIC */
5355 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5356 break;
5357 }
5358 break;
5359
5360 case OP_NOT_HSPACE:
5361 switch(c)
5362 {
5363 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
5364 default: break;
5365 }
5366 break;
5367
5368 case OP_HSPACE:
5369 switch(c)
5370 {
5371 HSPACE_CASES: break;
5372 default: RRETURN(MATCH_NOMATCH);
5373 }
5374 break;
5375
5376 case OP_NOT_VSPACE:
5377 switch(c)
5378 {
5379 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
5380 default: break;
5381 }
5382 break;
5383
5384 case OP_VSPACE:
5385 switch(c)
5386 {
5387 VSPACE_CASES: break;
5388 default: RRETURN(MATCH_NOMATCH);
5389 }
5390 break;
5391
5392 case OP_NOT_DIGIT:
5393 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5394 RRETURN(MATCH_NOMATCH);
5395 break;
5396
5397 case OP_DIGIT:
5398 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5399 RRETURN(MATCH_NOMATCH);
5400 break;
5401
5402 case OP_NOT_WHITESPACE:
5403 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5404 RRETURN(MATCH_NOMATCH);
5405 break;
5406
5407 case OP_WHITESPACE:
5408 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5409 RRETURN(MATCH_NOMATCH);
5410 break;
5411
5412 case OP_NOT_WORDCHAR:
5413 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5414 RRETURN(MATCH_NOMATCH);
5415 break;
5416
5417 case OP_WORDCHAR:
5418 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5419 RRETURN(MATCH_NOMATCH);
5420 break;
5421
5422 default:
5423 RRETURN(PCRE_ERROR_INTERNAL);
5424 }
5425 }
5426 }
5427 else
5428 #endif
5429 /* Not UTF mode */
5430 {
5431 for (fi = min;; fi++) /* LOOP_COUNT: Ok */
5432 {
5433 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5434 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5435 if (fi >= max) RRETURN(MATCH_NOMATCH);
5436 if (eptr >= md->end_subject)
5437 {
5438 SCHECK_PARTIAL();
5439 RRETURN(MATCH_NOMATCH);
5440 }
5441 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5442 RRETURN(MATCH_NOMATCH);
5443 c = *eptr++;
5444 switch(ctype)
5445 {
5446 case OP_ANY: /* This is the non-NL case */
5447 if (md->partial != 0 && /* Take care with CRLF partial */
5448 eptr >= md->end_subject &&
5449 NLBLOCK->nltype == NLTYPE_FIXED &&
5450 NLBLOCK->nllen == 2 &&
5451 c == NLBLOCK->nl[0])
5452 {
5453 md->hitend = TRUE;
5454 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5455 }
5456 break;
5457
5458 case OP_ALLANY:
5459 case OP_ANYBYTE:
5460 break;
5461
5462 case OP_ANYNL:
5463 switch(c)
5464 {
5465 default: RRETURN(MATCH_NOMATCH);
5466 case CHAR_CR:
5467 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5468 break;
5469
5470 case CHAR_LF:
5471 break;
5472
5473 case CHAR_VT:
5474 case CHAR_FF:
5475 case CHAR_NEL:
5476 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5477 case 0x2028:
5478 case 0x2029:
5479 #endif
5480 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5481 break;
5482 }
5483 break;
5484
5485 case OP_NOT_HSPACE:
5486 switch(c)
5487 {
5488 default: break;
5489 HSPACE_BYTE_CASES:
5490 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5491 HSPACE_MULTIBYTE_CASES:
5492 #endif
5493 RRETURN(MATCH_NOMATCH);
5494 }
5495 break;
5496
5497 case OP_HSPACE:
5498 switch(c)
5499 {
5500 default: RRETURN(MATCH_NOMATCH);
5501 HSPACE_BYTE_CASES:
5502 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5503 HSPACE_MULTIBYTE_CASES:
5504 #endif
5505 break;
5506 }
5507 break;
5508
5509 case OP_NOT_VSPACE:
5510 switch(c)
5511 {
5512 default: break;
5513 VSPACE_BYTE_CASES:
5514 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5515 VSPACE_MULTIBYTE_CASES:
5516 #endif
5517 RRETURN(MATCH_NOMATCH);
5518 }
5519 break;
5520
5521 case OP_VSPACE:
5522 switch(c)
5523 {
5524 default: RRETURN(MATCH_NOMATCH);
5525 VSPACE_BYTE_CASES:
5526 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5527 VSPACE_MULTIBYTE_CASES:
5528 #endif
5529 break;
5530 }
5531 break;
5532
5533 case OP_NOT_DIGIT:
5534 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5535 break;
5536
5537 case OP_DIGIT:
5538 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5539 break;
5540
5541 case OP_NOT_WHITESPACE:
5542 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5543 break;
5544
5545 case OP_WHITESPACE:
5546 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5547 break;
5548
5549 case OP_NOT_WORDCHAR:
5550 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5551 break;
5552
5553 case OP_WORDCHAR:
5554 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5555 break;
5556
5557 default:
5558 RRETURN(PCRE_ERROR_INTERNAL);
5559 }
5560 }
5561 }
5562 /* Control never gets here */
5563 }
5564
5565 /* If maximizing, it is worth using inline code for speed, doing the type
5566 test once at the start (i.e. keep it out of the loop). Again, keep the
5567 UTF-8 and UCP stuff separate. */
5568
5569 else
5570 {
5571 pp = eptr; /* Remember where we started */
5572
5573 #ifdef SUPPORT_UCP
5574 if (prop_type >= 0)
5575 {
5576 switch(prop_type)
5577 {
5578 case PT_ANY:
5579 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
5580 {
5581 int len = 1;
5582 if (eptr >= md->end_subject)
5583 {
5584 SCHECK_PARTIAL();
5585 break;
5586 }
5587 GETCHARLENTEST(c, eptr, len);
5588 if (prop_fail_result) break;
5589 eptr+= len;
5590 COST_CHK(1);
5591 }
5592 break;
5593
5594 case PT_LAMP:
5595 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
5596 {
5597 int chartype;
5598 int len = 1;
5599 if (eptr >= md->end_subject)
5600 {
5601 SCHECK_PARTIAL();
5602 break;
5603 }
5604 GETCHARLENTEST(c, eptr, len);
5605 chartype = UCD_CHARTYPE(c);
5606 if ((chartype == ucp_Lu ||
5607 chartype == ucp_Ll ||
5608 chartype == ucp_Lt) == prop_fail_result)
5609 break;
5610 eptr+= len;
5611 COST_CHK(1);
5612 }
5613 break;
5614
5615 case PT_GC:
5616 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
5617 {
5618 int len = 1;
5619 if (eptr >= md->end_subject)
5620 {
5621 SCHECK_PARTIAL();
5622 break;
5623 }
5624 GETCHARLENTEST(c, eptr, len);
5625 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5626 eptr+= len;
5627 COST_CHK(1);
5628 }
5629 break;
5630
5631 case PT_PC:
5632 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
5633 {
5634 int len = 1;
5635 if (eptr >= md->end_subject)
5636 {
5637 SCHECK_PARTIAL();
5638 break;
5639 }
5640 GETCHARLENTEST(c, eptr, len);
5641 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5642 eptr+= len;
5643 COST_CHK(1);
5644 }
5645 break;
5646
5647 case PT_SC:
5648 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
5649 {
5650 int len = 1;
5651 if (eptr >= md->end_subject)
5652 {
5653 SCHECK_PARTIAL();
5654 break;
5655 }
5656 GETCHARLENTEST(c, eptr, len);
5657 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5658 eptr+= len;
5659 COST_CHK(1);
5660 }
5661 break;
5662
5663 case PT_ALNUM:
5664 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
5665 {
5666 int category;
5667 int len = 1;
5668 if (eptr >= md->end_subject)
5669 {
5670 SCHECK_PARTIAL();
5671 break;
5672 }
5673 GETCHARLENTEST(c, eptr, len);
5674 category = UCD_CATEGORY(c);
5675 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5676 break;
5677 eptr+= len;
5678 COST_CHK(1);
5679 }
5680 break;
5681
5682 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
5683 which means that Perl space and POSIX space are now identical. PCRE
5684 was changed at release 8.34. */
5685
5686 case PT_SPACE: /* Perl space */
5687 case PT_PXSPACE: /* POSIX space */
5688 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
5689 {
5690 int len = 1;
5691 if (eptr >= md->end_subject)
5692 {
5693 SCHECK_PARTIAL();
5694 break;
5695 }
5696 GETCHARLENTEST(c, eptr, len);
5697 switch(c)
5698 {
5699 HSPACE_CASES:
5700 VSPACE_CASES:
5701 if (prop_fail_result) goto ENDLOOP99; /* Break the loop */
5702 break;
5703
5704 default:
5705 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
5706 goto ENDLOOP99; /* Break the loop */
5707 break;
5708 }
5709 eptr+= len;
5710 COST_CHK(1);
5711 }
5712 ENDLOOP99:
5713 break;
5714
5715 case PT_WORD:
5716 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
5717 {
5718 int category;
5719 int len = 1;
5720 if (eptr >= md->end_subject)
5721 {
5722 SCHECK_PARTIAL();
5723 break;
5724 }
5725 GETCHARLENTEST(c, eptr, len);
5726 category = UCD_CATEGORY(c);
5727 if ((category == ucp_L || category == ucp_N ||
5728 c == CHAR_UNDERSCORE) == prop_fail_result)
5729 break;
5730 eptr+= len;
5731 COST_CHK(1);
5732 }
5733 break;
5734
5735 case PT_CLIST:
5736 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
5737 {
5738 const pcre_uint32 *cp;
5739 int len = 1;
5740 if (eptr >= md->end_subject)
5741 {
5742 SCHECK_PARTIAL();
5743 break;
5744 }
5745 GETCHARLENTEST(c, eptr, len);
5746 cp = PRIV(ucd_caseless_sets) + prop_value;
5747 for (;;) /* LOOP_COUNT: COST */
5748 {
5749 if (c < *cp)
5750 { if (prop_fail_result) break; else goto GOT_MAX; }
5751 if (c == *cp++)
5752 { if (prop_fail_result) goto GOT_MAX; else break; }
5753 COST(1);
5754 }
5755 eptr += len;
5756 COST_CHK(1);
5757 }
5758 GOT_MAX:
5759 break;
5760
5761 case PT_UCNC:
5762 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
5763 {
5764 int len = 1;
5765 if (eptr >= md->end_subject)
5766 {
5767 SCHECK_PARTIAL();
5768 break;
5769 }
5770 GETCHARLENTEST(c, eptr, len);
5771 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5772 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5773 c >= 0xe000) == prop_fail_result)
5774 break;
5775 eptr += len;
5776 COST_CHK(1);
5777 }
5778 break;
5779
5780 default:
5781 RRETURN(PCRE_ERROR_INTERNAL);
5782 }
5783
5784 /* eptr is now past the end of the maximum run */
5785
5786 if (possessive) continue; /* No backtracking */
5787 for(;;) /* LOOP_COUNT: Ok */
5788 {
5789 if (eptr <= pp) goto TAIL_RECURSE;
5790 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5791 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5792 eptr--;
5793 if (utf) BACKCHAR(eptr);
5794 }
5795 }
5796
5797 /* Match extended Unicode grapheme clusters. We will get here only if the
5798 support is in the binary; otherwise a compile-time error occurs. */
5799
5800 else if (ctype == OP_EXTUNI)
5801 {
5802 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
5803 {
5804 if (eptr >= md->end_subject)
5805 {
5806 SCHECK_PARTIAL();
5807 break;
5808 }
5809 else
5810 {
5811 #ifndef ERLANG_INTEGRATION
5812 int lgb, rgb;
5813 #endif
5814 GETCHARINCTEST(c, eptr);
5815 lgb = UCD_GRAPHBREAK(c);
5816 while (eptr < md->end_subject) /* LOOP_COUNT: CHK */
5817 {
5818 int len = 1;
5819 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5820 rgb = UCD_GRAPHBREAK(c);
5821 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5822 lgb = rgb;
5823 eptr += len;
5824 COST_CHK(1);
5825 }
5826 COST_CHK(1);
5827 }
5828 CHECK_PARTIAL();
5829 }
5830
5831 /* eptr is now past the end of the maximum run */
5832
5833 if (possessive) continue; /* No backtracking */
5834
5835 /* We use <= pp rather than == pp to detect the start of the run while
5836 backtracking because the use of \C in UTF mode can cause BACKCHAR to
5837 move back past pp. This is just palliative; the use of \C in UTF mode
5838 is fraught with danger. */
5839
5840 for(;;) /* LOOP_COUNT: Ok */
5841 {
5842 #ifndef ERLANG_INTEGRATION
5843 int lgb, rgb;
5844 #endif
5845 PCRE_PUCHAR fptr;
5846
5847 if (eptr <= pp) goto TAIL_RECURSE; /* At start of char run */
5848 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5849 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5850
5851 /* Backtracking over an extended grapheme cluster involves inspecting
5852 the previous two characters (if present) to see if a break is
5853 permitted between them. */
5854
5855 eptr--;
5856 if (!utf) c = *eptr; else
5857 {
5858 BACKCHAR(eptr);
5859 GETCHAR(c, eptr);
5860 }
5861 rgb = UCD_GRAPHBREAK(c);
5862
5863 for (;;) /* LOOP_COUNT: COST */
5864 {
5865 if (eptr <= pp) goto TAIL_RECURSE; /* At start of char run */
5866 fptr = eptr - 1;
5867 if (!utf) c = *fptr; else
5868 {
5869 BACKCHAR(fptr);
5870 GETCHAR(c, fptr);
5871 }
5872 lgb = UCD_GRAPHBREAK(c);
5873 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5874 eptr = fptr;
5875 rgb = lgb;
5876 COST(1);
5877 }
5878 }
5879 }
5880
5881 else
5882 #endif /* SUPPORT_UCP */
5883
5884 #ifdef SUPPORT_UTF
5885 if (utf)
5886 {
5887 switch(ctype)
5888 {
5889 case OP_ANY:
5890 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
5891 {
5892 if (eptr >= md->end_subject)
5893 {
5894 SCHECK_PARTIAL();
5895 break;
5896 }
5897 if (IS_NEWLINE(eptr)) break;
5898 if (md->partial != 0 && /* Take care with CRLF partial */
5899 eptr + 1 >= md->end_subject &&
5900 NLBLOCK->nltype == NLTYPE_FIXED &&
5901 NLBLOCK->nllen == 2 &&
5902 UCHAR21(eptr) == NLBLOCK->nl[0])
5903 {
5904 md->hitend = TRUE;
5905 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5906 }
5907 eptr++;
5908 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5909 COST_CHK(1);
5910 }
5911 break;
5912
5913 case OP_ALLANY:
5914 if (max < INT_MAX)
5915 {
5916 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
5917 {
5918 if (eptr >= md->end_subject)
5919 {
5920 SCHECK_PARTIAL();
5921 break;
5922 }
5923 eptr++;
5924 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5925 COST_CHK(1);
5926 }
5927 }
5928 else
5929 {
5930 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5931 SCHECK_PARTIAL();
5932 }
5933 break;
5934
5935 /* The byte case is the same as non-UTF8 */
5936
5937 case OP_ANYBYTE:
5938 c = max - min;
5939 if (c > (unsigned int)(md->end_subject - eptr))
5940 {
5941 eptr = md->end_subject;
5942 SCHECK_PARTIAL();
5943 }
5944 else eptr += c;
5945 break;
5946
5947 case OP_ANYNL:
5948 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
5949 {
5950 int len = 1;
5951 if (eptr >= md->end_subject)
5952 {
5953 SCHECK_PARTIAL();
5954 break;
5955 }
5956 GETCHARLEN(c, eptr, len);
5957 if (c == CHAR_CR)
5958 {
5959 if (++eptr >= md->end_subject) break;
5960 if (UCHAR21(eptr) == CHAR_LF) eptr++;
5961 }
5962 else
5963 {
5964 if (c != CHAR_LF &&
5965 (md->bsr_anycrlf ||
5966 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5967 #ifndef EBCDIC
5968 && c != 0x2028 && c != 0x2029
5969 #endif /* Not EBCDIC */
5970 )))
5971 break;
5972 eptr += len;
5973 }
5974 COST_CHK(1);
5975 }
5976 break;
5977
5978 case OP_NOT_HSPACE:
5979 case OP_HSPACE:
5980 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
5981 {
5982 BOOL gotspace;
5983 int len = 1;
5984 if (eptr >= md->end_subject)
5985 {
5986 SCHECK_PARTIAL();
5987 break;
5988 }
5989 GETCHARLEN(c, eptr, len);
5990 switch(c)
5991 {
5992 HSPACE_CASES: gotspace = TRUE; break;
5993 default: gotspace = FALSE; break;
5994 }
5995 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5996 eptr += len;
5997 COST_CHK(1);
5998 }
5999 break;
6000
6001 case OP_NOT_VSPACE:
6002 case OP_VSPACE:
6003 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
6004 {
6005 BOOL gotspace;
6006 int len = 1;
6007 if (eptr >= md->end_subject)
6008 {
6009 SCHECK_PARTIAL();
6010 break;
6011 }
6012 GETCHARLEN(c, eptr, len);
6013 switch(c)
6014 {
6015 VSPACE_CASES: gotspace = TRUE; break;
6016 default: gotspace = FALSE; break;
6017 }
6018 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
6019 eptr += len;
6020 COST_CHK(1);
6021 }
6022 break;
6023
6024 case OP_NOT_DIGIT:
6025 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
6026 {
6027 int len = 1;
6028 if (eptr >= md->end_subject)
6029 {
6030 SCHECK_PARTIAL();
6031 break;
6032 }
6033 GETCHARLEN(c, eptr, len);
6034 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
6035 eptr+= len;
6036 COST_CHK(1);
6037 }
6038 break;
6039
6040 case OP_DIGIT:
6041 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
6042 {
6043 int len = 1;
6044 if (eptr >= md->end_subject)
6045 {
6046 SCHECK_PARTIAL();
6047 break;
6048 }
6049 GETCHARLEN(c, eptr, len);
6050 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
6051 eptr+= len;
6052 COST_CHK(1);
6053 }
6054 break;
6055
6056 case OP_NOT_WHITESPACE:
6057 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
6058 {
6059 int len = 1;
6060 if (eptr >= md->end_subject)
6061 {
6062 SCHECK_PARTIAL();
6063 break;
6064 }
6065 GETCHARLEN(c, eptr, len);
6066 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
6067 eptr+= len;
6068 COST_CHK(1);
6069 }
6070 break;
6071
6072 case OP_WHITESPACE:
6073 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
6074 {
6075 int len = 1;
6076 if (eptr >= md->end_subject)
6077 {
6078 SCHECK_PARTIAL();
6079 break;
6080 }
6081 GETCHARLEN(c, eptr, len);
6082 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
6083 eptr+= len;
6084 COST_CHK(1);
6085 }
6086 break;
6087
6088 case OP_NOT_WORDCHAR:
6089 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
6090 {
6091 int len = 1;
6092 if (eptr >= md->end_subject)
6093 {
6094 SCHECK_PARTIAL();
6095 break;
6096 }
6097 GETCHARLEN(c, eptr, len);
6098 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
6099 eptr+= len;
6100 COST_CHK(1);
6101 }
6102 break;
6103
6104 case OP_WORDCHAR:
6105 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
6106 {
6107 int len = 1;
6108 if (eptr >= md->end_subject)
6109 {
6110 SCHECK_PARTIAL();
6111 break;
6112 }
6113 GETCHARLEN(c, eptr, len);
6114 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
6115 eptr+= len;
6116 COST_CHK(1);
6117 }
6118 break;
6119
6120 default:
6121 RRETURN(PCRE_ERROR_INTERNAL);
6122 }
6123
6124 if (possessive) continue; /* No backtracking */
6125 for(;;) /* LOOP_COUNT: Ok */
6126 {
6127 if (eptr <= pp) goto TAIL_RECURSE;
6128 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
6129 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6130 eptr--;
6131 BACKCHAR(eptr);
6132 if (ctype == OP_ANYNL && eptr > pp && UCHAR21(eptr) == CHAR_NL &&
6133 UCHAR21(eptr - 1) == CHAR_CR) eptr--;
6134 }
6135 }
6136 else
6137 #endif /* SUPPORT_UTF */
6138 /* Not UTF mode */
6139 {
6140 switch(ctype)
6141 {
6142 case OP_ANY:
6143 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
6144 {
6145 if (eptr >= md->end_subject)
6146 {
6147 SCHECK_PARTIAL();
6148 break;
6149 }
6150 if (IS_NEWLINE(eptr)) break;
6151 if (md->partial != 0 && /* Take care with CRLF partial */
6152 eptr + 1 >= md->end_subject &&
6153 NLBLOCK->nltype == NLTYPE_FIXED &&
6154 NLBLOCK->nllen == 2 &&
6155 *eptr == NLBLOCK->nl[0])
6156 {
6157 md->hitend = TRUE;
6158 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
6159 }
6160 eptr++;
6161 COST_CHK(1);
6162 }
6163 break;
6164
6165 case OP_ALLANY:
6166 case OP_ANYBYTE:
6167 c = max - min;
6168 if (c > (unsigned int)(md->end_subject - eptr))
6169 {
6170 eptr = md->end_subject;
6171 SCHECK_PARTIAL();
6172 }
6173 else eptr += c;
6174 break;
6175
6176 case OP_ANYNL:
6177 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
6178 {
6179 if (eptr >= md->end_subject)
6180 {
6181 SCHECK_PARTIAL();
6182 break;
6183 }
6184 c = *eptr;
6185 if (c == CHAR_CR)
6186 {
6187 if (++eptr >= md->end_subject) break;
6188 if (*eptr == CHAR_LF) eptr++;
6189 }
6190 else
6191 {
6192 if (c != CHAR_LF && (md->bsr_anycrlf ||
6193 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
6194 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6195 && c != 0x2028 && c != 0x2029
6196 #endif
6197 ))) break;
6198 eptr++;
6199 }
6200 COST_CHK(1);
6201 }
6202 break;
6203
6204 case OP_NOT_HSPACE:
6205 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
6206 {
6207 if (eptr >= md->end_subject)
6208 {
6209 SCHECK_PARTIAL();
6210 break;
6211 }
6212 switch(*eptr)
6213 {
6214 default: eptr++; break;
6215 HSPACE_BYTE_CASES:
6216 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6217 HSPACE_MULTIBYTE_CASES:
6218 #endif
6219 goto ENDLOOP00;
6220 }
6221 COST_CHK(1);
6222 }
6223 ENDLOOP00:
6224 break;
6225
6226 case OP_HSPACE:
6227 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
6228 {
6229 if (eptr >= md->end_subject)
6230 {
6231 SCHECK_PARTIAL();
6232 break;
6233 }
6234 switch(*eptr)
6235 {
6236 default: goto ENDLOOP01;
6237 HSPACE_BYTE_CASES:
6238 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6239 HSPACE_MULTIBYTE_CASES:
6240 #endif
6241 eptr++; break;
6242 }
6243 COST_CHK(1);
6244 }
6245 ENDLOOP01:
6246 break;
6247
6248 case OP_NOT_VSPACE:
6249 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
6250 {
6251 if (eptr >= md->end_subject)
6252 {
6253 SCHECK_PARTIAL();
6254 break;
6255 }
6256 switch(*eptr)
6257 {
6258 default: eptr++; break;
6259 VSPACE_BYTE_CASES:
6260 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6261 VSPACE_MULTIBYTE_CASES:
6262 #endif
6263 goto ENDLOOP02;
6264 }
6265 COST_CHK(1);
6266 }
6267 ENDLOOP02:
6268 break;
6269
6270 case OP_VSPACE:
6271 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
6272 {
6273 if (eptr >= md->end_subject)
6274 {
6275 SCHECK_PARTIAL();
6276 break;
6277 }
6278 switch(*eptr)
6279 {
6280 default: goto ENDLOOP03;
6281 VSPACE_BYTE_CASES:
6282 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6283 VSPACE_MULTIBYTE_CASES:
6284 #endif
6285 eptr++; break;
6286 }
6287 COST_CHK(1);
6288 }
6289 ENDLOOP03:
6290 break;
6291
6292 case OP_NOT_DIGIT:
6293 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
6294 {
6295 if (eptr >= md->end_subject)
6296 {
6297 SCHECK_PARTIAL();
6298 break;
6299 }
6300 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
6301 eptr++;
6302 COST_CHK(1);
6303 }
6304 break;
6305
6306 case OP_DIGIT:
6307 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
6308 {
6309 if (eptr >= md->end_subject)
6310 {
6311 SCHECK_PARTIAL();
6312 break;
6313 }
6314 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
6315 eptr++;
6316 COST_CHK(1);
6317 }
6318 break;
6319
6320 case OP_NOT_WHITESPACE:
6321 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
6322 {
6323 if (eptr >= md->end_subject)
6324 {
6325 SCHECK_PARTIAL();
6326 break;
6327 }
6328 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6329 eptr++;
6330 COST_CHK(1);
6331 }
6332 break;
6333
6334 case OP_WHITESPACE:
6335 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
6336 {
6337 if (eptr >= md->end_subject)
6338 {
6339 SCHECK_PARTIAL();
6340 break;
6341 }
6342 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6343 eptr++;
6344 COST_CHK(1);
6345 }
6346 break;
6347
6348 case OP_NOT_WORDCHAR:
6349 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
6350 {
6351 if (eptr >= md->end_subject)
6352 {
6353 SCHECK_PARTIAL();
6354 break;
6355 }
6356 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6357 eptr++;
6358 COST_CHK(1);
6359 }
6360 break;
6361
6362 case OP_WORDCHAR:
6363 for (i = min; i < max; i++) /* LOOP_COUNT: CHK */
6364 {
6365 if (eptr >= md->end_subject)
6366 {
6367 SCHECK_PARTIAL();
6368 break;
6369 }
6370 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6371 eptr++;
6372 COST_CHK(1);
6373 }
6374 break;
6375
6376 default:
6377 RRETURN(PCRE_ERROR_INTERNAL);
6378 }
6379
6380 if (possessive) continue; /* No backtracking */
6381 for (;;) /* LOOP_COUNT: Ok */
6382 {
6383 if (eptr == pp) goto TAIL_RECURSE;
6384 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6385 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6386 eptr--;
6387 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6388 eptr[-1] == CHAR_CR) eptr--;
6389 }
6390 }
6391
6392 /* Control never gets here */
6393 }
6394
6395 /* There's been some horrible disaster. Arrival here can only mean there is
6396 something seriously wrong in the code above or the OP_xxx definitions. */
6397
6398 default:
6399 DPRINTF(("Unknown opcode %d\n", *ecode));
6400 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6401 }
6402
6403 /* Do not stick any code in here without much thought; it is assumed
6404 that "continue" in the code above comes out to here to repeat the main
6405 loop. */
6406
6407 } /* End of main loop */
6408 /* Control never reaches here */
6409
6410
6411 /* When compiling to use the heap rather than the stack for recursive calls to
6412 match(), the RRETURN() macro jumps here. The number that is saved in
6413 frame->Xwhere indicates which label we actually want to return to. */
6414
6415 #ifdef NO_RECURSE
6416 #define LBL(val) case val: goto L_RM##val;
6417 HEAP_RETURN:
6418 switch (frame->Xwhere)
6419 {
6420 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6421 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6422 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6423 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6424 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6425 LBL(65) LBL(66)
6426 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6427 LBL(20) LBL(21)
6428 #endif
6429 #ifdef SUPPORT_UTF
6430 LBL(16) LBL(18)
6431 LBL(22) LBL(23) LBL(28) LBL(30)
6432 LBL(32) LBL(34) LBL(42) LBL(46)
6433 #ifdef SUPPORT_UCP
6434 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6435 LBL(59) LBL(60) LBL(61) LBL(62) LBL(67)
6436 #endif /* SUPPORT_UCP */
6437 #endif /* SUPPORT_UTF */
6438 default:
6439 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6440 return PCRE_ERROR_INTERNAL;
6441 }
6442 #undef LBL
6443 #ifdef ERLANG_INTEGRATION
6444 LOOP_COUNT_RETURN:
6445 /* Restore the saved register variables in the upper dummy frame, description below */
6446 {
6447 heapframe *newframe = frame;
6448 frame = newframe->Xprevframe;
6449 rrc = newframe->Xop;
6450 i = newframe->Xfi;
6451 c = (pcre_uint32) newframe->Xfc;
6452 utf = newframe->Xcur_is_word;
6453 minimize = newframe->Xcondition;
6454 possessive = newframe->Xprev_is_word;
6455 caseless = (BOOL) newframe->Xcodelink;
6456 condcode = newframe->Xctype;
6457 /* Note, the frame is not freed until the whole match is done,
6458 the function release_match_heapframes takes care of that */
6459 EDEBUGF(("LOOP_COUNT_RETURN: %d",frame->Xwhere));
6460 switch (frame->Xwhere)
6461 {
6462 #include "pcre_exec_loop_break_cases.inc"
6463 default:
6464 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6465 return PCRE_ERROR_INTERNAL;
6466 }
6467 }
6468
6469 LOOP_COUNT_BREAK:
6470 /* Save the local register variables in a dummy frame, to keep the
6471 * every frame of equal size rule */
6472 /*
6473 * Store Local in
6474 * ------------------------------ --------------
6475 * rrc Xop
6476 * i Xfi
6477 * c Xfc (cast)
6478 * utf Xcur_is_word
6479 * minimize Xcondition
6480 * possessive Xprev_is_word
6481 * caseless Xcodelink (cast)
6482 * condcode Xctype
6483 */
6484 {
6485 heapframe *newframe = frame->Xnextframe;
6486 if (newframe == NULL)
6487 {
6488 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));
6489 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
6490 newframe->Xnextframe = NULL;
6491 frame->Xnextframe = newframe;
6492 }
6493 newframe->Xprevframe = frame;
6494 newframe->Xop = rrc;
6495 newframe->Xfi = i;
6496 newframe->Xfc = (unsigned int) c;
6497 newframe->Xcur_is_word = utf;
6498 newframe->Xcondition = minimize;
6499 newframe->Xprev_is_word = possessive;
6500 newframe->Xcodelink = (int) caseless;
6501 newframe->Xctype = condcode;
6502 md->state_save = newframe;
6503 md->loop_limit = 0;
6504 EDEBUGF(("Break loop!"));
6505 return PCRE_ERROR_LOOP_LIMIT;
6506 }
6507 #endif
6508 #endif /* NO_RECURSE */
6509 }
6510
6511
6512 /***************************************************************************
6513 ****************************************************************************
6514 RECURSION IN THE match() FUNCTION
6515
6516 Undefine all the macros that were defined above to handle this. */
6517
6518 #ifdef NO_RECURSE
6519 #undef eptr
6520 #undef ecode
6521 #undef mstart
6522 #undef offset_top
6523 #undef eptrb
6524 #undef flags
6525
6526 #undef callpat
6527 #undef charptr
6528 #undef data
6529 #undef next
6530 #undef pp
6531 #undef prev
6532 #undef saved_eptr
6533
6534 #undef new_recursive
6535
6536 #undef cur_is_word
6537 #undef condition
6538 #undef prev_is_word
6539
6540 #undef ctype
6541 #undef length
6542 #undef max
6543 #undef min
6544 #undef number
6545 #undef offset
6546 #undef op
6547 #undef save_capture_last
6548 #undef save_offset1
6549 #undef save_offset2
6550 #undef save_offset3
6551 #undef stacksave
6552
6553 #undef newptrb
6554
6555 #endif
6556
6557 /* These two are defined as macros in both cases */
6558
6559 #undef fc
6560 #undef fi
6561
6562 /***************************************************************************
6563 ***************************************************************************/
6564
6565
6566 #ifdef NO_RECURSE
6567 /*************************************************
6568 * Release allocated heap frames *
6569 *************************************************/
6570
6571 /* This function releases all the allocated frames. The base frame is on the
6572 machine stack, and so must not be freed.
6573
6574 Argument: the address of the base frame
6575 Returns: nothing
6576 */
6577
6578 static void
release_match_heapframes(heapframe * frame_base)6579 release_match_heapframes (heapframe *frame_base)
6580 {
6581 heapframe *nextframe = frame_base->Xnextframe;
6582 #ifdef ERLANG_INTEGRATION
6583 frame_base->Xnextframe = NULL; /* Protect against multiple free */
6584 #endif
6585 while (nextframe != NULL)
6586 {
6587 heapframe *oldframe = nextframe;
6588 nextframe = nextframe->Xnextframe;
6589 (PUBL(stack_free))(oldframe);
6590 }
6591 }
6592 #endif
6593
6594
6595 /*************************************************
6596 * Execute a Regular Expression *
6597 *************************************************/
6598
6599 /* This function applies a compiled re to a subject string and picks out
6600 portions of the string if it matches. Two elements in the vector are set for
6601 each substring: the offsets to the start and end of the substring.
6602
6603 Arguments:
6604 argument_re points to the compiled expression
6605 extra_data points to extra data or is NULL
6606 subject points to the subject string
6607 length length of subject string (may contain binary zeros)
6608 start_offset where to start in the subject string
6609 options option bits
6610 offsets points to a vector of ints to be filled in with offsets
6611 offsetcount the number of elements in the vector
6612
6613 Returns: > 0 => success; value is the number of elements filled in
6614 = 0 => success, but offsets is not big enough
6615 -1 => failed to match
6616 < -1 => some kind of unexpected problem
6617 */
6618 #ifdef ERLANG_INTEGRATION
6619 typedef struct {
6620 int Xarg_offset_max;
6621 BOOL Xusing_temporary_offsets;
6622 BOOL Xanchored;
6623 BOOL Xstartline;
6624 BOOL Xfirstline;
6625 BOOL Xutf;
6626 BOOL Xhas_first_char;
6627 BOOL Xhas_req_char;
6628 pcre_uchar Xfirst_char;
6629 pcre_uchar Xfirst_char2;
6630 pcre_uchar Xreq_char;
6631 pcre_uchar Xreq_char2;
6632 match_data Xmatch_block;
6633 match_data *Xmd;
6634 const pcre_uint8 *Xtables;
6635 const pcre_uint8 *Xstart_bits;
6636 PCRE_PUCHAR Xstart_match;
6637 PCRE_PUCHAR Xend_subject;
6638 PCRE_PUCHAR Xstart_partial;
6639 PCRE_PUCHAR Xmatch_partial;
6640 PCRE_PUCHAR Xreq_char_ptr;
6641 const pcre_study_data *Xstudy;
6642 REAL_PCRE *Xre;
6643 heapframe Xframe_zero; /* Always NO_RECURSE */
6644
6645 /* for yield in valid_utf() */
6646
6647 struct PRIV(valid_utf_ystate) valid_utf_ystate;
6648
6649 /* Original function parameters that need be saved */
6650 int Xstart_offset;
6651 int Xoffsetcount;
6652 int *Xoffsets;
6653 int Xlength;
6654 PCRE_SPTR Xsubject;
6655 } PcreExecContext;
6656 #endif
6657
6658
6659 #if defined COMPILE_PCRE8
6660 #if defined(ERLANG_INTEGRATION)
6661 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
erts_pcre_exec(const pcre * argument_re,const erts_pcre_extra * extra_data,PCRE_SPTR subject,int length,int start_offset,int options,int * offsets,int offsetcount)6662 erts_pcre_exec(const pcre *argument_re, const erts_pcre_extra *extra_data,
6663 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6664 int offsetcount)
6665 #else
6666 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6667 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6668 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6669 int offsetcount)
6670 #endif
6671 #elif defined COMPILE_PCRE16
6672 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6673 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6674 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6675 int offsetcount)
6676 #elif defined COMPILE_PCRE32
6677 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6678 pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
6679 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
6680 int offsetcount)
6681 #endif
6682 {
6683 #ifndef ERLANG_INTEGRATION
6684 #define ERTS_UPDATE_CONSUMED(X, MD)
6685 int rc, ocount, arg_offset_max;
6686 int newline;
6687 BOOL using_temporary_offsets = FALSE;
6688 BOOL anchored;
6689 BOOL startline;
6690 BOOL firstline;
6691 BOOL utf;
6692 BOOL has_first_char = FALSE;
6693 BOOL has_req_char = FALSE;
6694 pcre_uchar first_char = 0;
6695 pcre_uchar first_char2 = 0;
6696 pcre_uchar req_char = 0;
6697 pcre_uchar req_char2 = 0;
6698 match_data match_block;
6699 match_data *md = &match_block;
6700 const pcre_uint8 *tables;
6701 const pcre_uint8 *start_bits = NULL;
6702 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6703 PCRE_PUCHAR end_subject;
6704 PCRE_PUCHAR start_partial = NULL;
6705 PCRE_PUCHAR match_partial = NULL;
6706 PCRE_PUCHAR req_char_ptr = start_match - 1;
6707
6708 const pcre_study_data *study;
6709 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6710 #ifdef NO_RECURSE
6711 heapframe frame_zero;
6712 #endif
6713 #else
6714
6715 /* "local" variables in faked stackframe instead */
6716 #define arg_offset_max (exec_context->Xarg_offset_max)
6717 #define using_temporary_offsets (exec_context->Xusing_temporary_offsets)
6718 #define anchored (exec_context->Xanchored)
6719 #define startline (exec_context->Xstartline)
6720 #define firstline (exec_context->Xfirstline)
6721 #define has_first_char (exec_context->Xhas_first_char)
6722 #define has_req_char (exec_context->Xhas_req_char)
6723 #define first_char2 (exec_context->Xfirst_char2)
6724 #define req_char2 (exec_context->Xreq_char2)
6725 #define match_block (exec_context->Xmatch_block)
6726 #define md (exec_context->Xmd)
6727 #define start_match (exec_context->Xstart_match)
6728 #define start_partial (exec_context->Xstart_partial)
6729 #define match_partial (exec_context->Xmatch_partial)
6730 #define study (exec_context->Xstudy)
6731 #define re (exec_context->Xre)
6732 #define frame_zero (exec_context->Xframe_zero)
6733
6734 #define SWAPIN() do { \
6735 utf = exec_context->Xutf; \
6736 first_char = exec_context->Xfirst_char; \
6737 tables = exec_context->Xtables; \
6738 start_bits = exec_context->Xstart_bits; \
6739 end_subject = exec_context->Xend_subject; \
6740 req_char_ptr = exec_context->Xreq_char_ptr; \
6741 req_char = exec_context->Xreq_char; \
6742 /* Parameters */ \
6743 start_offset = exec_context->Xstart_offset; \
6744 offsetcount = exec_context->Xoffsetcount; \
6745 offsets = exec_context->Xoffsets; \
6746 length = exec_context->Xlength; \
6747 subject = exec_context->Xsubject; \
6748 } while (0)
6749
6750 #define SWAPOUT() do { \
6751 exec_context->Xutf = utf; \
6752 exec_context->Xfirst_char = first_char; \
6753 exec_context->Xtables = tables; \
6754 exec_context->Xstart_bits = start_bits; \
6755 exec_context->Xend_subject = end_subject; \
6756 exec_context->Xreq_char_ptr = req_char_ptr; \
6757 exec_context->Xreq_char = req_char; \
6758 /* Parameters */ \
6759 exec_context->Xstart_offset = start_offset; \
6760 exec_context->Xoffsetcount = offsetcount; \
6761 exec_context->Xoffsets = offsets; \
6762 exec_context->Xlength = length; \
6763 exec_context->Xsubject = subject; \
6764 } while (0)
6765
6766 #define ERTS_UPDATE_CONSUMED(X, MD) \
6767 do { \
6768 if (((X)->flags & PCRE_EXTRA_LOOP_LIMIT) != 0) { \
6769 unsigned long consumed__; \
6770 if (!(X)->restart_data) { \
6771 consumed__ = 0; \
6772 } \
6773 else { \
6774 PcreExecContext *ctx__ = (PcreExecContext *) \
6775 (*(X)->restart_data); \
6776 consumed__ = ctx__->valid_utf_ystate.cnt; \
6777 ctx__->valid_utf_ystate.cnt = 0; \
6778 } \
6779 if ((MD)) { \
6780 match_data *md__ = (MD); \
6781 consumed__ += (X)->loop_limit - md__->loop_limit; \
6782 } \
6783 *((X)->loop_counter_return) = consumed__; \
6784 } \
6785 } while (0)
6786 PcreExecContext *exec_context;
6787 PcreExecContext internal_context;
6788
6789 /* Locals that need never be saved */
6790 int rc, ocount;
6791 int newline;
6792
6793 /* Variables that we swap in and out */
6794 BOOL utf;
6795 pcre_uchar first_char;
6796 const pcre_uint8 *tables = NULL;
6797 const pcre_uint8 *start_bits;
6798 PCRE_PUCHAR end_subject = NULL;
6799 PCRE_PUCHAR req_char_ptr;
6800 pcre_uchar req_char;
6801
6802 /* End special swapped variables */
6803
6804 if (extra_data != NULL &&
6805 (extra_data->flags & PCRE_EXTRA_LOOP_LIMIT) &&
6806 *(extra_data->restart_data) != NULL) {
6807 /* we are restarting, every initialization is skipped and we jump directly into the loop */
6808 exec_context = (PcreExecContext *) *(extra_data->restart_data);
6809 SWAPIN();
6810 if (exec_context->valid_utf_ystate.yielded)
6811 goto restart_valid_utf;
6812 goto RESTART_INTERRUPTED;
6813 } else {
6814 if (extra_data != NULL &&
6815 (extra_data->flags & PCRE_EXTRA_LOOP_LIMIT)) {
6816 exec_context = (PcreExecContext *) (erts_pcre_malloc)(sizeof(PcreExecContext));
6817 *(extra_data->restart_data) = (void *) exec_context;
6818 exec_context->valid_utf_ystate.yielded = 0;
6819 /* need freeing by special routine from client */
6820 } else {
6821 #if defined(ERLANG_INTEGRATION)
6822 fprintf(stderr, "Unexpected execution path\n");
6823 abort();
6824 #endif
6825 exec_context = &internal_context;
6826 }
6827
6828 /* OK, no restart here, initialize variables instead */
6829 using_temporary_offsets = FALSE;
6830 has_first_char = FALSE;
6831 has_req_char = FALSE;
6832 first_char = 0;
6833 first_char2 = 0;
6834 req_char = 0;
6835 req_char2 = 0;
6836 md = &match_block;
6837 start_bits = NULL;
6838 start_match = (PCRE_PUCHAR)subject + start_offset;
6839 start_partial = NULL;
6840 match_partial = NULL;
6841 req_char_ptr = start_match - 1;
6842 re = (REAL_PCRE *)argument_re;
6843
6844 md->state_save = NULL;
6845 }
6846
6847 #endif /* ERLANG_INTEGRATION */
6848
6849
6850 #ifdef NO_RECURSE
6851 frame_zero.Xprevframe = NULL; /* Marks the top level */
6852 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6853 md->match_frames_base = &frame_zero;
6854 #endif
6855
6856 /* Check for the special magic call that measures the size of the stack used
6857 per recursive call of match(). Without the funny casting for sizeof, a Windows
6858 compiler gave this error: "unary minus operator applied to unsigned type,
6859 result still unsigned". Hopefully the cast fixes that. */
6860
6861 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6862 start_offset == -999)
6863 #ifdef NO_RECURSE
6864 return -((int)sizeof(heapframe));
6865 #else
6866 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6867 #endif
6868
6869 /* Plausibility checks */
6870
6871 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6872 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6873 return PCRE_ERROR_NULL;
6874 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6875 if (length < 0) return PCRE_ERROR_BADLENGTH;
6876 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6877
6878 /* Check that the first field in the block is the magic number. If it is not,
6879 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6880 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6881 means that the pattern is likely compiled with different endianness. */
6882
6883 if (re->magic_number != MAGIC_NUMBER)
6884 return re->magic_number == REVERSED_MAGIC_NUMBER?
6885 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6886 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6887
6888 /* These two settings are used in the code for checking a UTF-8 string that
6889 follows immediately afterwards. Other values in the md block are used only
6890 during "normal" pcre_exec() processing, not when the JIT support is in use,
6891 so they are set up later. */
6892
6893 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6894 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6895 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6896 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6897
6898 /* Check a UTF-8 string if required. Pass back the character offset and error
6899 code for an invalid string if a results vector is available. */
6900
6901 #ifdef SUPPORT_UTF
6902 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6903 {
6904 int erroroffset;
6905 int errorcode;
6906
6907 #if !defined(ERLANG_INTEGRATION)
6908 errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length);
6909 #else
6910 struct PRIV(valid_utf_ystate) *ystate;
6911
6912 if (!extra_data || !extra_data->restart_data) {
6913 ystate = NULL;
6914 }
6915 else if (!(extra_data->flags & PCRE_EXTRA_LOOP_LIMIT)) {
6916 exec_context->valid_utf_ystate.cnt = 10;
6917 ystate = NULL;
6918 }
6919 else {
6920 exec_context->valid_utf_ystate.yielded = 0;
6921 restart_valid_utf:
6922 ystate = &exec_context->valid_utf_ystate;
6923 ystate->cnt = (int) extra_data->loop_limit;
6924 }
6925 errorcode = PRIV(yielding_valid_utf)((PCRE_PUCHAR)subject, length,
6926 &erroroffset, ystate);
6927 #endif
6928 if (errorcode != 0)
6929 {
6930 #if defined(ERLANG_INTEGRATION)
6931 if (ystate && ystate->yielded) {
6932 ERTS_UPDATE_CONSUMED(extra_data, NULL);
6933 SWAPOUT();
6934 return PCRE_ERROR_LOOP_LIMIT;
6935 }
6936 #endif
6937 if (offsetcount >= 2)
6938 {
6939 offsets[0] = erroroffset;
6940 offsets[1] = errorcode;
6941 }
6942 #if defined COMPILE_PCRE8
6943 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6944 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6945 #elif defined COMPILE_PCRE16
6946 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6947 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6948 #elif defined COMPILE_PCRE32
6949 return PCRE_ERROR_BADUTF32;
6950 #endif
6951 }
6952 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
6953 /* Check that a start_offset points to the start of a UTF character. */
6954 if (start_offset > 0 && start_offset < length &&
6955 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6956 return PCRE_ERROR_BADUTF8_OFFSET;
6957 #endif
6958 }
6959 #if defined(ERLANG_INTEGRATION)
6960 else {
6961 exec_context->valid_utf_ystate.cnt = 0;
6962 }
6963 #endif
6964 #endif
6965
6966 /* If the pattern was successfully studied with JIT support, run the JIT
6967 executable instead of the rest of this function. Most options must be set at
6968 compile time for the JIT code to be usable. Fallback to the normal code path if
6969 an unsupported flag is set. */
6970
6971 #ifdef SUPPORT_JIT
6972 if (extra_data != NULL
6973 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6974 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6975 && extra_data->executable_jit != NULL
6976 && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0)
6977 {
6978 rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length,
6979 start_offset, options, offsets, offsetcount);
6980
6981 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6982 mode is not compiled. In this case we simply fallback to interpreter. */
6983
6984 if (rc != PCRE_ERROR_JIT_BADOPTION) return rc;
6985 }
6986 #endif
6987
6988 /* Carry on with non-JIT matching. This information is for finding all the
6989 numbers associated with a given name, for condition testing. */
6990
6991 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6992 md->name_count = re->name_count;
6993 md->name_entry_size = re->name_entry_size;
6994
6995 /* Fish out the optional data from the extra_data structure, first setting
6996 the default values. */
6997
6998 study = NULL;
6999 md->match_limit = MATCH_LIMIT;
7000 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
7001 md->callout_data = NULL;
7002
7003 /* The table pointer is always in native byte order. */
7004
7005 tables = re->tables;
7006
7007 /* The two limit values override the defaults, whatever their value. */
7008
7009 if (extra_data != NULL)
7010 {
7011 unsigned long int flags = extra_data->flags;
7012 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
7013 study = (const pcre_study_data *)extra_data->study_data;
7014 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
7015 md->match_limit = extra_data->match_limit;
7016 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
7017 md->match_limit_recursion = extra_data->match_limit_recursion;
7018 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
7019 md->callout_data = extra_data->callout_data;
7020 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
7021 #ifdef ERLANG_INTEGRATION
7022 if ((flags & PCRE_EXTRA_LOOP_LIMIT) != 0)
7023 {
7024 md->loop_limit = extra_data->loop_limit;
7025 if (extra_data->restart_data)
7026 md->loop_limit -= extra_data->loop_limit - exec_context->valid_utf_ystate.cnt;
7027 if (md->loop_limit < 10)
7028 md->loop_limit = 10; /* At least do something if we've come this far... */
7029 }
7030 #endif
7031 }
7032
7033 /* Limits in the regex override only if they are smaller. */
7034
7035 if ((re->flags & PCRE_MLSET) != 0 && re->limit_match < md->match_limit)
7036 md->match_limit = re->limit_match;
7037
7038 if ((re->flags & PCRE_RLSET) != 0 &&
7039 re->limit_recursion < md->match_limit_recursion)
7040 md->match_limit_recursion = re->limit_recursion;
7041
7042 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
7043 is a feature that makes it possible to save compiled regex and re-use them
7044 in other programs later. */
7045
7046 if (tables == NULL) tables = PRIV(default_tables);
7047
7048 /* Set up other data */
7049
7050 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
7051 startline = (re->flags & PCRE_STARTLINE) != 0;
7052 firstline = (re->options & PCRE_FIRSTLINE) != 0;
7053
7054 /* The code starts after the real_pcre block and the capture name table. */
7055
7056 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
7057 re->name_count * re->name_entry_size;
7058
7059 md->start_subject = (PCRE_PUCHAR)subject;
7060 md->start_offset = start_offset;
7061 md->end_subject = md->start_subject + length;
7062 end_subject = md->end_subject;
7063
7064 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
7065 md->use_ucp = (re->options & PCRE_UCP) != 0;
7066 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
7067 md->ignore_skip_arg = 0;
7068
7069 /* Some options are unpacked into BOOL variables in the hope that testing
7070 them will be faster than individual option bits. */
7071
7072 md->notbol = (options & PCRE_NOTBOL) != 0;
7073 md->noteol = (options & PCRE_NOTEOL) != 0;
7074 md->notempty = (options & PCRE_NOTEMPTY) != 0;
7075 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
7076
7077 md->hitend = FALSE;
7078 md->mark = md->nomatch_mark = NULL; /* In case never set */
7079
7080 md->recursive = NULL; /* No recursion at top level */
7081 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
7082
7083 md->lcc = tables + lcc_offset;
7084 md->fcc = tables + fcc_offset;
7085 md->ctypes = tables + ctypes_offset;
7086
7087 /* Handle different \R options. */
7088
7089 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
7090 {
7091 case 0:
7092 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
7093 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
7094 else
7095 #ifdef BSR_ANYCRLF
7096 md->bsr_anycrlf = TRUE;
7097 #else
7098 md->bsr_anycrlf = FALSE;
7099 #endif
7100 break;
7101
7102 case PCRE_BSR_ANYCRLF:
7103 md->bsr_anycrlf = TRUE;
7104 break;
7105
7106 case PCRE_BSR_UNICODE:
7107 md->bsr_anycrlf = FALSE;
7108 break;
7109
7110 default: return PCRE_ERROR_BADNEWLINE;
7111 }
7112
7113 /* Handle different types of newline. The three bits give eight cases. If
7114 nothing is set at run time, whatever was used at compile time applies. */
7115
7116 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
7117 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
7118 {
7119 case 0: newline = NEWLINE; break; /* Compile-time default */
7120 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
7121 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
7122 case PCRE_NEWLINE_CR+
7123 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
7124 case PCRE_NEWLINE_ANY: newline = -1; break;
7125 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
7126 default: return PCRE_ERROR_BADNEWLINE;
7127 }
7128
7129 if (newline == -2)
7130 {
7131 md->nltype = NLTYPE_ANYCRLF;
7132 }
7133 else if (newline < 0)
7134 {
7135 md->nltype = NLTYPE_ANY;
7136 }
7137 else
7138 {
7139 md->nltype = NLTYPE_FIXED;
7140 if (newline > 255)
7141 {
7142 md->nllen = 2;
7143 md->nl[0] = (newline >> 8) & 255;
7144 md->nl[1] = newline & 255;
7145 }
7146 else
7147 {
7148 md->nllen = 1;
7149 md->nl[0] = newline;
7150 }
7151 }
7152
7153 /* Partial matching was originally supported only for a restricted set of
7154 regexes; from release 8.00 there are no restrictions, but the bits are still
7155 defined (though never set). So there's no harm in leaving this code. */
7156
7157 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
7158 return PCRE_ERROR_BADPARTIAL;
7159
7160 /* If the expression has got more back references than the offsets supplied can
7161 hold, we get a temporary chunk of working store to use during the matching.
7162 Otherwise, we can use the vector supplied, rounding down its size to a multiple
7163 of 3. */
7164
7165 ocount = offsetcount - (offsetcount % 3);
7166 arg_offset_max = (2*ocount)/3;
7167
7168 if (re->top_backref > 0 && re->top_backref >= ocount/3)
7169 {
7170 ocount = re->top_backref * 3 + 3;
7171 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
7172 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
7173 using_temporary_offsets = TRUE;
7174 DPRINTF(("Got memory to hold back references\n"));
7175 }
7176 else md->offset_vector = offsets;
7177 md->offset_end = ocount;
7178 md->offset_max = (2*ocount)/3;
7179 md->capture_last = 0;
7180
7181 /* Reset the working variable associated with each extraction. These should
7182 never be used unless previously set, but they get saved and restored, and so we
7183 initialize them to avoid reading uninitialized locations. Also, unset the
7184 offsets for the matched string. This is really just for tidiness with callouts,
7185 in case they inspect these fields. */
7186
7187 if (md->offset_vector != NULL)
7188 {
7189 register int *iptr = md->offset_vector + ocount;
7190 register int *iend = iptr - re->top_bracket;
7191 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
7192 while (--iptr >= iend) *iptr = -1;
7193 if (offsetcount > 0) md->offset_vector[0] = -1;
7194 if (offsetcount > 1) md->offset_vector[1] = -1;
7195 }
7196
7197 /* Set up the first character to match, if available. The first_char value is
7198 never set for an anchored regular expression, but the anchoring may be forced
7199 at run time, so we have to test for anchoring. The first char may be unset for
7200 an unanchored pattern, of course. If there's no first char and the pattern was
7201 studied, there may be a bitmap of possible first characters. */
7202
7203 if (!anchored)
7204 {
7205 if ((re->flags & PCRE_FIRSTSET) != 0)
7206 {
7207 has_first_char = TRUE;
7208 first_char = first_char2 = (pcre_uchar)(re->first_char);
7209 if ((re->flags & PCRE_FCH_CASELESS) != 0)
7210 {
7211 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
7212 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
7213 if (utf && first_char > 127)
7214 first_char2 = UCD_OTHERCASE(first_char);
7215 #endif
7216 }
7217 }
7218 else
7219 if (!startline && study != NULL &&
7220 (study->flags & PCRE_STUDY_MAPPED) != 0)
7221 start_bits = study->start_bits;
7222 }
7223
7224 /* For anchored or unanchored matches, there may be a "last known required
7225 character" set. */
7226
7227 if ((re->flags & PCRE_REQCHSET) != 0)
7228 {
7229 has_req_char = TRUE;
7230 req_char = req_char2 = (pcre_uchar)(re->req_char);
7231 if ((re->flags & PCRE_RCH_CASELESS) != 0)
7232 {
7233 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
7234 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
7235 if (utf && req_char > 127)
7236 req_char2 = UCD_OTHERCASE(req_char);
7237 #endif
7238 }
7239 }
7240
7241
7242 /* ==========================================================================*/
7243
7244 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
7245 the loop runs just once. */
7246
7247 for(;;)
7248 {
7249 PCRE_PUCHAR save_end_subject = end_subject;
7250 PCRE_PUCHAR new_start_match;
7251
7252 /* If firstline is TRUE, the start of the match is constrained to the first
7253 line of a multiline string. That is, the match must be before or at the first
7254 newline. Implement this by temporarily adjusting end_subject so that we stop
7255 scanning at a newline. If the match fails at the newline, later code breaks
7256 this loop. */
7257
7258 if (firstline)
7259 {
7260 PCRE_PUCHAR t = start_match;
7261 #ifdef SUPPORT_UTF
7262 if (utf)
7263 {
7264 while (t < md->end_subject && !IS_NEWLINE(t))
7265 {
7266 t++;
7267 ACROSSCHAR(t < end_subject, *t, t++);
7268 }
7269 }
7270 else
7271 #endif
7272 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
7273 end_subject = t;
7274 }
7275
7276 /* There are some optimizations that avoid running the match if a known
7277 starting point is not found, or if a known later character is not present.
7278 However, there is an option that disables these, for testing and for ensuring
7279 that all callouts do actually occur. The option can be set in the regex by
7280 (*NO_START_OPT) or passed in match-time options. */
7281
7282 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
7283 {
7284 /* Advance to a unique first char if there is one. */
7285
7286 if (has_first_char)
7287 {
7288 pcre_uchar smc;
7289
7290 if (first_char != first_char2)
7291 while (start_match < end_subject &&
7292 (smc = UCHAR21TEST(start_match)) != first_char && smc != first_char2)
7293 start_match++;
7294 else
7295 while (start_match < end_subject && UCHAR21TEST(start_match) != first_char)
7296 start_match++;
7297 }
7298
7299 /* Or to just after a linebreak for a multiline match */
7300
7301 else if (startline)
7302 {
7303 if (start_match > md->start_subject + start_offset)
7304 {
7305 #ifdef SUPPORT_UTF
7306 if (utf)
7307 {
7308 while (start_match < end_subject && !WAS_NEWLINE(start_match))
7309 {
7310 start_match++;
7311 ACROSSCHAR(start_match < end_subject, *start_match,
7312 start_match++);
7313 }
7314 }
7315 else
7316 #endif
7317 while (start_match < end_subject && !WAS_NEWLINE(start_match))
7318 start_match++;
7319
7320 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
7321 and we are now at a LF, advance the match position by one more character.
7322 */
7323
7324 if (start_match[-1] == CHAR_CR &&
7325 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
7326 start_match < end_subject &&
7327 UCHAR21TEST(start_match) == CHAR_NL)
7328 start_match++;
7329 }
7330 }
7331
7332 /* Or to a non-unique first byte after study */
7333
7334 else if (start_bits != NULL)
7335 {
7336 while (start_match < end_subject)
7337 {
7338 register pcre_uint32 c = UCHAR21TEST(start_match);
7339 #ifndef COMPILE_PCRE8
7340 if (c > 255) c = 255;
7341 #endif
7342 if ((start_bits[c/8] & (1 << (c&7))) != 0)
7343 {
7344 ERTS_UPDATE_CONSUMED(extra_data, md);
7345 break;
7346 }
7347 start_match++;
7348 }
7349 }
7350 } /* Starting optimizations */
7351
7352 /* Restore fudged end_subject */
7353
7354 end_subject = save_end_subject;
7355
7356 /* The following two optimizations are disabled for partial matching or if
7357 disabling is explicitly requested. */
7358
7359 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
7360 {
7361 /* If the pattern was studied, a minimum subject length may be set. This is
7362 a lower bound; no actual string of that length may actually match the
7363 pattern. Although the value is, strictly, in characters, we treat it as
7364 bytes to avoid spending too much time in this optimization. */
7365
7366 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
7367 (pcre_uint32)(end_subject - start_match) < study->minlength)
7368 {
7369 rc = MATCH_NOMATCH;
7370 ERTS_UPDATE_CONSUMED(extra_data, md);
7371 break;
7372 }
7373
7374 /* If req_char is set, we know that that character must appear in the
7375 subject for the match to succeed. If the first character is set, req_char
7376 must be later in the subject; otherwise the test starts at the match point.
7377 This optimization can save a huge amount of backtracking in patterns with
7378 nested unlimited repeats that aren't going to match. Writing separate code
7379 for cased/caseless versions makes it go faster, as does using an
7380 autoincrement and backing off on a match.
7381
7382 HOWEVER: when the subject string is very, very long, searching to its end
7383 can take a long time, and give bad performance on quite ordinary patterns.
7384 This showed up when somebody was matching something like /^\d+C/ on a
7385 32-megabyte string... so we don't do this when the string is sufficiently
7386 long. */
7387
7388 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
7389 {
7390 register PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
7391
7392 /* We don't need to repeat the search if we haven't yet reached the
7393 place we found it at last time. */
7394
7395 if (p > req_char_ptr)
7396 {
7397 if (req_char != req_char2)
7398 {
7399 while (p < end_subject)
7400 {
7401 register pcre_uint32 pp = UCHAR21INCTEST(p);
7402 if (pp == req_char || pp == req_char2) { p--; break; }
7403 }
7404 }
7405 else
7406 {
7407 while (p < end_subject)
7408 {
7409 if (UCHAR21INCTEST(p) == req_char) { p--; break; }
7410 }
7411 }
7412
7413 /* If we can't find the required character, break the matching loop,
7414 forcing a match failure. */
7415
7416 if (p >= end_subject)
7417 {
7418 rc = MATCH_NOMATCH;
7419 ERTS_UPDATE_CONSUMED(extra_data, md);
7420 break;
7421 }
7422
7423 /* If we have found the required character, save the point where we
7424 found it, so that we don't search again next time round the loop if
7425 the start hasn't passed this character yet. */
7426
7427 req_char_ptr = p;
7428 }
7429 }
7430 }
7431
7432 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
7433 printf(">>>> Match against: ");
7434 pchars(start_match, end_subject - start_match, TRUE, md);
7435 printf("\n");
7436 #endif
7437
7438 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
7439 first starting point for which a partial match was found. */
7440
7441 md->start_match_ptr = start_match;
7442 md->start_used_ptr = start_match;
7443 md->match_call_count = 0;
7444 md->match_function_type = 0;
7445 md->end_offset_top = 0;
7446 md->skip_arg_count = 0;
7447 EDEBUGF(("Calling match..."));
7448 rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0);
7449 #ifdef ERLANG_INTEGRATION
7450 ERTS_UPDATE_CONSUMED(extra_data, md);
7451 SWAPOUT();
7452 while(rc == PCRE_ERROR_LOOP_LIMIT) {
7453 EDEBUGF(("Loop limit break detected"));
7454 return PCRE_ERROR_LOOP_LIMIT;
7455 RESTART_INTERRUPTED:
7456 md->loop_limit = extra_data->loop_limit;
7457 rc = match(NULL,NULL,NULL,0,md,NULL,0);
7458 *extra_data->loop_counter_return =
7459 (extra_data->loop_limit - md->loop_limit);
7460 }
7461 md->state_save = NULL; /* So that next call to free_saved... does not crash */
7462 #endif
7463 if (md->hitend && start_partial == NULL)
7464 {
7465 start_partial = md->start_used_ptr;
7466 match_partial = start_match;
7467 }
7468
7469 switch(rc)
7470 {
7471 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
7472 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
7473 entirely. The only way we can do that is to re-do the match at the same
7474 point, with a flag to force SKIP with an argument to be ignored. Just
7475 treating this case as NOMATCH does not work because it does not check other
7476 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
7477
7478 case MATCH_SKIP_ARG:
7479 new_start_match = start_match;
7480 md->ignore_skip_arg = md->skip_arg_count;
7481 break;
7482
7483 /* SKIP passes back the next starting point explicitly, but if it is no
7484 greater than the match we have just done, treat it as NOMATCH. */
7485
7486 case MATCH_SKIP:
7487 if (md->start_match_ptr > start_match)
7488 {
7489 new_start_match = md->start_match_ptr;
7490 break;
7491 }
7492 /* Fall through */
7493
7494 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
7495 exactly like PRUNE. Unset ignore SKIP-with-argument. */
7496
7497 case MATCH_NOMATCH:
7498 case MATCH_PRUNE:
7499 case MATCH_THEN:
7500 md->ignore_skip_arg = 0;
7501 new_start_match = start_match + 1;
7502 #ifdef SUPPORT_UTF
7503 if (utf)
7504 ACROSSCHAR(new_start_match < end_subject, *new_start_match,
7505 new_start_match++);
7506 #endif
7507 break;
7508
7509 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
7510
7511 case MATCH_COMMIT:
7512 rc = MATCH_NOMATCH;
7513 goto ENDLOOP;
7514
7515 /* Any other return is either a match, or some kind of error. */
7516
7517 default:
7518 goto ENDLOOP;
7519 }
7520
7521 /* Control reaches here for the various types of "no match at this point"
7522 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
7523
7524 rc = MATCH_NOMATCH;
7525
7526 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
7527 newline in the subject (though it may continue over the newline). Therefore,
7528 if we have just failed to match, starting at a newline, do not continue. */
7529
7530 if (firstline && IS_NEWLINE(start_match)) break;
7531
7532 /* Advance to new matching position */
7533
7534 start_match = new_start_match;
7535
7536 /* Break the loop if the pattern is anchored or if we have passed the end of
7537 the subject. */
7538
7539 if (anchored || start_match > end_subject) break;
7540
7541 /* If we have just passed a CR and we are now at a LF, and the pattern does
7542 not contain any explicit matches for \r or \n, and the newline option is CRLF
7543 or ANY or ANYCRLF, advance the match position by one more character. In
7544 normal matching start_match will aways be greater than the first position at
7545 this stage, but a failed *SKIP can cause a return at the same point, which is
7546 why the first test exists. */
7547
7548 if (start_match > (PCRE_PUCHAR)subject + start_offset &&
7549 start_match[-1] == CHAR_CR &&
7550 start_match < end_subject &&
7551 *start_match == CHAR_NL &&
7552 (re->flags & PCRE_HASCRORLF) == 0 &&
7553 (md->nltype == NLTYPE_ANY ||
7554 md->nltype == NLTYPE_ANYCRLF ||
7555 md->nllen == 2))
7556 start_match++;
7557
7558 md->mark = NULL; /* Reset for start of next match attempt */
7559 } /* End of for(;;) "bumpalong" loop */
7560
7561 /* ==========================================================================*/
7562
7563 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
7564 conditions is true:
7565
7566 (1) The pattern is anchored or the match was failed by (*COMMIT);
7567
7568 (2) We are past the end of the subject;
7569
7570 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
7571 this option requests that a match occur at or before the first newline in
7572 the subject.
7573
7574 When we have a match and the offset vector is big enough to deal with any
7575 backreferences, captured substring offsets will already be set up. In the case
7576 where we had to get some local store to hold offsets for backreference
7577 processing, copy those that we can. In this case there need not be overflow if
7578 certain parts of the pattern were not used, even though there are more
7579 capturing parentheses than vector slots. */
7580
7581 ENDLOOP:
7582
7583 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
7584 {
7585 if (using_temporary_offsets)
7586 {
7587 if (arg_offset_max >= 4)
7588 {
7589 memcpy(offsets + 2, md->offset_vector + 2,
7590 (arg_offset_max - 2) * sizeof(int));
7591 DPRINTF(("Copied offsets from temporary memory\n"));
7592 }
7593 if (md->end_offset_top > arg_offset_max) md->capture_last |= OVFLBIT;
7594 DPRINTF(("Freeing temporary memory\n"));
7595 (PUBL(free))(md->offset_vector);
7596 #ifdef ERLANG_INTEGRATION
7597 md->offset_vector = NULL;
7598 #endif
7599 }
7600
7601 /* Set the return code to the number of captured strings, or 0 if there were
7602 too many to fit into the vector. */
7603
7604 rc = ((md->capture_last & OVFLBIT) != 0 &&
7605 md->end_offset_top >= arg_offset_max)?
7606 0 : md->end_offset_top/2;
7607
7608 /* If there is space in the offset vector, set any unused pairs at the end of
7609 the pattern to -1 for backwards compatibility. It is documented that this
7610 happens. In earlier versions, the whole set of potential capturing offsets
7611 was set to -1 each time round the loop, but this is handled differently now.
7612 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
7613 those at the end that need unsetting here. We can't just unset them all at
7614 the start of the whole thing because they may get set in one branch that is
7615 not the final matching branch. */
7616
7617 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
7618 {
7619 register int *iptr, *iend;
7620 int resetcount = 2 + re->top_bracket * 2;
7621 if (resetcount > offsetcount) resetcount = offsetcount;
7622 iptr = offsets + md->end_offset_top;
7623 iend = offsets + resetcount;
7624 while (iptr < iend) *iptr++ = -1;
7625 }
7626
7627 /* If there is space, set up the whole thing as substring 0. The value of
7628 md->start_match_ptr might be modified if \K was encountered on the success
7629 matching path. */
7630
7631 if (offsetcount < 2) rc = 0; else
7632 {
7633 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
7634 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
7635 }
7636
7637 /* Return MARK data if requested */
7638
7639 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
7640 *(extra_data->mark) = (pcre_uchar *)md->mark;
7641 DPRINTF((">>>> returning %d\n", rc));
7642 #ifdef NO_RECURSE
7643 release_match_heapframes(&frame_zero);
7644 #endif
7645 return rc;
7646 }
7647
7648 /* Control gets here if there has been an error, or if the overall match
7649 attempt has failed at all permitted starting positions. */
7650
7651 if (using_temporary_offsets)
7652 {
7653 DPRINTF(("Freeing temporary memory\n"));
7654 #ifdef ERLANG_INTEGRATION
7655 if (extra_data == NULL ||
7656 !(extra_data->flags & PCRE_EXTRA_LOOP_LIMIT))
7657 {
7658 (PUBL(free))(md->offset_vector);
7659 md->offset_vector = NULL;
7660 }
7661 #else
7662 (PUBL(free))(md->offset_vector);
7663 #endif
7664 }
7665
7666 /* For anything other than nomatch or partial match, just return the code. */
7667
7668 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
7669 {
7670 DPRINTF((">>>> error: returning %d\n", rc));
7671 #ifdef NO_RECURSE
7672 release_match_heapframes(&frame_zero);
7673 #endif
7674 return rc;
7675 }
7676
7677 /* Handle partial matches - disable any mark data */
7678
7679 if (match_partial != NULL)
7680 {
7681 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
7682 md->mark = NULL;
7683 if (offsetcount > 1)
7684 {
7685 offsets[0] = (int)(start_partial - (PCRE_PUCHAR)subject);
7686 offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
7687 if (offsetcount > 2)
7688 offsets[2] = (int)(match_partial - (PCRE_PUCHAR)subject);
7689 }
7690 rc = PCRE_ERROR_PARTIAL;
7691 }
7692
7693 /* This is the classic nomatch case */
7694
7695 else
7696 {
7697 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
7698 rc = PCRE_ERROR_NOMATCH;
7699 }
7700
7701 /* Return the MARK data if it has been requested. */
7702
7703 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
7704 *(extra_data->mark) = (pcre_uchar *)md->nomatch_mark;
7705 #ifdef NO_RECURSE
7706 release_match_heapframes(&frame_zero);
7707 #endif
7708 return rc;
7709 }
7710
7711 #if defined(ERLANG_INTEGRATION)
7712 #undef arg_offset_max
7713 #undef using_temporary_offsets
7714 #undef anchored
7715 #undef startline
7716 #undef firstline
7717 #undef has_first_char
7718 #undef has_req_char
7719 #undef first_char2
7720 #undef req_char
7721 #undef req_char2
7722 #undef match_block
7723 #undef md
7724 #undef start_match
7725 #undef start_partial
7726 #undef match_partial
7727 #undef study
7728 #undef re
7729 #undef frame_zero
7730
erts_pcre_free_restart_data(void * restart_data)7731 void erts_pcre_free_restart_data(void *restart_data) {
7732 PcreExecContext *top = (PcreExecContext *) restart_data;
7733 /* We might be done, or we might not, so there might be some saved match_states here */
7734 if (top != NULL) {
7735 match_data *md = top->Xmd;
7736 if (top->Xusing_temporary_offsets && md->offset_vector != NULL) {
7737 (PUBL(free))(md->offset_vector);
7738 }
7739 release_match_heapframes(&(top->Xframe_zero));
7740 (PUBL(free))(top);
7741 }
7742 }
7743 #endif
7744 /* End of pcre_exec.c */
7745