1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2014 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41 /* This module contains functions for scanning a compiled pattern and
42 collecting data (e.g. minimum matching length). */
43
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49
50 #include "pcre2_internal.h"
51
52
53 /* Set a bit in the starting code unit bit map. */
54
55 #define SET_BIT(c) re->start_bitmap[(c)/8] |= (1 << ((c)&7))
56
57 /* Returns from set_start_bits() */
58
59 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
60
61
62
63 /*************************************************
64 * Find the minimum subject length for a group *
65 *************************************************/
66
67 /* Scan a parenthesized group and compute the minimum length of subject that
68 is needed to match it. This is a lower bound; it does not mean there is a
69 string of that length that matches. In UTF8 mode, the result is in characters
70 rather than bytes.
71
72 Arguments:
73 re compiled pattern block
74 code pointer to start of group (the bracket)
75 startcode pointer to start of the whole pattern's code
76 recurse_depth RECURSE depth
77 utf UTF flag
78
79 Returns: the minimum length
80 -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
81 -2 internal error (missing capturing bracket)
82 -3 internal error (opcode not listed)
83 */
84
85 static int
find_minlength(const pcre2_real_code * re,PCRE2_SPTR code,PCRE2_SPTR startcode,int recurse_depth,BOOL utf)86 find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
87 PCRE2_SPTR startcode, int recurse_depth, BOOL utf)
88 {
89 int length = -1;
90 BOOL had_recurse = FALSE;
91 register int branchlength = 0;
92 register PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
93
94 if (*code == OP_CBRA || *code == OP_SCBRA ||
95 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
96
97 /* Scan along the opcodes for this branch. If we get to the end of the
98 branch, check the length against that of the other branches. */
99
100 for (;;)
101 {
102 int d, min;
103 PCRE2_UCHAR *cs, *ce;
104 register PCRE2_UCHAR op = *cc;
105
106 switch (op)
107 {
108 case OP_COND:
109 case OP_SCOND:
110
111 /* If there is only one branch in a condition, the implied branch has zero
112 length, so we don't add anything. This covers the DEFINE "condition"
113 automatically. */
114
115 cs = cc + GET(cc, 1);
116 if (*cs != OP_ALT)
117 {
118 cc = cs + 1 + LINK_SIZE;
119 break;
120 }
121
122 /* Otherwise we can fall through and treat it the same as any other
123 subpattern. */
124
125 case OP_CBRA:
126 case OP_SCBRA:
127 case OP_BRA:
128 case OP_SBRA:
129 case OP_CBRAPOS:
130 case OP_SCBRAPOS:
131 case OP_BRAPOS:
132 case OP_SBRAPOS:
133 case OP_ONCE:
134 case OP_ONCE_NC:
135 d = find_minlength(re, cc, startcode, recurse_depth, utf);
136 if (d < 0) return d;
137 branchlength += d;
138 do cc += GET(cc, 1); while (*cc == OP_ALT);
139 cc += 1 + LINK_SIZE;
140 break;
141
142 /* ACCEPT makes things far too complicated; we have to give up. */
143
144 case OP_ACCEPT:
145 case OP_ASSERT_ACCEPT:
146 return -1;
147
148 /* Reached end of a branch; if it's a ket it is the end of a nested
149 call. If it's ALT it is an alternation in a nested call. If it is END it's
150 the end of the outer call. All can be handled by the same code. If an
151 ACCEPT was previously encountered, use the length that was in force at that
152 time, and pass back the shortest ACCEPT length. */
153
154 case OP_ALT:
155 case OP_KET:
156 case OP_KETRMAX:
157 case OP_KETRMIN:
158 case OP_KETRPOS:
159 case OP_END:
160 if (length < 0 || (!had_recurse && branchlength < length))
161 length = branchlength;
162 if (op != OP_ALT) return length;
163 cc += 1 + LINK_SIZE;
164 branchlength = 0;
165 had_recurse = FALSE;
166 break;
167
168 /* Skip over assertive subpatterns */
169
170 case OP_ASSERT:
171 case OP_ASSERT_NOT:
172 case OP_ASSERTBACK:
173 case OP_ASSERTBACK_NOT:
174 do cc += GET(cc, 1); while (*cc == OP_ALT);
175 /* Fall through */
176
177 /* Skip over things that don't match chars */
178
179 case OP_REVERSE:
180 case OP_CREF:
181 case OP_DNCREF:
182 case OP_RREF:
183 case OP_DNRREF:
184 case OP_FALSE:
185 case OP_TRUE:
186 case OP_CALLOUT:
187 case OP_SOD:
188 case OP_SOM:
189 case OP_EOD:
190 case OP_EODN:
191 case OP_CIRC:
192 case OP_CIRCM:
193 case OP_DOLL:
194 case OP_DOLLM:
195 case OP_NOT_WORD_BOUNDARY:
196 case OP_WORD_BOUNDARY:
197 cc += PRIV(OP_lengths)[*cc];
198 break;
199
200 /* Skip over a subpattern that has a {0} or {0,x} quantifier */
201
202 case OP_BRAZERO:
203 case OP_BRAMINZERO:
204 case OP_BRAPOSZERO:
205 case OP_SKIPZERO:
206 cc += PRIV(OP_lengths)[*cc];
207 do cc += GET(cc, 1); while (*cc == OP_ALT);
208 cc += 1 + LINK_SIZE;
209 break;
210
211 /* Handle literal characters and + repetitions */
212
213 case OP_CHAR:
214 case OP_CHARI:
215 case OP_NOT:
216 case OP_NOTI:
217 case OP_PLUS:
218 case OP_PLUSI:
219 case OP_MINPLUS:
220 case OP_MINPLUSI:
221 case OP_POSPLUS:
222 case OP_POSPLUSI:
223 case OP_NOTPLUS:
224 case OP_NOTPLUSI:
225 case OP_NOTMINPLUS:
226 case OP_NOTMINPLUSI:
227 case OP_NOTPOSPLUS:
228 case OP_NOTPOSPLUSI:
229 branchlength++;
230 cc += 2;
231 #ifdef SUPPORT_UNICODE
232 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
233 #endif
234 break;
235
236 case OP_TYPEPLUS:
237 case OP_TYPEMINPLUS:
238 case OP_TYPEPOSPLUS:
239 branchlength++;
240 cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
241 break;
242
243 /* Handle exact repetitions. The count is already in characters, but we
244 may need to skip over a multibyte character in UTF mode. */
245
246 case OP_EXACT:
247 case OP_EXACTI:
248 case OP_NOTEXACT:
249 case OP_NOTEXACTI:
250 branchlength += GET2(cc,1);
251 cc += 2 + IMM2_SIZE;
252 #ifdef SUPPORT_UNICODE
253 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
254 #endif
255 break;
256
257 case OP_TYPEEXACT:
258 branchlength += GET2(cc,1);
259 cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
260 || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
261 break;
262
263 /* Handle single-char non-literal matchers */
264
265 case OP_PROP:
266 case OP_NOTPROP:
267 cc += 2;
268 /* Fall through */
269
270 case OP_NOT_DIGIT:
271 case OP_DIGIT:
272 case OP_NOT_WHITESPACE:
273 case OP_WHITESPACE:
274 case OP_NOT_WORDCHAR:
275 case OP_WORDCHAR:
276 case OP_ANY:
277 case OP_ALLANY:
278 case OP_EXTUNI:
279 case OP_HSPACE:
280 case OP_NOT_HSPACE:
281 case OP_VSPACE:
282 case OP_NOT_VSPACE:
283 branchlength++;
284 cc++;
285 break;
286
287 /* "Any newline" might match two characters, but it also might match just
288 one. */
289
290 case OP_ANYNL:
291 branchlength += 1;
292 cc++;
293 break;
294
295 /* The single-byte matcher means we can't proceed in UTF mode. (In
296 non-UTF mode \C will actually be turned into OP_ALLANY, so won't ever
297 appear, but leave the code, just in case.) */
298
299 case OP_ANYBYTE:
300 #ifdef SUPPORT_UNICODE
301 if (utf) return -1;
302 #endif
303 branchlength++;
304 cc++;
305 break;
306
307 /* For repeated character types, we have to test for \p and \P, which have
308 an extra two bytes of parameters. */
309
310 case OP_TYPESTAR:
311 case OP_TYPEMINSTAR:
312 case OP_TYPEQUERY:
313 case OP_TYPEMINQUERY:
314 case OP_TYPEPOSSTAR:
315 case OP_TYPEPOSQUERY:
316 if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
317 cc += PRIV(OP_lengths)[op];
318 break;
319
320 case OP_TYPEUPTO:
321 case OP_TYPEMINUPTO:
322 case OP_TYPEPOSUPTO:
323 if (cc[1 + IMM2_SIZE] == OP_PROP
324 || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
325 cc += PRIV(OP_lengths)[op];
326 break;
327
328 /* Check a class for variable quantification */
329
330 case OP_CLASS:
331 case OP_NCLASS:
332 #ifdef SUPPORT_WIDE_CHARS
333 case OP_XCLASS:
334 /* The original code caused an unsigned overflow in 64 bit systems,
335 so now we use a conditional statement. */
336 if (op == OP_XCLASS)
337 cc += GET(cc, 1);
338 else
339 cc += PRIV(OP_lengths)[OP_CLASS];
340 #else
341 cc += PRIV(OP_lengths)[OP_CLASS];
342 #endif
343
344 switch (*cc)
345 {
346 case OP_CRPLUS:
347 case OP_CRMINPLUS:
348 case OP_CRPOSPLUS:
349 branchlength++;
350 /* Fall through */
351
352 case OP_CRSTAR:
353 case OP_CRMINSTAR:
354 case OP_CRQUERY:
355 case OP_CRMINQUERY:
356 case OP_CRPOSSTAR:
357 case OP_CRPOSQUERY:
358 cc++;
359 break;
360
361 case OP_CRRANGE:
362 case OP_CRMINRANGE:
363 case OP_CRPOSRANGE:
364 branchlength += GET2(cc,1);
365 cc += 1 + 2 * IMM2_SIZE;
366 break;
367
368 default:
369 branchlength++;
370 break;
371 }
372 break;
373
374 /* Backreferences and subroutine calls are treated in the same way: we find
375 the minimum length for the subpattern. A recursion, however, causes an
376 a flag to be set that causes the length of this branch to be ignored. The
377 logic is that a recursion can only make sense if there is another
378 alternation that stops the recursing. That will provide the minimum length
379 (when no recursion happens). A backreference within the group that it is
380 referencing behaves in the same way.
381
382 If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket
383 matches an empty string (by default it causes a matching failure), so in
384 that case we must set the minimum length to zero. */
385
386 case OP_DNREF: /* Duplicate named pattern back reference */
387 case OP_DNREFI:
388 if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
389 {
390 int count = GET2(cc, 1+IMM2_SIZE);
391 PCRE2_UCHAR *slot =
392 (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
393 GET2(cc, 1) * re->name_entry_size;
394
395 d = INT_MAX;
396 while (count-- > 0)
397 {
398 ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
399 if (cs == NULL) return -2;
400 do ce += GET(ce, 1); while (*ce == OP_ALT);
401 if (cc > cs && cc < ce)
402 {
403 d = 0;
404 had_recurse = TRUE;
405 break;
406 }
407 else
408 {
409 int dd = find_minlength(re, cs, startcode, recurse_depth, utf);
410 if (dd < d) d = dd;
411 }
412 slot += re->name_entry_size;
413 }
414 }
415 else d = 0;
416 cc += 1 + 2*IMM2_SIZE;
417 goto REPEAT_BACK_REFERENCE;
418
419 case OP_REF: /* Single back reference */
420 case OP_REFI:
421 if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
422 {
423 ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
424 if (cs == NULL) return -2;
425 do ce += GET(ce, 1); while (*ce == OP_ALT);
426 if (cc > cs && cc < ce)
427 {
428 d = 0;
429 had_recurse = TRUE;
430 }
431 else
432 {
433 d = find_minlength(re, cs, startcode, recurse_depth, utf);
434 }
435 }
436 else d = 0;
437 cc += 1 + IMM2_SIZE;
438
439 /* Handle repeated back references */
440
441 REPEAT_BACK_REFERENCE:
442 switch (*cc)
443 {
444 case OP_CRSTAR:
445 case OP_CRMINSTAR:
446 case OP_CRQUERY:
447 case OP_CRMINQUERY:
448 case OP_CRPOSSTAR:
449 case OP_CRPOSQUERY:
450 min = 0;
451 cc++;
452 break;
453
454 case OP_CRPLUS:
455 case OP_CRMINPLUS:
456 case OP_CRPOSPLUS:
457 min = 1;
458 cc++;
459 break;
460
461 case OP_CRRANGE:
462 case OP_CRMINRANGE:
463 case OP_CRPOSRANGE:
464 min = GET2(cc, 1);
465 cc += 1 + 2 * IMM2_SIZE;
466 break;
467
468 default:
469 min = 1;
470 break;
471 }
472
473 branchlength += min * d;
474 break;
475
476 /* We can easily detect direct recursion, but not mutual recursion. This is
477 caught by a recursion depth count. */
478
479 case OP_RECURSE:
480 cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1);
481 do ce += GET(ce, 1); while (*ce == OP_ALT);
482 if ((cc > cs && cc < ce) || recurse_depth > 10)
483 had_recurse = TRUE;
484 else
485 {
486 branchlength += find_minlength(re, cs, startcode, recurse_depth + 1, utf);
487 }
488 cc += 1 + LINK_SIZE;
489 break;
490
491 /* Anything else does not or need not match a character. We can get the
492 item's length from the table, but for those that can match zero occurrences
493 of a character, we must take special action for UTF-8 characters. As it
494 happens, the "NOT" versions of these opcodes are used at present only for
495 ASCII characters, so they could be omitted from this list. However, in
496 future that may change, so we include them here so as not to leave a
497 gotcha for a future maintainer. */
498
499 case OP_UPTO:
500 case OP_UPTOI:
501 case OP_NOTUPTO:
502 case OP_NOTUPTOI:
503 case OP_MINUPTO:
504 case OP_MINUPTOI:
505 case OP_NOTMINUPTO:
506 case OP_NOTMINUPTOI:
507 case OP_POSUPTO:
508 case OP_POSUPTOI:
509 case OP_NOTPOSUPTO:
510 case OP_NOTPOSUPTOI:
511
512 case OP_STAR:
513 case OP_STARI:
514 case OP_NOTSTAR:
515 case OP_NOTSTARI:
516 case OP_MINSTAR:
517 case OP_MINSTARI:
518 case OP_NOTMINSTAR:
519 case OP_NOTMINSTARI:
520 case OP_POSSTAR:
521 case OP_POSSTARI:
522 case OP_NOTPOSSTAR:
523 case OP_NOTPOSSTARI:
524
525 case OP_QUERY:
526 case OP_QUERYI:
527 case OP_NOTQUERY:
528 case OP_NOTQUERYI:
529 case OP_MINQUERY:
530 case OP_MINQUERYI:
531 case OP_NOTMINQUERY:
532 case OP_NOTMINQUERYI:
533 case OP_POSQUERY:
534 case OP_POSQUERYI:
535 case OP_NOTPOSQUERY:
536 case OP_NOTPOSQUERYI:
537
538 cc += PRIV(OP_lengths)[op];
539 #ifdef SUPPORT_UNICODE
540 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
541 #endif
542 break;
543
544 /* Skip these, but we need to add in the name length. */
545
546 case OP_MARK:
547 case OP_PRUNE_ARG:
548 case OP_SKIP_ARG:
549 case OP_THEN_ARG:
550 cc += PRIV(OP_lengths)[op] + cc[1];
551 break;
552
553 /* The remaining opcodes are just skipped over. */
554
555 case OP_CLOSE:
556 case OP_COMMIT:
557 case OP_FAIL:
558 case OP_PRUNE:
559 case OP_SET_SOM:
560 case OP_SKIP:
561 case OP_THEN:
562 cc += PRIV(OP_lengths)[op];
563 break;
564
565 /* This should not occur: we list all opcodes explicitly so that when
566 new ones get added they are properly considered. */
567
568 default:
569 return -3;
570 }
571 }
572 /* Control never gets here */
573 }
574
575
576
577 /*************************************************
578 * Set a bit and maybe its alternate case *
579 *************************************************/
580
581 /* Given a character, set its first code unit's bit in the table, and also the
582 corresponding bit for the other version of a letter if we are caseless.
583
584 Arguments:
585 re points to the regex block
586 p points to the first code unit of the character
587 caseless TRUE if caseless
588 utf TRUE for UTF mode
589
590 Returns: pointer after the character
591 */
592
593 static PCRE2_SPTR
set_table_bit(pcre2_real_code * re,PCRE2_SPTR p,BOOL caseless,BOOL utf)594 set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf)
595 {
596 uint32_t c = *p++; /* First code unit */
597 (void)utf; /* Stop compiler warning when UTF not supported */
598
599 /* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
600 0xff. */
601
602 #if PCRE2_CODE_UNIT_WIDTH != 8
603 if (c > 0xff) SET_BIT(0xff); else
604 #endif
605
606 SET_BIT(c);
607
608 /* In UTF-8 or UTF-16 mode, pick up the remaining code units in order to find
609 the end of the character, even when caseless. */
610
611 #ifdef SUPPORT_UNICODE
612 if (utf)
613 {
614 #if PCRE2_CODE_UNIT_WIDTH == 8
615 if (c >= 0xc0) GETUTF8INC(c, p);
616 #elif PCRE2_CODE_UNIT_WIDTH == 16
617 if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, p);
618 #endif
619 }
620 #endif /* SUPPORT_UNICODE */
621
622 /* If caseless, handle the other case of the character. */
623
624 if (caseless)
625 {
626 if (utf)
627 {
628 #if PCRE2_CODE_UNIT_WIDTH == 8
629 PCRE2_UCHAR buff[6];
630 c = UCD_OTHERCASE(c);
631 (void)PRIV(ord2utf)(c, buff);
632 SET_BIT(buff[0]);
633 #else /* 16-bit or 32-bit mode */
634 c = UCD_OTHERCASE(c);
635 if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
636 #endif
637 }
638
639 /* Not UTF */
640
641 else if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
642 }
643
644 return p;
645 }
646
647
648
649 /*************************************************
650 * Set bits for a positive character type *
651 *************************************************/
652
653 /* This function sets starting bits for a character type. In UTF-8 mode, we can
654 only do a direct setting for bytes less than 128, as otherwise there can be
655 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
656 environment, the tables will only recognize ASCII characters anyway, but in at
657 least one Windows environment, some higher bytes bits were set in the tables.
658 So we deal with that case by considering the UTF-8 encoding.
659
660 Arguments:
661 re the regex block
662 cbit type the type of character wanted
663 table_limit 32 for non-UTF-8; 16 for UTF-8
664
665 Returns: nothing
666 */
667
668 static void
set_type_bits(pcre2_real_code * re,int cbit_type,unsigned int table_limit)669 set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
670 {
671 register uint32_t c;
672 for (c = 0; c < table_limit; c++)
673 re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type];
674 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
675 if (table_limit == 32) return;
676 for (c = 128; c < 256; c++)
677 {
678 if ((re->tables[cbits_offset + c/8] & (1 << (c&7))) != 0)
679 {
680 PCRE2_UCHAR buff[6];
681 (void)PRIV(ord2utf)(c, buff);
682 SET_BIT(buff[0]);
683 }
684 }
685 #endif /* UTF-8 */
686 }
687
688
689 /*************************************************
690 * Set bits for a negative character type *
691 *************************************************/
692
693 /* This function sets starting bits for a negative character type such as \D.
694 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
695 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
696 Unlike in the positive case, where we can set appropriate starting bits for
697 specific high-valued UTF-8 characters, in this case we have to set the bits for
698 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
699 0xc0 (192) for simplicity.
700
701 Arguments:
702 re the regex block
703 cbit type the type of character wanted
704 table_limit 32 for non-UTF-8; 16 for UTF-8
705
706 Returns: nothing
707 */
708
709 static void
set_nottype_bits(pcre2_real_code * re,int cbit_type,unsigned int table_limit)710 set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
711 {
712 register uint32_t c;
713 for (c = 0; c < table_limit; c++)
714 re->start_bitmap[c] |= ~(re->tables[c+cbits_offset+cbit_type]);
715 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
716 if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff;
717 #endif
718 }
719
720
721
722 /*************************************************
723 * Create bitmap of starting bytes *
724 *************************************************/
725
726 /* This function scans a compiled unanchored expression recursively and
727 attempts to build a bitmap of the set of possible starting code units whose
728 values are less than 256. In 16-bit and 32-bit mode, values above 255 all cause
729 the 255 bit to be set. When calling set[_not]_type_bits() in UTF-8 (sic) mode
730 we pass a value of 16 rather than 32 as the final argument. (See comments in
731 those functions for the reason.)
732
733 The SSB_CONTINUE return is useful for parenthesized groups in patterns such as
734 (a*)b where the group provides some optional starting code units but scanning
735 must continue at the outer level to find at least one mandatory code unit. At
736 the outermost level, this function fails unless the result is SSB_DONE.
737
738 Arguments:
739 re points to the compiled regex block
740 code points to an expression
741 utf TRUE if in UTF mode
742
743 Returns: SSB_FAIL => Failed to find any starting code units
744 SSB_DONE => Found mandatory starting code units
745 SSB_CONTINUE => Found optional starting code units
746 SSB_UNKNOWN => Hit an unrecognized opcode
747 */
748
749 static int
set_start_bits(pcre2_real_code * re,PCRE2_SPTR code,BOOL utf)750 set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf)
751 {
752 register uint32_t c;
753 int yield = SSB_DONE;
754
755 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
756 int table_limit = utf? 16:32;
757 #else
758 int table_limit = 32;
759 #endif
760
761 do
762 {
763 BOOL try_next = TRUE;
764 PCRE2_SPTR tcode = code + 1 + LINK_SIZE;
765
766 if (*code == OP_CBRA || *code == OP_SCBRA ||
767 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
768
769 while (try_next) /* Loop for items in this branch */
770 {
771 int rc;
772 uint8_t *classmap = NULL;
773
774 switch(*tcode)
775 {
776 /* If we reach something we don't understand, it means a new opcode has
777 been created that hasn't been added to this function. Hopefully this
778 problem will be discovered during testing. */
779
780 default:
781 return SSB_UNKNOWN;
782
783 /* Fail for a valid opcode that implies no starting bits. */
784
785 case OP_ACCEPT:
786 case OP_ASSERT_ACCEPT:
787 case OP_ALLANY:
788 case OP_ANY:
789 case OP_ANYBYTE:
790 case OP_CIRC:
791 case OP_CIRCM:
792 case OP_CLOSE:
793 case OP_COMMIT:
794 case OP_COND:
795 case OP_CREF:
796 case OP_FALSE:
797 case OP_TRUE:
798 case OP_DNCREF:
799 case OP_DNREF:
800 case OP_DNREFI:
801 case OP_DNRREF:
802 case OP_DOLL:
803 case OP_DOLLM:
804 case OP_END:
805 case OP_EOD:
806 case OP_EODN:
807 case OP_EXTUNI:
808 case OP_FAIL:
809 case OP_MARK:
810 case OP_NOT:
811 case OP_NOTEXACT:
812 case OP_NOTEXACTI:
813 case OP_NOTI:
814 case OP_NOTMINPLUS:
815 case OP_NOTMINPLUSI:
816 case OP_NOTMINQUERY:
817 case OP_NOTMINQUERYI:
818 case OP_NOTMINSTAR:
819 case OP_NOTMINSTARI:
820 case OP_NOTMINUPTO:
821 case OP_NOTMINUPTOI:
822 case OP_NOTPLUS:
823 case OP_NOTPLUSI:
824 case OP_NOTPOSPLUS:
825 case OP_NOTPOSPLUSI:
826 case OP_NOTPOSQUERY:
827 case OP_NOTPOSQUERYI:
828 case OP_NOTPOSSTAR:
829 case OP_NOTPOSSTARI:
830 case OP_NOTPOSUPTO:
831 case OP_NOTPOSUPTOI:
832 case OP_NOTPROP:
833 case OP_NOTQUERY:
834 case OP_NOTQUERYI:
835 case OP_NOTSTAR:
836 case OP_NOTSTARI:
837 case OP_NOTUPTO:
838 case OP_NOTUPTOI:
839 case OP_NOT_HSPACE:
840 case OP_NOT_VSPACE:
841 case OP_PRUNE:
842 case OP_PRUNE_ARG:
843 case OP_RECURSE:
844 case OP_REF:
845 case OP_REFI:
846 case OP_REVERSE:
847 case OP_RREF:
848 case OP_SCOND:
849 case OP_SET_SOM:
850 case OP_SKIP:
851 case OP_SKIP_ARG:
852 case OP_SOD:
853 case OP_SOM:
854 case OP_THEN:
855 case OP_THEN_ARG:
856 return SSB_FAIL;
857
858 /* A "real" property test implies no starting bits, but the fake property
859 PT_CLIST identifies a list of characters. These lists are short, as they
860 are used for characters with more than one "other case", so there is no
861 point in recognizing them for OP_NOTPROP. */
862
863 case OP_PROP:
864 if (tcode[1] != PT_CLIST) return SSB_FAIL;
865 {
866 const uint32_t *p = PRIV(ucd_caseless_sets) + tcode[2];
867 while ((c = *p++) < NOTACHAR)
868 {
869 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
870 if (utf)
871 {
872 PCRE2_UCHAR buff[6];
873 (void)PRIV(ord2utf)(c, buff);
874 c = buff[0];
875 }
876 #endif
877 if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
878 }
879 }
880 try_next = FALSE;
881 break;
882
883 /* We can ignore word boundary tests. */
884
885 case OP_WORD_BOUNDARY:
886 case OP_NOT_WORD_BOUNDARY:
887 tcode++;
888 break;
889
890 /* If we hit a bracket or a positive lookahead assertion, recurse to set
891 bits from within the subpattern. If it can't find anything, we have to
892 give up. If it finds some mandatory character(s), we are done for this
893 branch. Otherwise, carry on scanning after the subpattern. */
894
895 case OP_BRA:
896 case OP_SBRA:
897 case OP_CBRA:
898 case OP_SCBRA:
899 case OP_BRAPOS:
900 case OP_SBRAPOS:
901 case OP_CBRAPOS:
902 case OP_SCBRAPOS:
903 case OP_ONCE:
904 case OP_ONCE_NC:
905 case OP_ASSERT:
906 rc = set_start_bits(re, tcode, utf);
907 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
908 if (rc == SSB_DONE) try_next = FALSE; else
909 {
910 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
911 tcode += 1 + LINK_SIZE;
912 }
913 break;
914
915 /* If we hit ALT or KET, it means we haven't found anything mandatory in
916 this branch, though we might have found something optional. For ALT, we
917 continue with the next alternative, but we have to arrange that the final
918 result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
919 return SSB_CONTINUE: if this is the top level, that indicates failure,
920 but after a nested subpattern, it causes scanning to continue. */
921
922 case OP_ALT:
923 yield = SSB_CONTINUE;
924 try_next = FALSE;
925 break;
926
927 case OP_KET:
928 case OP_KETRMAX:
929 case OP_KETRMIN:
930 case OP_KETRPOS:
931 return SSB_CONTINUE;
932
933 /* Skip over callout */
934
935 case OP_CALLOUT:
936 tcode += 2 + 2*LINK_SIZE;
937 break;
938
939 /* Skip over lookbehind and negative lookahead assertions */
940
941 case OP_ASSERT_NOT:
942 case OP_ASSERTBACK:
943 case OP_ASSERTBACK_NOT:
944 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
945 tcode += 1 + LINK_SIZE;
946 break;
947
948 /* BRAZERO does the bracket, but carries on. */
949
950 case OP_BRAZERO:
951 case OP_BRAMINZERO:
952 case OP_BRAPOSZERO:
953 rc = set_start_bits(re, ++tcode, utf);
954 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
955 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
956 tcode += 1 + LINK_SIZE;
957 break;
958
959 /* SKIPZERO skips the bracket. */
960
961 case OP_SKIPZERO:
962 tcode++;
963 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
964 tcode += 1 + LINK_SIZE;
965 break;
966
967 /* Single-char * or ? sets the bit and tries the next item */
968
969 case OP_STAR:
970 case OP_MINSTAR:
971 case OP_POSSTAR:
972 case OP_QUERY:
973 case OP_MINQUERY:
974 case OP_POSQUERY:
975 tcode = set_table_bit(re, tcode + 1, FALSE, utf);
976 break;
977
978 case OP_STARI:
979 case OP_MINSTARI:
980 case OP_POSSTARI:
981 case OP_QUERYI:
982 case OP_MINQUERYI:
983 case OP_POSQUERYI:
984 tcode = set_table_bit(re, tcode + 1, TRUE, utf);
985 break;
986
987 /* Single-char upto sets the bit and tries the next */
988
989 case OP_UPTO:
990 case OP_MINUPTO:
991 case OP_POSUPTO:
992 tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf);
993 break;
994
995 case OP_UPTOI:
996 case OP_MINUPTOI:
997 case OP_POSUPTOI:
998 tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf);
999 break;
1000
1001 /* At least one single char sets the bit and stops */
1002
1003 case OP_EXACT:
1004 tcode += IMM2_SIZE;
1005 /* Fall through */
1006 case OP_CHAR:
1007 case OP_PLUS:
1008 case OP_MINPLUS:
1009 case OP_POSPLUS:
1010 (void)set_table_bit(re, tcode + 1, FALSE, utf);
1011 try_next = FALSE;
1012 break;
1013
1014 case OP_EXACTI:
1015 tcode += IMM2_SIZE;
1016 /* Fall through */
1017 case OP_CHARI:
1018 case OP_PLUSI:
1019 case OP_MINPLUSI:
1020 case OP_POSPLUSI:
1021 (void)set_table_bit(re, tcode + 1, TRUE, utf);
1022 try_next = FALSE;
1023 break;
1024
1025 /* Special spacing and line-terminating items. These recognize specific
1026 lists of characters. The difference between VSPACE and ANYNL is that the
1027 latter can match the two-character CRLF sequence, but that is not
1028 relevant for finding the first character, so their code here is
1029 identical. */
1030
1031 case OP_HSPACE:
1032 SET_BIT(CHAR_HT);
1033 SET_BIT(CHAR_SPACE);
1034
1035 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1036 the bits for 0xA0 and for code units >= 255, independently of UTF. */
1037
1038 #if PCRE2_CODE_UNIT_WIDTH != 8
1039 SET_BIT(0xA0);
1040 SET_BIT(0xFF);
1041 #else
1042 /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1043 units of horizontal space characters. */
1044
1045 #ifdef SUPPORT_UNICODE
1046 if (utf)
1047 {
1048 SET_BIT(0xC2); /* For U+00A0 */
1049 SET_BIT(0xE1); /* For U+1680, U+180E */
1050 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1051 SET_BIT(0xE3); /* For U+3000 */
1052 }
1053 else
1054 #endif
1055 /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1056 the code is EBCDIC. */
1057 {
1058 #ifndef EBCDIC
1059 SET_BIT(0xA0);
1060 #endif /* Not EBCDIC */
1061 }
1062 #endif /* 8-bit support */
1063
1064 try_next = FALSE;
1065 break;
1066
1067 case OP_ANYNL:
1068 case OP_VSPACE:
1069 SET_BIT(CHAR_LF);
1070 SET_BIT(CHAR_VT);
1071 SET_BIT(CHAR_FF);
1072 SET_BIT(CHAR_CR);
1073
1074 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1075 the bits for NEL and for code units >= 255, independently of UTF. */
1076
1077 #if PCRE2_CODE_UNIT_WIDTH != 8
1078 SET_BIT(CHAR_NEL);
1079 SET_BIT(0xFF);
1080 #else
1081 /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1082 units of vertical space characters. */
1083
1084 #ifdef SUPPORT_UNICODE
1085 if (utf)
1086 {
1087 SET_BIT(0xC2); /* For U+0085 (NEL) */
1088 SET_BIT(0xE2); /* For U+2028, U+2029 */
1089 }
1090 else
1091 #endif
1092 /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1093 {
1094 SET_BIT(CHAR_NEL);
1095 }
1096 #endif /* 8-bit support */
1097
1098 try_next = FALSE;
1099 break;
1100
1101 /* Single character types set the bits and stop. Note that if PCRE2_UCP
1102 is set, we do not see these op codes because \d etc are converted to
1103 properties. Therefore, these apply in the case when only characters less
1104 than 256 are recognized to match the types. */
1105
1106 case OP_NOT_DIGIT:
1107 set_nottype_bits(re, cbit_digit, table_limit);
1108 try_next = FALSE;
1109 break;
1110
1111 case OP_DIGIT:
1112 set_type_bits(re, cbit_digit, table_limit);
1113 try_next = FALSE;
1114 break;
1115
1116 case OP_NOT_WHITESPACE:
1117 set_nottype_bits(re, cbit_space, table_limit);
1118 try_next = FALSE;
1119 break;
1120
1121 case OP_WHITESPACE:
1122 set_type_bits(re, cbit_space, table_limit);
1123 try_next = FALSE;
1124 break;
1125
1126 case OP_NOT_WORDCHAR:
1127 set_nottype_bits(re, cbit_word, table_limit);
1128 try_next = FALSE;
1129 break;
1130
1131 case OP_WORDCHAR:
1132 set_type_bits(re, cbit_word, table_limit);
1133 try_next = FALSE;
1134 break;
1135
1136 /* One or more character type fudges the pointer and restarts, knowing
1137 it will hit a single character type and stop there. */
1138
1139 case OP_TYPEPLUS:
1140 case OP_TYPEMINPLUS:
1141 case OP_TYPEPOSPLUS:
1142 tcode++;
1143 break;
1144
1145 case OP_TYPEEXACT:
1146 tcode += 1 + IMM2_SIZE;
1147 break;
1148
1149 /* Zero or more repeats of character types set the bits and then
1150 try again. */
1151
1152 case OP_TYPEUPTO:
1153 case OP_TYPEMINUPTO:
1154 case OP_TYPEPOSUPTO:
1155 tcode += IMM2_SIZE; /* Fall through */
1156
1157 case OP_TYPESTAR:
1158 case OP_TYPEMINSTAR:
1159 case OP_TYPEPOSSTAR:
1160 case OP_TYPEQUERY:
1161 case OP_TYPEMINQUERY:
1162 case OP_TYPEPOSQUERY:
1163 switch(tcode[1])
1164 {
1165 default:
1166 case OP_ANY:
1167 case OP_ALLANY:
1168 return SSB_FAIL;
1169
1170 case OP_HSPACE:
1171 SET_BIT(CHAR_HT);
1172 SET_BIT(CHAR_SPACE);
1173
1174 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1175 the bits for 0xA0 and for code units >= 255, independently of UTF. */
1176
1177 #if PCRE2_CODE_UNIT_WIDTH != 8
1178 SET_BIT(0xA0);
1179 SET_BIT(0xFF);
1180 #else
1181 /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1182 units of horizontal space characters. */
1183
1184 #ifdef SUPPORT_UNICODE
1185 if (utf)
1186 {
1187 SET_BIT(0xC2); /* For U+00A0 */
1188 SET_BIT(0xE1); /* For U+1680, U+180E */
1189 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1190 SET_BIT(0xE3); /* For U+3000 */
1191 }
1192 else
1193 #endif
1194 /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1195 the code is EBCDIC. */
1196 {
1197 #ifndef EBCDIC
1198 SET_BIT(0xA0);
1199 #endif /* Not EBCDIC */
1200 }
1201 #endif /* 8-bit support */
1202 break;
1203
1204 case OP_ANYNL:
1205 case OP_VSPACE:
1206 SET_BIT(CHAR_LF);
1207 SET_BIT(CHAR_VT);
1208 SET_BIT(CHAR_FF);
1209 SET_BIT(CHAR_CR);
1210
1211 /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1212 the bits for NEL and for code units >= 255, independently of UTF. */
1213
1214 #if PCRE2_CODE_UNIT_WIDTH != 8
1215 SET_BIT(CHAR_NEL);
1216 SET_BIT(0xFF);
1217 #else
1218 /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1219 units of vertical space characters. */
1220
1221 #ifdef SUPPORT_UNICODE
1222 if (utf)
1223 {
1224 SET_BIT(0xC2); /* For U+0085 (NEL) */
1225 SET_BIT(0xE2); /* For U+2028, U+2029 */
1226 }
1227 else
1228 #endif
1229 /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1230 {
1231 SET_BIT(CHAR_NEL);
1232 }
1233 #endif /* 8-bit support */
1234 break;
1235
1236 case OP_NOT_DIGIT:
1237 set_nottype_bits(re, cbit_digit, table_limit);
1238 break;
1239
1240 case OP_DIGIT:
1241 set_type_bits(re, cbit_digit, table_limit);
1242 break;
1243
1244 case OP_NOT_WHITESPACE:
1245 set_nottype_bits(re, cbit_space, table_limit);
1246 break;
1247
1248 case OP_WHITESPACE:
1249 set_type_bits(re, cbit_space, table_limit);
1250 break;
1251
1252 case OP_NOT_WORDCHAR:
1253 set_nottype_bits(re, cbit_word, table_limit);
1254 break;
1255
1256 case OP_WORDCHAR:
1257 set_type_bits(re, cbit_word, table_limit);
1258 break;
1259 }
1260
1261 tcode += 2;
1262 break;
1263
1264 /* Extended class: if there are any property checks, or if this is a
1265 negative XCLASS without a map, give up. If there are no property checks,
1266 there must be wide characters on the XCLASS list, because otherwise an
1267 XCLASS would not have been created. This means that code points >= 255
1268 are always potential starters. */
1269
1270 #ifdef SUPPORT_WIDE_CHARS
1271 case OP_XCLASS:
1272 if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0 ||
1273 (tcode[1 + LINK_SIZE] & (XCL_MAP|XCL_NOT)) == XCL_NOT)
1274 return SSB_FAIL;
1275
1276 /* We have a positive XCLASS or a negative one without a map. Set up the
1277 map pointer if there is one, and fall through. */
1278
1279 classmap = ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0)? NULL :
1280 (uint8_t *)(tcode + 1 + LINK_SIZE + 1);
1281 #endif
1282
1283 /* Enter here for a negative non-XCLASS. In the 8-bit library, if we are
1284 in UTF mode, any byte with a value >= 0xc4 is a potentially valid starter
1285 because it starts a character with a value > 255. In 8-bit non-UTF mode,
1286 there is no difference between CLASS and NCLASS. In all other wide
1287 character modes, set the 0xFF bit to indicate code units >= 255. */
1288
1289 case OP_NCLASS:
1290 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1291 if (utf)
1292 {
1293 re->start_bitmap[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
1294 memset(re->start_bitmap+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
1295 }
1296 #elif PCRE2_CODE_UNIT_WIDTH != 8
1297 SET_BIT(0xFF); /* For characters >= 255 */
1298 #endif
1299 /* Fall through */
1300
1301 /* Enter here for a positive non-XCLASS. If we have fallen through from
1302 an XCLASS, classmap will already be set; just advance the code pointer.
1303 Otherwise, set up classmap for a a non-XCLASS and advance past it. */
1304
1305 case OP_CLASS:
1306 if (*tcode == OP_XCLASS) tcode += GET(tcode, 1); else
1307 {
1308 classmap = (uint8_t *)(++tcode);
1309 tcode += 32 / sizeof(PCRE2_UCHAR);
1310 }
1311
1312 /* When wide characters are supported, classmap may be NULL. In UTF-8
1313 (sic) mode, the bits in a class bit map correspond to character values,
1314 not to byte values. However, the bit map we are constructing is for byte
1315 values. So we have to do a conversion for characters whose code point is
1316 greater than 127. In fact, there are only two possible starting bytes for
1317 characters in the range 128 - 255. */
1318
1319 if (classmap != NULL)
1320 {
1321 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1322 if (utf)
1323 {
1324 for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c];
1325 for (c = 128; c < 256; c++)
1326 {
1327 if ((classmap[c/8] && (1 << (c&7))) != 0)
1328 {
1329 int d = (c >> 6) | 0xc0; /* Set bit for this starter */
1330 re->start_bitmap[d/8] |= (1 << (d&7)); /* and then skip on to the */
1331 c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
1332 }
1333 }
1334 }
1335 else
1336 #endif
1337 /* In all modes except UTF-8, the two bit maps are compatible. */
1338
1339 {
1340 for (c = 0; c < 32; c++) re->start_bitmap[c] |= classmap[c];
1341 }
1342 }
1343
1344 /* Act on what follows the class. For a zero minimum repeat, continue;
1345 otherwise stop processing. */
1346
1347 switch (*tcode)
1348 {
1349 case OP_CRSTAR:
1350 case OP_CRMINSTAR:
1351 case OP_CRQUERY:
1352 case OP_CRMINQUERY:
1353 case OP_CRPOSSTAR:
1354 case OP_CRPOSQUERY:
1355 tcode++;
1356 break;
1357
1358 case OP_CRRANGE:
1359 case OP_CRMINRANGE:
1360 case OP_CRPOSRANGE:
1361 if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1362 else try_next = FALSE;
1363 break;
1364
1365 default:
1366 try_next = FALSE;
1367 break;
1368 }
1369 break; /* End of class handling case */
1370 } /* End of switch for opcodes */
1371 } /* End of try_next loop */
1372
1373 code += GET(code, 1); /* Advance to next branch */
1374 }
1375 while (*code == OP_ALT);
1376
1377 return yield;
1378 }
1379
1380
1381
1382 /*************************************************
1383 * Study a compiled expression *
1384 *************************************************/
1385
1386 /* This function is handed a compiled expression that it must study to produce
1387 information that will speed up the matching.
1388
1389 Argument: points to the compiled expression
1390 Returns: 0 normally; non-zero should never normally occur
1391 1 unknown opcode in set_start_bits
1392 2 missing capturing bracket
1393 3 unknown opcode in find_minlength
1394 */
1395
1396 int
PRIV(study)1397 PRIV(study)(pcre2_real_code *re)
1398 {
1399 int min;
1400 PCRE2_UCHAR *code;
1401 BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
1402
1403 /* Find start of compiled code */
1404
1405 code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
1406 re->name_entry_size * re->name_count;
1407
1408 /* For an anchored pattern, or an unanchored pattern that has a first code
1409 unit, or a multiline pattern that matches only at "line start", there is no
1410 point in seeking a list of starting code units. */
1411
1412 if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
1413 (re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
1414 {
1415 int rc = set_start_bits(re, code, utf);
1416 if (rc == SSB_UNKNOWN) return 1;
1417 if (rc == SSB_DONE) re->flags |= PCRE2_FIRSTMAPSET;
1418 }
1419
1420 /* Find the minimum length of subject string. */
1421
1422 switch(min = find_minlength(re, code, code, 0, utf))
1423 {
1424 case -1: /* \C in UTF mode or (*ACCEPT) was encountered */
1425 break;
1426
1427 case -2:
1428 return 2; /* missing capturing bracket */
1429
1430 case -3:
1431 return 3; /* unrecognized opcode */
1432
1433 default:
1434 re->minlength = min;
1435 break;
1436 }
1437
1438 return 0;
1439 }
1440
1441 /* End of pcre2_study.c */
1442