1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10          New API code Copyright (c) 2014 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 /* This module contains functions for scanning a compiled pattern and
42 collecting data (e.g. minimum matching length). */
43 
44 
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48 
49 
50 #include "pcre2_internal.h"
51 
52 
53 /* Set a bit in the starting code unit bit map. */
54 
55 #define SET_BIT(c) re->start_bitmap[(c)/8] |= (1 << ((c)&7))
56 
57 /* Returns from set_start_bits() */
58 
59 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
60 
61 
62 
63 /*************************************************
64 *   Find the minimum subject length for a group  *
65 *************************************************/
66 
67 /* Scan a parenthesized group and compute the minimum length of subject that
68 is needed to match it. This is a lower bound; it does not mean there is a
69 string of that length that matches. In UTF8 mode, the result is in characters
70 rather than bytes.
71 
72 Arguments:
73   re              compiled pattern block
74   code            pointer to start of group (the bracket)
75   startcode       pointer to start of the whole pattern's code
76   recurse_depth   RECURSE depth
77   utf             UTF flag
78 
79 Returns:   the minimum length
80            -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
81            -2 internal error (missing capturing bracket)
82            -3 internal error (opcode not listed)
83 */
84 
85 static int
find_minlength(const pcre2_real_code * re,PCRE2_SPTR code,PCRE2_SPTR startcode,int recurse_depth,BOOL utf)86 find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
87   PCRE2_SPTR startcode, int recurse_depth, BOOL utf)
88 {
89 int length = -1;
90 BOOL had_recurse = FALSE;
91 register int branchlength = 0;
92 register PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
93 
94 if (*code == OP_CBRA || *code == OP_SCBRA ||
95     *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
96 
97 /* Scan along the opcodes for this branch. If we get to the end of the
98 branch, check the length against that of the other branches. */
99 
100 for (;;)
101   {
102   int d, min;
103   PCRE2_UCHAR *cs, *ce;
104   register PCRE2_UCHAR op = *cc;
105 
106   switch (op)
107     {
108     case OP_COND:
109     case OP_SCOND:
110 
111     /* If there is only one branch in a condition, the implied branch has zero
112     length, so we don't add anything. This covers the DEFINE "condition"
113     automatically. */
114 
115     cs = cc + GET(cc, 1);
116     if (*cs != OP_ALT)
117       {
118       cc = cs + 1 + LINK_SIZE;
119       break;
120       }
121 
122     /* Otherwise we can fall through and treat it the same as any other
123     subpattern. */
124 
125     case OP_CBRA:
126     case OP_SCBRA:
127     case OP_BRA:
128     case OP_SBRA:
129     case OP_CBRAPOS:
130     case OP_SCBRAPOS:
131     case OP_BRAPOS:
132     case OP_SBRAPOS:
133     case OP_ONCE:
134     case OP_ONCE_NC:
135     d = find_minlength(re, cc, startcode, recurse_depth, utf);
136     if (d < 0) return d;
137     branchlength += d;
138     do cc += GET(cc, 1); while (*cc == OP_ALT);
139     cc += 1 + LINK_SIZE;
140     break;
141 
142     /* ACCEPT makes things far too complicated; we have to give up. */
143 
144     case OP_ACCEPT:
145     case OP_ASSERT_ACCEPT:
146     return -1;
147 
148     /* Reached end of a branch; if it's a ket it is the end of a nested
149     call. If it's ALT it is an alternation in a nested call. If it is END it's
150     the end of the outer call. All can be handled by the same code. If an
151     ACCEPT was previously encountered, use the length that was in force at that
152     time, and pass back the shortest ACCEPT length. */
153 
154     case OP_ALT:
155     case OP_KET:
156     case OP_KETRMAX:
157     case OP_KETRMIN:
158     case OP_KETRPOS:
159     case OP_END:
160     if (length < 0 || (!had_recurse && branchlength < length))
161       length = branchlength;
162     if (op != OP_ALT) return length;
163     cc += 1 + LINK_SIZE;
164     branchlength = 0;
165     had_recurse = FALSE;
166     break;
167 
168     /* Skip over assertive subpatterns */
169 
170     case OP_ASSERT:
171     case OP_ASSERT_NOT:
172     case OP_ASSERTBACK:
173     case OP_ASSERTBACK_NOT:
174     do cc += GET(cc, 1); while (*cc == OP_ALT);
175     /* Fall through */
176 
177     /* Skip over things that don't match chars */
178 
179     case OP_REVERSE:
180     case OP_CREF:
181     case OP_DNCREF:
182     case OP_RREF:
183     case OP_DNRREF:
184     case OP_FALSE:
185     case OP_TRUE:
186     case OP_CALLOUT:
187     case OP_SOD:
188     case OP_SOM:
189     case OP_EOD:
190     case OP_EODN:
191     case OP_CIRC:
192     case OP_CIRCM:
193     case OP_DOLL:
194     case OP_DOLLM:
195     case OP_NOT_WORD_BOUNDARY:
196     case OP_WORD_BOUNDARY:
197     cc += PRIV(OP_lengths)[*cc];
198     break;
199 
200     /* Skip over a subpattern that has a {0} or {0,x} quantifier */
201 
202     case OP_BRAZERO:
203     case OP_BRAMINZERO:
204     case OP_BRAPOSZERO:
205     case OP_SKIPZERO:
206     cc += PRIV(OP_lengths)[*cc];
207     do cc += GET(cc, 1); while (*cc == OP_ALT);
208     cc += 1 + LINK_SIZE;
209     break;
210 
211     /* Handle literal characters and + repetitions */
212 
213     case OP_CHAR:
214     case OP_CHARI:
215     case OP_NOT:
216     case OP_NOTI:
217     case OP_PLUS:
218     case OP_PLUSI:
219     case OP_MINPLUS:
220     case OP_MINPLUSI:
221     case OP_POSPLUS:
222     case OP_POSPLUSI:
223     case OP_NOTPLUS:
224     case OP_NOTPLUSI:
225     case OP_NOTMINPLUS:
226     case OP_NOTMINPLUSI:
227     case OP_NOTPOSPLUS:
228     case OP_NOTPOSPLUSI:
229     branchlength++;
230     cc += 2;
231 #ifdef SUPPORT_UNICODE
232     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
233 #endif
234     break;
235 
236     case OP_TYPEPLUS:
237     case OP_TYPEMINPLUS:
238     case OP_TYPEPOSPLUS:
239     branchlength++;
240     cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
241     break;
242 
243     /* Handle exact repetitions. The count is already in characters, but we
244     may need to skip over a multibyte character in UTF mode.  */
245 
246     case OP_EXACT:
247     case OP_EXACTI:
248     case OP_NOTEXACT:
249     case OP_NOTEXACTI:
250     branchlength += GET2(cc,1);
251     cc += 2 + IMM2_SIZE;
252 #ifdef SUPPORT_UNICODE
253     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
254 #endif
255     break;
256 
257     case OP_TYPEEXACT:
258     branchlength += GET2(cc,1);
259     cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
260       || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
261     break;
262 
263     /* Handle single-char non-literal matchers */
264 
265     case OP_PROP:
266     case OP_NOTPROP:
267     cc += 2;
268     /* Fall through */
269 
270     case OP_NOT_DIGIT:
271     case OP_DIGIT:
272     case OP_NOT_WHITESPACE:
273     case OP_WHITESPACE:
274     case OP_NOT_WORDCHAR:
275     case OP_WORDCHAR:
276     case OP_ANY:
277     case OP_ALLANY:
278     case OP_EXTUNI:
279     case OP_HSPACE:
280     case OP_NOT_HSPACE:
281     case OP_VSPACE:
282     case OP_NOT_VSPACE:
283     branchlength++;
284     cc++;
285     break;
286 
287     /* "Any newline" might match two characters, but it also might match just
288     one. */
289 
290     case OP_ANYNL:
291     branchlength += 1;
292     cc++;
293     break;
294 
295     /* The single-byte matcher means we can't proceed in UTF mode. (In
296     non-UTF mode \C will actually be turned into OP_ALLANY, so won't ever
297     appear, but leave the code, just in case.) */
298 
299     case OP_ANYBYTE:
300 #ifdef SUPPORT_UNICODE
301     if (utf) return -1;
302 #endif
303     branchlength++;
304     cc++;
305     break;
306 
307     /* For repeated character types, we have to test for \p and \P, which have
308     an extra two bytes of parameters. */
309 
310     case OP_TYPESTAR:
311     case OP_TYPEMINSTAR:
312     case OP_TYPEQUERY:
313     case OP_TYPEMINQUERY:
314     case OP_TYPEPOSSTAR:
315     case OP_TYPEPOSQUERY:
316     if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
317     cc += PRIV(OP_lengths)[op];
318     break;
319 
320     case OP_TYPEUPTO:
321     case OP_TYPEMINUPTO:
322     case OP_TYPEPOSUPTO:
323     if (cc[1 + IMM2_SIZE] == OP_PROP
324       || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
325     cc += PRIV(OP_lengths)[op];
326     break;
327 
328     /* Check a class for variable quantification */
329 
330     case OP_CLASS:
331     case OP_NCLASS:
332 #ifdef SUPPORT_WIDE_CHARS
333     case OP_XCLASS:
334     /* The original code caused an unsigned overflow in 64 bit systems,
335     so now we use a conditional statement. */
336     if (op == OP_XCLASS)
337       cc += GET(cc, 1);
338     else
339       cc += PRIV(OP_lengths)[OP_CLASS];
340 #else
341     cc += PRIV(OP_lengths)[OP_CLASS];
342 #endif
343 
344     switch (*cc)
345       {
346       case OP_CRPLUS:
347       case OP_CRMINPLUS:
348       case OP_CRPOSPLUS:
349       branchlength++;
350       /* Fall through */
351 
352       case OP_CRSTAR:
353       case OP_CRMINSTAR:
354       case OP_CRQUERY:
355       case OP_CRMINQUERY:
356       case OP_CRPOSSTAR:
357       case OP_CRPOSQUERY:
358       cc++;
359       break;
360 
361       case OP_CRRANGE:
362       case OP_CRMINRANGE:
363       case OP_CRPOSRANGE:
364       branchlength += GET2(cc,1);
365       cc += 1 + 2 * IMM2_SIZE;
366       break;
367 
368       default:
369       branchlength++;
370       break;
371       }
372     break;
373 
374     /* Backreferences and subroutine calls are treated in the same way: we find
375     the minimum length for the subpattern. A recursion, however, causes an
376     a flag to be set that causes the length of this branch to be ignored. The
377     logic is that a recursion can only make sense if there is another
378     alternation that stops the recursing. That will provide the minimum length
379     (when no recursion happens). A backreference within the group that it is
380     referencing behaves in the same way.
381 
382     If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket
383     matches an empty string (by default it causes a matching failure), so in
384     that case we must set the minimum length to zero. */
385 
386     case OP_DNREF:     /* Duplicate named pattern back reference */
387     case OP_DNREFI:
388     if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
389       {
390       int count = GET2(cc, 1+IMM2_SIZE);
391       PCRE2_UCHAR *slot =
392         (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
393           GET2(cc, 1) * re->name_entry_size;
394 
395       d = INT_MAX;
396       while (count-- > 0)
397         {
398         ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
399         if (cs == NULL) return -2;
400         do ce += GET(ce, 1); while (*ce == OP_ALT);
401         if (cc > cs && cc < ce)
402           {
403           d = 0;
404           had_recurse = TRUE;
405           break;
406           }
407         else
408           {
409           int dd = find_minlength(re, cs, startcode, recurse_depth, utf);
410           if (dd < d) d = dd;
411           }
412         slot += re->name_entry_size;
413         }
414       }
415     else d = 0;
416     cc += 1 + 2*IMM2_SIZE;
417     goto REPEAT_BACK_REFERENCE;
418 
419     case OP_REF:      /* Single back reference */
420     case OP_REFI:
421     if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
422       {
423       ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
424       if (cs == NULL) return -2;
425       do ce += GET(ce, 1); while (*ce == OP_ALT);
426       if (cc > cs && cc < ce)
427         {
428         d = 0;
429         had_recurse = TRUE;
430         }
431       else
432         {
433         d = find_minlength(re, cs, startcode, recurse_depth, utf);
434         }
435       }
436     else d = 0;
437     cc += 1 + IMM2_SIZE;
438 
439     /* Handle repeated back references */
440 
441     REPEAT_BACK_REFERENCE:
442     switch (*cc)
443       {
444       case OP_CRSTAR:
445       case OP_CRMINSTAR:
446       case OP_CRQUERY:
447       case OP_CRMINQUERY:
448       case OP_CRPOSSTAR:
449       case OP_CRPOSQUERY:
450       min = 0;
451       cc++;
452       break;
453 
454       case OP_CRPLUS:
455       case OP_CRMINPLUS:
456       case OP_CRPOSPLUS:
457       min = 1;
458       cc++;
459       break;
460 
461       case OP_CRRANGE:
462       case OP_CRMINRANGE:
463       case OP_CRPOSRANGE:
464       min = GET2(cc, 1);
465       cc += 1 + 2 * IMM2_SIZE;
466       break;
467 
468       default:
469       min = 1;
470       break;
471       }
472 
473     branchlength += min * d;
474     break;
475 
476     /* We can easily detect direct recursion, but not mutual recursion. This is
477     caught by a recursion depth count. */
478 
479     case OP_RECURSE:
480     cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1);
481     do ce += GET(ce, 1); while (*ce == OP_ALT);
482     if ((cc > cs && cc < ce) || recurse_depth > 10)
483       had_recurse = TRUE;
484     else
485       {
486       branchlength += find_minlength(re, cs, startcode, recurse_depth + 1, utf);
487       }
488     cc += 1 + LINK_SIZE;
489     break;
490 
491     /* Anything else does not or need not match a character. We can get the
492     item's length from the table, but for those that can match zero occurrences
493     of a character, we must take special action for UTF-8 characters. As it
494     happens, the "NOT" versions of these opcodes are used at present only for
495     ASCII characters, so they could be omitted from this list. However, in
496     future that may change, so we include them here so as not to leave a
497     gotcha for a future maintainer. */
498 
499     case OP_UPTO:
500     case OP_UPTOI:
501     case OP_NOTUPTO:
502     case OP_NOTUPTOI:
503     case OP_MINUPTO:
504     case OP_MINUPTOI:
505     case OP_NOTMINUPTO:
506     case OP_NOTMINUPTOI:
507     case OP_POSUPTO:
508     case OP_POSUPTOI:
509     case OP_NOTPOSUPTO:
510     case OP_NOTPOSUPTOI:
511 
512     case OP_STAR:
513     case OP_STARI:
514     case OP_NOTSTAR:
515     case OP_NOTSTARI:
516     case OP_MINSTAR:
517     case OP_MINSTARI:
518     case OP_NOTMINSTAR:
519     case OP_NOTMINSTARI:
520     case OP_POSSTAR:
521     case OP_POSSTARI:
522     case OP_NOTPOSSTAR:
523     case OP_NOTPOSSTARI:
524 
525     case OP_QUERY:
526     case OP_QUERYI:
527     case OP_NOTQUERY:
528     case OP_NOTQUERYI:
529     case OP_MINQUERY:
530     case OP_MINQUERYI:
531     case OP_NOTMINQUERY:
532     case OP_NOTMINQUERYI:
533     case OP_POSQUERY:
534     case OP_POSQUERYI:
535     case OP_NOTPOSQUERY:
536     case OP_NOTPOSQUERYI:
537 
538     cc += PRIV(OP_lengths)[op];
539 #ifdef SUPPORT_UNICODE
540     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
541 #endif
542     break;
543 
544     /* Skip these, but we need to add in the name length. */
545 
546     case OP_MARK:
547     case OP_PRUNE_ARG:
548     case OP_SKIP_ARG:
549     case OP_THEN_ARG:
550     cc += PRIV(OP_lengths)[op] + cc[1];
551     break;
552 
553     /* The remaining opcodes are just skipped over. */
554 
555     case OP_CLOSE:
556     case OP_COMMIT:
557     case OP_FAIL:
558     case OP_PRUNE:
559     case OP_SET_SOM:
560     case OP_SKIP:
561     case OP_THEN:
562     cc += PRIV(OP_lengths)[op];
563     break;
564 
565     /* This should not occur: we list all opcodes explicitly so that when
566     new ones get added they are properly considered. */
567 
568     default:
569     return -3;
570     }
571   }
572 /* Control never gets here */
573 }
574 
575 
576 
577 /*************************************************
578 *      Set a bit and maybe its alternate case    *
579 *************************************************/
580 
581 /* Given a character, set its first code unit's bit in the table, and also the
582 corresponding bit for the other version of a letter if we are caseless.
583 
584 Arguments:
585   re            points to the regex block
586   p             points to the first code unit of the character
587   caseless      TRUE if caseless
588   utf           TRUE for UTF mode
589 
590 Returns:        pointer after the character
591 */
592 
593 static PCRE2_SPTR
set_table_bit(pcre2_real_code * re,PCRE2_SPTR p,BOOL caseless,BOOL utf)594 set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf)
595 {
596 uint32_t c = *p++;   /* First code unit */
597 (void)utf;           /* Stop compiler warning when UTF not supported */
598 
599 /* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
600 0xff. */
601 
602 #if PCRE2_CODE_UNIT_WIDTH != 8
603 if (c > 0xff) SET_BIT(0xff); else
604 #endif
605 
606 SET_BIT(c);
607 
608 /* In UTF-8 or UTF-16 mode, pick up the remaining code units in order to find
609 the end of the character, even when caseless. */
610 
611 #ifdef SUPPORT_UNICODE
612 if (utf)
613   {
614 #if PCRE2_CODE_UNIT_WIDTH == 8
615   if (c >= 0xc0) GETUTF8INC(c, p);
616 #elif PCRE2_CODE_UNIT_WIDTH == 16
617   if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, p);
618 #endif
619   }
620 #endif  /* SUPPORT_UNICODE */
621 
622 /* If caseless, handle the other case of the character. */
623 
624 if (caseless)
625   {
626   if (utf)
627     {
628 #if PCRE2_CODE_UNIT_WIDTH == 8
629     PCRE2_UCHAR buff[6];
630     c = UCD_OTHERCASE(c);
631     (void)PRIV(ord2utf)(c, buff);
632     SET_BIT(buff[0]);
633 #else  /* 16-bit or 32-bit mode */
634     c = UCD_OTHERCASE(c);
635     if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
636 #endif
637     }
638 
639   /* Not UTF */
640 
641   else if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
642   }
643 
644 return p;
645 }
646 
647 
648 
649 /*************************************************
650 *     Set bits for a positive character type     *
651 *************************************************/
652 
653 /* This function sets starting bits for a character type. In UTF-8 mode, we can
654 only do a direct setting for bytes less than 128, as otherwise there can be
655 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
656 environment, the tables will only recognize ASCII characters anyway, but in at
657 least one Windows environment, some higher bytes bits were set in the tables.
658 So we deal with that case by considering the UTF-8 encoding.
659 
660 Arguments:
661   re             the regex block
662   cbit type      the type of character wanted
663   table_limit    32 for non-UTF-8; 16 for UTF-8
664 
665 Returns:         nothing
666 */
667 
668 static void
set_type_bits(pcre2_real_code * re,int cbit_type,unsigned int table_limit)669 set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
670 {
671 register uint32_t c;
672 for (c = 0; c < table_limit; c++)
673   re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type];
674 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
675 if (table_limit == 32) return;
676 for (c = 128; c < 256; c++)
677   {
678   if ((re->tables[cbits_offset + c/8] & (1 << (c&7))) != 0)
679     {
680     PCRE2_UCHAR buff[6];
681     (void)PRIV(ord2utf)(c, buff);
682     SET_BIT(buff[0]);
683     }
684   }
685 #endif  /* UTF-8 */
686 }
687 
688 
689 /*************************************************
690 *     Set bits for a negative character type     *
691 *************************************************/
692 
693 /* This function sets starting bits for a negative character type such as \D.
694 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
695 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
696 Unlike in the positive case, where we can set appropriate starting bits for
697 specific high-valued UTF-8 characters, in this case we have to set the bits for
698 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
699 0xc0 (192) for simplicity.
700 
701 Arguments:
702   re             the regex block
703   cbit type      the type of character wanted
704   table_limit    32 for non-UTF-8; 16 for UTF-8
705 
706 Returns:         nothing
707 */
708 
709 static void
set_nottype_bits(pcre2_real_code * re,int cbit_type,unsigned int table_limit)710 set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
711 {
712 register uint32_t c;
713 for (c = 0; c < table_limit; c++)
714   re->start_bitmap[c] |= ~(re->tables[c+cbits_offset+cbit_type]);
715 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
716 if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff;
717 #endif
718 }
719 
720 
721 
722 /*************************************************
723 *          Create bitmap of starting bytes       *
724 *************************************************/
725 
726 /* This function scans a compiled unanchored expression recursively and
727 attempts to build a bitmap of the set of possible starting code units whose
728 values are less than 256. In 16-bit and 32-bit mode, values above 255 all cause
729 the 255 bit to be set. When calling set[_not]_type_bits() in UTF-8 (sic) mode
730 we pass a value of 16 rather than 32 as the final argument. (See comments in
731 those functions for the reason.)
732 
733 The SSB_CONTINUE return is useful for parenthesized groups in patterns such as
734 (a*)b where the group provides some optional starting code units but scanning
735 must continue at the outer level to find at least one mandatory code unit. At
736 the outermost level, this function fails unless the result is SSB_DONE.
737 
738 Arguments:
739   re           points to the compiled regex block
740   code         points to an expression
741   utf          TRUE if in UTF mode
742 
743 Returns:       SSB_FAIL     => Failed to find any starting code units
744                SSB_DONE     => Found mandatory starting code units
745                SSB_CONTINUE => Found optional starting code units
746                SSB_UNKNOWN  => Hit an unrecognized opcode
747 */
748 
749 static int
set_start_bits(pcre2_real_code * re,PCRE2_SPTR code,BOOL utf)750 set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf)
751 {
752 register uint32_t c;
753 int yield = SSB_DONE;
754 
755 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
756 int table_limit = utf? 16:32;
757 #else
758 int table_limit = 32;
759 #endif
760 
761 do
762   {
763   BOOL try_next = TRUE;
764   PCRE2_SPTR tcode = code + 1 + LINK_SIZE;
765 
766   if (*code == OP_CBRA || *code == OP_SCBRA ||
767       *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
768 
769   while (try_next)    /* Loop for items in this branch */
770     {
771     int rc;
772     uint8_t *classmap = NULL;
773 
774     switch(*tcode)
775       {
776       /* If we reach something we don't understand, it means a new opcode has
777       been created that hasn't been added to this function. Hopefully this
778       problem will be discovered during testing. */
779 
780       default:
781       return SSB_UNKNOWN;
782 
783       /* Fail for a valid opcode that implies no starting bits. */
784 
785       case OP_ACCEPT:
786       case OP_ASSERT_ACCEPT:
787       case OP_ALLANY:
788       case OP_ANY:
789       case OP_ANYBYTE:
790       case OP_CIRC:
791       case OP_CIRCM:
792       case OP_CLOSE:
793       case OP_COMMIT:
794       case OP_COND:
795       case OP_CREF:
796       case OP_FALSE:
797       case OP_TRUE:
798       case OP_DNCREF:
799       case OP_DNREF:
800       case OP_DNREFI:
801       case OP_DNRREF:
802       case OP_DOLL:
803       case OP_DOLLM:
804       case OP_END:
805       case OP_EOD:
806       case OP_EODN:
807       case OP_EXTUNI:
808       case OP_FAIL:
809       case OP_MARK:
810       case OP_NOT:
811       case OP_NOTEXACT:
812       case OP_NOTEXACTI:
813       case OP_NOTI:
814       case OP_NOTMINPLUS:
815       case OP_NOTMINPLUSI:
816       case OP_NOTMINQUERY:
817       case OP_NOTMINQUERYI:
818       case OP_NOTMINSTAR:
819       case OP_NOTMINSTARI:
820       case OP_NOTMINUPTO:
821       case OP_NOTMINUPTOI:
822       case OP_NOTPLUS:
823       case OP_NOTPLUSI:
824       case OP_NOTPOSPLUS:
825       case OP_NOTPOSPLUSI:
826       case OP_NOTPOSQUERY:
827       case OP_NOTPOSQUERYI:
828       case OP_NOTPOSSTAR:
829       case OP_NOTPOSSTARI:
830       case OP_NOTPOSUPTO:
831       case OP_NOTPOSUPTOI:
832       case OP_NOTPROP:
833       case OP_NOTQUERY:
834       case OP_NOTQUERYI:
835       case OP_NOTSTAR:
836       case OP_NOTSTARI:
837       case OP_NOTUPTO:
838       case OP_NOTUPTOI:
839       case OP_NOT_HSPACE:
840       case OP_NOT_VSPACE:
841       case OP_PRUNE:
842       case OP_PRUNE_ARG:
843       case OP_RECURSE:
844       case OP_REF:
845       case OP_REFI:
846       case OP_REVERSE:
847       case OP_RREF:
848       case OP_SCOND:
849       case OP_SET_SOM:
850       case OP_SKIP:
851       case OP_SKIP_ARG:
852       case OP_SOD:
853       case OP_SOM:
854       case OP_THEN:
855       case OP_THEN_ARG:
856       return SSB_FAIL;
857 
858       /* A "real" property test implies no starting bits, but the fake property
859       PT_CLIST identifies a list of characters. These lists are short, as they
860       are used for characters with more than one "other case", so there is no
861       point in recognizing them for OP_NOTPROP. */
862 
863       case OP_PROP:
864       if (tcode[1] != PT_CLIST) return SSB_FAIL;
865         {
866         const uint32_t *p = PRIV(ucd_caseless_sets) + tcode[2];
867         while ((c = *p++) < NOTACHAR)
868           {
869 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
870           if (utf)
871             {
872             PCRE2_UCHAR buff[6];
873             (void)PRIV(ord2utf)(c, buff);
874             c = buff[0];
875             }
876 #endif
877           if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
878           }
879         }
880       try_next = FALSE;
881       break;
882 
883       /* We can ignore word boundary tests. */
884 
885       case OP_WORD_BOUNDARY:
886       case OP_NOT_WORD_BOUNDARY:
887       tcode++;
888       break;
889 
890       /* If we hit a bracket or a positive lookahead assertion, recurse to set
891       bits from within the subpattern. If it can't find anything, we have to
892       give up. If it finds some mandatory character(s), we are done for this
893       branch. Otherwise, carry on scanning after the subpattern. */
894 
895       case OP_BRA:
896       case OP_SBRA:
897       case OP_CBRA:
898       case OP_SCBRA:
899       case OP_BRAPOS:
900       case OP_SBRAPOS:
901       case OP_CBRAPOS:
902       case OP_SCBRAPOS:
903       case OP_ONCE:
904       case OP_ONCE_NC:
905       case OP_ASSERT:
906       rc = set_start_bits(re, tcode, utf);
907       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
908       if (rc == SSB_DONE) try_next = FALSE; else
909         {
910         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
911         tcode += 1 + LINK_SIZE;
912         }
913       break;
914 
915       /* If we hit ALT or KET, it means we haven't found anything mandatory in
916       this branch, though we might have found something optional. For ALT, we
917       continue with the next alternative, but we have to arrange that the final
918       result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
919       return SSB_CONTINUE: if this is the top level, that indicates failure,
920       but after a nested subpattern, it causes scanning to continue. */
921 
922       case OP_ALT:
923       yield = SSB_CONTINUE;
924       try_next = FALSE;
925       break;
926 
927       case OP_KET:
928       case OP_KETRMAX:
929       case OP_KETRMIN:
930       case OP_KETRPOS:
931       return SSB_CONTINUE;
932 
933       /* Skip over callout */
934 
935       case OP_CALLOUT:
936       tcode += 2 + 2*LINK_SIZE;
937       break;
938 
939       /* Skip over lookbehind and negative lookahead assertions */
940 
941       case OP_ASSERT_NOT:
942       case OP_ASSERTBACK:
943       case OP_ASSERTBACK_NOT:
944       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
945       tcode += 1 + LINK_SIZE;
946       break;
947 
948       /* BRAZERO does the bracket, but carries on. */
949 
950       case OP_BRAZERO:
951       case OP_BRAMINZERO:
952       case OP_BRAPOSZERO:
953       rc = set_start_bits(re, ++tcode, utf);
954       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
955       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
956       tcode += 1 + LINK_SIZE;
957       break;
958 
959       /* SKIPZERO skips the bracket. */
960 
961       case OP_SKIPZERO:
962       tcode++;
963       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
964       tcode += 1 + LINK_SIZE;
965       break;
966 
967       /* Single-char * or ? sets the bit and tries the next item */
968 
969       case OP_STAR:
970       case OP_MINSTAR:
971       case OP_POSSTAR:
972       case OP_QUERY:
973       case OP_MINQUERY:
974       case OP_POSQUERY:
975       tcode = set_table_bit(re, tcode + 1, FALSE, utf);
976       break;
977 
978       case OP_STARI:
979       case OP_MINSTARI:
980       case OP_POSSTARI:
981       case OP_QUERYI:
982       case OP_MINQUERYI:
983       case OP_POSQUERYI:
984       tcode = set_table_bit(re, tcode + 1, TRUE, utf);
985       break;
986 
987       /* Single-char upto sets the bit and tries the next */
988 
989       case OP_UPTO:
990       case OP_MINUPTO:
991       case OP_POSUPTO:
992       tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf);
993       break;
994 
995       case OP_UPTOI:
996       case OP_MINUPTOI:
997       case OP_POSUPTOI:
998       tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf);
999       break;
1000 
1001       /* At least one single char sets the bit and stops */
1002 
1003       case OP_EXACT:
1004       tcode += IMM2_SIZE;
1005       /* Fall through */
1006       case OP_CHAR:
1007       case OP_PLUS:
1008       case OP_MINPLUS:
1009       case OP_POSPLUS:
1010       (void)set_table_bit(re, tcode + 1, FALSE, utf);
1011       try_next = FALSE;
1012       break;
1013 
1014       case OP_EXACTI:
1015       tcode += IMM2_SIZE;
1016       /* Fall through */
1017       case OP_CHARI:
1018       case OP_PLUSI:
1019       case OP_MINPLUSI:
1020       case OP_POSPLUSI:
1021       (void)set_table_bit(re, tcode + 1, TRUE, utf);
1022       try_next = FALSE;
1023       break;
1024 
1025       /* Special spacing and line-terminating items. These recognize specific
1026       lists of characters. The difference between VSPACE and ANYNL is that the
1027       latter can match the two-character CRLF sequence, but that is not
1028       relevant for finding the first character, so their code here is
1029       identical. */
1030 
1031       case OP_HSPACE:
1032       SET_BIT(CHAR_HT);
1033       SET_BIT(CHAR_SPACE);
1034 
1035       /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1036       the bits for 0xA0 and for code units >= 255, independently of UTF. */
1037 
1038 #if PCRE2_CODE_UNIT_WIDTH != 8
1039       SET_BIT(0xA0);
1040       SET_BIT(0xFF);
1041 #else
1042       /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1043       units of horizontal space characters. */
1044 
1045 #ifdef SUPPORT_UNICODE
1046       if (utf)
1047         {
1048         SET_BIT(0xC2);  /* For U+00A0 */
1049         SET_BIT(0xE1);  /* For U+1680, U+180E */
1050         SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1051         SET_BIT(0xE3);  /* For U+3000 */
1052         }
1053       else
1054 #endif
1055       /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1056       the code is EBCDIC. */
1057         {
1058 #ifndef EBCDIC
1059         SET_BIT(0xA0);
1060 #endif  /* Not EBCDIC */
1061         }
1062 #endif  /* 8-bit support */
1063 
1064       try_next = FALSE;
1065       break;
1066 
1067       case OP_ANYNL:
1068       case OP_VSPACE:
1069       SET_BIT(CHAR_LF);
1070       SET_BIT(CHAR_VT);
1071       SET_BIT(CHAR_FF);
1072       SET_BIT(CHAR_CR);
1073 
1074       /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1075       the bits for NEL and for code units >= 255, independently of UTF. */
1076 
1077 #if PCRE2_CODE_UNIT_WIDTH != 8
1078       SET_BIT(CHAR_NEL);
1079       SET_BIT(0xFF);
1080 #else
1081       /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1082       units of vertical space characters. */
1083 
1084 #ifdef SUPPORT_UNICODE
1085       if (utf)
1086         {
1087         SET_BIT(0xC2);  /* For U+0085 (NEL) */
1088         SET_BIT(0xE2);  /* For U+2028, U+2029 */
1089         }
1090       else
1091 #endif
1092       /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1093         {
1094         SET_BIT(CHAR_NEL);
1095         }
1096 #endif  /* 8-bit support */
1097 
1098       try_next = FALSE;
1099       break;
1100 
1101       /* Single character types set the bits and stop. Note that if PCRE2_UCP
1102       is set, we do not see these op codes because \d etc are converted to
1103       properties. Therefore, these apply in the case when only characters less
1104       than 256 are recognized to match the types. */
1105 
1106       case OP_NOT_DIGIT:
1107       set_nottype_bits(re, cbit_digit, table_limit);
1108       try_next = FALSE;
1109       break;
1110 
1111       case OP_DIGIT:
1112       set_type_bits(re, cbit_digit, table_limit);
1113       try_next = FALSE;
1114       break;
1115 
1116       case OP_NOT_WHITESPACE:
1117       set_nottype_bits(re, cbit_space, table_limit);
1118       try_next = FALSE;
1119       break;
1120 
1121       case OP_WHITESPACE:
1122       set_type_bits(re, cbit_space, table_limit);
1123       try_next = FALSE;
1124       break;
1125 
1126       case OP_NOT_WORDCHAR:
1127       set_nottype_bits(re, cbit_word, table_limit);
1128       try_next = FALSE;
1129       break;
1130 
1131       case OP_WORDCHAR:
1132       set_type_bits(re, cbit_word, table_limit);
1133       try_next = FALSE;
1134       break;
1135 
1136       /* One or more character type fudges the pointer and restarts, knowing
1137       it will hit a single character type and stop there. */
1138 
1139       case OP_TYPEPLUS:
1140       case OP_TYPEMINPLUS:
1141       case OP_TYPEPOSPLUS:
1142       tcode++;
1143       break;
1144 
1145       case OP_TYPEEXACT:
1146       tcode += 1 + IMM2_SIZE;
1147       break;
1148 
1149       /* Zero or more repeats of character types set the bits and then
1150       try again. */
1151 
1152       case OP_TYPEUPTO:
1153       case OP_TYPEMINUPTO:
1154       case OP_TYPEPOSUPTO:
1155       tcode += IMM2_SIZE;  /* Fall through */
1156 
1157       case OP_TYPESTAR:
1158       case OP_TYPEMINSTAR:
1159       case OP_TYPEPOSSTAR:
1160       case OP_TYPEQUERY:
1161       case OP_TYPEMINQUERY:
1162       case OP_TYPEPOSQUERY:
1163       switch(tcode[1])
1164         {
1165         default:
1166         case OP_ANY:
1167         case OP_ALLANY:
1168         return SSB_FAIL;
1169 
1170         case OP_HSPACE:
1171         SET_BIT(CHAR_HT);
1172         SET_BIT(CHAR_SPACE);
1173 
1174         /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1175         the bits for 0xA0 and for code units >= 255, independently of UTF. */
1176 
1177 #if PCRE2_CODE_UNIT_WIDTH != 8
1178         SET_BIT(0xA0);
1179         SET_BIT(0xFF);
1180 #else
1181         /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1182         units of horizontal space characters. */
1183 
1184 #ifdef SUPPORT_UNICODE
1185         if (utf)
1186           {
1187           SET_BIT(0xC2);  /* For U+00A0 */
1188           SET_BIT(0xE1);  /* For U+1680, U+180E */
1189           SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1190           SET_BIT(0xE3);  /* For U+3000 */
1191           }
1192         else
1193 #endif
1194         /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1195         the code is EBCDIC. */
1196           {
1197 #ifndef EBCDIC
1198           SET_BIT(0xA0);
1199 #endif  /* Not EBCDIC */
1200           }
1201 #endif  /* 8-bit support */
1202         break;
1203 
1204         case OP_ANYNL:
1205         case OP_VSPACE:
1206         SET_BIT(CHAR_LF);
1207         SET_BIT(CHAR_VT);
1208         SET_BIT(CHAR_FF);
1209         SET_BIT(CHAR_CR);
1210 
1211         /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1212         the bits for NEL and for code units >= 255, independently of UTF. */
1213 
1214 #if PCRE2_CODE_UNIT_WIDTH != 8
1215         SET_BIT(CHAR_NEL);
1216         SET_BIT(0xFF);
1217 #else
1218         /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1219         units of vertical space characters. */
1220 
1221 #ifdef SUPPORT_UNICODE
1222         if (utf)
1223           {
1224           SET_BIT(0xC2);  /* For U+0085 (NEL) */
1225           SET_BIT(0xE2);  /* For U+2028, U+2029 */
1226           }
1227         else
1228 #endif
1229         /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1230           {
1231           SET_BIT(CHAR_NEL);
1232           }
1233 #endif  /* 8-bit support */
1234         break;
1235 
1236         case OP_NOT_DIGIT:
1237         set_nottype_bits(re, cbit_digit, table_limit);
1238         break;
1239 
1240         case OP_DIGIT:
1241         set_type_bits(re, cbit_digit, table_limit);
1242         break;
1243 
1244         case OP_NOT_WHITESPACE:
1245         set_nottype_bits(re, cbit_space, table_limit);
1246         break;
1247 
1248         case OP_WHITESPACE:
1249         set_type_bits(re, cbit_space, table_limit);
1250         break;
1251 
1252         case OP_NOT_WORDCHAR:
1253         set_nottype_bits(re, cbit_word, table_limit);
1254         break;
1255 
1256         case OP_WORDCHAR:
1257         set_type_bits(re, cbit_word, table_limit);
1258         break;
1259         }
1260 
1261       tcode += 2;
1262       break;
1263 
1264       /* Extended class: if there are any property checks, or if this is a
1265       negative XCLASS without a map, give up. If there are no property checks,
1266       there must be wide characters on the XCLASS list, because otherwise an
1267       XCLASS would not have been created. This means that code points >= 255
1268       are always potential starters. */
1269 
1270 #ifdef SUPPORT_WIDE_CHARS
1271       case OP_XCLASS:
1272       if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0 ||
1273           (tcode[1 + LINK_SIZE] & (XCL_MAP|XCL_NOT)) == XCL_NOT)
1274         return SSB_FAIL;
1275 
1276       /* We have a positive XCLASS or a negative one without a map. Set up the
1277       map pointer if there is one, and fall through. */
1278 
1279       classmap = ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0)? NULL :
1280         (uint8_t *)(tcode + 1 + LINK_SIZE + 1);
1281 #endif
1282 
1283       /* Enter here for a negative non-XCLASS. In the 8-bit library, if we are
1284       in UTF mode, any byte with a value >= 0xc4 is a potentially valid starter
1285       because it starts a character with a value > 255. In 8-bit non-UTF mode,
1286       there is no difference between CLASS and NCLASS. In all other wide
1287       character modes, set the 0xFF bit to indicate code units >= 255. */
1288 
1289       case OP_NCLASS:
1290 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1291       if (utf)
1292         {
1293         re->start_bitmap[24] |= 0xf0;            /* Bits for 0xc4 - 0xc8 */
1294         memset(re->start_bitmap+25, 0xff, 7);    /* Bits for 0xc9 - 0xff */
1295         }
1296 #elif PCRE2_CODE_UNIT_WIDTH != 8
1297       SET_BIT(0xFF);                             /* For characters >= 255 */
1298 #endif
1299       /* Fall through */
1300 
1301       /* Enter here for a positive non-XCLASS. If we have fallen through from
1302       an XCLASS, classmap will already be set; just advance the code pointer.
1303       Otherwise, set up classmap for a a non-XCLASS and advance past it. */
1304 
1305       case OP_CLASS:
1306       if (*tcode == OP_XCLASS) tcode += GET(tcode, 1); else
1307         {
1308         classmap = (uint8_t *)(++tcode);
1309         tcode += 32 / sizeof(PCRE2_UCHAR);
1310         }
1311 
1312       /* When wide characters are supported, classmap may be NULL. In UTF-8
1313       (sic) mode, the bits in a class bit map correspond to character values,
1314       not to byte values. However, the bit map we are constructing is for byte
1315       values. So we have to do a conversion for characters whose code point is
1316       greater than 127. In fact, there are only two possible starting bytes for
1317       characters in the range 128 - 255. */
1318 
1319       if (classmap != NULL)
1320         {
1321 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1322         if (utf)
1323           {
1324           for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c];
1325           for (c = 128; c < 256; c++)
1326             {
1327             if ((classmap[c/8] && (1 << (c&7))) != 0)
1328               {
1329               int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
1330               re->start_bitmap[d/8] |= (1 << (d&7));  /* and then skip on to the */
1331               c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
1332               }
1333             }
1334           }
1335         else
1336 #endif
1337         /* In all modes except UTF-8, the two bit maps are compatible. */
1338 
1339           {
1340           for (c = 0; c < 32; c++) re->start_bitmap[c] |= classmap[c];
1341           }
1342         }
1343 
1344       /* Act on what follows the class. For a zero minimum repeat, continue;
1345       otherwise stop processing. */
1346 
1347       switch (*tcode)
1348         {
1349         case OP_CRSTAR:
1350         case OP_CRMINSTAR:
1351         case OP_CRQUERY:
1352         case OP_CRMINQUERY:
1353         case OP_CRPOSSTAR:
1354         case OP_CRPOSQUERY:
1355         tcode++;
1356         break;
1357 
1358         case OP_CRRANGE:
1359         case OP_CRMINRANGE:
1360         case OP_CRPOSRANGE:
1361         if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1362           else try_next = FALSE;
1363         break;
1364 
1365         default:
1366         try_next = FALSE;
1367         break;
1368         }
1369       break; /* End of class handling case */
1370       }      /* End of switch for opcodes */
1371     }        /* End of try_next loop */
1372 
1373   code += GET(code, 1);   /* Advance to next branch */
1374   }
1375 while (*code == OP_ALT);
1376 
1377 return yield;
1378 }
1379 
1380 
1381 
1382 /*************************************************
1383 *          Study a compiled expression           *
1384 *************************************************/
1385 
1386 /* This function is handed a compiled expression that it must study to produce
1387 information that will speed up the matching.
1388 
1389 Argument:  points to the compiled expression
1390 Returns:   0 normally; non-zero should never normally occur
1391            1 unknown opcode in set_start_bits
1392            2 missing capturing bracket
1393            3 unknown opcode in find_minlength
1394 */
1395 
1396 int
PRIV(study)1397 PRIV(study)(pcre2_real_code *re)
1398 {
1399 int min;
1400 PCRE2_UCHAR *code;
1401 BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
1402 
1403 /* Find start of compiled code */
1404 
1405 code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
1406   re->name_entry_size * re->name_count;
1407 
1408 /* For an anchored pattern, or an unanchored pattern that has a first code
1409 unit, or a multiline pattern that matches only at "line start", there is no
1410 point in seeking a list of starting code units. */
1411 
1412 if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
1413     (re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
1414   {
1415   int rc = set_start_bits(re, code, utf);
1416   if (rc == SSB_UNKNOWN) return 1;
1417   if (rc == SSB_DONE) re->flags |= PCRE2_FIRSTMAPSET;
1418   }
1419 
1420 /* Find the minimum length of subject string. */
1421 
1422 switch(min = find_minlength(re, code, code, 0, utf))
1423   {
1424   case -1:  /* \C in UTF mode or (*ACCEPT) was encountered */
1425   break;
1426 
1427   case -2:
1428   return 2; /* missing capturing bracket */
1429 
1430   case -3:
1431   return 3; /* unrecognized opcode */
1432 
1433   default:
1434   re->minlength = min;
1435   break;
1436   }
1437 
1438 return 0;
1439 }
1440 
1441 /* End of pcre2_study.c */
1442