1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9            Copyright (c) 1997-2012 University of Cambridge
10 
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14 
15     * Redistributions of source code must retain the above copyright notice,
16       this list of conditions and the following disclaimer.
17 
18     * Redistributions in binary form must reproduce the above copyright
19       notice, this list of conditions and the following disclaimer in the
20       documentation and/or other materials provided with the distribution.
21 
22     * Neither the name of the University of Cambridge nor the names of its
23       contributors may be used to endorse or promote products derived from
24       this software without specific prior written permission.
25 
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39 
40 
41 /* This module contains the external function pcre_study(), along with local
42 supporting functions. */
43 
44 /* %ExternalCopyright% */
45 
46 #ifdef HAVE_CONFIG_H
47 #include "config.h"
48 #endif
49 
50 #include "pcre_internal.h"
51 
52 #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
53 
54 /* Returns from set_start_bits() */
55 
56 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
57 
58 
59 
60 /*************************************************
61 *   Find the minimum subject length for a group  *
62 *************************************************/
63 
64 /* Scan a parenthesized group and compute the minimum length of subject that
65 is needed to match it. This is a lower bound; it does not mean there is a
66 string of that length that matches. In UTF8 mode, the result is in characters
67 rather than bytes.
68 
69 Arguments:
70   re              compiled pattern block
71   code            pointer to start of group (the bracket)
72   startcode       pointer to start of the whole pattern's code
73   options         the compiling options
74   recurses        chain of recurse_check to catch mutual recursion
75   countptr        pointer to call count (to catch over complexity)
76 
77 Returns:   the minimum length
78            -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
79            -2 internal error (missing capturing bracket)
80            -3 internal error (opcode not listed)
81 */
82 
83 static int
find_minlength(const REAL_PCRE * re,const pcre_uchar * code,const pcre_uchar * startcode,int options,recurse_check * recurses,int * countptr)84 find_minlength(const REAL_PCRE *re, const pcre_uchar *code,
85   const pcre_uchar *startcode, int options, recurse_check *recurses,
86   int *countptr)
87 {
88 int length = -1;
89 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
90 BOOL utf = (options & PCRE_UTF8) != 0;
91 BOOL had_recurse = FALSE;
92 recurse_check this_recurse;
93 register int branchlength = 0;
94 register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
95 
96 if ((*countptr)++ > 1000) return -1;   /* too complex */
97 
98 if (*code == OP_CBRA || *code == OP_SCBRA ||
99     *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
100 
101 /* Scan along the opcodes for this branch. If we get to the end of the
102 branch, check the length against that of the other branches. */
103 
104 for (;;)
105   {
106   int d, min;
107   pcre_uchar *cs, *ce;
108   register pcre_uchar op = *cc;
109 
110   switch (op)
111     {
112     case OP_COND:
113     case OP_SCOND:
114 
115     /* If there is only one branch in a condition, the implied branch has zero
116     length, so we don't add anything. This covers the DEFINE "condition"
117     automatically. */
118 
119     cs = cc + GET(cc, 1);
120     if (*cs != OP_ALT)
121       {
122       cc = cs + 1 + LINK_SIZE;
123       break;
124       }
125 
126     /* Otherwise we can fall through and treat it the same as any other
127     subpattern. */
128 
129     case OP_CBRA:
130     case OP_SCBRA:
131     case OP_BRA:
132     case OP_SBRA:
133     case OP_CBRAPOS:
134     case OP_SCBRAPOS:
135     case OP_BRAPOS:
136     case OP_SBRAPOS:
137     case OP_ONCE:
138     case OP_ONCE_NC:
139     d = find_minlength(re, cc, startcode, options, recurses, countptr);
140     if (d < 0) return d;
141     branchlength += d;
142     do cc += GET(cc, 1); while (*cc == OP_ALT);
143     cc += 1 + LINK_SIZE;
144     break;
145 
146     /* ACCEPT makes things far too complicated; we have to give up. */
147 
148     case OP_ACCEPT:
149     case OP_ASSERT_ACCEPT:
150     return -1;
151 
152     /* Reached end of a branch; if it's a ket it is the end of a nested
153     call. If it's ALT it is an alternation in a nested call. If it is END it's
154     the end of the outer call. All can be handled by the same code. If an
155     ACCEPT was previously encountered, use the length that was in force at that
156     time, and pass back the shortest ACCEPT length. */
157 
158     case OP_ALT:
159     case OP_KET:
160     case OP_KETRMAX:
161     case OP_KETRMIN:
162     case OP_KETRPOS:
163     case OP_END:
164     if (length < 0 || (!had_recurse && branchlength < length))
165       length = branchlength;
166     if (op != OP_ALT) return length;
167     cc += 1 + LINK_SIZE;
168     branchlength = 0;
169     had_recurse = FALSE;
170     break;
171 
172     /* Skip over assertive subpatterns */
173 
174     case OP_ASSERT:
175     case OP_ASSERT_NOT:
176     case OP_ASSERTBACK:
177     case OP_ASSERTBACK_NOT:
178     do cc += GET(cc, 1); while (*cc == OP_ALT);
179     /* Fall through */
180 
181     /* Skip over things that don't match chars */
182 
183     case OP_REVERSE:
184     case OP_CREF:
185     case OP_DNCREF:
186     case OP_RREF:
187     case OP_DNRREF:
188     case OP_DEF:
189     case OP_CALLOUT:
190     case OP_SOD:
191     case OP_SOM:
192     case OP_EOD:
193     case OP_EODN:
194     case OP_CIRC:
195     case OP_CIRCM:
196     case OP_DOLL:
197     case OP_DOLLM:
198     case OP_NOT_WORD_BOUNDARY:
199     case OP_WORD_BOUNDARY:
200     cc += PRIV(OP_lengths)[*cc];
201     break;
202 
203     /* Skip over a subpattern that has a {0} or {0,x} quantifier */
204 
205     case OP_BRAZERO:
206     case OP_BRAMINZERO:
207     case OP_BRAPOSZERO:
208     case OP_SKIPZERO:
209     cc += PRIV(OP_lengths)[*cc];
210     do cc += GET(cc, 1); while (*cc == OP_ALT);
211     cc += 1 + LINK_SIZE;
212     break;
213 
214     /* Handle literal characters and + repetitions */
215 
216     case OP_CHAR:
217     case OP_CHARI:
218     case OP_NOT:
219     case OP_NOTI:
220     case OP_PLUS:
221     case OP_PLUSI:
222     case OP_MINPLUS:
223     case OP_MINPLUSI:
224     case OP_POSPLUS:
225     case OP_POSPLUSI:
226     case OP_NOTPLUS:
227     case OP_NOTPLUSI:
228     case OP_NOTMINPLUS:
229     case OP_NOTMINPLUSI:
230     case OP_NOTPOSPLUS:
231     case OP_NOTPOSPLUSI:
232     branchlength++;
233     cc += 2;
234 #ifdef SUPPORT_UTF
235     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
236 #endif
237     break;
238 
239     case OP_TYPEPLUS:
240     case OP_TYPEMINPLUS:
241     case OP_TYPEPOSPLUS:
242     branchlength++;
243     cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
244     break;
245 
246     /* Handle exact repetitions. The count is already in characters, but we
247     need to skip over a multibyte character in UTF8 mode.  */
248 
249     case OP_EXACT:
250     case OP_EXACTI:
251     case OP_NOTEXACT:
252     case OP_NOTEXACTI:
253     branchlength += GET2(cc,1);
254     cc += 2 + IMM2_SIZE;
255 #ifdef SUPPORT_UTF
256     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
257 #endif
258     break;
259 
260     case OP_TYPEEXACT:
261     branchlength += GET2(cc,1);
262     cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
263       || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
264     break;
265 
266     /* Handle single-char non-literal matchers */
267 
268     case OP_PROP:
269     case OP_NOTPROP:
270     cc += 2;
271     /* Fall through */
272 
273     case OP_NOT_DIGIT:
274     case OP_DIGIT:
275     case OP_NOT_WHITESPACE:
276     case OP_WHITESPACE:
277     case OP_NOT_WORDCHAR:
278     case OP_WORDCHAR:
279     case OP_ANY:
280     case OP_ALLANY:
281     case OP_EXTUNI:
282     case OP_HSPACE:
283     case OP_NOT_HSPACE:
284     case OP_VSPACE:
285     case OP_NOT_VSPACE:
286     branchlength++;
287     cc++;
288     break;
289 
290     /* "Any newline" might match two characters, but it also might match just
291     one. */
292 
293     case OP_ANYNL:
294     branchlength += 1;
295     cc++;
296     break;
297 
298     /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
299     non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
300     appear, but leave the code, just in case.) */
301 
302     case OP_ANYBYTE:
303 #ifdef SUPPORT_UTF
304     if (utf) return -1;
305 #endif
306     branchlength++;
307     cc++;
308     break;
309 
310     /* For repeated character types, we have to test for \p and \P, which have
311     an extra two bytes of parameters. */
312 
313     case OP_TYPESTAR:
314     case OP_TYPEMINSTAR:
315     case OP_TYPEQUERY:
316     case OP_TYPEMINQUERY:
317     case OP_TYPEPOSSTAR:
318     case OP_TYPEPOSQUERY:
319     if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
320     cc += PRIV(OP_lengths)[op];
321     break;
322 
323     case OP_TYPEUPTO:
324     case OP_TYPEMINUPTO:
325     case OP_TYPEPOSUPTO:
326     if (cc[1 + IMM2_SIZE] == OP_PROP
327       || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
328     cc += PRIV(OP_lengths)[op];
329     break;
330 
331     /* Check a class for variable quantification */
332 
333     case OP_CLASS:
334     case OP_NCLASS:
335 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
336     case OP_XCLASS:
337     /* The original code caused an unsigned overflow in 64 bit systems,
338     so now we use a conditional statement. */
339     if (op == OP_XCLASS)
340       cc += GET(cc, 1);
341     else
342       cc += PRIV(OP_lengths)[OP_CLASS];
343 #else
344     cc += PRIV(OP_lengths)[OP_CLASS];
345 #endif
346 
347     switch (*cc)
348       {
349       case OP_CRPLUS:
350       case OP_CRMINPLUS:
351       case OP_CRPOSPLUS:
352       branchlength++;
353       /* Fall through */
354 
355       case OP_CRSTAR:
356       case OP_CRMINSTAR:
357       case OP_CRQUERY:
358       case OP_CRMINQUERY:
359       case OP_CRPOSSTAR:
360       case OP_CRPOSQUERY:
361       cc++;
362       break;
363 
364       case OP_CRRANGE:
365       case OP_CRMINRANGE:
366       case OP_CRPOSRANGE:
367       branchlength += GET2(cc,1);
368       cc += 1 + 2 * IMM2_SIZE;
369       break;
370 
371       default:
372       branchlength++;
373       break;
374       }
375     break;
376 
377     /* Backreferences and subroutine calls are treated in the same way: we find
378     the minimum length for the subpattern. A recursion, however, causes an
379     a flag to be set that causes the length of this branch to be ignored. The
380     logic is that a recursion can only make sense if there is another
381     alternation that stops the recursing. That will provide the minimum length
382     (when no recursion happens). A backreference within the group that it is
383     referencing behaves in the same way.
384 
385     If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
386     matches an empty string (by default it causes a matching failure), so in
387     that case we must set the minimum length to zero. */
388 
389     case OP_DNREF:     /* Duplicate named pattern back reference */
390     case OP_DNREFI:
391     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
392       {
393       int count = GET2(cc, 1+IMM2_SIZE);
394       pcre_uchar *slot = (pcre_uchar *)re +
395         re->name_table_offset + GET2(cc, 1) * re->name_entry_size;
396       d = INT_MAX;
397       while (count-- > 0)
398         {
399         ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
400         if (cs == NULL) return -2;
401         do ce += GET(ce, 1); while (*ce == OP_ALT);
402         if (cc > cs && cc < ce)     /* Simple recursion */
403           {
404           d = 0;
405           had_recurse = TRUE;
406           break;
407           }
408         else
409           {
410           recurse_check *r = recurses;
411           for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
412           if (r != NULL)           /* Mutual recursion */
413             {
414             d = 0;
415             had_recurse = TRUE;
416             break;
417             }
418           else
419             {
420             int dd;
421             this_recurse.prev = recurses;
422             this_recurse.group = cs;
423             dd = find_minlength(re, cs, startcode, options, &this_recurse,
424               countptr);
425             if (dd < d) d = dd;
426             }
427           }
428         slot += re->name_entry_size;
429         }
430       }
431     else d = 0;
432     cc += 1 + 2*IMM2_SIZE;
433     goto REPEAT_BACK_REFERENCE;
434 
435     case OP_REF:      /* Single back reference */
436     case OP_REFI:
437     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
438       {
439       ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
440       if (cs == NULL) return -2;
441       do ce += GET(ce, 1); while (*ce == OP_ALT);
442       if (cc > cs && cc < ce)    /* Simple recursion */
443         {
444         d = 0;
445         had_recurse = TRUE;
446         }
447       else
448         {
449         recurse_check *r = recurses;
450         for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
451         if (r != NULL)           /* Mutual recursion */
452           {
453           d = 0;
454           had_recurse = TRUE;
455           }
456         else
457           {
458           this_recurse.prev = recurses;
459           this_recurse.group = cs;
460           d = find_minlength(re, cs, startcode, options, &this_recurse,
461             countptr);
462           }
463         }
464       }
465     else d = 0;
466     cc += 1 + IMM2_SIZE;
467 
468     /* Handle repeated back references */
469 
470     REPEAT_BACK_REFERENCE:
471     switch (*cc)
472       {
473       case OP_CRSTAR:
474       case OP_CRMINSTAR:
475       case OP_CRQUERY:
476       case OP_CRMINQUERY:
477       case OP_CRPOSSTAR:
478       case OP_CRPOSQUERY:
479       min = 0;
480       cc++;
481       break;
482 
483       case OP_CRPLUS:
484       case OP_CRMINPLUS:
485       case OP_CRPOSPLUS:
486       min = 1;
487       cc++;
488       break;
489 
490       case OP_CRRANGE:
491       case OP_CRMINRANGE:
492       case OP_CRPOSRANGE:
493       min = GET2(cc, 1);
494       cc += 1 + 2 * IMM2_SIZE;
495       break;
496 
497       default:
498       min = 1;
499       break;
500       }
501 
502     branchlength += min * d;
503     break;
504 
505     /* We can easily detect direct recursion, but not mutual recursion. This is
506     caught by a recursion depth count. */
507 
508     case OP_RECURSE:
509     cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
510     do ce += GET(ce, 1); while (*ce == OP_ALT);
511     if (cc > cs && cc < ce)    /* Simple recursion */
512       had_recurse = TRUE;
513     else
514       {
515       recurse_check *r = recurses;
516       for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
517       if (r != NULL)           /* Mutual recursion */
518         had_recurse = TRUE;
519       else
520         {
521         this_recurse.prev = recurses;
522         this_recurse.group = cs;
523         branchlength += find_minlength(re, cs, startcode, options,
524           &this_recurse, countptr);
525         }
526       }
527     cc += 1 + LINK_SIZE;
528     break;
529 
530     /* Anything else does not or need not match a character. We can get the
531     item's length from the table, but for those that can match zero occurrences
532     of a character, we must take special action for UTF-8 characters. As it
533     happens, the "NOT" versions of these opcodes are used at present only for
534     ASCII characters, so they could be omitted from this list. However, in
535     future that may change, so we include them here so as not to leave a
536     gotcha for a future maintainer. */
537 
538     case OP_UPTO:
539     case OP_UPTOI:
540     case OP_NOTUPTO:
541     case OP_NOTUPTOI:
542     case OP_MINUPTO:
543     case OP_MINUPTOI:
544     case OP_NOTMINUPTO:
545     case OP_NOTMINUPTOI:
546     case OP_POSUPTO:
547     case OP_POSUPTOI:
548     case OP_NOTPOSUPTO:
549     case OP_NOTPOSUPTOI:
550 
551     case OP_STAR:
552     case OP_STARI:
553     case OP_NOTSTAR:
554     case OP_NOTSTARI:
555     case OP_MINSTAR:
556     case OP_MINSTARI:
557     case OP_NOTMINSTAR:
558     case OP_NOTMINSTARI:
559     case OP_POSSTAR:
560     case OP_POSSTARI:
561     case OP_NOTPOSSTAR:
562     case OP_NOTPOSSTARI:
563 
564     case OP_QUERY:
565     case OP_QUERYI:
566     case OP_NOTQUERY:
567     case OP_NOTQUERYI:
568     case OP_MINQUERY:
569     case OP_MINQUERYI:
570     case OP_NOTMINQUERY:
571     case OP_NOTMINQUERYI:
572     case OP_POSQUERY:
573     case OP_POSQUERYI:
574     case OP_NOTPOSQUERY:
575     case OP_NOTPOSQUERYI:
576 
577     cc += PRIV(OP_lengths)[op];
578 #ifdef SUPPORT_UTF
579     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
580 #endif
581     break;
582 
583     /* Skip these, but we need to add in the name length. */
584 
585     case OP_MARK:
586     case OP_PRUNE_ARG:
587     case OP_SKIP_ARG:
588     case OP_THEN_ARG:
589     cc += PRIV(OP_lengths)[op] + cc[1];
590     break;
591 
592     /* The remaining opcodes are just skipped over. */
593 
594     case OP_CLOSE:
595     case OP_COMMIT:
596     case OP_FAIL:
597     case OP_PRUNE:
598     case OP_SET_SOM:
599     case OP_SKIP:
600     case OP_THEN:
601     cc += PRIV(OP_lengths)[op];
602     break;
603 
604     /* This should not occur: we list all opcodes explicitly so that when
605     new ones get added they are properly considered. */
606 
607     default:
608     return -3;
609     }
610   }
611 /* Control never gets here */
612 }
613 
614 
615 
616 /*************************************************
617 *      Set a bit and maybe its alternate case    *
618 *************************************************/
619 
620 /* Given a character, set its first byte's bit in the table, and also the
621 corresponding bit for the other version of a letter if we are caseless. In
622 UTF-8 mode, for characters greater than 127, we can only do the caseless thing
623 when Unicode property support is available.
624 
625 Arguments:
626   start_bits    points to the bit map
627   p             points to the character
628   caseless      the caseless flag
629   cd            the block with char table pointers
630   utf           TRUE for UTF-8 / UTF-16 / UTF-32 mode
631 
632 Returns:        pointer after the character
633 */
634 
635 static const pcre_uchar *
set_table_bit(pcre_uint8 * start_bits,const pcre_uchar * p,BOOL caseless,compile_data * cd,BOOL utf)636 set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
637   compile_data *cd, BOOL utf)
638 {
639 pcre_uint32 c = *p;
640 
641 #ifdef COMPILE_PCRE8
642 SET_BIT(c);
643 
644 #ifdef SUPPORT_UTF
645 if (utf && c > 127)
646   {
647   GETCHARINC(c, p);
648 #ifdef SUPPORT_UCP
649   if (caseless)
650     {
651     pcre_uchar buff[6];
652     c = UCD_OTHERCASE(c);
653     (void)PRIV(ord2utf)(c, buff);
654     SET_BIT(buff[0]);
655     }
656 #endif  /* Not SUPPORT_UCP */
657   return p;
658   }
659 #else   /* Not SUPPORT_UTF */
660 (void)(utf);   /* Stops warning for unused parameter */
661 #endif  /* SUPPORT_UTF */
662 
663 /* Not UTF-8 mode, or character is less than 127. */
664 
665 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
666 return p + 1;
667 #endif  /* COMPILE_PCRE8 */
668 
669 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
670 if (c > 0xff)
671   {
672   c = 0xff;
673   caseless = FALSE;
674   }
675 SET_BIT(c);
676 
677 #ifdef SUPPORT_UTF
678 if (utf && c > 127)
679   {
680   GETCHARINC(c, p);
681 #ifdef SUPPORT_UCP
682   if (caseless)
683     {
684     c = UCD_OTHERCASE(c);
685     if (c > 0xff)
686       c = 0xff;
687     SET_BIT(c);
688     }
689 #endif  /* SUPPORT_UCP */
690   return p;
691   }
692 #else   /* Not SUPPORT_UTF */
693 (void)(utf);   /* Stops warning for unused parameter */
694 #endif  /* SUPPORT_UTF */
695 
696 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
697 return p + 1;
698 #endif
699 }
700 
701 
702 
703 /*************************************************
704 *     Set bits for a positive character type     *
705 *************************************************/
706 
707 /* This function sets starting bits for a character type. In UTF-8 mode, we can
708 only do a direct setting for bytes less than 128, as otherwise there can be
709 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
710 environment, the tables will only recognize ASCII characters anyway, but in at
711 least one Windows environment, some higher bytes bits were set in the tables.
712 So we deal with that case by considering the UTF-8 encoding.
713 
714 Arguments:
715   start_bits     the starting bitmap
716   cbit type      the type of character wanted
717   table_limit    32 for non-UTF-8; 16 for UTF-8
718   cd             the block with char table pointers
719 
720 Returns:         nothing
721 */
722 
723 static void
set_type_bits(pcre_uint8 * start_bits,int cbit_type,unsigned int table_limit,compile_data * cd)724 set_type_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
725   compile_data *cd)
726 {
727 register pcre_uint32 c;
728 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
729 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
730 if (table_limit == 32) return;
731 for (c = 128; c < 256; c++)
732   {
733   if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
734     {
735     pcre_uchar buff[6];
736     (void)PRIV(ord2utf)(c, buff);
737     SET_BIT(buff[0]);
738     }
739   }
740 #endif
741 }
742 
743 
744 /*************************************************
745 *     Set bits for a negative character type     *
746 *************************************************/
747 
748 /* This function sets starting bits for a negative character type such as \D.
749 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
750 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
751 Unlike in the positive case, where we can set appropriate starting bits for
752 specific high-valued UTF-8 characters, in this case we have to set the bits for
753 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
754 0xc0 (192) for simplicity.
755 
756 Arguments:
757   start_bits     the starting bitmap
758   cbit type      the type of character wanted
759   table_limit    32 for non-UTF-8; 16 for UTF-8
760   cd             the block with char table pointers
761 
762 Returns:         nothing
763 */
764 
765 static void
set_nottype_bits(pcre_uint8 * start_bits,int cbit_type,unsigned int table_limit,compile_data * cd)766 set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
767   compile_data *cd)
768 {
769 register pcre_uint32 c;
770 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
771 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
772 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
773 #endif
774 }
775 
776 
777 
778 /*************************************************
779 *          Create bitmap of starting bytes       *
780 *************************************************/
781 
782 /* This function scans a compiled unanchored expression recursively and
783 attempts to build a bitmap of the set of possible starting bytes. As time goes
784 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
785 useful for parenthesized groups in patterns such as (a*)b where the group
786 provides some optional starting bytes but scanning must continue at the outer
787 level to find at least one mandatory byte. At the outermost level, this
788 function fails unless the result is SSB_DONE.
789 
790 Arguments:
791   code         points to an expression
792   start_bits   points to a 32-byte table, initialized to 0
793   utf          TRUE if in UTF-8 / UTF-16 / UTF-32 mode
794   cd           the block with char table pointers
795 
796 Returns:       SSB_FAIL     => Failed to find any starting bytes
797                SSB_DONE     => Found mandatory starting bytes
798                SSB_CONTINUE => Found optional starting bytes
799                SSB_UNKNOWN  => Hit an unrecognized opcode
800 */
801 
802 static int
set_start_bits(const pcre_uchar * code,pcre_uint8 * start_bits,BOOL utf,compile_data * cd)803 set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
804   compile_data *cd)
805 {
806 register pcre_uint32 c;
807 int yield = SSB_DONE;
808 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
809 int table_limit = utf? 16:32;
810 #else
811 int table_limit = 32;
812 #endif
813 
814 #if 0
815 /* ========================================================================= */
816 /* The following comment and code was inserted in January 1999. In May 2006,
817 when it was observed to cause compiler warnings about unused values, I took it
818 out again. If anybody is still using OS/2, they will have to put it back
819 manually. */
820 
821 /* This next statement and the later reference to dummy are here in order to
822 trick the optimizer of the IBM C compiler for OS/2 into generating correct
823 code. Apparently IBM isn't going to fix the problem, and we would rather not
824 disable optimization (in this module it actually makes a big difference, and
825 the pcre module can use all the optimization it can get). */
826 
827 volatile int dummy;
828 /* ========================================================================= */
829 #endif
830 
831 do
832   {
833   BOOL try_next = TRUE;
834   const pcre_uchar *tcode = code + 1 + LINK_SIZE;
835 
836   if (*code == OP_CBRA || *code == OP_SCBRA ||
837       *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
838 
839   while (try_next)    /* Loop for items in this branch */
840     {
841     int rc;
842 
843     switch(*tcode)
844       {
845       /* If we reach something we don't understand, it means a new opcode has
846       been created that hasn't been added to this code. Hopefully this problem
847       will be discovered during testing. */
848 
849       default:
850       return SSB_UNKNOWN;
851 
852       /* Fail for a valid opcode that implies no starting bits. */
853 
854       case OP_ACCEPT:
855       case OP_ASSERT_ACCEPT:
856       case OP_ALLANY:
857       case OP_ANY:
858       case OP_ANYBYTE:
859       case OP_CIRC:
860       case OP_CIRCM:
861       case OP_CLOSE:
862       case OP_COMMIT:
863       case OP_COND:
864       case OP_CREF:
865       case OP_DEF:
866       case OP_DNCREF:
867       case OP_DNREF:
868       case OP_DNREFI:
869       case OP_DNRREF:
870       case OP_DOLL:
871       case OP_DOLLM:
872       case OP_END:
873       case OP_EOD:
874       case OP_EODN:
875       case OP_EXTUNI:
876       case OP_FAIL:
877       case OP_MARK:
878       case OP_NOT:
879       case OP_NOTEXACT:
880       case OP_NOTEXACTI:
881       case OP_NOTI:
882       case OP_NOTMINPLUS:
883       case OP_NOTMINPLUSI:
884       case OP_NOTMINQUERY:
885       case OP_NOTMINQUERYI:
886       case OP_NOTMINSTAR:
887       case OP_NOTMINSTARI:
888       case OP_NOTMINUPTO:
889       case OP_NOTMINUPTOI:
890       case OP_NOTPLUS:
891       case OP_NOTPLUSI:
892       case OP_NOTPOSPLUS:
893       case OP_NOTPOSPLUSI:
894       case OP_NOTPOSQUERY:
895       case OP_NOTPOSQUERYI:
896       case OP_NOTPOSSTAR:
897       case OP_NOTPOSSTARI:
898       case OP_NOTPOSUPTO:
899       case OP_NOTPOSUPTOI:
900       case OP_NOTPROP:
901       case OP_NOTQUERY:
902       case OP_NOTQUERYI:
903       case OP_NOTSTAR:
904       case OP_NOTSTARI:
905       case OP_NOTUPTO:
906       case OP_NOTUPTOI:
907       case OP_NOT_HSPACE:
908       case OP_NOT_VSPACE:
909       case OP_PRUNE:
910       case OP_PRUNE_ARG:
911       case OP_RECURSE:
912       case OP_REF:
913       case OP_REFI:
914       case OP_REVERSE:
915       case OP_RREF:
916       case OP_SCOND:
917       case OP_SET_SOM:
918       case OP_SKIP:
919       case OP_SKIP_ARG:
920       case OP_SOD:
921       case OP_SOM:
922       case OP_THEN:
923       case OP_THEN_ARG:
924       return SSB_FAIL;
925 
926       /* A "real" property test implies no starting bits, but the fake property
927       PT_CLIST identifies a list of characters. These lists are short, as they
928       are used for characters with more than one "other case", so there is no
929       point in recognizing them for OP_NOTPROP. */
930 
931       case OP_PROP:
932       if (tcode[1] != PT_CLIST) return SSB_FAIL;
933         {
934         const pcre_uint32 *p = PRIV(ucd_caseless_sets) + tcode[2];
935         while ((c = *p++) < NOTACHAR)
936           {
937 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
938           if (utf)
939             {
940             pcre_uchar buff[6];
941             (void)PRIV(ord2utf)(c, buff);
942             c = buff[0];
943             }
944 #endif
945           if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
946           }
947         }
948       try_next = FALSE;
949       break;
950 
951       /* We can ignore word boundary tests. */
952 
953       case OP_WORD_BOUNDARY:
954       case OP_NOT_WORD_BOUNDARY:
955       tcode++;
956       break;
957 
958       /* If we hit a bracket or a positive lookahead assertion, recurse to set
959       bits from within the subpattern. If it can't find anything, we have to
960       give up. If it finds some mandatory character(s), we are done for this
961       branch. Otherwise, carry on scanning after the subpattern. */
962 
963       case OP_BRA:
964       case OP_SBRA:
965       case OP_CBRA:
966       case OP_SCBRA:
967       case OP_BRAPOS:
968       case OP_SBRAPOS:
969       case OP_CBRAPOS:
970       case OP_SCBRAPOS:
971       case OP_ONCE:
972       case OP_ONCE_NC:
973       case OP_ASSERT:
974       rc = set_start_bits(tcode, start_bits, utf, cd);
975       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
976       if (rc == SSB_DONE) try_next = FALSE; else
977         {
978         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
979         tcode += 1 + LINK_SIZE;
980         }
981       break;
982 
983       /* If we hit ALT or KET, it means we haven't found anything mandatory in
984       this branch, though we might have found something optional. For ALT, we
985       continue with the next alternative, but we have to arrange that the final
986       result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
987       return SSB_CONTINUE: if this is the top level, that indicates failure,
988       but after a nested subpattern, it causes scanning to continue. */
989 
990       case OP_ALT:
991       yield = SSB_CONTINUE;
992       try_next = FALSE;
993       break;
994 
995       case OP_KET:
996       case OP_KETRMAX:
997       case OP_KETRMIN:
998       case OP_KETRPOS:
999       return SSB_CONTINUE;
1000 
1001       /* Skip over callout */
1002 
1003       case OP_CALLOUT:
1004       tcode += 2 + 2*LINK_SIZE;
1005       break;
1006 
1007       /* Skip over lookbehind and negative lookahead assertions */
1008 
1009       case OP_ASSERT_NOT:
1010       case OP_ASSERTBACK:
1011       case OP_ASSERTBACK_NOT:
1012       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1013       tcode += 1 + LINK_SIZE;
1014       break;
1015 
1016       /* BRAZERO does the bracket, but carries on. */
1017 
1018       case OP_BRAZERO:
1019       case OP_BRAMINZERO:
1020       case OP_BRAPOSZERO:
1021       rc = set_start_bits(++tcode, start_bits, utf, cd);
1022       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
1023 /* =========================================================================
1024       See the comment at the head of this function concerning the next line,
1025       which was an old fudge for the benefit of OS/2.
1026       dummy = 1;
1027   ========================================================================= */
1028       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1029       tcode += 1 + LINK_SIZE;
1030       break;
1031 
1032       /* SKIPZERO skips the bracket. */
1033 
1034       case OP_SKIPZERO:
1035       tcode++;
1036       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1037       tcode += 1 + LINK_SIZE;
1038       break;
1039 
1040       /* Single-char * or ? sets the bit and tries the next item */
1041 
1042       case OP_STAR:
1043       case OP_MINSTAR:
1044       case OP_POSSTAR:
1045       case OP_QUERY:
1046       case OP_MINQUERY:
1047       case OP_POSQUERY:
1048       tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
1049       break;
1050 
1051       case OP_STARI:
1052       case OP_MINSTARI:
1053       case OP_POSSTARI:
1054       case OP_QUERYI:
1055       case OP_MINQUERYI:
1056       case OP_POSQUERYI:
1057       tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
1058       break;
1059 
1060       /* Single-char upto sets the bit and tries the next */
1061 
1062       case OP_UPTO:
1063       case OP_MINUPTO:
1064       case OP_POSUPTO:
1065       tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
1066       break;
1067 
1068       case OP_UPTOI:
1069       case OP_MINUPTOI:
1070       case OP_POSUPTOI:
1071       tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
1072       break;
1073 
1074       /* At least one single char sets the bit and stops */
1075 
1076       case OP_EXACT:
1077       tcode += IMM2_SIZE;
1078       /* Fall through */
1079       case OP_CHAR:
1080       case OP_PLUS:
1081       case OP_MINPLUS:
1082       case OP_POSPLUS:
1083       (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
1084       try_next = FALSE;
1085       break;
1086 
1087       case OP_EXACTI:
1088       tcode += IMM2_SIZE;
1089       /* Fall through */
1090       case OP_CHARI:
1091       case OP_PLUSI:
1092       case OP_MINPLUSI:
1093       case OP_POSPLUSI:
1094       (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
1095       try_next = FALSE;
1096       break;
1097 
1098       /* Special spacing and line-terminating items. These recognize specific
1099       lists of characters. The difference between VSPACE and ANYNL is that the
1100       latter can match the two-character CRLF sequence, but that is not
1101       relevant for finding the first character, so their code here is
1102       identical. */
1103 
1104       case OP_HSPACE:
1105       SET_BIT(CHAR_HT);
1106       SET_BIT(CHAR_SPACE);
1107 #ifdef SUPPORT_UTF
1108       if (utf)
1109         {
1110 #ifdef COMPILE_PCRE8
1111         SET_BIT(0xC2);  /* For U+00A0 */
1112         SET_BIT(0xE1);  /* For U+1680, U+180E */
1113         SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1114         SET_BIT(0xE3);  /* For U+3000 */
1115 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1116         SET_BIT(0xA0);
1117         SET_BIT(0xFF);  /* For characters > 255 */
1118 #endif  /* COMPILE_PCRE[8|16|32] */
1119         }
1120       else
1121 #endif /* SUPPORT_UTF */
1122         {
1123 #ifndef EBCDIC
1124         SET_BIT(0xA0);
1125 #endif  /* Not EBCDIC */
1126 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1127         SET_BIT(0xFF);  /* For characters > 255 */
1128 #endif  /* COMPILE_PCRE[16|32] */
1129         }
1130       try_next = FALSE;
1131       break;
1132 
1133       case OP_ANYNL:
1134       case OP_VSPACE:
1135       SET_BIT(CHAR_LF);
1136       SET_BIT(CHAR_VT);
1137       SET_BIT(CHAR_FF);
1138       SET_BIT(CHAR_CR);
1139 #ifdef SUPPORT_UTF
1140       if (utf)
1141         {
1142 #ifdef COMPILE_PCRE8
1143         SET_BIT(0xC2);  /* For U+0085 */
1144         SET_BIT(0xE2);  /* For U+2028, U+2029 */
1145 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1146         SET_BIT(CHAR_NEL);
1147         SET_BIT(0xFF);  /* For characters > 255 */
1148 #endif  /* COMPILE_PCRE[8|16|32] */
1149         }
1150       else
1151 #endif /* SUPPORT_UTF */
1152         {
1153         SET_BIT(CHAR_NEL);
1154 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1155         SET_BIT(0xFF);  /* For characters > 255 */
1156 #endif
1157         }
1158       try_next = FALSE;
1159       break;
1160 
1161       /* Single character types set the bits and stop. Note that if PCRE_UCP
1162       is set, we do not see these op codes because \d etc are converted to
1163       properties. Therefore, these apply in the case when only characters less
1164       than 256 are recognized to match the types. */
1165 
1166       case OP_NOT_DIGIT:
1167       set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1168       try_next = FALSE;
1169       break;
1170 
1171       case OP_DIGIT:
1172       set_type_bits(start_bits, cbit_digit, table_limit, cd);
1173       try_next = FALSE;
1174       break;
1175 
1176       /* The cbit_space table has vertical tab as whitespace; we no longer
1177       have to play fancy tricks because Perl added VT to its whitespace at
1178       release 5.18. PCRE added it at release 8.34. */
1179 
1180       case OP_NOT_WHITESPACE:
1181       set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1182       try_next = FALSE;
1183       break;
1184 
1185       case OP_WHITESPACE:
1186       set_type_bits(start_bits, cbit_space, table_limit, cd);
1187       try_next = FALSE;
1188       break;
1189 
1190       case OP_NOT_WORDCHAR:
1191       set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1192       try_next = FALSE;
1193       break;
1194 
1195       case OP_WORDCHAR:
1196       set_type_bits(start_bits, cbit_word, table_limit, cd);
1197       try_next = FALSE;
1198       break;
1199 
1200       /* One or more character type fudges the pointer and restarts, knowing
1201       it will hit a single character type and stop there. */
1202 
1203       case OP_TYPEPLUS:
1204       case OP_TYPEMINPLUS:
1205       case OP_TYPEPOSPLUS:
1206       tcode++;
1207       break;
1208 
1209       case OP_TYPEEXACT:
1210       tcode += 1 + IMM2_SIZE;
1211       break;
1212 
1213       /* Zero or more repeats of character types set the bits and then
1214       try again. */
1215 
1216       case OP_TYPEUPTO:
1217       case OP_TYPEMINUPTO:
1218       case OP_TYPEPOSUPTO:
1219       tcode += IMM2_SIZE;  /* Fall through */
1220 
1221       case OP_TYPESTAR:
1222       case OP_TYPEMINSTAR:
1223       case OP_TYPEPOSSTAR:
1224       case OP_TYPEQUERY:
1225       case OP_TYPEMINQUERY:
1226       case OP_TYPEPOSQUERY:
1227       switch(tcode[1])
1228         {
1229         default:
1230         case OP_ANY:
1231         case OP_ALLANY:
1232         return SSB_FAIL;
1233 
1234         case OP_HSPACE:
1235         SET_BIT(CHAR_HT);
1236         SET_BIT(CHAR_SPACE);
1237 #ifdef SUPPORT_UTF
1238         if (utf)
1239           {
1240 #ifdef COMPILE_PCRE8
1241           SET_BIT(0xC2);  /* For U+00A0 */
1242           SET_BIT(0xE1);  /* For U+1680, U+180E */
1243           SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1244           SET_BIT(0xE3);  /* For U+3000 */
1245 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1246           SET_BIT(0xA0);
1247           SET_BIT(0xFF);  /* For characters > 255 */
1248 #endif  /* COMPILE_PCRE[8|16|32] */
1249           }
1250         else
1251 #endif /* SUPPORT_UTF */
1252 #ifndef EBCDIC
1253           SET_BIT(0xA0);
1254 #endif  /* Not EBCDIC */
1255         break;
1256 
1257         case OP_ANYNL:
1258         case OP_VSPACE:
1259         SET_BIT(CHAR_LF);
1260         SET_BIT(CHAR_VT);
1261         SET_BIT(CHAR_FF);
1262         SET_BIT(CHAR_CR);
1263 #ifdef SUPPORT_UTF
1264         if (utf)
1265           {
1266 #ifdef COMPILE_PCRE8
1267           SET_BIT(0xC2);  /* For U+0085 */
1268           SET_BIT(0xE2);  /* For U+2028, U+2029 */
1269 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1270           SET_BIT(CHAR_NEL);
1271           SET_BIT(0xFF);  /* For characters > 255 */
1272 #endif  /* COMPILE_PCRE16 */
1273           }
1274         else
1275 #endif /* SUPPORT_UTF */
1276           SET_BIT(CHAR_NEL);
1277         break;
1278 
1279         case OP_NOT_DIGIT:
1280         set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1281         break;
1282 
1283         case OP_DIGIT:
1284         set_type_bits(start_bits, cbit_digit, table_limit, cd);
1285         break;
1286 
1287         /* The cbit_space table has vertical tab as whitespace; we no longer
1288         have to play fancy tricks because Perl added VT to its whitespace at
1289         release 5.18. PCRE added it at release 8.34. */
1290 
1291         case OP_NOT_WHITESPACE:
1292         set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1293         break;
1294 
1295         case OP_WHITESPACE:
1296         set_type_bits(start_bits, cbit_space, table_limit, cd);
1297         break;
1298 
1299         case OP_NOT_WORDCHAR:
1300         set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1301         break;
1302 
1303         case OP_WORDCHAR:
1304         set_type_bits(start_bits, cbit_word, table_limit, cd);
1305         break;
1306         }
1307 
1308       tcode += 2;
1309       break;
1310 
1311       /* Character class where all the information is in a bit map: set the
1312       bits and either carry on or not, according to the repeat count. If it was
1313       a negative class, and we are operating with UTF-8 characters, any byte
1314       with a value >= 0xc4 is a potentially valid starter because it starts a
1315       character with a value > 255. */
1316 
1317 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1318       case OP_XCLASS:
1319       if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0)
1320         return SSB_FAIL;
1321       /* All bits are set. */
1322       if ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0 && (tcode[1 + LINK_SIZE] & XCL_NOT) != 0)
1323         return SSB_FAIL;
1324 #endif
1325       /* Fall through */
1326 
1327       case OP_NCLASS:
1328 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1329       if (utf)
1330         {
1331         start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
1332         memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
1333         }
1334 #endif
1335 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1336       SET_BIT(0xFF);                         /* For characters > 255 */
1337 #endif
1338       /* Fall through */
1339 
1340       case OP_CLASS:
1341         {
1342         pcre_uint8 *map;
1343 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1344         map = NULL;
1345         if (*tcode == OP_XCLASS)
1346           {
1347           if ((tcode[1 + LINK_SIZE] & XCL_MAP) != 0)
1348             map = (pcre_uint8 *)(tcode + 1 + LINK_SIZE + 1);
1349           tcode += GET(tcode, 1);
1350           }
1351         else
1352 #endif
1353           {
1354           tcode++;
1355           map = (pcre_uint8 *)tcode;
1356           tcode += 32 / sizeof(pcre_uchar);
1357           }
1358 
1359         /* In UTF-8 mode, the bits in a bit map correspond to character
1360         values, not to byte values. However, the bit map we are constructing is
1361         for byte values. So we have to do a conversion for characters whose
1362         value is > 127. In fact, there are only two possible starting bytes for
1363         characters in the range 128 - 255. */
1364 
1365 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1366         if (map != NULL)
1367 #endif
1368           {
1369 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1370           if (utf)
1371             {
1372             for (c = 0; c < 16; c++) start_bits[c] |= map[c];
1373             for (c = 128; c < 256; c++)
1374               {
1375               if ((map[c/8] & (1 << (c&7))) != 0)
1376                 {
1377                 int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
1378                 start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
1379                 c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
1380                 }
1381               }
1382             }
1383           else
1384 #endif
1385             {
1386             /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1387             for (c = 0; c < 32; c++) start_bits[c] |= map[c];
1388             }
1389           }
1390 
1391         /* Advance past the bit map, and act on what follows. For a zero
1392         minimum repeat, continue; otherwise stop processing. */
1393 
1394         switch (*tcode)
1395           {
1396           case OP_CRSTAR:
1397           case OP_CRMINSTAR:
1398           case OP_CRQUERY:
1399           case OP_CRMINQUERY:
1400           case OP_CRPOSSTAR:
1401           case OP_CRPOSQUERY:
1402           tcode++;
1403           break;
1404 
1405           case OP_CRRANGE:
1406           case OP_CRMINRANGE:
1407           case OP_CRPOSRANGE:
1408           if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1409             else try_next = FALSE;
1410           break;
1411 
1412           default:
1413           try_next = FALSE;
1414           break;
1415           }
1416         }
1417       break; /* End of bitmap class handling */
1418 
1419       }      /* End of switch */
1420     }        /* End of try_next loop */
1421 
1422   code += GET(code, 1);   /* Advance to next branch */
1423   }
1424 while (*code == OP_ALT);
1425 return yield;
1426 }
1427 
1428 
1429 
1430 
1431 
1432 /*************************************************
1433 *          Study a compiled expression           *
1434 *************************************************/
1435 
1436 /* This function is handed a compiled expression that it must study to produce
1437 information that will speed up the matching. It returns a pcre[16]_extra block
1438 which then gets handed back to pcre_exec().
1439 
1440 Arguments:
1441   re        points to the compiled expression
1442   options   contains option bits
1443   errorptr  points to where to place error messages;
1444             set NULL unless error
1445 
1446 Returns:    pointer to a pcre[16]_extra block, with study_data filled in and
1447               the appropriate flags set;
1448             NULL on error or if no optimization possible
1449 */
1450 
1451 #if defined COMPILE_PCRE8
1452 #if defined(ERLANG_INTEGRATION)
1453 PCRE_EXP_DEFN erts_pcre_extra * PCRE_CALL_CONVENTION
erts_pcre_study(const pcre * external_re,int options,const char ** errorptr)1454 erts_pcre_study(const pcre *external_re, int options, const char **errorptr)
1455 #else
1456 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1457 pcre_study(const pcre *external_re, int options, const char **errorptr)
1458 #endif
1459 #elif defined COMPILE_PCRE16
1460 PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
1461 pcre16_study(const pcre16 *external_re, int options, const char **errorptr)
1462 #elif defined COMPILE_PCRE32
1463 PCRE_EXP_DEFN pcre32_extra * PCRE_CALL_CONVENTION
1464 pcre32_study(const pcre32 *external_re, int options, const char **errorptr)
1465 #endif
1466 {
1467 int min;
1468 int count = 0;
1469 BOOL bits_set = FALSE;
1470 pcre_uint8 start_bits[32];
1471 PUBL(extra) *extra = NULL;
1472 pcre_study_data *study;
1473 const pcre_uint8 *tables;
1474 pcre_uchar *code;
1475 compile_data compile_block;
1476 const REAL_PCRE *re = (const REAL_PCRE *)external_re;
1477 
1478 
1479 *errorptr = NULL;
1480 
1481 if (re == NULL || re->magic_number != MAGIC_NUMBER)
1482   {
1483   *errorptr = "argument is not a compiled regular expression";
1484   return NULL;
1485   }
1486 
1487 if ((re->flags & PCRE_MODE) == 0)
1488   {
1489 #if defined COMPILE_PCRE8
1490   *errorptr = "argument not compiled in 8 bit mode";
1491 #elif defined COMPILE_PCRE16
1492   *errorptr = "argument not compiled in 16 bit mode";
1493 #elif defined COMPILE_PCRE32
1494   *errorptr = "argument not compiled in 32 bit mode";
1495 #endif
1496   return NULL;
1497   }
1498 
1499 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1500   {
1501   *errorptr = "unknown or incorrect option bit(s) set";
1502   return NULL;
1503   }
1504 
1505 code = (pcre_uchar *)re + re->name_table_offset +
1506   (re->name_count * re->name_entry_size);
1507 
1508 /* For an anchored pattern, or an unanchored pattern that has a first char, or
1509 a multiline pattern that matches only at "line starts", there is no point in
1510 seeking a list of starting bytes. */
1511 
1512 if ((re->options & PCRE_ANCHORED) == 0 &&
1513     (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1514   {
1515   int rc;
1516 
1517   /* Set the character tables in the block that is passed around */
1518 
1519   tables = re->tables;
1520 
1521 #if defined COMPILE_PCRE8
1522 #if defined(ERLANG_INTEGRATION)
1523   if (tables == NULL)
1524     (void)erts_pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1525     (void *)(&tables));
1526 #else
1527   if (tables == NULL)
1528     (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1529     (void *)(&tables));
1530 #endif
1531 #elif defined COMPILE_PCRE16
1532   if (tables == NULL)
1533     (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1534     (void *)(&tables));
1535 #elif defined COMPILE_PCRE32
1536   if (tables == NULL)
1537     (void)pcre32_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1538     (void *)(&tables));
1539 #endif
1540 
1541   compile_block.lcc = tables + lcc_offset;
1542   compile_block.fcc = tables + fcc_offset;
1543   compile_block.cbits = tables + cbits_offset;
1544   compile_block.ctypes = tables + ctypes_offset;
1545 
1546   /* See if we can find a fixed set of initial characters for the pattern. */
1547 
1548   memset(start_bits, 0, 32 * sizeof(pcre_uint8));
1549   rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1550     &compile_block);
1551   bits_set = rc == SSB_DONE;
1552   if (rc == SSB_UNKNOWN)
1553     {
1554     *errorptr = "internal error: opcode not recognized";
1555     return NULL;
1556     }
1557   }
1558 
1559 /* Find the minimum length of subject string. */
1560 
1561 switch(min = find_minlength(re, code, code, re->options, NULL, &count))
1562   {
1563   case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
1564   case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
1565   default: break;
1566   }
1567 
1568 /* If a set of starting bytes has been identified, or if the minimum length is
1569 greater than zero, or if JIT optimization has been requested, or if
1570 PCRE_STUDY_EXTRA_NEEDED is set, get a pcre[16]_extra block and a
1571 pcre_study_data block. The study data is put in the latter, which is pointed to
1572 by the former, which may also get additional data set later by the calling
1573 program. At the moment, the size of pcre_study_data is fixed. We nevertheless
1574 save it in a field for returning via the pcre_fullinfo() function so that if it
1575 becomes variable in the future, we don't have to change that code. */
1576 
1577 if (bits_set || min > 0 || (options & (
1578 #ifdef SUPPORT_JIT
1579     PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE |
1580     PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE |
1581 #endif
1582     PCRE_STUDY_EXTRA_NEEDED)) != 0)
1583   {
1584   extra = (PUBL(extra) *)(PUBL(malloc))
1585     (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
1586   if (extra == NULL)
1587     {
1588     *errorptr = "failed to get memory";
1589     return NULL;
1590     }
1591 
1592   study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
1593   extra->flags = PCRE_EXTRA_STUDY_DATA;
1594   extra->study_data = study;
1595 
1596   study->size = sizeof(pcre_study_data);
1597   study->flags = 0;
1598 
1599   /* Set the start bits always, to avoid unset memory errors if the
1600   study data is written to a file, but set the flag only if any of the bits
1601   are set, to save time looking when none are. */
1602 
1603   if (bits_set)
1604     {
1605     study->flags |= PCRE_STUDY_MAPPED;
1606     memcpy(study->start_bits, start_bits, sizeof(start_bits));
1607     }
1608   else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
1609 
1610 #ifdef PCRE_DEBUG
1611   if (bits_set)
1612     {
1613     pcre_uint8 *ptr = start_bits;
1614     int i;
1615 
1616     printf("Start bits:\n");
1617     for (i = 0; i < 32; i++)
1618       printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
1619     }
1620 #endif
1621 
1622   /* Always set the minlength value in the block, because the JIT compiler
1623   makes use of it. However, don't set the bit unless the length is greater than
1624   zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
1625   checking the zero case. */
1626 
1627   if (min > 0)
1628     {
1629     study->flags |= PCRE_STUDY_MINLEN;
1630     study->minlength = min;
1631     }
1632   else study->minlength = 0;
1633 
1634   /* If JIT support was compiled and requested, attempt the JIT compilation.
1635   If no starting bytes were found, and the minimum length is zero, and JIT
1636   compilation fails, abandon the extra block and return NULL, unless
1637   PCRE_STUDY_EXTRA_NEEDED is set. */
1638 
1639 #ifdef SUPPORT_JIT
1640   extra->executable_jit = NULL;
1641   if ((options & PCRE_STUDY_JIT_COMPILE) != 0)
1642     PRIV(jit_compile)(re, extra, JIT_COMPILE);
1643   if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0)
1644     PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE);
1645   if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0)
1646     PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE);
1647 
1648   if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0 &&
1649       (options & PCRE_STUDY_EXTRA_NEEDED) == 0)
1650     {
1651 #if defined COMPILE_PCRE8
1652 #if defined(ERLANG_INTEGRATION)
1653     erts_pcre_free_study(extra);
1654 #else
1655     pcre_free_study(extra);
1656 #endif
1657 #elif defined COMPILE_PCRE16
1658     pcre16_free_study(extra);
1659 #elif defined COMPILE_PCRE32
1660     pcre32_free_study(extra);
1661 #endif
1662     extra = NULL;
1663     }
1664 #endif
1665   }
1666 
1667 return extra;
1668 }
1669 
1670 
1671 /*************************************************
1672 *          Free the study data                   *
1673 *************************************************/
1674 
1675 /* This function frees the memory that was obtained by pcre_study().
1676 
1677 Argument:   a pointer to the pcre[16]_extra block
1678 Returns:    nothing
1679 */
1680 
1681 #if defined COMPILE_PCRE8
1682 #if defined(ERLANG_INTEGRATION)
1683 PCRE_EXP_DEFN void
erts_pcre_free_study(erts_pcre_extra * extra)1684 erts_pcre_free_study(erts_pcre_extra *extra)
1685 #else
1686 PCRE_EXP_DEFN void
1687 pcre_free_study(pcre_extra *extra)
1688 #endif
1689 #elif defined COMPILE_PCRE16
1690 PCRE_EXP_DEFN void
1691 pcre16_free_study(pcre16_extra *extra)
1692 #elif defined COMPILE_PCRE32
1693 PCRE_EXP_DEFN void
1694 pcre32_free_study(pcre32_extra *extra)
1695 #endif
1696 {
1697 if (extra == NULL)
1698   return;
1699 #ifdef SUPPORT_JIT
1700 if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
1701      extra->executable_jit != NULL)
1702   PRIV(jit_free)(extra->executable_jit);
1703 #endif
1704 PUBL(free)(extra);
1705 }
1706 
1707 /* End of pcre_study.c */
1708