1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9            Copyright (c) 1997-2021 University of Cambridge
10 
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14 
15     * Redistributions of source code must retain the above copyright notice,
16       this list of conditions and the following disclaimer.
17 
18     * Redistributions in binary form must reproduce the above copyright
19       notice, this list of conditions and the following disclaimer in the
20       documentation and/or other materials provided with the distribution.
21 
22     * Neither the name of the University of Cambridge nor the names of its
23       contributors may be used to endorse or promote products derived from
24       this software without specific prior written permission.
25 
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39 
40 
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43 
44 
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48 
49 #define NLBLOCK cd             /* Block containing newline information */
50 #define PSSTART start_pattern  /* Field containing pattern start */
51 #define PSEND   end_pattern    /* Field containing pattern end */
52 
53 #include "pcre_internal.h"
54 
55 
56 /* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
57 is also used by pcretest. PCRE_DEBUG is not defined when building a production
58 library. We do not need to select pcre16_printint.c specially, because the
59 COMPILE_PCREx macro will already be appropriately set. */
60 
61 #ifdef PCRE_DEBUG
62 /* pcre_printint.c should not include any headers */
63 #define PCRE_INCLUDED
64 #include "pcre_printint.c"
65 #undef PCRE_INCLUDED
66 #endif
67 
68 
69 /* Macro for setting individual bits in class bitmaps. */
70 
71 #define SETBIT(a,b) a[(b)/8] |= (1U << ((b)&7))
72 
73 /* Maximum length value to check against when making sure that the integer that
74 holds the compiled pattern length does not overflow. We make it a bit less than
75 INT_MAX to allow for adding in group terminating bytes, so that we don't have
76 to check them every time. */
77 
78 #define OFLOW_MAX (INT_MAX - 20)
79 
80 /* Definitions to allow mutual recursion */
81 
82 static int
83   add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
84     const pcre_uint32 *, unsigned int);
85 
86 static BOOL
87   compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
88     pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
89     compile_data *, int *);
90 
91 
92 
93 /*************************************************
94 *      Code parameters and static tables         *
95 *************************************************/
96 
97 /* This value specifies the size of stack workspace that is used during the
98 first pre-compile phase that determines how much memory is required. The regex
99 is partly compiled into this space, but the compiled parts are discarded as
100 soon as they can be, so that hopefully there will never be an overrun. The code
101 does, however, check for an overrun. The largest amount I've seen used is 218,
102 so this number is very generous.
103 
104 The same workspace is used during the second, actual compile phase for
105 remembering forward references to groups so that they can be filled in at the
106 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
107 is 4 there is plenty of room for most patterns. However, the memory can get
108 filled up by repetitions of forward references, for example patterns like
109 /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
110 that the workspace is expanded using malloc() in this situation. The value
111 below is therefore a minimum, and we put a maximum on it for safety. The
112 minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
113 kicks in at the same number of forward references in all cases. */
114 
115 #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
116 #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
117 
118 /* This value determines the size of the initial vector that is used for
119 remembering named groups during the pre-compile. It is allocated on the stack,
120 but if it is too small, it is expanded using malloc(), in a similar way to the
121 workspace. The value is the number of slots in the list. */
122 
123 #define NAMED_GROUP_LIST_SIZE  20
124 
125 /* The overrun tests check for a slightly smaller size so that they detect the
126 overrun before it actually does run off the end of the data block. */
127 
128 #define WORK_SIZE_SAFETY_MARGIN (100)
129 
130 /* Private flags added to firstchar and reqchar. */
131 
132 #define REQ_CASELESS    (1U << 0)        /* Indicates caselessness */
133 #define REQ_VARY        (1U << 1)        /* Reqchar followed non-literal item */
134 /* Negative values for the firstchar and reqchar flags */
135 #define REQ_UNSET       (-2)
136 #define REQ_NONE        (-1)
137 
138 /* Repeated character flags. */
139 
140 #define UTF_LENGTH     0x10000000l      /* The char contains its length. */
141 
142 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
143 are simple data values; negative values are for special things like \d and so
144 on. Zero means further processing is needed (for things like \x), or the escape
145 is invalid. */
146 
147 #ifndef EBCDIC
148 
149 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
150 in UTF-8 mode. */
151 
152 static const short int escapes[] = {
153      0,                       0,
154      0,                       0,
155      0,                       0,
156      0,                       0,
157      0,                       0,
158      CHAR_COLON,              CHAR_SEMICOLON,
159      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
160      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
161      CHAR_COMMERCIAL_AT,      -ESC_A,
162      -ESC_B,                  -ESC_C,
163      -ESC_D,                  -ESC_E,
164      0,                       -ESC_G,
165      -ESC_H,                  0,
166      0,                       -ESC_K,
167      0,                       0,
168      -ESC_N,                  0,
169      -ESC_P,                  -ESC_Q,
170      -ESC_R,                  -ESC_S,
171      0,                       0,
172      -ESC_V,                  -ESC_W,
173      -ESC_X,                  0,
174      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
175      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
176      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
177      CHAR_GRAVE_ACCENT,       ESC_a,
178      -ESC_b,                  0,
179      -ESC_d,                  ESC_e,
180      ESC_f,                   0,
181      -ESC_h,                  0,
182      0,                       -ESC_k,
183      0,                       0,
184      ESC_n,                   0,
185      -ESC_p,                  0,
186      ESC_r,                   -ESC_s,
187      ESC_tee,                 0,
188      -ESC_v,                  -ESC_w,
189      0,                       0,
190      -ESC_z
191 };
192 
193 #else
194 
195 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
196 
197 static const short int escapes[] = {
198 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
199 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
200 /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
201 /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
202 /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
203 /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
204 /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
205 /*  80 */     0, ESC_a, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
206 /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
207 /*  90 */     0,     0, -ESC_k,       0,      0, ESC_n,      0, -ESC_p,
208 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
209 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
210 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
211 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
212 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
213 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
214 /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
215 /*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
216 /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
217 /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
218 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
219 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
220 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
221 };
222 
223 /* We also need a table of characters that may follow \c in an EBCDIC
224 environment for characters 0-31. */
225 
226 static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
227 
228 #endif
229 
230 
231 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
232 searched linearly. Put all the names into a single string, in order to reduce
233 the number of relocations when a shared library is dynamically linked. The
234 string is built from string macros so that it works in UTF-8 mode on EBCDIC
235 platforms. */
236 
237 typedef struct verbitem {
238   int   len;                 /* Length of verb name */
239   int   op;                  /* Op when no arg, or -1 if arg mandatory */
240   int   op_arg;              /* Op when arg present, or -1 if not allowed */
241 } verbitem;
242 
243 static const char verbnames[] =
244   "\0"                       /* Empty name is a shorthand for MARK */
245   STRING_MARK0
246   STRING_ACCEPT0
247   STRING_COMMIT0
248   STRING_F0
249   STRING_FAIL0
250   STRING_PRUNE0
251   STRING_SKIP0
252   STRING_THEN;
253 
254 static const verbitem verbs[] = {
255   { 0, -1,        OP_MARK },
256   { 4, -1,        OP_MARK },
257   { 6, OP_ACCEPT, -1 },
258   { 6, OP_COMMIT, -1 },
259   { 1, OP_FAIL,   -1 },
260   { 4, OP_FAIL,   -1 },
261   { 5, OP_PRUNE,  OP_PRUNE_ARG },
262   { 4, OP_SKIP,   OP_SKIP_ARG  },
263   { 4, OP_THEN,   OP_THEN_ARG  }
264 };
265 
266 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
267 
268 
269 /* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in
270 another regex library. */
271 
272 static const pcre_uchar sub_start_of_word[] = {
273   CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
274   CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' };
275 
276 static const pcre_uchar sub_end_of_word[] = {
277   CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
278   CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
279   CHAR_RIGHT_PARENTHESIS, '\0' };
280 
281 
282 /* Tables of names of POSIX character classes and their lengths. The names are
283 now all in a single string, to reduce the number of relocations when a shared
284 library is dynamically loaded. The list of lengths is terminated by a zero
285 length entry. The first three must be alpha, lower, upper, as this is assumed
286 for handling case independence. The indices for graph, print, and punct are
287 needed, so identify them. */
288 
289 static const char posix_names[] =
290   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
291   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
292   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
293   STRING_word0  STRING_xdigit;
294 
295 static const pcre_uint8 posix_name_lengths[] = {
296   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
297 
298 #define PC_GRAPH  8
299 #define PC_PRINT  9
300 #define PC_PUNCT 10
301 
302 
303 /* Table of class bit maps for each POSIX class. Each class is formed from a
304 base map, with an optional addition or removal of another map. Then, for some
305 classes, there is some additional tweaking: for [:blank:] the vertical space
306 characters are removed, and for [:alpha:] and [:alnum:] the underscore
307 character is removed. The triples in the table consist of the base map offset,
308 second map offset or -1 if no second map, and a non-negative value for map
309 addition or a negative value for map subtraction (if there are two maps). The
310 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
311 remove vertical space characters, 2 => remove underscore. */
312 
313 static const int posix_class_maps[] = {
314   cbit_word,  cbit_digit, -2,             /* alpha */
315   cbit_lower, -1,          0,             /* lower */
316   cbit_upper, -1,          0,             /* upper */
317   cbit_word,  -1,          2,             /* alnum - word without underscore */
318   cbit_print, cbit_cntrl,  0,             /* ascii */
319   cbit_space, -1,          1,             /* blank - a GNU extension */
320   cbit_cntrl, -1,          0,             /* cntrl */
321   cbit_digit, -1,          0,             /* digit */
322   cbit_graph, -1,          0,             /* graph */
323   cbit_print, -1,          0,             /* print */
324   cbit_punct, -1,          0,             /* punct */
325   cbit_space, -1,          0,             /* space */
326   cbit_word,  -1,          0,             /* word - a Perl extension */
327   cbit_xdigit,-1,          0              /* xdigit */
328 };
329 
330 /* Table of substitutes for \d etc when PCRE_UCP is set. They are replaced by
331 Unicode property escapes. */
332 
333 #ifdef SUPPORT_UCP
334 static const pcre_uchar string_PNd[]  = {
335   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
336   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
337 static const pcre_uchar string_pNd[]  = {
338   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
339   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
340 static const pcre_uchar string_PXsp[] = {
341   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
342   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
343 static const pcre_uchar string_pXsp[] = {
344   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
345   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
346 static const pcre_uchar string_PXwd[] = {
347   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
348   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
349 static const pcre_uchar string_pXwd[] = {
350   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
351   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
352 
353 static const pcre_uchar *substitutes[] = {
354   string_PNd,           /* \D */
355   string_pNd,           /* \d */
356   string_PXsp,          /* \S */   /* Xsp is Perl space, but from 8.34, Perl */
357   string_pXsp,          /* \s */   /* space and POSIX space are the same. */
358   string_PXwd,          /* \W */
359   string_pXwd           /* \w */
360 };
361 
362 /* The POSIX class substitutes must be in the order of the POSIX class names,
363 defined above, and there are both positive and negative cases. NULL means no
364 general substitute of a Unicode property escape (\p or \P). However, for some
365 POSIX classes (e.g. graph, print, punct) a special property code is compiled
366 directly. */
367 
368 static const pcre_uchar string_pL[] =   {
369   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
370   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
371 static const pcre_uchar string_pLl[] =  {
372   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
373   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
374 static const pcre_uchar string_pLu[] =  {
375   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
376   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
377 static const pcre_uchar string_pXan[] = {
378   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
379   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
380 static const pcre_uchar string_h[] =    {
381   CHAR_BACKSLASH, CHAR_h, '\0' };
382 static const pcre_uchar string_pXps[] = {
383   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
384   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
385 static const pcre_uchar string_PL[] =   {
386   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
387   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
388 static const pcre_uchar string_PLl[] =  {
389   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
390   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
391 static const pcre_uchar string_PLu[] =  {
392   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
393   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
394 static const pcre_uchar string_PXan[] = {
395   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
396   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
397 static const pcre_uchar string_H[] =    {
398   CHAR_BACKSLASH, CHAR_H, '\0' };
399 static const pcre_uchar string_PXps[] = {
400   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
401   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
402 
403 static const pcre_uchar *posix_substitutes[] = {
404   string_pL,            /* alpha */
405   string_pLl,           /* lower */
406   string_pLu,           /* upper */
407   string_pXan,          /* alnum */
408   NULL,                 /* ascii */
409   string_h,             /* blank */
410   NULL,                 /* cntrl */
411   string_pNd,           /* digit */
412   NULL,                 /* graph */
413   NULL,                 /* print */
414   NULL,                 /* punct */
415   string_pXps,          /* space */   /* Xps is POSIX space, but from 8.34 */
416   string_pXwd,          /* word  */   /* Perl and POSIX space are the same */
417   NULL,                 /* xdigit */
418   /* Negated cases */
419   string_PL,            /* ^alpha */
420   string_PLl,           /* ^lower */
421   string_PLu,           /* ^upper */
422   string_PXan,          /* ^alnum */
423   NULL,                 /* ^ascii */
424   string_H,             /* ^blank */
425   NULL,                 /* ^cntrl */
426   string_PNd,           /* ^digit */
427   NULL,                 /* ^graph */
428   NULL,                 /* ^print */
429   NULL,                 /* ^punct */
430   string_PXps,          /* ^space */  /* Xps is POSIX space, but from 8.34 */
431   string_PXwd,          /* ^word */   /* Perl and POSIX space are the same */
432   NULL                  /* ^xdigit */
433 };
434 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
435 #endif
436 
437 #define STRING(a)  # a
438 #define XSTRING(s) STRING(s)
439 
440 /* The texts of compile-time error messages. These are "char *" because they
441 are passed to the outside world. Do not ever re-use any error number, because
442 they are documented. Always add a new error instead. Messages marked DEAD below
443 are no longer used. This used to be a table of strings, but in order to reduce
444 the number of relocations needed when a shared library is loaded dynamically,
445 it is now one long string. We cannot use a table of offsets, because the
446 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
447 simply count through to the one we want - this isn't a performance issue
448 because these strings are used only when there is a compilation error.
449 
450 Each substring ends with \0 to insert a null character. This includes the final
451 substring, so that the whole string ends with \0\0, which can be detected when
452 counting through. */
453 
454 static const char error_texts[] =
455   "no error\0"
456   "\\ at end of pattern\0"
457   "\\c at end of pattern\0"
458   "unrecognized character follows \\\0"
459   "numbers out of order in {} quantifier\0"
460   /* 5 */
461   "number too big in {} quantifier\0"
462   "missing terminating ] for character class\0"
463   "invalid escape sequence in character class\0"
464   "range out of order in character class\0"
465   "nothing to repeat\0"
466   /* 10 */
467   "internal error: invalid forward reference offset\0"
468   "internal error: unexpected repeat\0"
469   "unrecognized character after (? or (?-\0"
470   "POSIX named classes are supported only within a class\0"
471   "missing )\0"
472   /* 15 */
473   "reference to non-existent subpattern\0"
474   "erroffset passed as NULL\0"
475   "unknown option bit(s) set\0"
476   "missing ) after comment\0"
477   "parentheses nested too deeply\0"  /** DEAD **/
478   /* 20 */
479   "regular expression is too large\0"
480   "failed to get memory\0"
481   "unmatched parentheses\0"
482   "internal error: code overflow\0"
483   "unrecognized character after (?<\0"
484   /* 25 */
485   "lookbehind assertion is not fixed length\0"
486   "malformed number or name after (?(\0"
487   "conditional group contains more than two branches\0"
488   "assertion expected after (?( or (?(?C)\0"
489   "(?R or (?[+-]digits must be followed by )\0"
490   /* 30 */
491   "unknown POSIX class name\0"
492   "POSIX collating elements are not supported\0"
493   "this version of PCRE is compiled without UTF support\0"
494   "spare error\0"  /** DEAD **/
495   "character value in \\x{} or \\o{} is too large\0"
496   /* 35 */
497   "invalid condition (?(0)\0"
498   "\\C not allowed in lookbehind assertion\0"
499   "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
500   "number after (?C is > 255\0"
501   "closing ) for (?C expected\0"
502   /* 40 */
503   "recursive call could loop indefinitely\0"
504   "unrecognized character after (?P\0"
505   "syntax error in subpattern name (missing terminator)\0"
506   "two named subpatterns have the same name\0"
507   "invalid UTF-8 string\0"
508   /* 45 */
509   "support for \\P, \\p, and \\X has not been compiled\0"
510   "malformed \\P or \\p sequence\0"
511   "unknown property name after \\P or \\p\0"
512   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
513   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
514   /* 50 */
515   "repeated subpattern is too long\0"    /** DEAD **/
516   "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
517   "internal error: overran compiling workspace\0"
518   "internal error: previously-checked referenced subpattern not found\0"
519   "DEFINE group contains more than one branch\0"
520   /* 55 */
521   "repeating a DEFINE group is not allowed\0"  /** DEAD **/
522   "inconsistent NEWLINE options\0"
523   "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
524   "a numbered reference must not be zero\0"
525   "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
526   /* 60 */
527   "(*VERB) not recognized or malformed\0"
528   "number is too big\0"
529   "subpattern name expected\0"
530   "digit expected after (?+\0"
531   "] is an invalid data character in JavaScript compatibility mode\0"
532   /* 65 */
533   "different names for subpatterns of the same number are not allowed\0"
534   "(*MARK) must have an argument\0"
535   "this version of PCRE is not compiled with Unicode property support\0"
536 #ifndef EBCDIC
537   "\\c must be followed by an ASCII character\0"
538 #else
539   "\\c must be followed by a letter or one of [\\]^_?\0"
540 #endif
541   "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
542   /* 70 */
543   "internal error: unknown opcode in find_fixedlength()\0"
544   "\\N is not supported in a class\0"
545   "too many forward references\0"
546   "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
547   "invalid UTF-16 string\0"
548   /* 75 */
549   "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
550   "character value in \\u.... sequence is too large\0"
551   "invalid UTF-32 string\0"
552   "setting UTF is disabled by the application\0"
553   "non-hex character in \\x{} (closing brace missing?)\0"
554   /* 80 */
555   "non-octal character in \\o{} (closing brace missing?)\0"
556   "missing opening brace after \\o\0"
557   "parentheses are too deeply nested\0"
558   "invalid range in character class\0"
559   "group name must start with a non-digit\0"
560   /* 85 */
561   "parentheses are too deeply nested (stack check)\0"
562   "digits missing in \\x{} or \\o{}\0"
563   "regular expression is too complicated\0"
564   ;
565 
566 /* Table to identify digits and hex digits. This is used when compiling
567 patterns. Note that the tables in chartables are dependent on the locale, and
568 may mark arbitrary characters as digits - but the PCRE compiling code expects
569 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
570 a private table here. It costs 256 bytes, but it is a lot faster than doing
571 character value tests (at least in some simple cases I timed), and in some
572 applications one wants PCRE to compile efficiently as well as match
573 efficiently.
574 
575 For convenience, we use the same bit definitions as in chartables:
576 
577   0x04   decimal digit
578   0x08   hexadecimal digit
579 
580 Then we can use ctype_digit and ctype_xdigit in the code. */
581 
582 /* Using a simple comparison for decimal numbers rather than a memory read
583 is much faster, and the resulting code is simpler (the compiler turns it
584 into a subtraction and unsigned comparison). */
585 
586 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
587 
588 #ifndef EBCDIC
589 
590 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
591 UTF-8 mode. */
592 
593 static const pcre_uint8 digitab[] =
594   {
595   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
596   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
597   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
598   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
599   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
600   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
601   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
602   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
603   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
604   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
605   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
606   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
607   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
608   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
609   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
610   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
611   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
612   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
613   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
614   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
615   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
616   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
617   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
618   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
619   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
620   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
621   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
622   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
623   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
624   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
625   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
626   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
627 
628 #else
629 
630 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
631 
632 static const pcre_uint8 digitab[] =
633   {
634   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
635   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
636   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
637   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
638   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
639   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
640   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
641   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
642   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
643   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
644   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
645   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
646   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
647   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
648   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
649   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
650   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
651   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
652   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
653   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
654   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
655   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
656   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
657   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
658   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
659   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
660   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
661   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
662   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
663   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
664   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
665   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
666 
667 static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
668   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
669   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
670   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
671   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
672   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
673   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
674   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
675   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
676   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
677   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
678   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
679   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
680   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
681   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
682   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
683   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
684   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
685   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
686   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
687   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
688   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
689   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
690   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
691   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
692   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
693   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
694   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
695   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
696   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
697   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
698   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
699   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
700 #endif
701 
702 
703 /* This table is used to check whether auto-possessification is possible
704 between adjacent character-type opcodes. The left-hand (repeated) opcode is
705 used to select the row, and the right-hand opcode is use to select the column.
706 A value of 1 means that auto-possessification is OK. For example, the second
707 value in the first row means that \D+\d can be turned into \D++\d.
708 
709 The Unicode property types (\P and \p) have to be present to fill out the table
710 because of what their opcode values are, but the table values should always be
711 zero because property types are handled separately in the code. The last four
712 columns apply to items that cannot be repeated, so there is no need to have
713 rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
714 *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
715 
716 #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
717 #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
718 
719 static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
720 /* \D \d \S \s \W \w  . .+ \C \P \p \R \H \h \V \v \X \Z \z  $ $M */
721   { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \D */
722   { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \d */
723   { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \S */
724   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \s */
725   { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \W */
726   { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \w */
727   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .  */
728   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .+ */
729   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \C */
730   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \P */
731   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \p */
732   { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \R */
733   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \H */
734   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \h */
735   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \V */
736   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 },  /* \v */
737   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }   /* \X */
738 };
739 
740 
741 /* This table is used to check whether auto-possessification is possible
742 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
743 left-hand (repeated) opcode is used to select the row, and the right-hand
744 opcode is used to select the column. The values are as follows:
745 
746   0   Always return FALSE (never auto-possessify)
747   1   Character groups are distinct (possessify if both are OP_PROP)
748   2   Check character categories in the same group (general or particular)
749   3   TRUE if the two opcodes are not the same (PROP vs NOTPROP)
750 
751   4   Check left general category vs right particular category
752   5   Check right general category vs left particular category
753 
754   6   Left alphanum vs right general category
755   7   Left space vs right general category
756   8   Left word vs right general category
757 
758   9   Right alphanum vs left general category
759  10   Right space vs left general category
760  11   Right word vs left general category
761 
762  12   Left alphanum vs right particular category
763  13   Left space vs right particular category
764  14   Left word vs right particular category
765 
766  15   Right alphanum vs left particular category
767  16   Right space vs left particular category
768  17   Right word vs left particular category
769 */
770 
771 static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
772 /* ANY LAMP GC  PC  SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
773   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_ANY */
774   { 0,  3,  0,  0,  0,    3,    1,      1,   0,    0,   0 },  /* PT_LAMP */
775   { 0,  0,  2,  4,  0,    9,   10,     10,  11,    0,   0 },  /* PT_GC */
776   { 0,  0,  5,  2,  0,   15,   16,     16,  17,    0,   0 },  /* PT_PC */
777   { 0,  0,  0,  0,  2,    0,    0,      0,   0,    0,   0 },  /* PT_SC */
778   { 0,  3,  6, 12,  0,    3,    1,      1,   0,    0,   0 },  /* PT_ALNUM */
779   { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_SPACE */
780   { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_PXSPACE */
781   { 0,  0,  8, 14,  0,    0,    1,      1,   3,    0,   0 },  /* PT_WORD */
782   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_CLIST */
783   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   3 }   /* PT_UCNC */
784 };
785 
786 /* This table is used to check whether auto-possessification is possible
787 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
788 specifies a general category and the other specifies a particular category. The
789 row is selected by the general category and the column by the particular
790 category. The value is 1 if the particular category is not part of the general
791 category. */
792 
793 static const pcre_uint8 catposstab[7][30] = {
794 /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
795   { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* C */
796   { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* L */
797   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* M */
798   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* N */
799   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },  /* P */
800   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 },  /* S */
801   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 }   /* Z */
802 };
803 
804 /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
805 a general or particular category. The properties in each row are those
806 that apply to the character set in question. Duplication means that a little
807 unnecessary work is done when checking, but this keeps things much simpler
808 because they can all use the same code. For more details see the comment where
809 this table is used.
810 
811 Note: SPACE and PXSPACE used to be different because Perl excluded VT from
812 "space", but from Perl 5.18 it's included, so both categories are treated the
813 same here. */
814 
815 static const pcre_uint8 posspropstab[3][4] = {
816   { ucp_L, ucp_N, ucp_N, ucp_Nl },  /* ALNUM, 3rd and 4th values redundant */
817   { ucp_Z, ucp_Z, ucp_C, ucp_Cc },  /* SPACE and PXSPACE, 2nd value redundant */
818   { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
819 };
820 
821 /* This table is used when converting repeating opcodes into possessified
822 versions as a result of an explicit possessive quantifier such as ++. A zero
823 value means there is no possessified version - in those cases the item in
824 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
825 because all relevant opcodes are less than that. */
826 
827 static const pcre_uint8 opcode_possessify[] = {
828   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
829   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
830 
831   0,                       /* NOTI */
832   OP_POSSTAR, 0,           /* STAR, MINSTAR */
833   OP_POSPLUS, 0,           /* PLUS, MINPLUS */
834   OP_POSQUERY, 0,          /* QUERY, MINQUERY */
835   OP_POSUPTO, 0,           /* UPTO, MINUPTO */
836   0,                       /* EXACT */
837   0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
838 
839   OP_POSSTARI, 0,          /* STARI, MINSTARI */
840   OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
841   OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
842   OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
843   0,                       /* EXACTI */
844   0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
845 
846   OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
847   OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
848   OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
849   OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
850   0,                       /* NOTEXACT */
851   0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
852 
853   OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
854   OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
855   OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
856   OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
857   0,                       /* NOTEXACTI */
858   0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
859 
860   OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
861   OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
862   OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
863   OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
864   0,                       /* TYPEEXACT */
865   0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
866 
867   OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
868   OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
869   OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
870   OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
871   0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
872 
873   0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
874   0, 0,                    /* REF, REFI */
875   0, 0,                    /* DNREF, DNREFI */
876   0, 0                     /* RECURSE, CALLOUT */
877 };
878 
879 
880 
881 /*************************************************
882 *            Find an error text                  *
883 *************************************************/
884 
885 /* The error texts are now all in one long string, to save on relocations. As
886 some of the text is of unknown length, we can't use a table of offsets.
887 Instead, just count through the strings. This is not a performance issue
888 because it happens only when there has been a compilation error.
889 
890 Argument:   the error number
891 Returns:    pointer to the error string
892 */
893 
894 static const char *
find_error_text(int n)895 find_error_text(int n)
896 {
897 const char *s = error_texts;
898 for (; n > 0; n--)
899   {
900   while (*s++ != CHAR_NULL) {};
901   if (*s == CHAR_NULL) return "Error text not found (please report)";
902   }
903 return s;
904 }
905 
906 
907 
908 /*************************************************
909 *           Expand the workspace                 *
910 *************************************************/
911 
912 /* This function is called during the second compiling phase, if the number of
913 forward references fills the existing workspace, which is originally a block on
914 the stack. A larger block is obtained from malloc() unless the ultimate limit
915 has been reached or the increase will be rather small.
916 
917 Argument: pointer to the compile data block
918 Returns:  0 if all went well, else an error number
919 */
920 
921 static int
expand_workspace(compile_data * cd)922 expand_workspace(compile_data *cd)
923 {
924 pcre_uchar *newspace;
925 int newsize = cd->workspace_size * 2;
926 
927 if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
928 if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
929     newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
930  return ERR72;
931 
932 newspace = (PUBL(malloc))(IN_UCHARS(newsize));
933 if (newspace == NULL) return ERR21;
934 memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
935 cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
936 if (cd->workspace_size > COMPILE_WORK_SIZE)
937   (PUBL(free))((void *)cd->start_workspace);
938 cd->start_workspace = newspace;
939 cd->workspace_size = newsize;
940 return 0;
941 }
942 
943 
944 
945 /*************************************************
946 *            Check for counted repeat            *
947 *************************************************/
948 
949 /* This function is called when a '{' is encountered in a place where it might
950 start a quantifier. It looks ahead to see if it really is a quantifier or not.
951 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
952 where the ddds are digits.
953 
954 Arguments:
955   p         pointer to the first char after '{'
956 
957 Returns:    TRUE or FALSE
958 */
959 
960 static BOOL
is_counted_repeat(const pcre_uchar * p)961 is_counted_repeat(const pcre_uchar *p)
962 {
963 if (!IS_DIGIT(*p)) return FALSE;
964 p++;
965 while (IS_DIGIT(*p)) p++;
966 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
967 
968 if (*p++ != CHAR_COMMA) return FALSE;
969 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
970 
971 if (!IS_DIGIT(*p)) return FALSE;
972 p++;
973 while (IS_DIGIT(*p)) p++;
974 
975 return (*p == CHAR_RIGHT_CURLY_BRACKET);
976 }
977 
978 
979 
980 /*************************************************
981 *            Handle escapes                      *
982 *************************************************/
983 
984 /* This function is called when a \ has been encountered. It either returns a
985 positive value for a simple escape such as \n, or 0 for a data character which
986 will be placed in chptr. A backreference to group n is returned as negative n.
987 When UTF-8 is enabled, a positive value greater than 255 may be returned in
988 chptr. On entry, ptr is pointing at the \. On exit, it is on the final
989 character of the escape sequence.
990 
991 Arguments:
992   ptrptr         points to the pattern position pointer
993   chptr          points to a returned data character
994   errorcodeptr   points to the errorcode variable
995   bracount       number of previous extracting brackets
996   options        the options bits
997   isclass        TRUE if inside a character class
998 
999 Returns:         zero => a data character
1000                  positive => a special escape sequence
1001                  negative => a back reference
1002                  on error, errorcodeptr is set
1003 */
1004 
1005 static int
check_escape(const pcre_uchar ** ptrptr,pcre_uint32 * chptr,int * errorcodeptr,int bracount,int options,BOOL isclass)1006 check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
1007   int bracount, int options, BOOL isclass)
1008 {
1009 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
1010 BOOL utf = (options & PCRE_UTF8) != 0;
1011 const pcre_uchar *ptr = *ptrptr + 1;
1012 pcre_uint32 c;
1013 int escape = 0;
1014 int i;
1015 
1016 GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
1017 ptr--;                            /* Set pointer back to the last byte */
1018 
1019 /* If backslash is at the end of the pattern, it's an error. */
1020 
1021 if (c == CHAR_NULL) *errorcodeptr = ERR1;
1022 
1023 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
1024 in a table. A non-zero result is something that can be returned immediately.
1025 Otherwise further processing may be required. */
1026 
1027 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1028 /* Not alphanumeric */
1029 else if (c < CHAR_0 || c > CHAR_z) {}
1030 else if ((i = escapes[c - CHAR_0]) != 0)
1031   { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1032 
1033 #else           /* EBCDIC coding */
1034 /* Not alphanumeric */
1035 else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
1036 else if ((i = escapes[c - 0x48]) != 0)  { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1037 #endif
1038 
1039 /* Escapes that need further processing, or are illegal. */
1040 
1041 else
1042   {
1043   const pcre_uchar *oldptr;
1044   BOOL braced, negated, overflow;
1045   int s;
1046 
1047   switch (c)
1048     {
1049     /* A number of Perl escapes are not handled by PCRE. We give an explicit
1050     error. */
1051 
1052     case CHAR_l:
1053     case CHAR_L:
1054     *errorcodeptr = ERR37;
1055     break;
1056 
1057     case CHAR_u:
1058     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1059       {
1060       /* In JavaScript, \u must be followed by four hexadecimal numbers.
1061       Otherwise it is a lowercase u letter. */
1062       if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1063         && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
1064         && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
1065         && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
1066         {
1067         c = 0;
1068         for (i = 0; i < 4; ++i)
1069           {
1070           register pcre_uint32 cc = *(++ptr);
1071 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1072           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1073           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1074 #else           /* EBCDIC coding */
1075           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1076           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1077 #endif
1078           }
1079 
1080 #if defined COMPILE_PCRE8
1081         if (c > (utf ? 0x10ffffU : 0xffU))
1082 #elif defined COMPILE_PCRE16
1083         if (c > (utf ? 0x10ffffU : 0xffffU))
1084 #elif defined COMPILE_PCRE32
1085         if (utf && c > 0x10ffffU)
1086 #endif
1087           {
1088           *errorcodeptr = ERR76;
1089           }
1090         else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1091         }
1092       }
1093     else
1094       *errorcodeptr = ERR37;
1095     break;
1096 
1097     case CHAR_U:
1098     /* In JavaScript, \U is an uppercase U letter. */
1099     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
1100     break;
1101 
1102     /* In a character class, \g is just a literal "g". Outside a character
1103     class, \g must be followed by one of a number of specific things:
1104 
1105     (1) A number, either plain or braced. If positive, it is an absolute
1106     backreference. If negative, it is a relative backreference. This is a Perl
1107     5.10 feature.
1108 
1109     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1110     is part of Perl's movement towards a unified syntax for back references. As
1111     this is synonymous with \k{name}, we fudge it up by pretending it really
1112     was \k.
1113 
1114     (3) For Oniguruma compatibility we also support \g followed by a name or a
1115     number either in angle brackets or in single quotes. However, these are
1116     (possibly recursive) subroutine calls, _not_ backreferences. Just return
1117     the ESC_g code (cf \k). */
1118 
1119     case CHAR_g:
1120     if (isclass) break;
1121     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
1122       {
1123       escape = ESC_g;
1124       break;
1125       }
1126 
1127     /* Handle the Perl-compatible cases */
1128 
1129     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1130       {
1131       const pcre_uchar *p;
1132       for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
1133         if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
1134       if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
1135         {
1136         escape = ESC_k;
1137         break;
1138         }
1139       braced = TRUE;
1140       ptr++;
1141       }
1142     else braced = FALSE;
1143 
1144     if (ptr[1] == CHAR_MINUS)
1145       {
1146       negated = TRUE;
1147       ptr++;
1148       }
1149     else negated = FALSE;
1150 
1151     /* The integer range is limited by the machine's int representation. */
1152     s = 0;
1153     overflow = FALSE;
1154     while (IS_DIGIT(ptr[1]))
1155       {
1156       if (s > INT_MAX / 10 - 1) /* Integer overflow */
1157         {
1158         overflow = TRUE;
1159         break;
1160         }
1161       s = s * 10 + (int)(*(++ptr) - CHAR_0);
1162       }
1163     if (overflow) /* Integer overflow */
1164       {
1165       while (IS_DIGIT(ptr[1]))
1166         ptr++;
1167       *errorcodeptr = ERR61;
1168       break;
1169       }
1170 
1171     if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
1172       {
1173       *errorcodeptr = ERR57;
1174       break;
1175       }
1176 
1177     if (s == 0)
1178       {
1179       *errorcodeptr = ERR58;
1180       break;
1181       }
1182 
1183     if (negated)
1184       {
1185       if (s > bracount)
1186         {
1187         *errorcodeptr = ERR15;
1188         break;
1189         }
1190       s = bracount - (s - 1);
1191       }
1192 
1193     escape = -s;
1194     break;
1195 
1196     /* The handling of escape sequences consisting of a string of digits
1197     starting with one that is not zero is not straightforward. Perl has changed
1198     over the years. Nowadays \g{} for backreferences and \o{} for octal are
1199     recommended to avoid the ambiguities in the old syntax.
1200 
1201     Outside a character class, the digits are read as a decimal number. If the
1202     number is less than 8 (used to be 10), or if there are that many previous
1203     extracting left brackets, then it is a back reference. Otherwise, up to
1204     three octal digits are read to form an escaped byte. Thus \123 is likely to
1205     be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1206     the octal value is greater than 377, the least significant 8 bits are
1207     taken. \8 and \9 are treated as the literal characters 8 and 9.
1208 
1209     Inside a character class, \ followed by a digit is always either a literal
1210     8 or 9 or an octal number. */
1211 
1212     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1213     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1214 
1215     if (!isclass)
1216       {
1217       oldptr = ptr;
1218       /* The integer range is limited by the machine's int representation. */
1219       s = (int)(c -CHAR_0);
1220       overflow = FALSE;
1221       while (IS_DIGIT(ptr[1]))
1222         {
1223         if (s > INT_MAX / 10 - 1) /* Integer overflow */
1224           {
1225           overflow = TRUE;
1226           break;
1227           }
1228         s = s * 10 + (int)(*(++ptr) - CHAR_0);
1229         }
1230       if (overflow) /* Integer overflow */
1231         {
1232         while (IS_DIGIT(ptr[1]))
1233           ptr++;
1234         *errorcodeptr = ERR61;
1235         break;
1236         }
1237       if (s < 8 || s <= bracount)  /* Check for back reference */
1238         {
1239         escape = -s;
1240         break;
1241         }
1242       ptr = oldptr;      /* Put the pointer back and fall through */
1243       }
1244 
1245     /* Handle a digit following \ when the number is not a back reference. If
1246     the first digit is 8 or 9, Perl used to generate a binary zero byte and
1247     then treat the digit as a following literal. At least by Perl 5.18 this
1248     changed so as not to insert the binary zero. */
1249 
1250     if ((c = *ptr) >= CHAR_8) break;
1251 
1252     /* fall through */
1253     /* Fall through with a digit less than 8 */
1254 
1255     /* \0 always starts an octal number, but we may drop through to here with a
1256     larger first octal digit. The original code used just to take the least
1257     significant 8 bits of octal numbers (I think this is what early Perls used
1258     to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1259     but no more than 3 octal digits. */
1260 
1261     case CHAR_0:
1262     c -= CHAR_0;
1263     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1264         c = c * 8 + *(++ptr) - CHAR_0;
1265 #ifdef COMPILE_PCRE8
1266     if (!utf && c > 0xff) *errorcodeptr = ERR51;
1267 #endif
1268     break;
1269 
1270     /* \o is a relatively new Perl feature, supporting a more general way of
1271     specifying character codes in octal. The only supported form is \o{ddd}. */
1272 
1273     case CHAR_o:
1274     if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1275     if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR86; else
1276       {
1277       ptr += 2;
1278       c = 0;
1279       overflow = FALSE;
1280       while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1281         {
1282         register pcre_uint32 cc = *ptr++;
1283         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1284 #ifdef COMPILE_PCRE32
1285         if (c >= 0x20000000l) { overflow = TRUE; break; }
1286 #endif
1287         c = (c << 3) + cc - CHAR_0 ;
1288 #if defined COMPILE_PCRE8
1289         if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1290 #elif defined COMPILE_PCRE16
1291         if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1292 #elif defined COMPILE_PCRE32
1293         if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1294 #endif
1295         }
1296       if (overflow)
1297         {
1298         while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1299         *errorcodeptr = ERR34;
1300         }
1301       else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1302         {
1303         if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1304         }
1305       else *errorcodeptr = ERR80;
1306       }
1307     break;
1308 
1309     /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1310     numbers. Otherwise it is a lowercase x letter. */
1311 
1312     case CHAR_x:
1313     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1314       {
1315       if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1316         && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1317         {
1318         c = 0;
1319         for (i = 0; i < 2; ++i)
1320           {
1321           register pcre_uint32 cc = *(++ptr);
1322 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1323           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1324           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1325 #else           /* EBCDIC coding */
1326           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1327           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1328 #endif
1329           }
1330         }
1331       }    /* End JavaScript handling */
1332 
1333     /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1334     greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1335     digits. If not, { used to be treated as a data character. However, Perl
1336     seems to read hex digits up to the first non-such, and ignore the rest, so
1337     that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1338     now gives an error. */
1339 
1340     else
1341       {
1342       if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1343         {
1344         ptr += 2;
1345         if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1346           {
1347           *errorcodeptr = ERR86;
1348           break;
1349           }
1350         c = 0;
1351         overflow = FALSE;
1352         while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1353           {
1354           register pcre_uint32 cc = *ptr++;
1355           if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1356 
1357 #ifdef COMPILE_PCRE32
1358           if (c >= 0x10000000l) { overflow = TRUE; break; }
1359 #endif
1360 
1361 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1362           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1363           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1364 #else           /* EBCDIC coding */
1365           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1366           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1367 #endif
1368 
1369 #if defined COMPILE_PCRE8
1370           if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1371 #elif defined COMPILE_PCRE16
1372           if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1373 #elif defined COMPILE_PCRE32
1374           if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1375 #endif
1376           }
1377 
1378         if (overflow)
1379           {
1380           while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1381           *errorcodeptr = ERR34;
1382           }
1383 
1384         else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1385           {
1386           if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1387           }
1388 
1389         /* If the sequence of hex digits does not end with '}', give an error.
1390         We used just to recognize this construct and fall through to the normal
1391         \x handling, but nowadays Perl gives an error, which seems much more
1392         sensible, so we do too. */
1393 
1394         else *errorcodeptr = ERR79;
1395         }   /* End of \x{} processing */
1396 
1397       /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1398 
1399       else
1400         {
1401         c = 0;
1402         while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1403           {
1404           pcre_uint32 cc;                          /* Some compilers don't like */
1405           cc = *(++ptr);                           /* ++ in initializers */
1406 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1407           if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
1408           c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1409 #else           /* EBCDIC coding */
1410           if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
1411           c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1412 #endif
1413           }
1414         }     /* End of \xdd handling */
1415       }       /* End of Perl-style \x handling */
1416     break;
1417 
1418     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1419     An error is given if the byte following \c is not an ASCII character. This
1420     coding is ASCII-specific, but then the whole concept of \cx is
1421     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1422 
1423     case CHAR_c:
1424     c = *(++ptr);
1425     if (c == CHAR_NULL)
1426       {
1427       *errorcodeptr = ERR2;
1428       break;
1429       }
1430 #ifndef EBCDIC    /* ASCII/UTF-8 coding */
1431     if (c > 127)  /* Excludes all non-ASCII in either mode */
1432       {
1433       *errorcodeptr = ERR68;
1434       break;
1435       }
1436     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1437     c ^= 0x40;
1438 #else             /* EBCDIC coding */
1439     if (c >= CHAR_a && c <= CHAR_z) c += 64;
1440     if (c == CHAR_QUESTION_MARK)
1441       c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
1442     else
1443       {
1444       for (i = 0; i < 32; i++)
1445         {
1446         if (c == ebcdic_escape_c[i]) break;
1447         }
1448       if (i < 32) c = i; else *errorcodeptr = ERR68;
1449       }
1450 #endif
1451     break;
1452 
1453     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1454     other alphanumeric following \ is an error if PCRE_EXTRA was set;
1455     otherwise, for Perl compatibility, it is a literal. This code looks a bit
1456     odd, but there used to be some cases other than the default, and there may
1457     be again in future, so I haven't "optimized" it. */
1458 
1459     default:
1460     if ((options & PCRE_EXTRA) != 0) switch(c)
1461       {
1462       default:
1463       *errorcodeptr = ERR3;
1464       break;
1465       }
1466     break;
1467     }
1468   }
1469 
1470 /* Perl supports \N{name} for character names, as well as plain \N for "not
1471 newline". PCRE does not support \N{name}. However, it does support
1472 quantification such as \N{2,3}. */
1473 
1474 if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1475      !is_counted_repeat(ptr+2))
1476   *errorcodeptr = ERR37;
1477 
1478 /* If PCRE_UCP is set, we change the values for \d etc. */
1479 
1480 if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
1481   escape += (ESC_DU - ESC_D);
1482 
1483 /* Set the pointer to the final character before returning. */
1484 
1485 *ptrptr = ptr;
1486 *chptr = c;
1487 return escape;
1488 }
1489 
1490 
1491 
1492 #ifdef SUPPORT_UCP
1493 /*************************************************
1494 *               Handle \P and \p                 *
1495 *************************************************/
1496 
1497 /* This function is called after \P or \p has been encountered, provided that
1498 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1499 pointing at the P or p. On exit, it is pointing at the final character of the
1500 escape sequence.
1501 
1502 Argument:
1503   ptrptr         points to the pattern position pointer
1504   negptr         points to a boolean that is set TRUE for negation else FALSE
1505   ptypeptr       points to an unsigned int that is set to the type value
1506   pdataptr       points to an unsigned int that is set to the detailed property value
1507   errorcodeptr   points to the error code variable
1508 
1509 Returns:         TRUE if the type value was found, or FALSE for an invalid type
1510 */
1511 
1512 static BOOL
get_ucp(const pcre_uchar ** ptrptr,BOOL * negptr,unsigned int * ptypeptr,unsigned int * pdataptr,int * errorcodeptr)1513 get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
1514   unsigned int *pdataptr, int *errorcodeptr)
1515 {
1516 pcre_uchar c;
1517 int i, bot, top;
1518 const pcre_uchar *ptr = *ptrptr;
1519 pcre_uchar name[32];
1520 
1521 c = *(++ptr);
1522 if (c == CHAR_NULL) goto ERROR_RETURN;
1523 
1524 *negptr = FALSE;
1525 
1526 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1527 negation. */
1528 
1529 if (c == CHAR_LEFT_CURLY_BRACKET)
1530   {
1531   if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1532     {
1533     *negptr = TRUE;
1534     ptr++;
1535     }
1536   for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1537     {
1538     c = *(++ptr);
1539     if (c == CHAR_NULL) goto ERROR_RETURN;
1540     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1541     name[i] = c;
1542     }
1543   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1544   name[i] = 0;
1545   }
1546 
1547 /* Otherwise there is just one following character */
1548 
1549 else
1550   {
1551   name[0] = c;
1552   name[1] = 0;
1553   }
1554 
1555 *ptrptr = ptr;
1556 
1557 /* Search for a recognized property name using binary chop */
1558 
1559 bot = 0;
1560 top = PRIV(utt_size);
1561 
1562 while (bot < top)
1563   {
1564   int r;
1565   i = (bot + top) >> 1;
1566   r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1567   if (r == 0)
1568     {
1569     *ptypeptr = PRIV(utt)[i].type;
1570     *pdataptr = PRIV(utt)[i].value;
1571     return TRUE;
1572     }
1573   if (r > 0) bot = i + 1; else top = i;
1574   }
1575 
1576 *errorcodeptr = ERR47;
1577 *ptrptr = ptr;
1578 return FALSE;
1579 
1580 ERROR_RETURN:
1581 *errorcodeptr = ERR46;
1582 *ptrptr = ptr;
1583 return FALSE;
1584 }
1585 #endif
1586 
1587 
1588 
1589 /*************************************************
1590 *         Read repeat counts                     *
1591 *************************************************/
1592 
1593 /* Read an item of the form {n,m} and return the values. This is called only
1594 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1595 so the syntax is guaranteed to be correct, but we need to check the values.
1596 
1597 Arguments:
1598   p              pointer to first char after '{'
1599   minp           pointer to int for min
1600   maxp           pointer to int for max
1601                  returned as -1 if no max
1602   errorcodeptr   points to error code variable
1603 
1604 Returns:         pointer to '}' on success;
1605                  current ptr on error, with errorcodeptr set non-zero
1606 */
1607 
1608 static const pcre_uchar *
read_repeat_counts(const pcre_uchar * p,int * minp,int * maxp,int * errorcodeptr)1609 read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1610 {
1611 int min = 0;
1612 int max = -1;
1613 
1614 while (IS_DIGIT(*p))
1615   {
1616   min = min * 10 + (int)(*p++ - CHAR_0);
1617   if (min > 65535)
1618     {
1619     *errorcodeptr = ERR5;
1620     return p;
1621     }
1622   }
1623 
1624 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1625   {
1626   if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1627     {
1628     max = 0;
1629     while(IS_DIGIT(*p))
1630       {
1631       max = max * 10 + (int)(*p++ - CHAR_0);
1632       if (max > 65535)
1633         {
1634         *errorcodeptr = ERR5;
1635         return p;
1636         }
1637       }
1638     if (max < min)
1639       {
1640       *errorcodeptr = ERR4;
1641       return p;
1642       }
1643     }
1644   }
1645 
1646 *minp = min;
1647 *maxp = max;
1648 return p;
1649 }
1650 
1651 
1652 
1653 /*************************************************
1654 *      Find first significant op code            *
1655 *************************************************/
1656 
1657 /* This is called by several functions that scan a compiled expression looking
1658 for a fixed first character, or an anchoring op code etc. It skips over things
1659 that do not influence this. For some calls, it makes sense to skip negative
1660 forward and all backward assertions, and also the \b assertion; for others it
1661 does not.
1662 
1663 Arguments:
1664   code         pointer to the start of the group
1665   skipassert   TRUE if certain assertions are to be skipped
1666 
1667 Returns:       pointer to the first significant opcode
1668 */
1669 
1670 static const pcre_uchar*
first_significant_code(const pcre_uchar * code,BOOL skipassert)1671 first_significant_code(const pcre_uchar *code, BOOL skipassert)
1672 {
1673 for (;;)
1674   {
1675   switch ((int)*code)
1676     {
1677     case OP_ASSERT_NOT:
1678     case OP_ASSERTBACK:
1679     case OP_ASSERTBACK_NOT:
1680     if (!skipassert) return code;
1681     do code += GET(code, 1); while (*code == OP_ALT);
1682     code += PRIV(OP_lengths)[*code];
1683     break;
1684 
1685     case OP_WORD_BOUNDARY:
1686     case OP_NOT_WORD_BOUNDARY:
1687     if (!skipassert) return code;
1688     /* Fall through */
1689 
1690     case OP_CALLOUT:
1691     case OP_CREF:
1692     case OP_DNCREF:
1693     case OP_RREF:
1694     case OP_DNRREF:
1695     case OP_DEF:
1696     code += PRIV(OP_lengths)[*code];
1697     break;
1698 
1699     default:
1700     return code;
1701     }
1702   }
1703 /* Control never reaches here */
1704 }
1705 
1706 
1707 
1708 /*************************************************
1709 *        Find the fixed length of a branch       *
1710 *************************************************/
1711 
1712 /* Scan a branch and compute the fixed length of subject that will match it,
1713 if the length is fixed. This is needed for dealing with backward assertions.
1714 In UTF8 mode, the result is in characters rather than bytes. The branch is
1715 temporarily terminated with OP_END when this function is called.
1716 
1717 This function is called when a backward assertion is encountered, so that if it
1718 fails, the error message can point to the correct place in the pattern.
1719 However, we cannot do this when the assertion contains subroutine calls,
1720 because they can be forward references. We solve this by remembering this case
1721 and doing the check at the end; a flag specifies which mode we are running in.
1722 
1723 Arguments:
1724   code     points to the start of the pattern (the bracket)
1725   utf      TRUE in UTF-8 / UTF-16 / UTF-32 mode
1726   atend    TRUE if called when the pattern is complete
1727   cd       the "compile data" structure
1728   recurses    chain of recurse_check to catch mutual recursion
1729 
1730 Returns:   the fixed length,
1731              or -1 if there is no fixed length,
1732              or -2 if \C was encountered (in UTF-8 mode only)
1733              or -3 if an OP_RECURSE item was encountered and atend is FALSE
1734              or -4 if an unknown opcode was encountered (internal error)
1735 */
1736 
1737 static int
find_fixedlength(pcre_uchar * code,BOOL utf,BOOL atend,compile_data * cd,recurse_check * recurses)1738 find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd,
1739   recurse_check *recurses)
1740 {
1741 int length = -1;
1742 recurse_check this_recurse;
1743 register int branchlength = 0;
1744 register pcre_uchar *cc = code + 1 + LINK_SIZE;
1745 
1746 /* Scan along the opcodes for this branch. If we get to the end of the
1747 branch, check the length against that of the other branches. */
1748 
1749 for (;;)
1750   {
1751   int d;
1752   pcre_uchar *ce, *cs;
1753   register pcre_uchar op = *cc;
1754 
1755   switch (op)
1756     {
1757     /* We only need to continue for OP_CBRA (normal capturing bracket) and
1758     OP_BRA (normal non-capturing bracket) because the other variants of these
1759     opcodes are all concerned with unlimited repeated groups, which of course
1760     are not of fixed length. */
1761 
1762     case OP_CBRA:
1763     case OP_BRA:
1764     case OP_ONCE:
1765     case OP_ONCE_NC:
1766     case OP_COND:
1767     d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd,
1768       recurses);
1769     if (d < 0) return d;
1770     branchlength += d;
1771     do cc += GET(cc, 1); while (*cc == OP_ALT);
1772     cc += 1 + LINK_SIZE;
1773     break;
1774 
1775     /* Reached end of a branch; if it's a ket it is the end of a nested call.
1776     If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1777     an ALT. If it is END it's the end of the outer call. All can be handled by
1778     the same code. Note that we must not include the OP_KETRxxx opcodes here,
1779     because they all imply an unlimited repeat. */
1780 
1781     case OP_ALT:
1782     case OP_KET:
1783     case OP_END:
1784     case OP_ACCEPT:
1785     case OP_ASSERT_ACCEPT:
1786     if (length < 0) length = branchlength;
1787       else if (length != branchlength) return -1;
1788     if (*cc != OP_ALT) return length;
1789     cc += 1 + LINK_SIZE;
1790     branchlength = 0;
1791     break;
1792 
1793     /* A true recursion implies not fixed length, but a subroutine call may
1794     be OK. If the subroutine is a forward reference, we can't deal with
1795     it until the end of the pattern, so return -3. */
1796 
1797     case OP_RECURSE:
1798     if (!atend) return -3;
1799     cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1800     do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
1801     if (cc > cs && cc < ce) return -1;                    /* Recursion */
1802     else   /* Check for mutual recursion */
1803       {
1804       recurse_check *r = recurses;
1805       for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
1806       if (r != NULL) return -1;   /* Mutual recursion */
1807       }
1808     this_recurse.prev = recurses;
1809     this_recurse.group = cs;
1810     d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd, &this_recurse);
1811     if (d < 0) return d;
1812     branchlength += d;
1813     cc += 1 + LINK_SIZE;
1814     break;
1815 
1816     /* Skip over assertive subpatterns */
1817 
1818     case OP_ASSERT:
1819     case OP_ASSERT_NOT:
1820     case OP_ASSERTBACK:
1821     case OP_ASSERTBACK_NOT:
1822     do cc += GET(cc, 1); while (*cc == OP_ALT);
1823     cc += 1 + LINK_SIZE;
1824     break;
1825 
1826     /* Skip over things that don't match chars */
1827 
1828     case OP_MARK:
1829     case OP_PRUNE_ARG:
1830     case OP_SKIP_ARG:
1831     case OP_THEN_ARG:
1832     cc += cc[1] + PRIV(OP_lengths)[*cc];
1833     break;
1834 
1835     case OP_CALLOUT:
1836     case OP_CIRC:
1837     case OP_CIRCM:
1838     case OP_CLOSE:
1839     case OP_COMMIT:
1840     case OP_CREF:
1841     case OP_DEF:
1842     case OP_DNCREF:
1843     case OP_DNRREF:
1844     case OP_DOLL:
1845     case OP_DOLLM:
1846     case OP_EOD:
1847     case OP_EODN:
1848     case OP_FAIL:
1849     case OP_NOT_WORD_BOUNDARY:
1850     case OP_PRUNE:
1851     case OP_REVERSE:
1852     case OP_RREF:
1853     case OP_SET_SOM:
1854     case OP_SKIP:
1855     case OP_SOD:
1856     case OP_SOM:
1857     case OP_THEN:
1858     case OP_WORD_BOUNDARY:
1859     cc += PRIV(OP_lengths)[*cc];
1860     break;
1861 
1862     /* Handle literal characters */
1863 
1864     case OP_CHAR:
1865     case OP_CHARI:
1866     case OP_NOT:
1867     case OP_NOTI:
1868     branchlength++;
1869     cc += 2;
1870 #ifdef SUPPORT_UTF
1871     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1872 #endif
1873     break;
1874 
1875     /* Handle exact repetitions. The count is already in characters, but we
1876     need to skip over a multibyte character in UTF8 mode.  */
1877 
1878     case OP_EXACT:
1879     case OP_EXACTI:
1880     case OP_NOTEXACT:
1881     case OP_NOTEXACTI:
1882     branchlength += (int)GET2(cc,1);
1883     cc += 2 + IMM2_SIZE;
1884 #ifdef SUPPORT_UTF
1885     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1886 #endif
1887     break;
1888 
1889     case OP_TYPEEXACT:
1890     branchlength += GET2(cc,1);
1891     if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1892       cc += 2;
1893     cc += 1 + IMM2_SIZE + 1;
1894     break;
1895 
1896     /* Handle single-char matchers */
1897 
1898     case OP_PROP:
1899     case OP_NOTPROP:
1900     cc += 2;
1901     /* Fall through */
1902 
1903     case OP_HSPACE:
1904     case OP_VSPACE:
1905     case OP_NOT_HSPACE:
1906     case OP_NOT_VSPACE:
1907     case OP_NOT_DIGIT:
1908     case OP_DIGIT:
1909     case OP_NOT_WHITESPACE:
1910     case OP_WHITESPACE:
1911     case OP_NOT_WORDCHAR:
1912     case OP_WORDCHAR:
1913     case OP_ANY:
1914     case OP_ALLANY:
1915     branchlength++;
1916     cc++;
1917     break;
1918 
1919     /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1920     otherwise \C is coded as OP_ALLANY. */
1921 
1922     case OP_ANYBYTE:
1923     return -2;
1924 
1925     /* Check a class for variable quantification */
1926 
1927     case OP_CLASS:
1928     case OP_NCLASS:
1929 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1930     case OP_XCLASS:
1931     /* The original code caused an unsigned overflow in 64 bit systems,
1932     so now we use a conditional statement. */
1933     if (op == OP_XCLASS)
1934       cc += GET(cc, 1);
1935     else
1936       cc += PRIV(OP_lengths)[OP_CLASS];
1937 #else
1938     cc += PRIV(OP_lengths)[OP_CLASS];
1939 #endif
1940 
1941     switch (*cc)
1942       {
1943       case OP_CRSTAR:
1944       case OP_CRMINSTAR:
1945       case OP_CRPLUS:
1946       case OP_CRMINPLUS:
1947       case OP_CRQUERY:
1948       case OP_CRMINQUERY:
1949       case OP_CRPOSSTAR:
1950       case OP_CRPOSPLUS:
1951       case OP_CRPOSQUERY:
1952       return -1;
1953 
1954       case OP_CRRANGE:
1955       case OP_CRMINRANGE:
1956       case OP_CRPOSRANGE:
1957       if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1958       branchlength += (int)GET2(cc,1);
1959       cc += 1 + 2 * IMM2_SIZE;
1960       break;
1961 
1962       default:
1963       branchlength++;
1964       }
1965     break;
1966 
1967     /* Anything else is variable length */
1968 
1969     case OP_ANYNL:
1970     case OP_BRAMINZERO:
1971     case OP_BRAPOS:
1972     case OP_BRAPOSZERO:
1973     case OP_BRAZERO:
1974     case OP_CBRAPOS:
1975     case OP_EXTUNI:
1976     case OP_KETRMAX:
1977     case OP_KETRMIN:
1978     case OP_KETRPOS:
1979     case OP_MINPLUS:
1980     case OP_MINPLUSI:
1981     case OP_MINQUERY:
1982     case OP_MINQUERYI:
1983     case OP_MINSTAR:
1984     case OP_MINSTARI:
1985     case OP_MINUPTO:
1986     case OP_MINUPTOI:
1987     case OP_NOTMINPLUS:
1988     case OP_NOTMINPLUSI:
1989     case OP_NOTMINQUERY:
1990     case OP_NOTMINQUERYI:
1991     case OP_NOTMINSTAR:
1992     case OP_NOTMINSTARI:
1993     case OP_NOTMINUPTO:
1994     case OP_NOTMINUPTOI:
1995     case OP_NOTPLUS:
1996     case OP_NOTPLUSI:
1997     case OP_NOTPOSPLUS:
1998     case OP_NOTPOSPLUSI:
1999     case OP_NOTPOSQUERY:
2000     case OP_NOTPOSQUERYI:
2001     case OP_NOTPOSSTAR:
2002     case OP_NOTPOSSTARI:
2003     case OP_NOTPOSUPTO:
2004     case OP_NOTPOSUPTOI:
2005     case OP_NOTQUERY:
2006     case OP_NOTQUERYI:
2007     case OP_NOTSTAR:
2008     case OP_NOTSTARI:
2009     case OP_NOTUPTO:
2010     case OP_NOTUPTOI:
2011     case OP_PLUS:
2012     case OP_PLUSI:
2013     case OP_POSPLUS:
2014     case OP_POSPLUSI:
2015     case OP_POSQUERY:
2016     case OP_POSQUERYI:
2017     case OP_POSSTAR:
2018     case OP_POSSTARI:
2019     case OP_POSUPTO:
2020     case OP_POSUPTOI:
2021     case OP_QUERY:
2022     case OP_QUERYI:
2023     case OP_REF:
2024     case OP_REFI:
2025     case OP_DNREF:
2026     case OP_DNREFI:
2027     case OP_SBRA:
2028     case OP_SBRAPOS:
2029     case OP_SCBRA:
2030     case OP_SCBRAPOS:
2031     case OP_SCOND:
2032     case OP_SKIPZERO:
2033     case OP_STAR:
2034     case OP_STARI:
2035     case OP_TYPEMINPLUS:
2036     case OP_TYPEMINQUERY:
2037     case OP_TYPEMINSTAR:
2038     case OP_TYPEMINUPTO:
2039     case OP_TYPEPLUS:
2040     case OP_TYPEPOSPLUS:
2041     case OP_TYPEPOSQUERY:
2042     case OP_TYPEPOSSTAR:
2043     case OP_TYPEPOSUPTO:
2044     case OP_TYPEQUERY:
2045     case OP_TYPESTAR:
2046     case OP_TYPEUPTO:
2047     case OP_UPTO:
2048     case OP_UPTOI:
2049     return -1;
2050 
2051     /* Catch unrecognized opcodes so that when new ones are added they
2052     are not forgotten, as has happened in the past. */
2053 
2054     default:
2055     return -4;
2056     }
2057   }
2058 /* Control never gets here */
2059 }
2060 
2061 
2062 
2063 /*************************************************
2064 *    Scan compiled regex for specific bracket    *
2065 *************************************************/
2066 
2067 /* This little function scans through a compiled pattern until it finds a
2068 capturing bracket with the given number, or, if the number is negative, an
2069 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
2070 so that it can be called from pcre_study() when finding the minimum matching
2071 length.
2072 
2073 Arguments:
2074   code        points to start of expression
2075   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
2076   number      the required bracket number or negative to find a lookbehind
2077 
2078 Returns:      pointer to the opcode for the bracket, or NULL if not found
2079 */
2080 
2081 const pcre_uchar *
PRIV(find_bracket)2082 PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2083 {
2084 for (;;)
2085   {
2086   register pcre_uchar c = *code;
2087 
2088   if (c == OP_END) return NULL;
2089 
2090   /* XCLASS is used for classes that cannot be represented just by a bit
2091   map. This includes negated single high-valued characters. The length in
2092   the table is zero; the actual length is stored in the compiled code. */
2093 
2094   if (c == OP_XCLASS) code += GET(code, 1);
2095 
2096   /* Handle recursion */
2097 
2098   else if (c == OP_REVERSE)
2099     {
2100     if (number < 0) return (pcre_uchar *)code;
2101     code += PRIV(OP_lengths)[c];
2102     }
2103 
2104   /* Handle capturing bracket */
2105 
2106   else if (c == OP_CBRA || c == OP_SCBRA ||
2107            c == OP_CBRAPOS || c == OP_SCBRAPOS)
2108     {
2109     int n = (int)GET2(code, 1+LINK_SIZE);
2110     if (n == number) return (pcre_uchar *)code;
2111     code += PRIV(OP_lengths)[c];
2112     }
2113 
2114   /* Otherwise, we can get the item's length from the table, except that for
2115   repeated character types, we have to test for \p and \P, which have an extra
2116   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2117   must add in its length. */
2118 
2119   else
2120     {
2121     switch(c)
2122       {
2123       case OP_TYPESTAR:
2124       case OP_TYPEMINSTAR:
2125       case OP_TYPEPLUS:
2126       case OP_TYPEMINPLUS:
2127       case OP_TYPEQUERY:
2128       case OP_TYPEMINQUERY:
2129       case OP_TYPEPOSSTAR:
2130       case OP_TYPEPOSPLUS:
2131       case OP_TYPEPOSQUERY:
2132       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2133       break;
2134 
2135       case OP_TYPEUPTO:
2136       case OP_TYPEMINUPTO:
2137       case OP_TYPEEXACT:
2138       case OP_TYPEPOSUPTO:
2139       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2140         code += 2;
2141       break;
2142 
2143       case OP_MARK:
2144       case OP_PRUNE_ARG:
2145       case OP_SKIP_ARG:
2146       case OP_THEN_ARG:
2147       code += code[1];
2148       break;
2149       }
2150 
2151     /* Add in the fixed length from the table */
2152 
2153     code += PRIV(OP_lengths)[c];
2154 
2155   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2156   a multi-byte character. The length in the table is a minimum, so we have to
2157   arrange to skip the extra bytes. */
2158 
2159 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2160     if (utf) switch(c)
2161       {
2162       case OP_CHAR:
2163       case OP_CHARI:
2164       case OP_NOT:
2165       case OP_NOTI:
2166       case OP_EXACT:
2167       case OP_EXACTI:
2168       case OP_NOTEXACT:
2169       case OP_NOTEXACTI:
2170       case OP_UPTO:
2171       case OP_UPTOI:
2172       case OP_NOTUPTO:
2173       case OP_NOTUPTOI:
2174       case OP_MINUPTO:
2175       case OP_MINUPTOI:
2176       case OP_NOTMINUPTO:
2177       case OP_NOTMINUPTOI:
2178       case OP_POSUPTO:
2179       case OP_POSUPTOI:
2180       case OP_NOTPOSUPTO:
2181       case OP_NOTPOSUPTOI:
2182       case OP_STAR:
2183       case OP_STARI:
2184       case OP_NOTSTAR:
2185       case OP_NOTSTARI:
2186       case OP_MINSTAR:
2187       case OP_MINSTARI:
2188       case OP_NOTMINSTAR:
2189       case OP_NOTMINSTARI:
2190       case OP_POSSTAR:
2191       case OP_POSSTARI:
2192       case OP_NOTPOSSTAR:
2193       case OP_NOTPOSSTARI:
2194       case OP_PLUS:
2195       case OP_PLUSI:
2196       case OP_NOTPLUS:
2197       case OP_NOTPLUSI:
2198       case OP_MINPLUS:
2199       case OP_MINPLUSI:
2200       case OP_NOTMINPLUS:
2201       case OP_NOTMINPLUSI:
2202       case OP_POSPLUS:
2203       case OP_POSPLUSI:
2204       case OP_NOTPOSPLUS:
2205       case OP_NOTPOSPLUSI:
2206       case OP_QUERY:
2207       case OP_QUERYI:
2208       case OP_NOTQUERY:
2209       case OP_NOTQUERYI:
2210       case OP_MINQUERY:
2211       case OP_MINQUERYI:
2212       case OP_NOTMINQUERY:
2213       case OP_NOTMINQUERYI:
2214       case OP_POSQUERY:
2215       case OP_POSQUERYI:
2216       case OP_NOTPOSQUERY:
2217       case OP_NOTPOSQUERYI:
2218       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2219       break;
2220       }
2221 #else
2222     (void)(utf);  /* Keep compiler happy by referencing function argument */
2223 #endif
2224     }
2225   }
2226 }
2227 
2228 
2229 
2230 /*************************************************
2231 *   Scan compiled regex for recursion reference  *
2232 *************************************************/
2233 
2234 /* This little function scans through a compiled pattern until it finds an
2235 instance of OP_RECURSE.
2236 
2237 Arguments:
2238   code        points to start of expression
2239   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
2240 
2241 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
2242 */
2243 
2244 static const pcre_uchar *
find_recurse(const pcre_uchar * code,BOOL utf)2245 find_recurse(const pcre_uchar *code, BOOL utf)
2246 {
2247 for (;;)
2248   {
2249   register pcre_uchar c = *code;
2250   if (c == OP_END) return NULL;
2251   if (c == OP_RECURSE) return code;
2252 
2253   /* XCLASS is used for classes that cannot be represented just by a bit
2254   map. This includes negated single high-valued characters. The length in
2255   the table is zero; the actual length is stored in the compiled code. */
2256 
2257   if (c == OP_XCLASS) code += GET(code, 1);
2258 
2259   /* Otherwise, we can get the item's length from the table, except that for
2260   repeated character types, we have to test for \p and \P, which have an extra
2261   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2262   must add in its length. */
2263 
2264   else
2265     {
2266     switch(c)
2267       {
2268       case OP_TYPESTAR:
2269       case OP_TYPEMINSTAR:
2270       case OP_TYPEPLUS:
2271       case OP_TYPEMINPLUS:
2272       case OP_TYPEQUERY:
2273       case OP_TYPEMINQUERY:
2274       case OP_TYPEPOSSTAR:
2275       case OP_TYPEPOSPLUS:
2276       case OP_TYPEPOSQUERY:
2277       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2278       break;
2279 
2280       case OP_TYPEPOSUPTO:
2281       case OP_TYPEUPTO:
2282       case OP_TYPEMINUPTO:
2283       case OP_TYPEEXACT:
2284       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2285         code += 2;
2286       break;
2287 
2288       case OP_MARK:
2289       case OP_PRUNE_ARG:
2290       case OP_SKIP_ARG:
2291       case OP_THEN_ARG:
2292       code += code[1];
2293       break;
2294       }
2295 
2296     /* Add in the fixed length from the table */
2297 
2298     code += PRIV(OP_lengths)[c];
2299 
2300     /* In UTF-8 mode, opcodes that are followed by a character may be followed
2301     by a multi-byte character. The length in the table is a minimum, so we have
2302     to arrange to skip the extra bytes. */
2303 
2304 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2305     if (utf) switch(c)
2306       {
2307       case OP_CHAR:
2308       case OP_CHARI:
2309       case OP_NOT:
2310       case OP_NOTI:
2311       case OP_EXACT:
2312       case OP_EXACTI:
2313       case OP_NOTEXACT:
2314       case OP_NOTEXACTI:
2315       case OP_UPTO:
2316       case OP_UPTOI:
2317       case OP_NOTUPTO:
2318       case OP_NOTUPTOI:
2319       case OP_MINUPTO:
2320       case OP_MINUPTOI:
2321       case OP_NOTMINUPTO:
2322       case OP_NOTMINUPTOI:
2323       case OP_POSUPTO:
2324       case OP_POSUPTOI:
2325       case OP_NOTPOSUPTO:
2326       case OP_NOTPOSUPTOI:
2327       case OP_STAR:
2328       case OP_STARI:
2329       case OP_NOTSTAR:
2330       case OP_NOTSTARI:
2331       case OP_MINSTAR:
2332       case OP_MINSTARI:
2333       case OP_NOTMINSTAR:
2334       case OP_NOTMINSTARI:
2335       case OP_POSSTAR:
2336       case OP_POSSTARI:
2337       case OP_NOTPOSSTAR:
2338       case OP_NOTPOSSTARI:
2339       case OP_PLUS:
2340       case OP_PLUSI:
2341       case OP_NOTPLUS:
2342       case OP_NOTPLUSI:
2343       case OP_MINPLUS:
2344       case OP_MINPLUSI:
2345       case OP_NOTMINPLUS:
2346       case OP_NOTMINPLUSI:
2347       case OP_POSPLUS:
2348       case OP_POSPLUSI:
2349       case OP_NOTPOSPLUS:
2350       case OP_NOTPOSPLUSI:
2351       case OP_QUERY:
2352       case OP_QUERYI:
2353       case OP_NOTQUERY:
2354       case OP_NOTQUERYI:
2355       case OP_MINQUERY:
2356       case OP_MINQUERYI:
2357       case OP_NOTMINQUERY:
2358       case OP_NOTMINQUERYI:
2359       case OP_POSQUERY:
2360       case OP_POSQUERYI:
2361       case OP_NOTPOSQUERY:
2362       case OP_NOTPOSQUERYI:
2363       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2364       break;
2365       }
2366 #else
2367     (void)(utf);  /* Keep compiler happy by referencing function argument */
2368 #endif
2369     }
2370   }
2371 }
2372 
2373 
2374 
2375 /*************************************************
2376 *    Scan compiled branch for non-emptiness      *
2377 *************************************************/
2378 
2379 /* This function scans through a branch of a compiled pattern to see whether it
2380 can match the empty string or not. It is called from could_be_empty()
2381 below and from compile_branch() when checking for an unlimited repeat of a
2382 group that can match nothing. Note that first_significant_code() skips over
2383 backward and negative forward assertions when its final argument is TRUE. If we
2384 hit an unclosed bracket, we return "empty" - this means we've struck an inner
2385 bracket whose current branch will already have been scanned.
2386 
2387 Arguments:
2388   code        points to start of search
2389   endcode     points to where to stop
2390   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2391   cd          contains pointers to tables etc.
2392   recurses    chain of recurse_check to catch mutual recursion
2393 
2394 Returns:      TRUE if what is matched could be empty
2395 */
2396 
2397 static BOOL
could_be_empty_branch(const pcre_uchar * code,const pcre_uchar * endcode,BOOL utf,compile_data * cd,recurse_check * recurses)2398 could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2399   BOOL utf, compile_data *cd, recurse_check *recurses)
2400 {
2401 register pcre_uchar c;
2402 recurse_check this_recurse;
2403 
2404 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2405      code < endcode;
2406      code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2407   {
2408   const pcre_uchar *ccode;
2409 
2410   c = *code;
2411 
2412   /* Skip over forward assertions; the other assertions are skipped by
2413   first_significant_code() with a TRUE final argument. */
2414 
2415   if (c == OP_ASSERT)
2416     {
2417     do code += GET(code, 1); while (*code == OP_ALT);
2418     c = *code;
2419     continue;
2420     }
2421 
2422   /* For a recursion/subroutine call, if its end has been reached, which
2423   implies a backward reference subroutine call, we can scan it. If it's a
2424   forward reference subroutine call, we can't. To detect forward reference
2425   we have to scan up the list that is kept in the workspace. This function is
2426   called only when doing the real compile, not during the pre-compile that
2427   measures the size of the compiled pattern. */
2428 
2429   if (c == OP_RECURSE)
2430     {
2431     const pcre_uchar *scode = cd->start_code + GET(code, 1);
2432     const pcre_uchar *endgroup = scode;
2433     BOOL empty_branch;
2434 
2435     /* Test for forward reference or uncompleted reference. This is disabled
2436     when called to scan a completed pattern by setting cd->start_workspace to
2437     NULL. */
2438 
2439     if (cd->start_workspace != NULL)
2440       {
2441       const pcre_uchar *tcode;
2442       for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2443         if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2444       if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2445       }
2446 
2447     /* If the reference is to a completed group, we need to detect whether this
2448     is a recursive call, as otherwise there will be an infinite loop. If it is
2449     a recursion, just skip over it. Simple recursions are easily detected. For
2450     mutual recursions we keep a chain on the stack. */
2451 
2452     do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2453     if (code >= scode && code <= endgroup) continue;  /* Simple recursion */
2454     else
2455       {
2456       recurse_check *r = recurses;
2457       for (r = recurses; r != NULL; r = r->prev)
2458         if (r->group == scode) break;
2459       if (r != NULL) continue;   /* Mutual recursion */
2460       }
2461 
2462     /* Completed reference; scan the referenced group, remembering it on the
2463     stack chain to detect mutual recursions. */
2464 
2465     empty_branch = FALSE;
2466     this_recurse.prev = recurses;
2467     this_recurse.group = scode;
2468 
2469     do
2470       {
2471       if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
2472         {
2473         empty_branch = TRUE;
2474         break;
2475         }
2476       scode += GET(scode, 1);
2477       }
2478     while (*scode == OP_ALT);
2479 
2480     if (!empty_branch) return FALSE;  /* All branches are non-empty */
2481     continue;
2482     }
2483 
2484   /* Groups with zero repeats can of course be empty; skip them. */
2485 
2486   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2487       c == OP_BRAPOSZERO)
2488     {
2489     code += PRIV(OP_lengths)[c];
2490     do code += GET(code, 1); while (*code == OP_ALT);
2491     c = *code;
2492     continue;
2493     }
2494 
2495   /* A nested group that is already marked as "could be empty" can just be
2496   skipped. */
2497 
2498   if (c == OP_SBRA  || c == OP_SBRAPOS ||
2499       c == OP_SCBRA || c == OP_SCBRAPOS)
2500     {
2501     do code += GET(code, 1); while (*code == OP_ALT);
2502     c = *code;
2503     continue;
2504     }
2505 
2506   /* For other groups, scan the branches. */
2507 
2508   if (c == OP_BRA  || c == OP_BRAPOS ||
2509       c == OP_CBRA || c == OP_CBRAPOS ||
2510       c == OP_ONCE || c == OP_ONCE_NC ||
2511       c == OP_COND || c == OP_SCOND)
2512     {
2513     BOOL empty_branch;
2514     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
2515 
2516     /* If a conditional group has only one branch, there is a second, implied,
2517     empty branch, so just skip over the conditional, because it could be empty.
2518     Otherwise, scan the individual branches of the group. */
2519 
2520     if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2521       code += GET(code, 1);
2522     else
2523       {
2524       empty_branch = FALSE;
2525       do
2526         {
2527         if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd,
2528           recurses)) empty_branch = TRUE;
2529         code += GET(code, 1);
2530         }
2531       while (*code == OP_ALT);
2532       if (!empty_branch) return FALSE;   /* All branches are non-empty */
2533       }
2534 
2535     c = *code;
2536     continue;
2537     }
2538 
2539   /* Handle the other opcodes */
2540 
2541   switch (c)
2542     {
2543     /* Check for quantifiers after a class. XCLASS is used for classes that
2544     cannot be represented just by a bit map. This includes negated single
2545     high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2546     actual length is stored in the compiled code, so we must update "code"
2547     here. */
2548 
2549 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2550     case OP_XCLASS:
2551     ccode = code += GET(code, 1);
2552     goto CHECK_CLASS_REPEAT;
2553 #endif
2554 
2555     case OP_CLASS:
2556     case OP_NCLASS:
2557     ccode = code + PRIV(OP_lengths)[OP_CLASS];
2558 
2559 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2560     CHECK_CLASS_REPEAT:
2561 #endif
2562 
2563     switch (*ccode)
2564       {
2565       case OP_CRSTAR:            /* These could be empty; continue */
2566       case OP_CRMINSTAR:
2567       case OP_CRQUERY:
2568       case OP_CRMINQUERY:
2569       case OP_CRPOSSTAR:
2570       case OP_CRPOSQUERY:
2571       break;
2572 
2573       default:                   /* Non-repeat => class must match */
2574       case OP_CRPLUS:            /* These repeats aren't empty */
2575       case OP_CRMINPLUS:
2576       case OP_CRPOSPLUS:
2577       return FALSE;
2578 
2579       case OP_CRRANGE:
2580       case OP_CRMINRANGE:
2581       case OP_CRPOSRANGE:
2582       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
2583       break;
2584       }
2585     break;
2586 
2587     /* Opcodes that must match a character */
2588 
2589     case OP_ANY:
2590     case OP_ALLANY:
2591     case OP_ANYBYTE:
2592 
2593     case OP_PROP:
2594     case OP_NOTPROP:
2595     case OP_ANYNL:
2596 
2597     case OP_NOT_HSPACE:
2598     case OP_HSPACE:
2599     case OP_NOT_VSPACE:
2600     case OP_VSPACE:
2601     case OP_EXTUNI:
2602 
2603     case OP_NOT_DIGIT:
2604     case OP_DIGIT:
2605     case OP_NOT_WHITESPACE:
2606     case OP_WHITESPACE:
2607     case OP_NOT_WORDCHAR:
2608     case OP_WORDCHAR:
2609 
2610     case OP_CHAR:
2611     case OP_CHARI:
2612     case OP_NOT:
2613     case OP_NOTI:
2614 
2615     case OP_PLUS:
2616     case OP_PLUSI:
2617     case OP_MINPLUS:
2618     case OP_MINPLUSI:
2619 
2620     case OP_NOTPLUS:
2621     case OP_NOTPLUSI:
2622     case OP_NOTMINPLUS:
2623     case OP_NOTMINPLUSI:
2624 
2625     case OP_POSPLUS:
2626     case OP_POSPLUSI:
2627     case OP_NOTPOSPLUS:
2628     case OP_NOTPOSPLUSI:
2629 
2630     case OP_EXACT:
2631     case OP_EXACTI:
2632     case OP_NOTEXACT:
2633     case OP_NOTEXACTI:
2634 
2635     case OP_TYPEPLUS:
2636     case OP_TYPEMINPLUS:
2637     case OP_TYPEPOSPLUS:
2638     case OP_TYPEEXACT:
2639 
2640     return FALSE;
2641 
2642     /* These are going to continue, as they may be empty, but we have to
2643     fudge the length for the \p and \P cases. */
2644 
2645     case OP_TYPESTAR:
2646     case OP_TYPEMINSTAR:
2647     case OP_TYPEPOSSTAR:
2648     case OP_TYPEQUERY:
2649     case OP_TYPEMINQUERY:
2650     case OP_TYPEPOSQUERY:
2651     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2652     break;
2653 
2654     /* Same for these */
2655 
2656     case OP_TYPEUPTO:
2657     case OP_TYPEMINUPTO:
2658     case OP_TYPEPOSUPTO:
2659     if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2660       code += 2;
2661     break;
2662 
2663     /* End of branch */
2664 
2665     case OP_KET:
2666     case OP_KETRMAX:
2667     case OP_KETRMIN:
2668     case OP_KETRPOS:
2669     case OP_ALT:
2670     return TRUE;
2671 
2672     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2673     MINUPTO, and POSUPTO and their caseless and negative versions may be
2674     followed by a multibyte character. */
2675 
2676 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2677     case OP_STAR:
2678     case OP_STARI:
2679     case OP_NOTSTAR:
2680     case OP_NOTSTARI:
2681 
2682     case OP_MINSTAR:
2683     case OP_MINSTARI:
2684     case OP_NOTMINSTAR:
2685     case OP_NOTMINSTARI:
2686 
2687     case OP_POSSTAR:
2688     case OP_POSSTARI:
2689     case OP_NOTPOSSTAR:
2690     case OP_NOTPOSSTARI:
2691 
2692     case OP_QUERY:
2693     case OP_QUERYI:
2694     case OP_NOTQUERY:
2695     case OP_NOTQUERYI:
2696 
2697     case OP_MINQUERY:
2698     case OP_MINQUERYI:
2699     case OP_NOTMINQUERY:
2700     case OP_NOTMINQUERYI:
2701 
2702     case OP_POSQUERY:
2703     case OP_POSQUERYI:
2704     case OP_NOTPOSQUERY:
2705     case OP_NOTPOSQUERYI:
2706 
2707     if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2708     break;
2709 
2710     case OP_UPTO:
2711     case OP_UPTOI:
2712     case OP_NOTUPTO:
2713     case OP_NOTUPTOI:
2714 
2715     case OP_MINUPTO:
2716     case OP_MINUPTOI:
2717     case OP_NOTMINUPTO:
2718     case OP_NOTMINUPTOI:
2719 
2720     case OP_POSUPTO:
2721     case OP_POSUPTOI:
2722     case OP_NOTPOSUPTO:
2723     case OP_NOTPOSUPTOI:
2724 
2725     if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2726     break;
2727 #endif
2728 
2729     /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2730     string. */
2731 
2732     case OP_MARK:
2733     case OP_PRUNE_ARG:
2734     case OP_SKIP_ARG:
2735     case OP_THEN_ARG:
2736     code += code[1];
2737     break;
2738 
2739     /* None of the remaining opcodes are required to match a character. */
2740 
2741     default:
2742     break;
2743     }
2744   }
2745 
2746 return TRUE;
2747 }
2748 
2749 
2750 
2751 /*************************************************
2752 *    Scan compiled regex for non-emptiness       *
2753 *************************************************/
2754 
2755 /* This function is called to check for left recursive calls. We want to check
2756 the current branch of the current pattern to see if it could match the empty
2757 string. If it could, we must look outwards for branches at other levels,
2758 stopping when we pass beyond the bracket which is the subject of the recursion.
2759 This function is called only during the real compile, not during the
2760 pre-compile.
2761 
2762 Arguments:
2763   code        points to start of the recursion
2764   endcode     points to where to stop (current RECURSE item)
2765   bcptr       points to the chain of current (unclosed) branch starts
2766   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2767   cd          pointers to tables etc
2768 
2769 Returns:      TRUE if what is matched could be empty
2770 */
2771 
2772 static BOOL
could_be_empty(const pcre_uchar * code,const pcre_uchar * endcode,branch_chain * bcptr,BOOL utf,compile_data * cd)2773 could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2774   branch_chain *bcptr, BOOL utf, compile_data *cd)
2775 {
2776 while (bcptr != NULL && bcptr->current_branch >= code)
2777   {
2778   if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
2779     return FALSE;
2780   bcptr = bcptr->outer;
2781   }
2782 return TRUE;
2783 }
2784 
2785 
2786 
2787 /*************************************************
2788 *        Base opcode of repeated opcodes         *
2789 *************************************************/
2790 
2791 /* Returns the base opcode for repeated single character type opcodes. If the
2792 opcode is not a repeated character type, it returns with the original value.
2793 
2794 Arguments:  c opcode
2795 Returns:    base opcode for the type
2796 */
2797 
2798 static pcre_uchar
get_repeat_base(pcre_uchar c)2799 get_repeat_base(pcre_uchar c)
2800 {
2801 return (c > OP_TYPEPOSUPTO)? c :
2802        (c >= OP_TYPESTAR)?   OP_TYPESTAR :
2803        (c >= OP_NOTSTARI)?   OP_NOTSTARI :
2804        (c >= OP_NOTSTAR)?    OP_NOTSTAR :
2805        (c >= OP_STARI)?      OP_STARI :
2806                              OP_STAR;
2807 }
2808 
2809 
2810 
2811 #ifdef SUPPORT_UCP
2812 /*************************************************
2813 *        Check a character and a property        *
2814 *************************************************/
2815 
2816 /* This function is called by check_auto_possessive() when a property item
2817 is adjacent to a fixed character.
2818 
2819 Arguments:
2820   c            the character
2821   ptype        the property type
2822   pdata        the data for the type
2823   negated      TRUE if it's a negated property (\P or \p{^)
2824 
2825 Returns:       TRUE if auto-possessifying is OK
2826 */
2827 
2828 static BOOL
check_char_prop(pcre_uint32 c,unsigned int ptype,unsigned int pdata,BOOL negated)2829 check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
2830   BOOL negated)
2831 {
2832 const pcre_uint32 *p;
2833 const ucd_record *prop = GET_UCD(c);
2834 
2835 switch(ptype)
2836   {
2837   case PT_LAMP:
2838   return (prop->chartype == ucp_Lu ||
2839           prop->chartype == ucp_Ll ||
2840           prop->chartype == ucp_Lt) == negated;
2841 
2842   case PT_GC:
2843   return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2844 
2845   case PT_PC:
2846   return (pdata == prop->chartype) == negated;
2847 
2848   case PT_SC:
2849   return (pdata == prop->script) == negated;
2850 
2851   /* These are specials */
2852 
2853   case PT_ALNUM:
2854   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2855           PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2856 
2857   /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2858   means that Perl space and POSIX space are now identical. PCRE was changed
2859   at release 8.34. */
2860 
2861   case PT_SPACE:    /* Perl space */
2862   case PT_PXSPACE:  /* POSIX space */
2863   switch(c)
2864     {
2865     HSPACE_CASES:
2866     VSPACE_CASES:
2867     return negated;
2868 
2869     default:
2870     return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
2871     }
2872   break;  /* Control never reaches here */
2873 
2874   case PT_WORD:
2875   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2876           PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2877           c == CHAR_UNDERSCORE) == negated;
2878 
2879   case PT_CLIST:
2880   p = PRIV(ucd_caseless_sets) + prop->caseset;
2881   for (;;)
2882     {
2883     if (c < *p) return !negated;
2884     if (c == *p++) return negated;
2885     }
2886   break;  /* Control never reaches here */
2887   }
2888 
2889 return FALSE;
2890 }
2891 #endif  /* SUPPORT_UCP */
2892 
2893 
2894 
2895 /*************************************************
2896 *        Fill the character property list        *
2897 *************************************************/
2898 
2899 /* Checks whether the code points to an opcode that can take part in auto-
2900 possessification, and if so, fills a list with its properties.
2901 
2902 Arguments:
2903   code        points to start of expression
2904   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2905   fcc         points to case-flipping table
2906   list        points to output list
2907               list[0] will be filled with the opcode
2908               list[1] will be non-zero if this opcode
2909                 can match an empty character string
2910               list[2..7] depends on the opcode
2911 
2912 Returns:      points to the start of the next opcode if *code is accepted
2913               NULL if *code is not accepted
2914 */
2915 
2916 static const pcre_uchar *
get_chr_property_list(const pcre_uchar * code,BOOL utf,const pcre_uint8 * fcc,pcre_uint32 * list)2917 get_chr_property_list(const pcre_uchar *code, BOOL utf,
2918   const pcre_uint8 *fcc, pcre_uint32 *list)
2919 {
2920 pcre_uchar c = *code;
2921 pcre_uchar base;
2922 const pcre_uchar *end;
2923 pcre_uint32 chr;
2924 
2925 #ifdef SUPPORT_UCP
2926 pcre_uint32 *clist_dest;
2927 const pcre_uint32 *clist_src;
2928 #else
2929 utf = utf;  /* Suppress "unused parameter" compiler warning */
2930 #endif
2931 
2932 list[0] = c;
2933 list[1] = FALSE;
2934 code++;
2935 
2936 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
2937   {
2938   base = get_repeat_base(c);
2939   c -= (base - OP_STAR);
2940 
2941   if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
2942     code += IMM2_SIZE;
2943 
2944   list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
2945 
2946   switch(base)
2947     {
2948     case OP_STAR:
2949     list[0] = OP_CHAR;
2950     break;
2951 
2952     case OP_STARI:
2953     list[0] = OP_CHARI;
2954     break;
2955 
2956     case OP_NOTSTAR:
2957     list[0] = OP_NOT;
2958     break;
2959 
2960     case OP_NOTSTARI:
2961     list[0] = OP_NOTI;
2962     break;
2963 
2964     case OP_TYPESTAR:
2965     list[0] = *code;
2966     code++;
2967     break;
2968     }
2969   c = list[0];
2970   }
2971 
2972 switch(c)
2973   {
2974   case OP_NOT_DIGIT:
2975   case OP_DIGIT:
2976   case OP_NOT_WHITESPACE:
2977   case OP_WHITESPACE:
2978   case OP_NOT_WORDCHAR:
2979   case OP_WORDCHAR:
2980   case OP_ANY:
2981   case OP_ALLANY:
2982   case OP_ANYNL:
2983   case OP_NOT_HSPACE:
2984   case OP_HSPACE:
2985   case OP_NOT_VSPACE:
2986   case OP_VSPACE:
2987   case OP_EXTUNI:
2988   case OP_EODN:
2989   case OP_EOD:
2990   case OP_DOLL:
2991   case OP_DOLLM:
2992   return code;
2993 
2994   case OP_CHAR:
2995   case OP_NOT:
2996   GETCHARINCTEST(chr, code);
2997   list[2] = chr;
2998   list[3] = NOTACHAR;
2999   return code;
3000 
3001   case OP_CHARI:
3002   case OP_NOTI:
3003   list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
3004   GETCHARINCTEST(chr, code);
3005   list[2] = chr;
3006 
3007 #ifdef SUPPORT_UCP
3008   if (chr < 128 || (chr < 256 && !utf))
3009     list[3] = fcc[chr];
3010   else
3011     list[3] = UCD_OTHERCASE(chr);
3012 #elif defined SUPPORT_UTF || !defined COMPILE_PCRE8
3013   list[3] = (chr < 256) ? fcc[chr] : chr;
3014 #else
3015   list[3] = fcc[chr];
3016 #endif
3017 
3018   /* The othercase might be the same value. */
3019 
3020   if (chr == list[3])
3021     list[3] = NOTACHAR;
3022   else
3023     list[4] = NOTACHAR;
3024   return code;
3025 
3026 #ifdef SUPPORT_UCP
3027   case OP_PROP:
3028   case OP_NOTPROP:
3029   if (code[0] != PT_CLIST)
3030     {
3031     list[2] = code[0];
3032     list[3] = code[1];
3033     return code + 2;
3034     }
3035 
3036   /* Convert only if we have enough space. */
3037 
3038   clist_src = PRIV(ucd_caseless_sets) + code[1];
3039   clist_dest = list + 2;
3040   code += 2;
3041 
3042   do {
3043      if (clist_dest >= list + 8)
3044        {
3045        /* Early return if there is not enough space. This should never
3046        happen, since all clists are shorter than 5 character now. */
3047        list[2] = code[0];
3048        list[3] = code[1];
3049        return code;
3050        }
3051      *clist_dest++ = *clist_src;
3052      }
3053   while(*clist_src++ != NOTACHAR);
3054 
3055   /* All characters are stored. The terminating NOTACHAR
3056   is copied form the clist itself. */
3057 
3058   list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
3059   return code;
3060 #endif
3061 
3062   case OP_NCLASS:
3063   case OP_CLASS:
3064 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3065   case OP_XCLASS:
3066   if (c == OP_XCLASS)
3067     end = code + GET(code, 0) - 1;
3068   else
3069 #endif
3070     end = code + 32 / sizeof(pcre_uchar);
3071 
3072   switch(*end)
3073     {
3074     case OP_CRSTAR:
3075     case OP_CRMINSTAR:
3076     case OP_CRQUERY:
3077     case OP_CRMINQUERY:
3078     case OP_CRPOSSTAR:
3079     case OP_CRPOSQUERY:
3080     list[1] = TRUE;
3081     end++;
3082     break;
3083 
3084     case OP_CRPLUS:
3085     case OP_CRMINPLUS:
3086     case OP_CRPOSPLUS:
3087     end++;
3088     break;
3089 
3090     case OP_CRRANGE:
3091     case OP_CRMINRANGE:
3092     case OP_CRPOSRANGE:
3093     list[1] = (GET2(end, 1) == 0);
3094     end += 1 + 2 * IMM2_SIZE;
3095     break;
3096     }
3097   list[2] = (pcre_uint32)(end - code);
3098   return end;
3099   }
3100 return NULL;    /* Opcode not accepted */
3101 }
3102 
3103 
3104 
3105 /*************************************************
3106 *    Scan further character sets for match       *
3107 *************************************************/
3108 
3109 /* Checks whether the base and the current opcode have a common character, in
3110 which case the base cannot be possessified.
3111 
3112 Arguments:
3113   code        points to the byte code
3114   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
3115   cd          static compile data
3116   base_list   the data list of the base opcode
3117 
3118 Returns:      TRUE if the auto-possessification is possible
3119 */
3120 
3121 static BOOL
compare_opcodes(const pcre_uchar * code,BOOL utf,const compile_data * cd,const pcre_uint32 * base_list,const pcre_uchar * base_end,int * rec_limit)3122 compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
3123   const pcre_uint32 *base_list, const pcre_uchar *base_end, int *rec_limit)
3124 {
3125 pcre_uchar c;
3126 pcre_uint32 list[8];
3127 const pcre_uint32 *chr_ptr;
3128 const pcre_uint32 *ochr_ptr;
3129 const pcre_uint32 *list_ptr;
3130 const pcre_uchar *next_code;
3131 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3132 const pcre_uchar *xclass_flags;
3133 #endif
3134 const pcre_uint8 *class_bitset;
3135 const pcre_uint8 *set1, *set2, *set_end;
3136 pcre_uint32 chr;
3137 BOOL accepted, invert_bits;
3138 BOOL entered_a_group = FALSE;
3139 
3140 if (*rec_limit == 0) return FALSE;
3141 --(*rec_limit);
3142 
3143 /* Note: the base_list[1] contains whether the current opcode has greedy
3144 (represented by a non-zero value) quantifier. This is a different from
3145 other character type lists, which stores here that the character iterator
3146 matches to an empty string (also represented by a non-zero value). */
3147 
3148 for(;;)
3149   {
3150   /* All operations move the code pointer forward.
3151   Therefore infinite recursions are not possible. */
3152 
3153   c = *code;
3154 
3155   /* Skip over callouts */
3156 
3157   if (c == OP_CALLOUT)
3158     {
3159     code += PRIV(OP_lengths)[c];
3160     continue;
3161     }
3162 
3163   if (c == OP_ALT)
3164     {
3165     do code += GET(code, 1); while (*code == OP_ALT);
3166     c = *code;
3167     }
3168 
3169   switch(c)
3170     {
3171     case OP_END:
3172     case OP_KETRPOS:
3173     /* TRUE only in greedy case. The non-greedy case could be replaced by
3174     an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
3175     uses more memory, which we cannot get at this stage.) */
3176 
3177     return base_list[1] != 0;
3178 
3179     case OP_KET:
3180     /* If the bracket is capturing, and referenced by an OP_RECURSE, or
3181     it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
3182     cannot be converted to a possessive form. */
3183 
3184     if (base_list[1] == 0) return FALSE;
3185 
3186     switch(*(code - GET(code, 1)))
3187       {
3188       case OP_ASSERT:
3189       case OP_ASSERT_NOT:
3190       case OP_ASSERTBACK:
3191       case OP_ASSERTBACK_NOT:
3192       case OP_ONCE:
3193       case OP_ONCE_NC:
3194       /* Atomic sub-patterns and assertions can always auto-possessify their
3195       last iterator. However, if the group was entered as a result of checking
3196       a previous iterator, this is not possible. */
3197 
3198       return !entered_a_group;
3199       }
3200 
3201     code += PRIV(OP_lengths)[c];
3202     continue;
3203 
3204     case OP_ONCE:
3205     case OP_ONCE_NC:
3206     case OP_BRA:
3207     case OP_CBRA:
3208     next_code = code + GET(code, 1);
3209     code += PRIV(OP_lengths)[c];
3210 
3211     while (*next_code == OP_ALT)
3212       {
3213       if (!compare_opcodes(code, utf, cd, base_list, base_end, rec_limit))
3214         return FALSE;
3215       code = next_code + 1 + LINK_SIZE;
3216       next_code += GET(next_code, 1);
3217       }
3218 
3219     entered_a_group = TRUE;
3220     continue;
3221 
3222     case OP_BRAZERO:
3223     case OP_BRAMINZERO:
3224 
3225     next_code = code + 1;
3226     if (*next_code != OP_BRA && *next_code != OP_CBRA
3227         && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
3228 
3229     do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3230 
3231     /* The bracket content will be checked by the
3232     OP_BRA/OP_CBRA case above. */
3233     next_code += 1 + LINK_SIZE;
3234     if (!compare_opcodes(next_code, utf, cd, base_list, base_end, rec_limit))
3235       return FALSE;
3236 
3237     code += PRIV(OP_lengths)[c];
3238     continue;
3239 
3240     default:
3241     break;
3242     }
3243 
3244   /* Check for a supported opcode, and load its properties. */
3245 
3246   code = get_chr_property_list(code, utf, cd->fcc, list);
3247   if (code == NULL) return FALSE;    /* Unsupported */
3248 
3249   /* If either opcode is a small character list, set pointers for comparing
3250   characters from that list with another list, or with a property. */
3251 
3252   if (base_list[0] == OP_CHAR)
3253     {
3254     chr_ptr = base_list + 2;
3255     list_ptr = list;
3256     }
3257   else if (list[0] == OP_CHAR)
3258     {
3259     chr_ptr = list + 2;
3260     list_ptr = base_list;
3261     }
3262 
3263   /* Character bitsets can also be compared to certain opcodes. */
3264 
3265   else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
3266 #ifdef COMPILE_PCRE8
3267       /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
3268       || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
3269 #endif
3270       )
3271     {
3272 #ifdef COMPILE_PCRE8
3273     if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
3274 #else
3275     if (base_list[0] == OP_CLASS)
3276 #endif
3277       {
3278       set1 = (pcre_uint8 *)(base_end - base_list[2]);
3279       list_ptr = list;
3280       }
3281     else
3282       {
3283       set1 = (pcre_uint8 *)(code - list[2]);
3284       list_ptr = base_list;
3285       }
3286 
3287     invert_bits = FALSE;
3288     switch(list_ptr[0])
3289       {
3290       case OP_CLASS:
3291       case OP_NCLASS:
3292       set2 = (pcre_uint8 *)
3293         ((list_ptr == list ? code : base_end) - list_ptr[2]);
3294       break;
3295 
3296 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3297       case OP_XCLASS:
3298       xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE;
3299       if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;
3300       if ((*xclass_flags & XCL_MAP) == 0)
3301         {
3302         /* No bits are set for characters < 256. */
3303         if (list[1] == 0) return (*xclass_flags & XCL_NOT) == 0;
3304         /* Might be an empty repeat. */
3305         continue;
3306         }
3307       set2 = (pcre_uint8 *)(xclass_flags + 1);
3308       break;
3309 #endif
3310 
3311       case OP_NOT_DIGIT:
3312       invert_bits = TRUE;
3313       /* Fall through */
3314       case OP_DIGIT:
3315       set2 = (pcre_uint8 *)(cd->cbits + cbit_digit);
3316       break;
3317 
3318       case OP_NOT_WHITESPACE:
3319       invert_bits = TRUE;
3320       /* Fall through */
3321       case OP_WHITESPACE:
3322       set2 = (pcre_uint8 *)(cd->cbits + cbit_space);
3323       break;
3324 
3325       case OP_NOT_WORDCHAR:
3326       invert_bits = TRUE;
3327       /* Fall through */
3328       case OP_WORDCHAR:
3329       set2 = (pcre_uint8 *)(cd->cbits + cbit_word);
3330       break;
3331 
3332       default:
3333       return FALSE;
3334       }
3335 
3336     /* Because the sets are unaligned, we need
3337     to perform byte comparison here. */
3338     set_end = set1 + 32;
3339     if (invert_bits)
3340       {
3341       do
3342         {
3343         if ((*set1++ & ~(*set2++)) != 0) return FALSE;
3344         }
3345       while (set1 < set_end);
3346       }
3347     else
3348       {
3349       do
3350         {
3351         if ((*set1++ & *set2++) != 0) return FALSE;
3352         }
3353       while (set1 < set_end);
3354       }
3355 
3356     if (list[1] == 0) return TRUE;
3357     /* Might be an empty repeat. */
3358     continue;
3359     }
3360 
3361   /* Some property combinations also acceptable. Unicode property opcodes are
3362   processed specially; the rest can be handled with a lookup table. */
3363 
3364   else
3365     {
3366     pcre_uint32 leftop, rightop;
3367 
3368     leftop = base_list[0];
3369     rightop = list[0];
3370 
3371 #ifdef SUPPORT_UCP
3372     accepted = FALSE; /* Always set in non-unicode case. */
3373     if (leftop == OP_PROP || leftop == OP_NOTPROP)
3374       {
3375       if (rightop == OP_EOD)
3376         accepted = TRUE;
3377       else if (rightop == OP_PROP || rightop == OP_NOTPROP)
3378         {
3379         int n;
3380         const pcre_uint8 *p;
3381         BOOL same = leftop == rightop;
3382         BOOL lisprop = leftop == OP_PROP;
3383         BOOL risprop = rightop == OP_PROP;
3384         BOOL bothprop = lisprop && risprop;
3385 
3386         /* There's a table that specifies how each combination is to be
3387         processed:
3388           0   Always return FALSE (never auto-possessify)
3389           1   Character groups are distinct (possessify if both are OP_PROP)
3390           2   Check character categories in the same group (general or particular)
3391           3   Return TRUE if the two opcodes are not the same
3392           ... see comments below
3393         */
3394 
3395         n = propposstab[base_list[2]][list[2]];
3396         switch(n)
3397           {
3398           case 0: break;
3399           case 1: accepted = bothprop; break;
3400           case 2: accepted = (base_list[3] == list[3]) != same; break;
3401           case 3: accepted = !same; break;
3402 
3403           case 4:  /* Left general category, right particular category */
3404           accepted = risprop && catposstab[base_list[3]][list[3]] == same;
3405           break;
3406 
3407           case 5:  /* Right general category, left particular category */
3408           accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
3409           break;
3410 
3411           /* This code is logically tricky. Think hard before fiddling with it.
3412           The posspropstab table has four entries per row. Each row relates to
3413           one of PCRE's special properties such as ALNUM or SPACE or WORD.
3414           Only WORD actually needs all four entries, but using repeats for the
3415           others means they can all use the same code below.
3416 
3417           The first two entries in each row are Unicode general categories, and
3418           apply always, because all the characters they include are part of the
3419           PCRE character set. The third and fourth entries are a general and a
3420           particular category, respectively, that include one or more relevant
3421           characters. One or the other is used, depending on whether the check
3422           is for a general or a particular category. However, in both cases the
3423           category contains more characters than the specials that are defined
3424           for the property being tested against. Therefore, it cannot be used
3425           in a NOTPROP case.
3426 
3427           Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3428           Underscore is covered by ucp_P or ucp_Po. */
3429 
3430           case 6:  /* Left alphanum vs right general category */
3431           case 7:  /* Left space vs right general category */
3432           case 8:  /* Left word vs right general category */
3433           p = posspropstab[n-6];
3434           accepted = risprop && lisprop ==
3435             (list[3] != p[0] &&
3436              list[3] != p[1] &&
3437             (list[3] != p[2] || !lisprop));
3438           break;
3439 
3440           case 9:   /* Right alphanum vs left general category */
3441           case 10:  /* Right space vs left general category */
3442           case 11:  /* Right word vs left general category */
3443           p = posspropstab[n-9];
3444           accepted = lisprop && risprop ==
3445             (base_list[3] != p[0] &&
3446              base_list[3] != p[1] &&
3447             (base_list[3] != p[2] || !risprop));
3448           break;
3449 
3450           case 12:  /* Left alphanum vs right particular category */
3451           case 13:  /* Left space vs right particular category */
3452           case 14:  /* Left word vs right particular category */
3453           p = posspropstab[n-12];
3454           accepted = risprop && lisprop ==
3455             (catposstab[p[0]][list[3]] &&
3456              catposstab[p[1]][list[3]] &&
3457             (list[3] != p[3] || !lisprop));
3458           break;
3459 
3460           case 15:  /* Right alphanum vs left particular category */
3461           case 16:  /* Right space vs left particular category */
3462           case 17:  /* Right word vs left particular category */
3463           p = posspropstab[n-15];
3464           accepted = lisprop && risprop ==
3465             (catposstab[p[0]][base_list[3]] &&
3466              catposstab[p[1]][base_list[3]] &&
3467             (base_list[3] != p[3] || !risprop));
3468           break;
3469           }
3470         }
3471       }
3472 
3473     else
3474 #endif  /* SUPPORT_UCP */
3475 
3476     accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3477            rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3478            autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3479 
3480     if (!accepted) return FALSE;
3481 
3482     if (list[1] == 0) return TRUE;
3483     /* Might be an empty repeat. */
3484     continue;
3485     }
3486 
3487   /* Control reaches here only if one of the items is a small character list.
3488   All characters are checked against the other side. */
3489 
3490   do
3491     {
3492     chr = *chr_ptr;
3493 
3494     switch(list_ptr[0])
3495       {
3496       case OP_CHAR:
3497       ochr_ptr = list_ptr + 2;
3498       do
3499         {
3500         if (chr == *ochr_ptr) return FALSE;
3501         ochr_ptr++;
3502         }
3503       while(*ochr_ptr != NOTACHAR);
3504       break;
3505 
3506       case OP_NOT:
3507       ochr_ptr = list_ptr + 2;
3508       do
3509         {
3510         if (chr == *ochr_ptr)
3511           break;
3512         ochr_ptr++;
3513         }
3514       while(*ochr_ptr != NOTACHAR);
3515       if (*ochr_ptr == NOTACHAR) return FALSE;   /* Not found */
3516       break;
3517 
3518       /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not*
3519       set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3520 
3521       case OP_DIGIT:
3522       if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE;
3523       break;
3524 
3525       case OP_NOT_DIGIT:
3526       if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE;
3527       break;
3528 
3529       case OP_WHITESPACE:
3530       if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE;
3531       break;
3532 
3533       case OP_NOT_WHITESPACE:
3534       if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE;
3535       break;
3536 
3537       case OP_WORDCHAR:
3538       if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE;
3539       break;
3540 
3541       case OP_NOT_WORDCHAR:
3542       if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE;
3543       break;
3544 
3545       case OP_HSPACE:
3546       switch(chr)
3547         {
3548         HSPACE_CASES: return FALSE;
3549         default: break;
3550         }
3551       break;
3552 
3553       case OP_NOT_HSPACE:
3554       switch(chr)
3555         {
3556         HSPACE_CASES: break;
3557         default: return FALSE;
3558         }
3559       break;
3560 
3561       case OP_ANYNL:
3562       case OP_VSPACE:
3563       switch(chr)
3564         {
3565         VSPACE_CASES: return FALSE;
3566         default: break;
3567         }
3568       break;
3569 
3570       case OP_NOT_VSPACE:
3571       switch(chr)
3572         {
3573         VSPACE_CASES: break;
3574         default: return FALSE;
3575         }
3576       break;
3577 
3578       case OP_DOLL:
3579       case OP_EODN:
3580       switch (chr)
3581         {
3582         case CHAR_CR:
3583         case CHAR_LF:
3584         case CHAR_VT:
3585         case CHAR_FF:
3586         case CHAR_NEL:
3587 #ifndef EBCDIC
3588         case 0x2028:
3589         case 0x2029:
3590 #endif  /* Not EBCDIC */
3591         return FALSE;
3592         }
3593       break;
3594 
3595       case OP_EOD:    /* Can always possessify before \z */
3596       break;
3597 
3598 #ifdef SUPPORT_UCP
3599       case OP_PROP:
3600       case OP_NOTPROP:
3601       if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3602             list_ptr[0] == OP_NOTPROP))
3603         return FALSE;
3604       break;
3605 #endif
3606 
3607       case OP_NCLASS:
3608       if (chr > 255) return FALSE;
3609       /* Fall through */
3610 
3611       case OP_CLASS:
3612       if (chr > 255) break;
3613       class_bitset = (pcre_uint8 *)
3614         ((list_ptr == list ? code : base_end) - list_ptr[2]);
3615       if ((class_bitset[chr >> 3] & (1U << (chr & 7))) != 0) return FALSE;
3616       break;
3617 
3618 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3619       case OP_XCLASS:
3620       if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
3621           list_ptr[2] + LINK_SIZE, utf)) return FALSE;
3622       break;
3623 #endif
3624 
3625       default:
3626       return FALSE;
3627       }
3628 
3629     chr_ptr++;
3630     }
3631   while(*chr_ptr != NOTACHAR);
3632 
3633   /* At least one character must be matched from this opcode. */
3634 
3635   if (list[1] == 0) return TRUE;
3636   }
3637 
3638 /* Control never reaches here. There used to be a fail-save return FALSE; here,
3639 but some compilers complain about an unreachable statement. */
3640 
3641 }
3642 
3643 
3644 
3645 /*************************************************
3646 *    Scan compiled regex for auto-possession     *
3647 *************************************************/
3648 
3649 /* Replaces single character iterations with their possessive alternatives
3650 if appropriate. This function modifies the compiled opcode!
3651 
3652 Arguments:
3653   code        points to start of the byte code
3654   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
3655   cd          static compile data
3656 
3657 Returns:      nothing
3658 */
3659 
3660 static void
auto_possessify(pcre_uchar * code,BOOL utf,const compile_data * cd)3661 auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)
3662 {
3663 register pcre_uchar c;
3664 const pcre_uchar *end;
3665 pcre_uchar *repeat_opcode;
3666 pcre_uint32 list[8];
3667 int rec_limit;
3668 
3669 for (;;)
3670   {
3671   c = *code;
3672 
3673   /* When a pattern with bad UTF-8 encoding is compiled with NO_UTF_CHECK,
3674   it may compile without complaining, but may get into a loop here if the code
3675   pointer points to a bad value. This is, of course a documentated possibility,
3676   when NO_UTF_CHECK is set, so it isn't a bug, but we can detect this case and
3677   just give up on this optimization. */
3678 
3679   if (c >= OP_TABLE_LENGTH) return;
3680 
3681   if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
3682     {
3683     c -= get_repeat_base(c) - OP_STAR;
3684     end = (c <= OP_MINUPTO) ?
3685       get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3686     list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3687 
3688     rec_limit = 1000;
3689     if (end != NULL && compare_opcodes(end, utf, cd, list, end, &rec_limit))
3690       {
3691       switch(c)
3692         {
3693         case OP_STAR:
3694         *code += OP_POSSTAR - OP_STAR;
3695         break;
3696 
3697         case OP_MINSTAR:
3698         *code += OP_POSSTAR - OP_MINSTAR;
3699         break;
3700 
3701         case OP_PLUS:
3702         *code += OP_POSPLUS - OP_PLUS;
3703         break;
3704 
3705         case OP_MINPLUS:
3706         *code += OP_POSPLUS - OP_MINPLUS;
3707         break;
3708 
3709         case OP_QUERY:
3710         *code += OP_POSQUERY - OP_QUERY;
3711         break;
3712 
3713         case OP_MINQUERY:
3714         *code += OP_POSQUERY - OP_MINQUERY;
3715         break;
3716 
3717         case OP_UPTO:
3718         *code += OP_POSUPTO - OP_UPTO;
3719         break;
3720 
3721         case OP_MINUPTO:
3722         *code += OP_POSUPTO - OP_MINUPTO;
3723         break;
3724         }
3725       }
3726     c = *code;
3727     }
3728   else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
3729     {
3730 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3731     if (c == OP_XCLASS)
3732       repeat_opcode = code + GET(code, 1);
3733     else
3734 #endif
3735       repeat_opcode = code + 1 + (32 / sizeof(pcre_uchar));
3736 
3737     c = *repeat_opcode;
3738     if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
3739       {
3740       /* end must not be NULL. */
3741       end = get_chr_property_list(code, utf, cd->fcc, list);
3742 
3743       list[1] = (c & 1) == 0;
3744 
3745       rec_limit = 1000;
3746       if (compare_opcodes(end, utf, cd, list, end, &rec_limit))
3747         {
3748         switch (c)
3749           {
3750           case OP_CRSTAR:
3751           case OP_CRMINSTAR:
3752           *repeat_opcode = OP_CRPOSSTAR;
3753           break;
3754 
3755           case OP_CRPLUS:
3756           case OP_CRMINPLUS:
3757           *repeat_opcode = OP_CRPOSPLUS;
3758           break;
3759 
3760           case OP_CRQUERY:
3761           case OP_CRMINQUERY:
3762           *repeat_opcode = OP_CRPOSQUERY;
3763           break;
3764 
3765           case OP_CRRANGE:
3766           case OP_CRMINRANGE:
3767           *repeat_opcode = OP_CRPOSRANGE;
3768           break;
3769           }
3770         }
3771       }
3772     c = *code;
3773     }
3774 
3775   switch(c)
3776     {
3777     case OP_END:
3778     return;
3779 
3780     case OP_TYPESTAR:
3781     case OP_TYPEMINSTAR:
3782     case OP_TYPEPLUS:
3783     case OP_TYPEMINPLUS:
3784     case OP_TYPEQUERY:
3785     case OP_TYPEMINQUERY:
3786     case OP_TYPEPOSSTAR:
3787     case OP_TYPEPOSPLUS:
3788     case OP_TYPEPOSQUERY:
3789     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
3790     break;
3791 
3792     case OP_TYPEUPTO:
3793     case OP_TYPEMINUPTO:
3794     case OP_TYPEEXACT:
3795     case OP_TYPEPOSUPTO:
3796     if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
3797       code += 2;
3798     break;
3799 
3800 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3801     case OP_XCLASS:
3802     code += GET(code, 1);
3803     break;
3804 #endif
3805 
3806     case OP_MARK:
3807     case OP_PRUNE_ARG:
3808     case OP_SKIP_ARG:
3809     case OP_THEN_ARG:
3810     code += code[1];
3811     break;
3812     }
3813 
3814   /* Add in the fixed length from the table */
3815 
3816   code += PRIV(OP_lengths)[c];
3817 
3818   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
3819   a multi-byte character. The length in the table is a minimum, so we have to
3820   arrange to skip the extra bytes. */
3821 
3822 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
3823   if (utf) switch(c)
3824     {
3825     case OP_CHAR:
3826     case OP_CHARI:
3827     case OP_NOT:
3828     case OP_NOTI:
3829     case OP_STAR:
3830     case OP_MINSTAR:
3831     case OP_PLUS:
3832     case OP_MINPLUS:
3833     case OP_QUERY:
3834     case OP_MINQUERY:
3835     case OP_UPTO:
3836     case OP_MINUPTO:
3837     case OP_EXACT:
3838     case OP_POSSTAR:
3839     case OP_POSPLUS:
3840     case OP_POSQUERY:
3841     case OP_POSUPTO:
3842     case OP_STARI:
3843     case OP_MINSTARI:
3844     case OP_PLUSI:
3845     case OP_MINPLUSI:
3846     case OP_QUERYI:
3847     case OP_MINQUERYI:
3848     case OP_UPTOI:
3849     case OP_MINUPTOI:
3850     case OP_EXACTI:
3851     case OP_POSSTARI:
3852     case OP_POSPLUSI:
3853     case OP_POSQUERYI:
3854     case OP_POSUPTOI:
3855     case OP_NOTSTAR:
3856     case OP_NOTMINSTAR:
3857     case OP_NOTPLUS:
3858     case OP_NOTMINPLUS:
3859     case OP_NOTQUERY:
3860     case OP_NOTMINQUERY:
3861     case OP_NOTUPTO:
3862     case OP_NOTMINUPTO:
3863     case OP_NOTEXACT:
3864     case OP_NOTPOSSTAR:
3865     case OP_NOTPOSPLUS:
3866     case OP_NOTPOSQUERY:
3867     case OP_NOTPOSUPTO:
3868     case OP_NOTSTARI:
3869     case OP_NOTMINSTARI:
3870     case OP_NOTPLUSI:
3871     case OP_NOTMINPLUSI:
3872     case OP_NOTQUERYI:
3873     case OP_NOTMINQUERYI:
3874     case OP_NOTUPTOI:
3875     case OP_NOTMINUPTOI:
3876     case OP_NOTEXACTI:
3877     case OP_NOTPOSSTARI:
3878     case OP_NOTPOSPLUSI:
3879     case OP_NOTPOSQUERYI:
3880     case OP_NOTPOSUPTOI:
3881     if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
3882     break;
3883     }
3884 #else
3885   (void)(utf);  /* Keep compiler happy by referencing function argument */
3886 #endif
3887   }
3888 }
3889 
3890 
3891 
3892 /*************************************************
3893 *           Check for POSIX class syntax         *
3894 *************************************************/
3895 
3896 /* This function is called when the sequence "[:" or "[." or "[=" is
3897 encountered in a character class. It checks whether this is followed by a
3898 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
3899 reach an unescaped ']' without the special preceding character, return FALSE.
3900 
3901 Originally, this function only recognized a sequence of letters between the
3902 terminators, but it seems that Perl recognizes any sequence of characters,
3903 though of course unknown POSIX names are subsequently rejected. Perl gives an
3904 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
3905 didn't consider this to be a POSIX class. Likewise for [:1234:].
3906 
3907 The problem in trying to be exactly like Perl is in the handling of escapes. We
3908 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
3909 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
3910 below handles the special cases \\ and \], but does not try to do any other
3911 escape processing. This makes it different from Perl for cases such as
3912 [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
3913 not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
3914 when Perl does, I think.
3915 
3916 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
3917 It seems that the appearance of a nested POSIX class supersedes an apparent
3918 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
3919 a digit.
3920 
3921 In Perl, unescaped square brackets may also appear as part of class names. For
3922 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
3923 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
3924 seem right at all. PCRE does not allow closing square brackets in POSIX class
3925 names.
3926 
3927 Arguments:
3928   ptr      pointer to the initial [
3929   endptr   where to return the end pointer
3930 
3931 Returns:   TRUE or FALSE
3932 */
3933 
3934 static BOOL
check_posix_syntax(const pcre_uchar * ptr,const pcre_uchar ** endptr)3935 check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
3936 {
3937 pcre_uchar terminator;          /* Don't combine these lines; the Solaris cc */
3938 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
3939 for (++ptr; *ptr != CHAR_NULL; ptr++)
3940   {
3941   if (*ptr == CHAR_BACKSLASH &&
3942       (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET ||
3943        ptr[1] == CHAR_BACKSLASH))
3944     ptr++;
3945   else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
3946             *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
3947   else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3948     {
3949     *endptr = ptr;
3950     return TRUE;
3951     }
3952   }
3953 return FALSE;
3954 }
3955 
3956 
3957 
3958 
3959 /*************************************************
3960 *          Check POSIX class name                *
3961 *************************************************/
3962 
3963 /* This function is called to check the name given in a POSIX-style class entry
3964 such as [:alnum:].
3965 
3966 Arguments:
3967   ptr        points to the first letter
3968   len        the length of the name
3969 
3970 Returns:     a value representing the name, or -1 if unknown
3971 */
3972 
3973 static int
check_posix_name(const pcre_uchar * ptr,int len)3974 check_posix_name(const pcre_uchar *ptr, int len)
3975 {
3976 const char *pn = posix_names;
3977 register int yield = 0;
3978 while (posix_name_lengths[yield] != 0)
3979   {
3980   if (len == posix_name_lengths[yield] &&
3981     STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
3982   pn += posix_name_lengths[yield] + 1;
3983   yield++;
3984   }
3985 return -1;
3986 }
3987 
3988 
3989 /*************************************************
3990 *    Adjust OP_RECURSE items in repeated group   *
3991 *************************************************/
3992 
3993 /* OP_RECURSE items contain an offset from the start of the regex to the group
3994 that is referenced. This means that groups can be replicated for fixed
3995 repetition simply by copying (because the recursion is allowed to refer to
3996 earlier groups that are outside the current group). However, when a group is
3997 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
3998 inserted before it, after it has been compiled. This means that any OP_RECURSE
3999 items within it that refer to the group itself or any contained groups have to
4000 have their offsets adjusted. That one of the jobs of this function. Before it
4001 is called, the partially compiled regex must be temporarily terminated with
4002 OP_END.
4003 
4004 This function has been extended to cope with forward references for recursions
4005 and subroutine calls. It must check the list of such references for the
4006 group we are dealing with. If it finds that one of the recursions in the
4007 current group is on this list, it does not adjust the value in the reference
4008 (which is a group number). After the group has been scanned, all the offsets in
4009 the forward reference list for the group are adjusted.
4010 
4011 Arguments:
4012   group      points to the start of the group
4013   adjust     the amount by which the group is to be moved
4014   utf        TRUE in UTF-8 / UTF-16 / UTF-32 mode
4015   cd         contains pointers to tables etc.
4016   save_hwm_offset   the hwm forward reference offset at the start of the group
4017 
4018 Returns:     nothing
4019 */
4020 
4021 static void
adjust_recurse(pcre_uchar * group,int adjust,BOOL utf,compile_data * cd,size_t save_hwm_offset)4022 adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
4023   size_t save_hwm_offset)
4024 {
4025 int offset;
4026 pcre_uchar *hc;
4027 pcre_uchar *ptr = group;
4028 
4029 while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
4030   {
4031   for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
4032        hc += LINK_SIZE)
4033     {
4034     offset = (int)GET(hc, 0);
4035     if (cd->start_code + offset == ptr + 1) break;
4036     }
4037 
4038   /* If we have not found this recursion on the forward reference list, adjust
4039   the recursion's offset if it's after the start of this group. */
4040 
4041   if (hc >= cd->hwm)
4042     {
4043     offset = (int)GET(ptr, 1);
4044     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
4045     }
4046 
4047   ptr += 1 + LINK_SIZE;
4048   }
4049 
4050 /* Now adjust all forward reference offsets for the group. */
4051 
4052 for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
4053      hc += LINK_SIZE)
4054   {
4055   offset = (int)GET(hc, 0);
4056   PUT(hc, 0, offset + adjust);
4057   }
4058 }
4059 
4060 
4061 
4062 /*************************************************
4063 *        Insert an automatic callout point       *
4064 *************************************************/
4065 
4066 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
4067 callout points before each pattern item.
4068 
4069 Arguments:
4070   code           current code pointer
4071   ptr            current pattern pointer
4072   cd             pointers to tables etc
4073 
4074 Returns:         new code pointer
4075 */
4076 
4077 static pcre_uchar *
auto_callout(pcre_uchar * code,const pcre_uchar * ptr,compile_data * cd)4078 auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
4079 {
4080 *code++ = OP_CALLOUT;
4081 *code++ = 255;
4082 PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
4083 PUT(code, LINK_SIZE, 0);                       /* Default length */
4084 return code + 2 * LINK_SIZE;
4085 }
4086 
4087 
4088 
4089 /*************************************************
4090 *         Complete a callout item                *
4091 *************************************************/
4092 
4093 /* A callout item contains the length of the next item in the pattern, which
4094 we can't fill in till after we have reached the relevant point. This is used
4095 for both automatic and manual callouts.
4096 
4097 Arguments:
4098   previous_callout   points to previous callout item
4099   ptr                current pattern pointer
4100   cd                 pointers to tables etc
4101 
4102 Returns:             nothing
4103 */
4104 
4105 static void
complete_callout(pcre_uchar * previous_callout,const pcre_uchar * ptr,compile_data * cd)4106 complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
4107 {
4108 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
4109 PUT(previous_callout, 2 + LINK_SIZE, length);
4110 }
4111 
4112 
4113 
4114 #ifdef SUPPORT_UCP
4115 /*************************************************
4116 *           Get othercase range                  *
4117 *************************************************/
4118 
4119 /* This function is passed the start and end of a class range, in UTF-8 mode
4120 with UCP support. It searches up the characters, looking for ranges of
4121 characters in the "other" case. Each call returns the next one, updating the
4122 start address. A character with multiple other cases is returned on its own
4123 with a special return value.
4124 
4125 Arguments:
4126   cptr        points to starting character value; updated
4127   d           end value
4128   ocptr       where to put start of othercase range
4129   odptr       where to put end of othercase range
4130 
4131 Yield:        -1 when no more
4132                0 when a range is returned
4133               >0 the CASESET offset for char with multiple other cases
4134                 in this case, ocptr contains the original
4135 */
4136 
4137 static int
get_othercase_range(pcre_uint32 * cptr,pcre_uint32 d,pcre_uint32 * ocptr,pcre_uint32 * odptr)4138 get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
4139   pcre_uint32 *odptr)
4140 {
4141 pcre_uint32 c, othercase, next;
4142 unsigned int co;
4143 
4144 /* Find the first character that has an other case. If it has multiple other
4145 cases, return its case offset value. */
4146 
4147 for (c = *cptr; c <= d; c++)
4148   {
4149   if ((co = UCD_CASESET(c)) != 0)
4150     {
4151     *ocptr = c++;   /* Character that has the set */
4152     *cptr = c;      /* Rest of input range */
4153     return (int)co;
4154     }
4155   if ((othercase = UCD_OTHERCASE(c)) != c) break;
4156   }
4157 
4158 if (c > d) return -1;  /* Reached end of range */
4159 
4160 /* Found a character that has a single other case. Search for the end of the
4161 range, which is either the end of the input range, or a character that has zero
4162 or more than one other cases. */
4163 
4164 *ocptr = othercase;
4165 next = othercase + 1;
4166 
4167 for (++c; c <= d; c++)
4168   {
4169   if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
4170   next++;
4171   }
4172 
4173 *odptr = next - 1;     /* End of othercase range */
4174 *cptr = c;             /* Rest of input range */
4175 return 0;
4176 }
4177 #endif  /* SUPPORT_UCP */
4178 
4179 
4180 
4181 /*************************************************
4182 *        Add a character or range to a class     *
4183 *************************************************/
4184 
4185 /* This function packages up the logic of adding a character or range of
4186 characters to a class. The character values in the arguments will be within the
4187 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
4188 mutually recursive with the function immediately below.
4189 
4190 Arguments:
4191   classbits     the bit map for characters < 256
4192   uchardptr     points to the pointer for extra data
4193   options       the options word
4194   cd            contains pointers to tables etc.
4195   start         start of range character
4196   end           end of range character
4197 
4198 Returns:        the number of < 256 characters added
4199                 the pointer to extra data is updated
4200 */
4201 
4202 static int
add_to_class(pcre_uint8 * classbits,pcre_uchar ** uchardptr,int options,compile_data * cd,pcre_uint32 start,pcre_uint32 end)4203 add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4204   compile_data *cd, pcre_uint32 start, pcre_uint32 end)
4205 {
4206 pcre_uint32 c;
4207 pcre_uint32 classbits_end = (end <= 0xff ? end : 0xff);
4208 int n8 = 0;
4209 
4210 /* If caseless matching is required, scan the range and process alternate
4211 cases. In Unicode, there are 8-bit characters that have alternate cases that
4212 are greater than 255 and vice-versa. Sometimes we can just extend the original
4213 range. */
4214 
4215 if ((options & PCRE_CASELESS) != 0)
4216   {
4217 #ifdef SUPPORT_UCP
4218   if ((options & PCRE_UTF8) != 0)
4219     {
4220     int rc;
4221     pcre_uint32 oc, od;
4222 
4223     options &= ~PCRE_CASELESS;   /* Remove for recursive calls */
4224     c = start;
4225 
4226     while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
4227       {
4228       /* Handle a single character that has more than one other case. */
4229 
4230       if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
4231         PRIV(ucd_caseless_sets) + rc, oc);
4232 
4233       /* Do nothing if the other case range is within the original range. */
4234 
4235       else if (oc >= start && od <= end) continue;
4236 
4237       /* Extend the original range if there is overlap, noting that if oc < c, we
4238       can't have od > end because a subrange is always shorter than the basic
4239       range. Otherwise, use a recursive call to add the additional range. */
4240 
4241       else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
4242       else if (od > end && oc <= end + 1)
4243         {
4244         end = od;       /* Extend upwards */
4245         if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
4246         }
4247       else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
4248       }
4249     }
4250   else
4251 #endif  /* SUPPORT_UCP */
4252 
4253   /* Not UTF-mode, or no UCP */
4254 
4255   for (c = start; c <= classbits_end; c++)
4256     {
4257     SETBIT(classbits, cd->fcc[c]);
4258     n8++;
4259     }
4260   }
4261 
4262 /* Now handle the original range. Adjust the final value according to the bit
4263 length - this means that the same lists of (e.g.) horizontal spaces can be used
4264 in all cases. */
4265 
4266 #if defined COMPILE_PCRE8
4267 #ifdef SUPPORT_UTF
4268   if ((options & PCRE_UTF8) == 0)
4269 #endif
4270   if (end > 0xff) end = 0xff;
4271 
4272 #elif defined COMPILE_PCRE16
4273 #ifdef SUPPORT_UTF
4274   if ((options & PCRE_UTF16) == 0)
4275 #endif
4276   if (end > 0xffff) end = 0xffff;
4277 
4278 #endif /* COMPILE_PCRE[8|16] */
4279 
4280 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
4281 
4282 for (c = start; c <= classbits_end; c++)
4283   {
4284   /* Regardless of start, c will always be <= 255. */
4285   SETBIT(classbits, c);
4286   n8++;
4287   }
4288 
4289 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4290 if (start <= 0xff) start = 0xff + 1;
4291 
4292 if (end >= start)
4293   {
4294   pcre_uchar *uchardata = *uchardptr;
4295 #ifdef SUPPORT_UTF
4296   if ((options & PCRE_UTF8) != 0)  /* All UTFs use the same flag bit */
4297     {
4298     if (start < end)
4299       {
4300       *uchardata++ = XCL_RANGE;
4301       uchardata += PRIV(ord2utf)(start, uchardata);
4302       uchardata += PRIV(ord2utf)(end, uchardata);
4303       }
4304     else if (start == end)
4305       {
4306       *uchardata++ = XCL_SINGLE;
4307       uchardata += PRIV(ord2utf)(start, uchardata);
4308       }
4309     }
4310   else
4311 #endif  /* SUPPORT_UTF */
4312 
4313   /* Without UTF support, character values are constrained by the bit length,
4314   and can only be > 256 for 16-bit and 32-bit libraries. */
4315 
4316 #ifdef COMPILE_PCRE8
4317     {}
4318 #else
4319   if (start < end)
4320     {
4321     *uchardata++ = XCL_RANGE;
4322     *uchardata++ = start;
4323     *uchardata++ = end;
4324     }
4325   else if (start == end)
4326     {
4327     *uchardata++ = XCL_SINGLE;
4328     *uchardata++ = start;
4329     }
4330 #endif
4331 
4332   *uchardptr = uchardata;   /* Updata extra data pointer */
4333   }
4334 #endif /* SUPPORT_UTF || !COMPILE_PCRE8 */
4335 
4336 return n8;    /* Number of 8-bit characters */
4337 }
4338 
4339 
4340 
4341 
4342 /*************************************************
4343 *        Add a list of characters to a class     *
4344 *************************************************/
4345 
4346 /* This function is used for adding a list of case-equivalent characters to a
4347 class, and also for adding a list of horizontal or vertical whitespace. If the
4348 list is in order (which it should be), ranges of characters are detected and
4349 handled appropriately. This function is mutually recursive with the function
4350 above.
4351 
4352 Arguments:
4353   classbits     the bit map for characters < 256
4354   uchardptr     points to the pointer for extra data
4355   options       the options word
4356   cd            contains pointers to tables etc.
4357   p             points to row of 32-bit values, terminated by NOTACHAR
4358   except        character to omit; this is used when adding lists of
4359                   case-equivalent characters to avoid including the one we
4360                   already know about
4361 
4362 Returns:        the number of < 256 characters added
4363                 the pointer to extra data is updated
4364 */
4365 
4366 static int
add_list_to_class(pcre_uint8 * classbits,pcre_uchar ** uchardptr,int options,compile_data * cd,const pcre_uint32 * p,unsigned int except)4367 add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4368   compile_data *cd, const pcre_uint32 *p, unsigned int except)
4369 {
4370 int n8 = 0;
4371 while (p[0] < NOTACHAR)
4372   {
4373   int n = 0;
4374   if (p[0] != except)
4375     {
4376     while(p[n+1] == p[0] + n + 1) n++;
4377     n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
4378     }
4379   p += n + 1;
4380   }
4381 return n8;
4382 }
4383 
4384 
4385 
4386 /*************************************************
4387 *    Add characters not in a list to a class     *
4388 *************************************************/
4389 
4390 /* This function is used for adding the complement of a list of horizontal or
4391 vertical whitespace to a class. The list must be in order.
4392 
4393 Arguments:
4394   classbits     the bit map for characters < 256
4395   uchardptr     points to the pointer for extra data
4396   options       the options word
4397   cd            contains pointers to tables etc.
4398   p             points to row of 32-bit values, terminated by NOTACHAR
4399 
4400 Returns:        the number of < 256 characters added
4401                 the pointer to extra data is updated
4402 */
4403 
4404 static int
add_not_list_to_class(pcre_uint8 * classbits,pcre_uchar ** uchardptr,int options,compile_data * cd,const pcre_uint32 * p)4405 add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
4406   int options, compile_data *cd, const pcre_uint32 *p)
4407 {
4408 BOOL utf = (options & PCRE_UTF8) != 0;
4409 int n8 = 0;
4410 if (p[0] > 0)
4411   n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
4412 while (p[0] < NOTACHAR)
4413   {
4414   while (p[1] == p[0] + 1) p++;
4415   n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
4416     (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
4417   p++;
4418   }
4419 return n8;
4420 }
4421 
4422 
4423 
4424 /*************************************************
4425 *           Compile one branch                   *
4426 *************************************************/
4427 
4428 /* Scan the pattern, compiling it into the a vector. If the options are
4429 changed during the branch, the pointer is used to change the external options
4430 bits. This function is used during the pre-compile phase when we are trying
4431 to find out the amount of memory needed, as well as during the real compile
4432 phase. The value of lengthptr distinguishes the two phases.
4433 
4434 Arguments:
4435   optionsptr        pointer to the option bits
4436   codeptr           points to the pointer to the current code point
4437   ptrptr            points to the current pattern pointer
4438   errorcodeptr      points to error code variable
4439   firstcharptr      place to put the first required character
4440   firstcharflagsptr place to put the first character flags, or a negative number
4441   reqcharptr        place to put the last required character
4442   reqcharflagsptr   place to put the last required character flags, or a negative number
4443   bcptr             points to current branch chain
4444   cond_depth        conditional nesting depth
4445   cd                contains pointers to tables etc.
4446   lengthptr         NULL during the real compile phase
4447                     points to length accumulator during pre-compile phase
4448 
4449 Returns:            TRUE on success
4450                     FALSE, with *errorcodeptr set non-zero on error
4451 */
4452 
4453 static BOOL
compile_branch(int * optionsptr,pcre_uchar ** codeptr,const pcre_uchar ** ptrptr,int * errorcodeptr,pcre_uint32 * firstcharptr,pcre_int32 * firstcharflagsptr,pcre_uint32 * reqcharptr,pcre_int32 * reqcharflagsptr,branch_chain * bcptr,int cond_depth,compile_data * cd,int * lengthptr)4454 compile_branch(int *optionsptr, pcre_uchar **codeptr,
4455   const pcre_uchar **ptrptr, int *errorcodeptr,
4456   pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
4457   pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
4458   branch_chain *bcptr, int cond_depth,
4459   compile_data *cd, int *lengthptr)
4460 {
4461 int repeat_type, op_type;
4462 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
4463 int bravalue = 0;
4464 int greedy_default, greedy_non_default;
4465 pcre_uint32 firstchar, reqchar;
4466 pcre_int32 firstcharflags, reqcharflags;
4467 pcre_uint32 zeroreqchar, zerofirstchar;
4468 pcre_int32 zeroreqcharflags, zerofirstcharflags;
4469 pcre_int32 req_caseopt, reqvary, tempreqvary;
4470 int options = *optionsptr;               /* May change dynamically */
4471 int after_manual_callout = 0;
4472 int length_prevgroup = 0;
4473 register pcre_uint32 c;
4474 int escape;
4475 register pcre_uchar *code = *codeptr;
4476 pcre_uchar *last_code = code;
4477 pcre_uchar *orig_code = code;
4478 pcre_uchar *tempcode;
4479 BOOL inescq = FALSE;
4480 BOOL groupsetfirstchar = FALSE;
4481 const pcre_uchar *ptr = *ptrptr;
4482 const pcre_uchar *tempptr;
4483 const pcre_uchar *nestptr = NULL;
4484 pcre_uchar *previous = NULL;
4485 pcre_uchar *previous_callout = NULL;
4486 size_t item_hwm_offset = 0;
4487 pcre_uint8 classbits[32];
4488 
4489 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
4490 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
4491 dynamically as we process the pattern. */
4492 
4493 #ifdef SUPPORT_UTF
4494 /* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
4495 BOOL utf = (options & PCRE_UTF8) != 0;
4496 #ifndef COMPILE_PCRE32
4497 pcre_uchar utf_chars[6];
4498 #endif
4499 #else
4500 BOOL utf = FALSE;
4501 #endif
4502 
4503 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
4504 class_uchardata always so that it can be passed to add_to_class() always,
4505 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
4506 alternative calls for the different cases. */
4507 
4508 pcre_uchar *class_uchardata;
4509 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4510 BOOL xclass;
4511 pcre_uchar *class_uchardata_base;
4512 #endif
4513 
4514 #ifdef PCRE_DEBUG
4515 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
4516 #endif
4517 
4518 /* Set up the default and non-default settings for greediness */
4519 
4520 greedy_default = ((options & PCRE_UNGREEDY) != 0);
4521 greedy_non_default = greedy_default ^ 1;
4522 
4523 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
4524 matching encountered yet". It gets changed to REQ_NONE if we hit something that
4525 matches a non-fixed char first char; reqchar just remains unset if we never
4526 find one.
4527 
4528 When we hit a repeat whose minimum is zero, we may have to adjust these values
4529 to take the zero repeat into account. This is implemented by setting them to
4530 zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
4531 item types that can be repeated set these backoff variables appropriately. */
4532 
4533 firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
4534 firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
4535 
4536 /* The variable req_caseopt contains either the REQ_CASELESS value
4537 or zero, according to the current setting of the caseless flag. The
4538 REQ_CASELESS leaves the lower 28 bit empty. It is added into the
4539 firstchar or reqchar variables to record the case status of the
4540 value. This is used only for ASCII characters. */
4541 
4542 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
4543 
4544 /* Switch on next character until the end of the branch */
4545 
4546 for (;; ptr++)
4547   {
4548   BOOL negate_class;
4549   BOOL should_flip_negation;
4550   BOOL possessive_quantifier;
4551   BOOL is_quantifier;
4552   BOOL is_recurse;
4553   BOOL reset_bracount;
4554   int class_has_8bitchar;
4555   int class_one_char;
4556 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4557   BOOL xclass_has_prop;
4558 #endif
4559   int newoptions;
4560   int recno;
4561   int refsign;
4562   int skipbytes;
4563   pcre_uint32 subreqchar, subfirstchar;
4564   pcre_int32 subreqcharflags, subfirstcharflags;
4565   int terminator;
4566   unsigned int mclength;
4567   unsigned int tempbracount;
4568   pcre_uint32 ec;
4569   pcre_uchar mcbuffer[8];
4570 
4571   /* Come here to restart the loop without advancing the pointer. */
4572 
4573   REDO_LOOP:
4574 
4575   /* Get next character in the pattern */
4576 
4577   c = *ptr;
4578 
4579   /* If we are at the end of a nested substitution, revert to the outer level
4580   string. Nesting only happens one level deep. */
4581 
4582   if (c == CHAR_NULL && nestptr != NULL)
4583     {
4584     ptr = nestptr;
4585     nestptr = NULL;
4586     c = *ptr;
4587     }
4588 
4589   /* If we are in the pre-compile phase, accumulate the length used for the
4590   previous cycle of this loop. */
4591 
4592   if (lengthptr != NULL)
4593     {
4594 #ifdef PCRE_DEBUG
4595     if (code > cd->hwm) cd->hwm = code;                 /* High water info */
4596 #endif
4597     if (code > cd->start_workspace + cd->workspace_size -
4598         WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
4599       {
4600       *errorcodeptr = (code >= cd->start_workspace + cd->workspace_size)?
4601         ERR52 : ERR87;
4602       goto FAILED;
4603       }
4604 
4605     /* There is at least one situation where code goes backwards: this is the
4606     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
4607     the class is simply eliminated. However, it is created first, so we have to
4608     allow memory for it. Therefore, don't ever reduce the length at this point.
4609     */
4610 
4611     if (code < last_code) code = last_code;
4612 
4613     /* Paranoid check for integer overflow */
4614 
4615     if (OFLOW_MAX - *lengthptr < code - last_code)
4616       {
4617       *errorcodeptr = ERR20;
4618       goto FAILED;
4619       }
4620 
4621     *lengthptr += (int)(code - last_code);
4622     DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
4623       (int)(code - last_code), c, c));
4624 
4625     /* If "previous" is set and it is not at the start of the work space, move
4626     it back to there, in order to avoid filling up the work space. Otherwise,
4627     if "previous" is NULL, reset the current code pointer to the start. */
4628 
4629     if (previous != NULL)
4630       {
4631       if (previous > orig_code)
4632         {
4633         memmove(orig_code, previous, IN_UCHARS(code - previous));
4634         code -= previous - orig_code;
4635         previous = orig_code;
4636         }
4637       }
4638     else code = orig_code;
4639 
4640     /* Remember where this code item starts so we can pick up the length
4641     next time round. */
4642 
4643     last_code = code;
4644     }
4645 
4646   /* In the real compile phase, just check the workspace used by the forward
4647   reference list. */
4648 
4649   else if (cd->hwm > cd->start_workspace + cd->workspace_size)
4650     {
4651     *errorcodeptr = ERR52;
4652     goto FAILED;
4653     }
4654 
4655   /* If in \Q...\E, check for the end; if not, we have a literal. Otherwise an
4656   isolated \E is ignored. */
4657 
4658   if (c != CHAR_NULL)
4659     {
4660     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4661       {
4662       inescq = FALSE;
4663       ptr++;
4664       continue;
4665       }
4666     else if (inescq)
4667       {
4668       if (previous_callout != NULL)
4669         {
4670         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
4671           complete_callout(previous_callout, ptr, cd);
4672         previous_callout = NULL;
4673         }
4674       if ((options & PCRE_AUTO_CALLOUT) != 0)
4675         {
4676         previous_callout = code;
4677         code = auto_callout(code, ptr, cd);
4678         }
4679       goto NORMAL_CHAR;
4680       }
4681 
4682     /* Check for the start of a \Q...\E sequence. We must do this here rather
4683     than later in case it is immediately followed by \E, which turns it into a
4684     "do nothing" sequence. */
4685 
4686     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
4687       {
4688       inescq = TRUE;
4689       ptr++;
4690       continue;
4691       }
4692     }
4693 
4694   /* In extended mode, skip white space and comments. */
4695 
4696   if ((options & PCRE_EXTENDED) != 0)
4697     {
4698     const pcre_uchar *wscptr = ptr;
4699     while (MAX_255(c) && (cd->ctypes[c] & ctype_space) != 0) c = *(++ptr);
4700     if (c == CHAR_NUMBER_SIGN)
4701       {
4702       ptr++;
4703       while (*ptr != CHAR_NULL)
4704         {
4705         if (IS_NEWLINE(ptr))         /* For non-fixed-length newline cases, */
4706           {                          /* IS_NEWLINE sets cd->nllen. */
4707           ptr += cd->nllen;
4708           break;
4709           }
4710         ptr++;
4711 #ifdef SUPPORT_UTF
4712         if (utf) FORWARDCHAR(ptr);
4713 #endif
4714         }
4715       }
4716 
4717     /* If we skipped any characters, restart the loop. Otherwise, we didn't see
4718     a comment. */
4719 
4720     if (ptr > wscptr) goto REDO_LOOP;
4721     }
4722 
4723   /* Skip over (?# comments. We need to do this here because we want to know if
4724   the next thing is a quantifier, and these comments may come between an item
4725   and its quantifier. */
4726 
4727   if (c == CHAR_LEFT_PARENTHESIS && ptr[1] == CHAR_QUESTION_MARK &&
4728       ptr[2] == CHAR_NUMBER_SIGN)
4729     {
4730     ptr += 3;
4731     while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
4732     if (*ptr == CHAR_NULL)
4733       {
4734       *errorcodeptr = ERR18;
4735       goto FAILED;
4736       }
4737     continue;
4738     }
4739 
4740   /* See if the next thing is a quantifier. */
4741 
4742   is_quantifier =
4743     c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4744     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4745 
4746   /* Fill in length of a previous callout, except when the next thing is a
4747   quantifier or when processing a property substitution string in UCP mode. */
4748 
4749   if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
4750        after_manual_callout-- <= 0)
4751     {
4752     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
4753       complete_callout(previous_callout, ptr, cd);
4754     previous_callout = NULL;
4755     }
4756 
4757   /* Create auto callout, except for quantifiers, or while processing property
4758   strings that are substituted for \w etc in UCP mode. */
4759 
4760   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
4761     {
4762     previous_callout = code;
4763     code = auto_callout(code, ptr, cd);
4764     }
4765 
4766   /* Process the next pattern item. */
4767 
4768   switch(c)
4769     {
4770     /* ===================================================================*/
4771     case CHAR_NULL:                /* The branch terminates at string end */
4772     case CHAR_VERTICAL_LINE:       /* or | or ) */
4773     case CHAR_RIGHT_PARENTHESIS:
4774     *firstcharptr = firstchar;
4775     *firstcharflagsptr = firstcharflags;
4776     *reqcharptr = reqchar;
4777     *reqcharflagsptr = reqcharflags;
4778     *codeptr = code;
4779     *ptrptr = ptr;
4780     if (lengthptr != NULL)
4781       {
4782       if (OFLOW_MAX - *lengthptr < code - last_code)
4783         {
4784         *errorcodeptr = ERR20;
4785         goto FAILED;
4786         }
4787       *lengthptr += (int)(code - last_code);   /* To include callout length */
4788       DPRINTF((">> end branch\n"));
4789       }
4790     return TRUE;
4791 
4792 
4793     /* ===================================================================*/
4794     /* Handle single-character metacharacters. In multiline mode, ^ disables
4795     the setting of any following char as a first character. */
4796 
4797     case CHAR_CIRCUMFLEX_ACCENT:
4798     previous = NULL;
4799     if ((options & PCRE_MULTILINE) != 0)
4800       {
4801       if (firstcharflags == REQ_UNSET)
4802         zerofirstcharflags = firstcharflags = REQ_NONE;
4803       *code++ = OP_CIRCM;
4804       }
4805     else *code++ = OP_CIRC;
4806     break;
4807 
4808     case CHAR_DOLLAR_SIGN:
4809     previous = NULL;
4810     *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
4811     break;
4812 
4813     /* There can never be a first char if '.' is first, whatever happens about
4814     repeats. The value of reqchar doesn't change either. */
4815 
4816     case CHAR_DOT:
4817     if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4818     zerofirstchar = firstchar;
4819     zerofirstcharflags = firstcharflags;
4820     zeroreqchar = reqchar;
4821     zeroreqcharflags = reqcharflags;
4822     previous = code;
4823     item_hwm_offset = cd->hwm - cd->start_workspace;
4824     *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
4825     break;
4826 
4827 
4828     /* ===================================================================*/
4829     /* Character classes. If the included characters are all < 256, we build a
4830     32-byte bitmap of the permitted characters, except in the special case
4831     where there is only one such character. For negated classes, we build the
4832     map as usual, then invert it at the end. However, we use a different opcode
4833     so that data characters > 255 can be handled correctly.
4834 
4835     If the class contains characters outside the 0-255 range, a different
4836     opcode is compiled. It may optionally have a bit map for characters < 256,
4837     but those above are are explicitly listed afterwards. A flag byte tells
4838     whether the bitmap is present, and whether this is a negated class or not.
4839 
4840     In JavaScript compatibility mode, an isolated ']' causes an error. In
4841     default (Perl) mode, it is treated as a data character. */
4842 
4843     case CHAR_RIGHT_SQUARE_BRACKET:
4844     if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4845       {
4846       *errorcodeptr = ERR64;
4847       goto FAILED;
4848       }
4849     goto NORMAL_CHAR;
4850 
4851     /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
4852     used for "start of word" and "end of word". As these are otherwise illegal
4853     sequences, we don't break anything by recognizing them. They are replaced
4854     by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
4855     erroneous and are handled by the normal code below. */
4856 
4857     case CHAR_LEFT_SQUARE_BRACKET:
4858     if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
4859       {
4860       nestptr = ptr + 7;
4861       ptr = sub_start_of_word;
4862       goto REDO_LOOP;
4863       }
4864 
4865     if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
4866       {
4867       nestptr = ptr + 7;
4868       ptr = sub_end_of_word;
4869       goto REDO_LOOP;
4870       }
4871 
4872     /* Handle a real character class. */
4873 
4874     previous = code;
4875     item_hwm_offset = cd->hwm - cd->start_workspace;
4876 
4877     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
4878     they are encountered at the top level, so we'll do that too. */
4879 
4880     if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4881          ptr[1] == CHAR_EQUALS_SIGN) &&
4882         check_posix_syntax(ptr, &tempptr))
4883       {
4884       *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
4885       goto FAILED;
4886       }
4887 
4888     /* If the first character is '^', set the negation flag and skip it. Also,
4889     if the first few characters (either before or after ^) are \Q\E or \E we
4890     skip them too. This makes for compatibility with Perl. */
4891 
4892     negate_class = FALSE;
4893     for (;;)
4894       {
4895       c = *(++ptr);
4896       if (c == CHAR_BACKSLASH)
4897         {
4898         if (ptr[1] == CHAR_E)
4899           ptr++;
4900         else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
4901           ptr += 3;
4902         else
4903           break;
4904         }
4905       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4906         negate_class = TRUE;
4907       else break;
4908       }
4909 
4910     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
4911     an initial ']' is taken as a data character -- the code below handles
4912     that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
4913     [^] must match any character, so generate OP_ALLANY. */
4914 
4915     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4916         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4917       {
4918       *code++ = negate_class? OP_ALLANY : OP_FAIL;
4919       if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4920       zerofirstchar = firstchar;
4921       zerofirstcharflags = firstcharflags;
4922       break;
4923       }
4924 
4925     /* If a class contains a negative special such as \S, we need to flip the
4926     negation flag at the end, so that support for characters > 255 works
4927     correctly (they are all included in the class). */
4928 
4929     should_flip_negation = FALSE;
4930 
4931     /* Extended class (xclass) will be used when characters > 255
4932     might match. */
4933 
4934 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4935     xclass = FALSE;
4936     class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
4937     class_uchardata_base = class_uchardata;   /* Save the start */
4938 #endif
4939 
4940     /* For optimization purposes, we track some properties of the class:
4941     class_has_8bitchar will be non-zero if the class contains at least one <
4942     256 character; class_one_char will be 1 if the class contains just one
4943     character; xclass_has_prop will be TRUE if unicode property checks
4944     are present in the class. */
4945 
4946     class_has_8bitchar = 0;
4947     class_one_char = 0;
4948 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4949     xclass_has_prop = FALSE;
4950 #endif
4951 
4952     /* Initialize the 32-char bit map to all zeros. We build the map in a
4953     temporary bit of memory, in case the class contains fewer than two
4954     8-bit characters because in that case the compiled code doesn't use the bit
4955     map. */
4956 
4957     memset(classbits, 0, 32 * sizeof(pcre_uint8));
4958 
4959     /* Process characters until ] is reached. By writing this as a "do" it
4960     means that an initial ] is taken as a data character. At the start of the
4961     loop, c contains the first byte of the character. */
4962 
4963     if (c != CHAR_NULL) do
4964       {
4965       const pcre_uchar *oldptr;
4966 
4967 #ifdef SUPPORT_UTF
4968       if (utf && HAS_EXTRALEN(c))
4969         {                           /* Braces are required because the */
4970         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
4971         }
4972 #endif
4973 
4974 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4975       /* In the pre-compile phase, accumulate the length of any extra
4976       data and reset the pointer. This is so that very large classes that
4977       contain a zillion > 255 characters no longer overwrite the work space
4978       (which is on the stack). We have to remember that there was XCLASS data,
4979       however. */
4980 
4981       if (class_uchardata > class_uchardata_base) xclass = TRUE;
4982 
4983       if (lengthptr != NULL && class_uchardata > class_uchardata_base)
4984         {
4985         *lengthptr += (int)(class_uchardata - class_uchardata_base);
4986         class_uchardata = class_uchardata_base;
4987         }
4988 #endif
4989 
4990       /* Inside \Q...\E everything is literal except \E */
4991 
4992       if (inescq)
4993         {
4994         if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
4995           {
4996           inescq = FALSE;                   /* Reset literal state */
4997           ptr++;                            /* Skip the 'E' */
4998           continue;                         /* Carry on with next */
4999           }
5000         goto CHECK_RANGE;                   /* Could be range if \E follows */
5001         }
5002 
5003       /* Handle POSIX class names. Perl allows a negation extension of the
5004       form [:^name:]. A square bracket that doesn't match the syntax is
5005       treated as a literal. We also recognize the POSIX constructions
5006       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
5007       5.6 and 5.8 do. */
5008 
5009       if (c == CHAR_LEFT_SQUARE_BRACKET &&
5010           (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
5011            ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
5012         {
5013         BOOL local_negate = FALSE;
5014         int posix_class, taboffset, tabopt;
5015         register const pcre_uint8 *cbits = cd->cbits;
5016         pcre_uint8 pbits[32];
5017 
5018         if (ptr[1] != CHAR_COLON)
5019           {
5020           *errorcodeptr = ERR31;
5021           goto FAILED;
5022           }
5023 
5024         ptr += 2;
5025         if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
5026           {
5027           local_negate = TRUE;
5028           should_flip_negation = TRUE;  /* Note negative special */
5029           ptr++;
5030           }
5031 
5032         posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
5033         if (posix_class < 0)
5034           {
5035           *errorcodeptr = ERR30;
5036           goto FAILED;
5037           }
5038 
5039         /* If matching is caseless, upper and lower are converted to
5040         alpha. This relies on the fact that the class table starts with
5041         alpha, lower, upper as the first 3 entries. */
5042 
5043         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
5044           posix_class = 0;
5045 
5046         /* When PCRE_UCP is set, some of the POSIX classes are converted to
5047         different escape sequences that use Unicode properties \p or \P. Others
5048         that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
5049         directly. */
5050 
5051 #ifdef SUPPORT_UCP
5052         if ((options & PCRE_UCP) != 0)
5053           {
5054           unsigned int ptype = 0;
5055           int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
5056 
5057           /* The posix_substitutes table specifies which POSIX classes can be
5058           converted to \p or \P items. */
5059 
5060           if (posix_substitutes[pc] != NULL)
5061             {
5062             nestptr = tempptr + 1;
5063             ptr = posix_substitutes[pc] - 1;
5064             continue;
5065             }
5066 
5067           /* There are three other classes that generate special property calls
5068           that are recognized only in an XCLASS. */
5069 
5070           else switch(posix_class)
5071             {
5072             case PC_GRAPH:
5073             ptype = PT_PXGRAPH;
5074             /* Fall through */
5075             case PC_PRINT:
5076             if (ptype == 0) ptype = PT_PXPRINT;
5077             /* Fall through */
5078             case PC_PUNCT:
5079             if (ptype == 0) ptype = PT_PXPUNCT;
5080             *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
5081             *class_uchardata++ = ptype;
5082             *class_uchardata++ = 0;
5083             xclass_has_prop = TRUE;
5084             ptr = tempptr + 1;
5085             continue;
5086 
5087             /* For the other POSIX classes (ascii, cntrl, xdigit) we are going
5088             to fall through to the non-UCP case and build a bit map for
5089             characters with code points less than 256. If we are in a negated
5090             POSIX class, characters with code points greater than 255 must
5091             either all match or all not match. In the special case where we
5092             have not yet generated any xclass data, and this is the final item
5093             in the overall class, we need do nothing: later on, the opcode
5094             OP_NCLASS will be used to indicate that characters greater than 255
5095             are acceptable. If we have already seen an xclass item or one may
5096             follow (we have to assume that it might if this is not the end of
5097             the class), explicitly list all wide codepoints, which will then
5098             either not match or match, depending on whether the class is or is
5099             not negated. */
5100 
5101             /* fall through */
5102 
5103             default:
5104             if (local_negate &&
5105                 (xclass || tempptr[2] != CHAR_RIGHT_SQUARE_BRACKET))
5106               {
5107               *class_uchardata++ = XCL_RANGE;
5108               class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
5109               class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
5110               }
5111             break;
5112             }
5113           }
5114 #endif
5115         /* In the non-UCP case, or when UCP makes no difference, we build the
5116         bit map for the POSIX class in a chunk of local store because we may be
5117         adding and subtracting from it, and we don't want to subtract bits that
5118         may be in the main map already. At the end we or the result into the
5119         bit map that is being built. */
5120 
5121         posix_class *= 3;
5122 
5123         /* Copy in the first table (always present) */
5124 
5125         memcpy(pbits, cbits + posix_class_maps[posix_class],
5126           32 * sizeof(pcre_uint8));
5127 
5128         /* If there is a second table, add or remove it as required. */
5129 
5130         taboffset = posix_class_maps[posix_class + 1];
5131         tabopt = posix_class_maps[posix_class + 2];
5132 
5133         if (taboffset >= 0)
5134           {
5135           if (tabopt >= 0)
5136             for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
5137           else
5138             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
5139           }
5140 
5141         /* Now see if we need to remove any special characters. An option
5142         value of 1 removes vertical space and 2 removes underscore. */
5143 
5144         if (tabopt < 0) tabopt = -tabopt;
5145         if (tabopt == 1) pbits[1] &= ~0x3c;
5146           else if (tabopt == 2) pbits[11] &= 0x7f;
5147 
5148         /* Add the POSIX table or its complement into the main table that is
5149         being built and we are done. */
5150 
5151         if (local_negate)
5152           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
5153         else
5154           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
5155 
5156         ptr = tempptr + 1;
5157         /* Every class contains at least one < 256 character. */
5158         class_has_8bitchar = 1;
5159         /* Every class contains at least two characters. */
5160         class_one_char = 2;
5161         continue;    /* End of POSIX syntax handling */
5162         }
5163 
5164       /* Backslash may introduce a single character, or it may introduce one
5165       of the specials, which just set a flag. The sequence \b is a special
5166       case. Inside a class (and only there) it is treated as backspace. We
5167       assume that other escapes have more than one character in them, so
5168       speculatively set both class_has_8bitchar and class_one_char bigger
5169       than one. Unrecognized escapes fall through and are either treated
5170       as literal characters (by default), or are faulted if
5171       PCRE_EXTRA is set. */
5172 
5173       if (c == CHAR_BACKSLASH)
5174         {
5175         escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options,
5176           TRUE);
5177         if (*errorcodeptr != 0) goto FAILED;
5178         if (escape == 0) c = ec;
5179         else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
5180         else if (escape == ESC_N)          /* \N is not supported in a class */
5181           {
5182           *errorcodeptr = ERR71;
5183           goto FAILED;
5184           }
5185         else if (escape == ESC_Q)            /* Handle start of quoted string */
5186           {
5187           if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5188             {
5189             ptr += 2; /* avoid empty string */
5190             }
5191           else inescq = TRUE;
5192           continue;
5193           }
5194         else if (escape == ESC_E) continue;  /* Ignore orphan \E */
5195 
5196         else
5197           {
5198           register const pcre_uint8 *cbits = cd->cbits;
5199           /* Every class contains at least two < 256 characters. */
5200           class_has_8bitchar++;
5201           /* Every class contains at least two characters. */
5202           class_one_char += 2;
5203 
5204           switch (escape)
5205             {
5206 #ifdef SUPPORT_UCP
5207             case ESC_du:     /* These are the values given for \d etc */
5208             case ESC_DU:     /* when PCRE_UCP is set. We replace the */
5209             case ESC_wu:     /* escape sequence with an appropriate \p */
5210             case ESC_WU:     /* or \P to test Unicode properties instead */
5211             case ESC_su:     /* of the default ASCII testing. */
5212             case ESC_SU:
5213             nestptr = ptr;
5214             ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
5215             class_has_8bitchar--;                /* Undo! */
5216             continue;
5217 #endif
5218             case ESC_d:
5219             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
5220             continue;
5221 
5222             case ESC_D:
5223             should_flip_negation = TRUE;
5224             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
5225             continue;
5226 
5227             case ESC_w:
5228             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
5229             continue;
5230 
5231             case ESC_W:
5232             should_flip_negation = TRUE;
5233             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
5234             continue;
5235 
5236             /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5237             5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5238             previously set by something earlier in the character class.
5239             Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5240             we could just adjust the appropriate bit. From PCRE 8.34 we no
5241             longer treat \s and \S specially. */
5242 
5243             case ESC_s:
5244             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
5245             continue;
5246 
5247             case ESC_S:
5248             should_flip_negation = TRUE;
5249             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
5250             continue;
5251 
5252             /* The rest apply in both UCP and non-UCP cases. */
5253 
5254             case ESC_h:
5255             (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5256               PRIV(hspace_list), NOTACHAR);
5257             continue;
5258 
5259             case ESC_H:
5260             (void)add_not_list_to_class(classbits, &class_uchardata, options,
5261               cd, PRIV(hspace_list));
5262             continue;
5263 
5264             case ESC_v:
5265             (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5266               PRIV(vspace_list), NOTACHAR);
5267             continue;
5268 
5269             case ESC_V:
5270             (void)add_not_list_to_class(classbits, &class_uchardata, options,
5271               cd, PRIV(vspace_list));
5272             continue;
5273 
5274             case ESC_p:
5275             case ESC_P:
5276 #ifdef SUPPORT_UCP
5277               {
5278               BOOL negated;
5279               unsigned int ptype = 0, pdata = 0;
5280               if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
5281                 goto FAILED;
5282               *class_uchardata++ = ((escape == ESC_p) != negated)?
5283                 XCL_PROP : XCL_NOTPROP;
5284               *class_uchardata++ = ptype;
5285               *class_uchardata++ = pdata;
5286               xclass_has_prop = TRUE;
5287               class_has_8bitchar--;                /* Undo! */
5288               continue;
5289               }
5290 #else
5291             *errorcodeptr = ERR45;
5292             goto FAILED;
5293 #endif
5294             /* Unrecognized escapes are faulted if PCRE is running in its
5295             strict mode. By default, for compatibility with Perl, they are
5296             treated as literals. */
5297 
5298             default:
5299             if ((options & PCRE_EXTRA) != 0)
5300               {
5301               *errorcodeptr = ERR7;
5302               goto FAILED;
5303               }
5304             class_has_8bitchar--;    /* Undo the speculative increase. */
5305             class_one_char -= 2;     /* Undo the speculative increase. */
5306             c = *ptr;                /* Get the final character and fall through */
5307             break;
5308             }
5309           }
5310 
5311         /* Fall through if the escape just defined a single character (c >= 0).
5312         This may be greater than 256. */
5313 
5314         escape = 0;
5315 
5316         }   /* End of backslash handling */
5317 
5318       /* A character may be followed by '-' to form a range. However, Perl does
5319       not permit ']' to be the end of the range. A '-' character at the end is
5320       treated as a literal. Perl ignores orphaned \E sequences entirely. The
5321       code for handling \Q and \E is messy. */
5322 
5323       CHECK_RANGE:
5324       while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5325         {
5326         inescq = FALSE;
5327         ptr += 2;
5328         }
5329       oldptr = ptr;
5330 
5331       /* Remember if \r or \n were explicitly used */
5332 
5333       if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5334 
5335       /* Check for range */
5336 
5337       if (!inescq && ptr[1] == CHAR_MINUS)
5338         {
5339         pcre_uint32 d;
5340         ptr += 2;
5341         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
5342 
5343         /* If we hit \Q (not followed by \E) at this point, go into escaped
5344         mode. */
5345 
5346         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
5347           {
5348           ptr += 2;
5349           if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
5350             { ptr += 2; continue; }
5351           inescq = TRUE;
5352           break;
5353           }
5354 
5355         /* Minus (hyphen) at the end of a class is treated as a literal, so put
5356         back the pointer and jump to handle the character that preceded it. */
5357 
5358         if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
5359           {
5360           ptr = oldptr;
5361           goto CLASS_SINGLE_CHARACTER;
5362           }
5363 
5364         /* Otherwise, we have a potential range; pick up the next character */
5365 
5366 #ifdef SUPPORT_UTF
5367         if (utf)
5368           {                           /* Braces are required because the */
5369           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
5370           }
5371         else
5372 #endif
5373         d = *ptr;  /* Not UTF-8 mode */
5374 
5375         /* The second part of a range can be a single-character escape
5376         sequence, but not any of the other escapes. Perl treats a hyphen as a
5377         literal in such circumstances. However, in Perl's warning mode, a
5378         warning is given, so PCRE now faults it as it is almost certainly a
5379         mistake on the user's part. */
5380 
5381         if (!inescq)
5382           {
5383           if (d == CHAR_BACKSLASH)
5384             {
5385             int descape;
5386             descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
5387             if (*errorcodeptr != 0) goto FAILED;
5388 
5389             /* 0 means a character was put into d; \b is backspace; any other
5390             special causes an error. */
5391 
5392             if (descape != 0)
5393               {
5394               if (descape == ESC_b) d = CHAR_BS; else
5395                 {
5396                 *errorcodeptr = ERR83;
5397                 goto FAILED;
5398                 }
5399               }
5400             }
5401 
5402           /* A hyphen followed by a POSIX class is treated in the same way. */
5403 
5404           else if (d == CHAR_LEFT_SQUARE_BRACKET &&
5405                    (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
5406                     ptr[1] == CHAR_EQUALS_SIGN) &&
5407                    check_posix_syntax(ptr, &tempptr))
5408             {
5409             *errorcodeptr = ERR83;
5410             goto FAILED;
5411             }
5412           }
5413 
5414         /* Check that the two values are in the correct order. Optimize
5415         one-character ranges. */
5416 
5417         if (d < c)
5418           {
5419           *errorcodeptr = ERR8;
5420           goto FAILED;
5421           }
5422         if (d == c) goto CLASS_SINGLE_CHARACTER;  /* A few lines below */
5423 
5424         /* We have found a character range, so single character optimizations
5425         cannot be done anymore. Any value greater than 1 indicates that there
5426         is more than one character. */
5427 
5428         class_one_char = 2;
5429 
5430         /* Remember an explicit \r or \n, and add the range to the class. */
5431 
5432         if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5433 
5434         class_has_8bitchar +=
5435           add_to_class(classbits, &class_uchardata, options, cd, c, d);
5436 
5437         continue;   /* Go get the next char in the class */
5438         }
5439 
5440       /* Handle a single character - we can get here for a normal non-escape
5441       char, or after \ that introduces a single character or for an apparent
5442       range that isn't. Only the value 1 matters for class_one_char, so don't
5443       increase it if it is already 2 or more ... just in case there's a class
5444       with a zillion characters in it. */
5445 
5446       CLASS_SINGLE_CHARACTER:
5447       if (class_one_char < 2) class_one_char++;
5448 
5449       /* If xclass_has_prop is false and class_one_char is 1, we have the first
5450       single character in the class, and there have been no prior ranges, or
5451       XCLASS items generated by escapes. If this is the final character in the
5452       class, we can optimize by turning the item into a 1-character OP_CHAR[I]
5453       if it's positive, or OP_NOT[I] if it's negative. In the positive case, it
5454       can cause firstchar to be set. Otherwise, there can be no first char if
5455       this item is first, whatever repeat count may follow. In the case of
5456       reqchar, save the previous value for reinstating. */
5457 
5458       if (!inescq &&
5459 #ifdef SUPPORT_UCP
5460           !xclass_has_prop &&
5461 #endif
5462           class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
5463         {
5464         ptr++;
5465         zeroreqchar = reqchar;
5466         zeroreqcharflags = reqcharflags;
5467 
5468         if (negate_class)
5469           {
5470 #ifdef SUPPORT_UCP
5471           int d;
5472 #endif
5473           if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5474           zerofirstchar = firstchar;
5475           zerofirstcharflags = firstcharflags;
5476 
5477           /* For caseless UTF-8 mode when UCP support is available, check
5478           whether this character has more than one other case. If so, generate
5479           a special OP_NOTPROP item instead of OP_NOTI. */
5480 
5481 #ifdef SUPPORT_UCP
5482           if (utf && (options & PCRE_CASELESS) != 0 &&
5483               (d = UCD_CASESET(c)) != 0)
5484             {
5485             *code++ = OP_NOTPROP;
5486             *code++ = PT_CLIST;
5487             *code++ = d;
5488             }
5489           else
5490 #endif
5491           /* Char has only one other case, or UCP not available */
5492 
5493             {
5494             *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
5495 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5496             if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5497               code += PRIV(ord2utf)(c, code);
5498             else
5499 #endif
5500               *code++ = c;
5501             }
5502 
5503           /* We are finished with this character class */
5504 
5505           goto END_CLASS;
5506           }
5507 
5508         /* For a single, positive character, get the value into mcbuffer, and
5509         then we can handle this with the normal one-character code. */
5510 
5511 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5512         if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5513           mclength = PRIV(ord2utf)(c, mcbuffer);
5514         else
5515 #endif
5516           {
5517           mcbuffer[0] = c;
5518           mclength = 1;
5519           }
5520         goto ONE_CHAR;
5521         }       /* End of 1-char optimization */
5522 
5523       /* There is more than one character in the class, or an XCLASS item
5524       has been generated. Add this character to the class. */
5525 
5526       class_has_8bitchar +=
5527         add_to_class(classbits, &class_uchardata, options, cd, c, c);
5528       }
5529 
5530     /* Loop until ']' reached. This "while" is the end of the "do" far above.
5531     If we are at the end of an internal nested string, revert to the outer
5532     string. */
5533 
5534     while (((c = *(++ptr)) != CHAR_NULL ||
5535            (nestptr != NULL &&
5536              (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != CHAR_NULL)) &&
5537            (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
5538 
5539     /* Check for missing terminating ']' */
5540 
5541     if (c == CHAR_NULL)
5542       {
5543       *errorcodeptr = ERR6;
5544       goto FAILED;
5545       }
5546 
5547     /* We will need an XCLASS if data has been placed in class_uchardata. In
5548     the second phase this is a sufficient test. However, in the pre-compile
5549     phase, class_uchardata gets emptied to prevent workspace overflow, so it
5550     only if the very last character in the class needs XCLASS will it contain
5551     anything at this point. For this reason, xclass gets set TRUE above when
5552     uchar_classdata is emptied, and that's why this code is the way it is here
5553     instead of just doing a test on class_uchardata below. */
5554 
5555 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5556     if (class_uchardata > class_uchardata_base) xclass = TRUE;
5557 #endif
5558 
5559     /* If this is the first thing in the branch, there can be no first char
5560     setting, whatever the repeat count. Any reqchar setting must remain
5561     unchanged after any kind of repeat. */
5562 
5563     if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5564     zerofirstchar = firstchar;
5565     zerofirstcharflags = firstcharflags;
5566     zeroreqchar = reqchar;
5567     zeroreqcharflags = reqcharflags;
5568 
5569     /* If there are characters with values > 255, we have to compile an
5570     extended class, with its own opcode, unless there was a negated special
5571     such as \S in the class, and PCRE_UCP is not set, because in that case all
5572     characters > 255 are in the class, so any that were explicitly given as
5573     well can be ignored. If (when there are explicit characters > 255 that must
5574     be listed) there are no characters < 256, we can omit the bitmap in the
5575     actual compiled code. */
5576 
5577 #ifdef SUPPORT_UTF
5578     if (xclass && (xclass_has_prop || !should_flip_negation ||
5579         (options & PCRE_UCP) != 0))
5580 #elif !defined COMPILE_PCRE8
5581     if (xclass && (xclass_has_prop || !should_flip_negation))
5582 #endif
5583 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5584       {
5585       /* For non-UCP wide characters, in a non-negative class containing \S or
5586       similar (should_flip_negation is set), all characters greater than 255
5587       must be in the class. */
5588 
5589       if (
5590 #if defined COMPILE_PCRE8
5591            utf &&
5592 #endif
5593            should_flip_negation && !negate_class && (options & PCRE_UCP) == 0)
5594         {
5595         *class_uchardata++ = XCL_RANGE;
5596         if (utf)   /* Will always be utf in the 8-bit library */
5597           {
5598           class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
5599           class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
5600           }
5601         else       /* Can only happen for the 16-bit & 32-bit libraries */
5602           {
5603 #if defined COMPILE_PCRE16
5604           *class_uchardata++ = 0x100;
5605           *class_uchardata++ = 0xffffu;
5606 #elif defined COMPILE_PCRE32
5607           *class_uchardata++ = 0x100;
5608           *class_uchardata++ = 0xffffffffu;
5609 #endif
5610           }
5611         }
5612 
5613       *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
5614       *code++ = OP_XCLASS;
5615       code += LINK_SIZE;
5616       *code = negate_class? XCL_NOT:0;
5617       if (xclass_has_prop) *code |= XCL_HASPROP;
5618 
5619       /* If the map is required, move up the extra data to make room for it;
5620       otherwise just move the code pointer to the end of the extra data. */
5621 
5622       if (class_has_8bitchar > 0)
5623         {
5624         *code++ |= XCL_MAP;
5625         memmove(code + (32 / sizeof(pcre_uchar)), code,
5626           IN_UCHARS(class_uchardata - code));
5627         if (negate_class && !xclass_has_prop)
5628           for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5629         memcpy(code, classbits, 32);
5630         code = class_uchardata + (32 / sizeof(pcre_uchar));
5631         }
5632       else code = class_uchardata;
5633 
5634       /* Now fill in the complete length of the item */
5635 
5636       PUT(previous, 1, (int)(code - previous));
5637       break;   /* End of class handling */
5638       }
5639 
5640     /* Even though any XCLASS list is now discarded, we must allow for
5641     its memory. */
5642 
5643     if (lengthptr != NULL)
5644       *lengthptr += (int)(class_uchardata - class_uchardata_base);
5645 #endif
5646 
5647     /* If there are no characters > 255, or they are all to be included or
5648     excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
5649     whole class was negated and whether there were negative specials such as \S
5650     (non-UCP) in the class. Then copy the 32-byte map into the code vector,
5651     negating it if necessary. */
5652 
5653     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
5654     if (lengthptr == NULL)    /* Save time in the pre-compile phase */
5655       {
5656       if (negate_class)
5657         for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5658       memcpy(code, classbits, 32);
5659       }
5660     code += 32 / sizeof(pcre_uchar);
5661 
5662     END_CLASS:
5663     break;
5664 
5665 
5666     /* ===================================================================*/
5667     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
5668     has been tested above. */
5669 
5670     case CHAR_LEFT_CURLY_BRACKET:
5671     if (!is_quantifier) goto NORMAL_CHAR;
5672     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
5673     if (*errorcodeptr != 0) goto FAILED;
5674     goto REPEAT;
5675 
5676     case CHAR_ASTERISK:
5677     repeat_min = 0;
5678     repeat_max = -1;
5679     goto REPEAT;
5680 
5681     case CHAR_PLUS:
5682     repeat_min = 1;
5683     repeat_max = -1;
5684     goto REPEAT;
5685 
5686     case CHAR_QUESTION_MARK:
5687     repeat_min = 0;
5688     repeat_max = 1;
5689 
5690     REPEAT:
5691     if (previous == NULL)
5692       {
5693       *errorcodeptr = ERR9;
5694       goto FAILED;
5695       }
5696 
5697     if (repeat_min == 0)
5698       {
5699       firstchar = zerofirstchar;    /* Adjust for zero repeat */
5700       firstcharflags = zerofirstcharflags;
5701       reqchar = zeroreqchar;        /* Ditto */
5702       reqcharflags = zeroreqcharflags;
5703       }
5704 
5705     /* Remember whether this is a variable length repeat */
5706 
5707     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
5708 
5709     op_type = 0;                    /* Default single-char op codes */
5710     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
5711 
5712     /* Save start of previous item, in case we have to move it up in order to
5713     insert something before it. */
5714 
5715     tempcode = previous;
5716 
5717     /* Before checking for a possessive quantifier, we must skip over
5718     whitespace and comments in extended mode because Perl allows white space at
5719     this point. */
5720 
5721     if ((options & PCRE_EXTENDED) != 0)
5722       {
5723       const pcre_uchar *p = ptr + 1;
5724       for (;;)
5725         {
5726         while (MAX_255(*p) && (cd->ctypes[*p] & ctype_space) != 0) p++;
5727         if (*p != CHAR_NUMBER_SIGN) break;
5728         p++;
5729         while (*p != CHAR_NULL)
5730           {
5731           if (IS_NEWLINE(p))         /* For non-fixed-length newline cases, */
5732             {                        /* IS_NEWLINE sets cd->nllen. */
5733             p += cd->nllen;
5734             break;
5735             }
5736           p++;
5737 #ifdef SUPPORT_UTF
5738           if (utf) FORWARDCHAR(p);
5739 #endif
5740           }           /* Loop for comment characters */
5741         }             /* Loop for multiple comments */
5742       ptr = p - 1;    /* Character before the next significant one. */
5743       }
5744 
5745     /* We also need to skip over (?# comments, which are not dependent on
5746     extended mode. */
5747 
5748     if (ptr[1] == CHAR_LEFT_PARENTHESIS && ptr[2] == CHAR_QUESTION_MARK &&
5749         ptr[3] == CHAR_NUMBER_SIGN)
5750       {
5751       ptr += 4;
5752       while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
5753       if (*ptr == CHAR_NULL)
5754         {
5755         *errorcodeptr = ERR18;
5756         goto FAILED;
5757         }
5758       }
5759 
5760     /* If the next character is '+', we have a possessive quantifier. This
5761     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
5762     If the next character is '?' this is a minimizing repeat, by default,
5763     but if PCRE_UNGREEDY is set, it works the other way round. We change the
5764     repeat type to the non-default. */
5765 
5766     if (ptr[1] == CHAR_PLUS)
5767       {
5768       repeat_type = 0;                  /* Force greedy */
5769       possessive_quantifier = TRUE;
5770       ptr++;
5771       }
5772     else if (ptr[1] == CHAR_QUESTION_MARK)
5773       {
5774       repeat_type = greedy_non_default;
5775       ptr++;
5776       }
5777     else repeat_type = greedy_default;
5778 
5779     /* If previous was a recursion call, wrap it in atomic brackets so that
5780     previous becomes the atomic group. All recursions were so wrapped in the
5781     past, but it no longer happens for non-repeated recursions. In fact, the
5782     repeated ones could be re-implemented independently so as not to need this,
5783     but for the moment we rely on the code for repeating groups. */
5784 
5785     if (*previous == OP_RECURSE)
5786       {
5787       memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
5788       *previous = OP_ONCE;
5789       PUT(previous, 1, 2 + 2*LINK_SIZE);
5790       previous[2 + 2*LINK_SIZE] = OP_KET;
5791       PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
5792       code += 2 + 2 * LINK_SIZE;
5793       length_prevgroup = 3 + 3*LINK_SIZE;
5794 
5795       /* When actually compiling, we need to check whether this was a forward
5796       reference, and if so, adjust the offset. */
5797 
5798       if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
5799         {
5800         int offset = GET(cd->hwm, -LINK_SIZE);
5801         if (offset == previous + 1 - cd->start_code)
5802           PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
5803         }
5804       }
5805 
5806     /* Now handle repetition for the different types of item. */
5807 
5808     /* If previous was a character or negated character match, abolish the item
5809     and generate a repeat item instead. If a char item has a minimum of more
5810     than one, ensure that it is set in reqchar - it might not be if a sequence
5811     such as x{3} is the first thing in a branch because the x will have gone
5812     into firstchar instead.  */
5813 
5814     if (*previous == OP_CHAR || *previous == OP_CHARI
5815         || *previous == OP_NOT || *previous == OP_NOTI)
5816       {
5817       switch (*previous)
5818         {
5819         default: /* Make compiler happy. */
5820         case OP_CHAR:  op_type = OP_STAR - OP_STAR; break;
5821         case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
5822         case OP_NOT:   op_type = OP_NOTSTAR - OP_STAR; break;
5823         case OP_NOTI:  op_type = OP_NOTSTARI - OP_STAR; break;
5824         }
5825 
5826       /* Deal with UTF characters that take up more than one character. It's
5827       easier to write this out separately than try to macrify it. Use c to
5828       hold the length of the character in bytes, plus UTF_LENGTH to flag that
5829       it's a length rather than a small character. */
5830 
5831 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5832       if (utf && NOT_FIRSTCHAR(code[-1]))
5833         {
5834         pcre_uchar *lastchar = code - 1;
5835         BACKCHAR(lastchar);
5836         c = (int)(code - lastchar);     /* Length of UTF-8 character */
5837         memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
5838         c |= UTF_LENGTH;                /* Flag c as a length */
5839         }
5840       else
5841 #endif /* SUPPORT_UTF */
5842 
5843       /* Handle the case of a single charater - either with no UTF support, or
5844       with UTF disabled, or for a single character UTF character. */
5845         {
5846         c = code[-1];
5847         if (*previous <= OP_CHARI && repeat_min > 1)
5848           {
5849           reqchar = c;
5850           reqcharflags = req_caseopt | cd->req_varyopt;
5851           }
5852         }
5853 
5854       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
5855       }
5856 
5857     /* If previous was a character type match (\d or similar), abolish it and
5858     create a suitable repeat item. The code is shared with single-character
5859     repeats by setting op_type to add a suitable offset into repeat_type. Note
5860     the the Unicode property types will be present only when SUPPORT_UCP is
5861     defined, but we don't wrap the little bits of code here because it just
5862     makes it horribly messy. */
5863 
5864     else if (*previous < OP_EODN)
5865       {
5866       pcre_uchar *oldcode;
5867       int prop_type, prop_value;
5868       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
5869       c = *previous;
5870 
5871       OUTPUT_SINGLE_REPEAT:
5872       if (*previous == OP_PROP || *previous == OP_NOTPROP)
5873         {
5874         prop_type = previous[1];
5875         prop_value = previous[2];
5876         }
5877       else prop_type = prop_value = -1;
5878 
5879       oldcode = code;
5880       code = previous;                  /* Usually overwrite previous item */
5881 
5882       /* If the maximum is zero then the minimum must also be zero; Perl allows
5883       this case, so we do too - by simply omitting the item altogether. */
5884 
5885       if (repeat_max == 0) goto END_REPEAT;
5886 
5887       /* Combine the op_type with the repeat_type */
5888 
5889       repeat_type += op_type;
5890 
5891       /* A minimum of zero is handled either as the special case * or ?, or as
5892       an UPTO, with the maximum given. */
5893 
5894       if (repeat_min == 0)
5895         {
5896         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
5897           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
5898         else
5899           {
5900           *code++ = OP_UPTO + repeat_type;
5901           PUT2INC(code, 0, repeat_max);
5902           }
5903         }
5904 
5905       /* A repeat minimum of 1 is optimized into some special cases. If the
5906       maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
5907       left in place and, if the maximum is greater than 1, we use OP_UPTO with
5908       one less than the maximum. */
5909 
5910       else if (repeat_min == 1)
5911         {
5912         if (repeat_max == -1)
5913           *code++ = OP_PLUS + repeat_type;
5914         else
5915           {
5916           code = oldcode;                 /* leave previous item in place */
5917           if (repeat_max == 1) goto END_REPEAT;
5918           *code++ = OP_UPTO + repeat_type;
5919           PUT2INC(code, 0, repeat_max - 1);
5920           }
5921         }
5922 
5923       /* The case {n,n} is just an EXACT, while the general case {n,m} is
5924       handled as an EXACT followed by an UPTO. */
5925 
5926       else
5927         {
5928         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
5929         PUT2INC(code, 0, repeat_min);
5930 
5931         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
5932         we have to insert the character for the previous code. For a repeated
5933         Unicode property match, there are two extra bytes that define the
5934         required property. In UTF-8 mode, long characters have their length in
5935         c, with the UTF_LENGTH bit as a flag. */
5936 
5937         if (repeat_max < 0)
5938           {
5939 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5940           if (utf && (c & UTF_LENGTH) != 0)
5941             {
5942             memcpy(code, utf_chars, IN_UCHARS(c & 7));
5943             code += c & 7;
5944             }
5945           else
5946 #endif
5947             {
5948             *code++ = c;
5949             if (prop_type >= 0)
5950               {
5951               *code++ = prop_type;
5952               *code++ = prop_value;
5953               }
5954             }
5955           *code++ = OP_STAR + repeat_type;
5956           }
5957 
5958         /* Else insert an UPTO if the max is greater than the min, again
5959         preceded by the character, for the previously inserted code. If the
5960         UPTO is just for 1 instance, we can use QUERY instead. */
5961 
5962         else if (repeat_max != repeat_min)
5963           {
5964 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5965           if (utf && (c & UTF_LENGTH) != 0)
5966             {
5967             memcpy(code, utf_chars, IN_UCHARS(c & 7));
5968             code += c & 7;
5969             }
5970           else
5971 #endif
5972           *code++ = c;
5973           if (prop_type >= 0)
5974             {
5975             *code++ = prop_type;
5976             *code++ = prop_value;
5977             }
5978           repeat_max -= repeat_min;
5979 
5980           if (repeat_max == 1)
5981             {
5982             *code++ = OP_QUERY + repeat_type;
5983             }
5984           else
5985             {
5986             *code++ = OP_UPTO + repeat_type;
5987             PUT2INC(code, 0, repeat_max);
5988             }
5989           }
5990         }
5991 
5992       /* The character or character type itself comes last in all cases. */
5993 
5994 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5995       if (utf && (c & UTF_LENGTH) != 0)
5996         {
5997         memcpy(code, utf_chars, IN_UCHARS(c & 7));
5998         code += c & 7;
5999         }
6000       else
6001 #endif
6002       *code++ = c;
6003 
6004       /* For a repeated Unicode property match, there are two extra bytes that
6005       define the required property. */
6006 
6007 #ifdef SUPPORT_UCP
6008       if (prop_type >= 0)
6009         {
6010         *code++ = prop_type;
6011         *code++ = prop_value;
6012         }
6013 #endif
6014       }
6015 
6016     /* If previous was a character class or a back reference, we put the repeat
6017     stuff after it, but just skip the item if the repeat was {0,0}. */
6018 
6019     else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
6020 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6021              *previous == OP_XCLASS ||
6022 #endif
6023              *previous == OP_REF   || *previous == OP_REFI ||
6024              *previous == OP_DNREF || *previous == OP_DNREFI)
6025       {
6026       if (repeat_max == 0)
6027         {
6028         code = previous;
6029         goto END_REPEAT;
6030         }
6031 
6032       if (repeat_min == 0 && repeat_max == -1)
6033         *code++ = OP_CRSTAR + repeat_type;
6034       else if (repeat_min == 1 && repeat_max == -1)
6035         *code++ = OP_CRPLUS + repeat_type;
6036       else if (repeat_min == 0 && repeat_max == 1)
6037         *code++ = OP_CRQUERY + repeat_type;
6038       else
6039         {
6040         *code++ = OP_CRRANGE + repeat_type;
6041         PUT2INC(code, 0, repeat_min);
6042         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
6043         PUT2INC(code, 0, repeat_max);
6044         }
6045       }
6046 
6047     /* If previous was a bracket group, we may have to replicate it in certain
6048     cases. Note that at this point we can encounter only the "basic" bracket
6049     opcodes such as BRA and CBRA, as this is the place where they get converted
6050     into the more special varieties such as BRAPOS and SBRA. A test for >=
6051     OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
6052     ASSERTBACK_NOT, ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND.
6053     Originally, PCRE did not allow repetition of assertions, but now it does,
6054     for Perl compatibility. */
6055 
6056     else if (*previous >= OP_ASSERT && *previous <= OP_COND)
6057       {
6058       register int i;
6059       int len = (int)(code - previous);
6060       size_t base_hwm_offset = item_hwm_offset;
6061       pcre_uchar *bralink = NULL;
6062       pcre_uchar *brazeroptr = NULL;
6063 
6064       /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
6065       we just ignore the repeat. */
6066 
6067       if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
6068         goto END_REPEAT;
6069 
6070       /* There is no sense in actually repeating assertions. The only potential
6071       use of repetition is in cases when the assertion is optional. Therefore,
6072       if the minimum is greater than zero, just ignore the repeat. If the
6073       maximum is not zero or one, set it to 1. */
6074 
6075       if (*previous < OP_ONCE)    /* Assertion */
6076         {
6077         if (repeat_min > 0) goto END_REPEAT;
6078         if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
6079         }
6080 
6081       /* The case of a zero minimum is special because of the need to stick
6082       OP_BRAZERO in front of it, and because the group appears once in the
6083       data, whereas in other cases it appears the minimum number of times. For
6084       this reason, it is simplest to treat this case separately, as otherwise
6085       the code gets far too messy. There are several special subcases when the
6086       minimum is zero. */
6087 
6088       if (repeat_min == 0)
6089         {
6090         /* If the maximum is also zero, we used to just omit the group from the
6091         output altogether, like this:
6092 
6093         ** if (repeat_max == 0)
6094         **   {
6095         **   code = previous;
6096         **   goto END_REPEAT;
6097         **   }
6098 
6099         However, that fails when a group or a subgroup within it is referenced
6100         as a subroutine from elsewhere in the pattern, so now we stick in
6101         OP_SKIPZERO in front of it so that it is skipped on execution. As we
6102         don't have a list of which groups are referenced, we cannot do this
6103         selectively.
6104 
6105         If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
6106         and do no more at this point. However, we do need to adjust any
6107         OP_RECURSE calls inside the group that refer to the group itself or any
6108         internal or forward referenced group, because the offset is from the
6109         start of the whole regex. Temporarily terminate the pattern while doing
6110         this. */
6111 
6112         if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
6113           {
6114           *code = OP_END;
6115           adjust_recurse(previous, 1, utf, cd, item_hwm_offset);
6116           memmove(previous + 1, previous, IN_UCHARS(len));
6117           code++;
6118           if (repeat_max == 0)
6119             {
6120             *previous++ = OP_SKIPZERO;
6121             goto END_REPEAT;
6122             }
6123           brazeroptr = previous;    /* Save for possessive optimizing */
6124           *previous++ = OP_BRAZERO + repeat_type;
6125           }
6126 
6127         /* If the maximum is greater than 1 and limited, we have to replicate
6128         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
6129         The first one has to be handled carefully because it's the original
6130         copy, which has to be moved up. The remainder can be handled by code
6131         that is common with the non-zero minimum case below. We have to
6132         adjust the value or repeat_max, since one less copy is required. Once
6133         again, we may have to adjust any OP_RECURSE calls inside the group. */
6134 
6135         else
6136           {
6137           int offset;
6138           *code = OP_END;
6139           adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, item_hwm_offset);
6140           memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
6141           code += 2 + LINK_SIZE;
6142           *previous++ = OP_BRAZERO + repeat_type;
6143           *previous++ = OP_BRA;
6144 
6145           /* We chain together the bracket offset fields that have to be
6146           filled in later when the ends of the brackets are reached. */
6147 
6148           offset = (bralink == NULL)? 0 : (int)(previous - bralink);
6149           bralink = previous;
6150           PUTINC(previous, 0, offset);
6151           }
6152 
6153         repeat_max--;
6154         }
6155 
6156       /* If the minimum is greater than zero, replicate the group as many
6157       times as necessary, and adjust the maximum to the number of subsequent
6158       copies that we need. If we set a first char from the group, and didn't
6159       set a required char, copy the latter from the former. If there are any
6160       forward reference subroutine calls in the group, there will be entries on
6161       the workspace list; replicate these with an appropriate increment. */
6162 
6163       else
6164         {
6165         if (repeat_min > 1)
6166           {
6167           /* In the pre-compile phase, we don't actually do the replication. We
6168           just adjust the length as if we had. Do some paranoid checks for
6169           potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
6170           integer type when available, otherwise double. */
6171 
6172           if (lengthptr != NULL)
6173             {
6174             int delta = (repeat_min - 1)*length_prevgroup;
6175             if ((INT64_OR_DOUBLE)(repeat_min - 1)*
6176                   (INT64_OR_DOUBLE)length_prevgroup >
6177                     (INT64_OR_DOUBLE)INT_MAX ||
6178                 OFLOW_MAX - *lengthptr < delta)
6179               {
6180               *errorcodeptr = ERR20;
6181               goto FAILED;
6182               }
6183             *lengthptr += delta;
6184             }
6185 
6186           /* This is compiling for real. If there is a set first byte for
6187           the group, and we have not yet set a "required byte", set it. Make
6188           sure there is enough workspace for copying forward references before
6189           doing the copy. */
6190 
6191           else
6192             {
6193             if (groupsetfirstchar && reqcharflags < 0)
6194               {
6195               reqchar = firstchar;
6196               reqcharflags = firstcharflags;
6197               }
6198 
6199             for (i = 1; i < repeat_min; i++)
6200               {
6201               pcre_uchar *hc;
6202               size_t this_hwm_offset = cd->hwm - cd->start_workspace;
6203               memcpy(code, previous, IN_UCHARS(len));
6204 
6205               while (cd->hwm > cd->start_workspace + cd->workspace_size -
6206                      WORK_SIZE_SAFETY_MARGIN -
6207                      (this_hwm_offset - base_hwm_offset))
6208                 {
6209                 *errorcodeptr = expand_workspace(cd);
6210                 if (*errorcodeptr != 0) goto FAILED;
6211                 }
6212 
6213               for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
6214                    hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
6215                    hc += LINK_SIZE)
6216                 {
6217                 PUT(cd->hwm, 0, GET(hc, 0) + len);
6218                 cd->hwm += LINK_SIZE;
6219                 }
6220               base_hwm_offset = this_hwm_offset;
6221               code += len;
6222               }
6223             }
6224           }
6225 
6226         if (repeat_max > 0) repeat_max -= repeat_min;
6227         }
6228 
6229       /* This code is common to both the zero and non-zero minimum cases. If
6230       the maximum is limited, it replicates the group in a nested fashion,
6231       remembering the bracket starts on a stack. In the case of a zero minimum,
6232       the first one was set up above. In all cases the repeat_max now specifies
6233       the number of additional copies needed. Again, we must remember to
6234       replicate entries on the forward reference list. */
6235 
6236       if (repeat_max >= 0)
6237         {
6238         /* In the pre-compile phase, we don't actually do the replication. We
6239         just adjust the length as if we had. For each repetition we must add 1
6240         to the length for BRAZERO and for all but the last repetition we must
6241         add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
6242         paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
6243         a 64-bit integer type when available, otherwise double. */
6244 
6245         if (lengthptr != NULL && repeat_max > 0)
6246           {
6247           int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
6248                       2 - 2*LINK_SIZE;   /* Last one doesn't nest */
6249           if ((INT64_OR_DOUBLE)repeat_max *
6250                 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
6251                   > (INT64_OR_DOUBLE)INT_MAX ||
6252               OFLOW_MAX - *lengthptr < delta)
6253             {
6254             *errorcodeptr = ERR20;
6255             goto FAILED;
6256             }
6257           *lengthptr += delta;
6258           }
6259 
6260         /* This is compiling for real */
6261 
6262         else for (i = repeat_max - 1; i >= 0; i--)
6263           {
6264           pcre_uchar *hc;
6265           size_t this_hwm_offset = cd->hwm - cd->start_workspace;
6266 
6267           *code++ = OP_BRAZERO + repeat_type;
6268 
6269           /* All but the final copy start a new nesting, maintaining the
6270           chain of brackets outstanding. */
6271 
6272           if (i != 0)
6273             {
6274             int offset;
6275             *code++ = OP_BRA;
6276             offset = (bralink == NULL)? 0 : (int)(code - bralink);
6277             bralink = code;
6278             PUTINC(code, 0, offset);
6279             }
6280 
6281           memcpy(code, previous, IN_UCHARS(len));
6282 
6283           /* Ensure there is enough workspace for forward references before
6284           copying them. */
6285 
6286           while (cd->hwm > cd->start_workspace + cd->workspace_size -
6287                  WORK_SIZE_SAFETY_MARGIN -
6288                  (this_hwm_offset - base_hwm_offset))
6289             {
6290             *errorcodeptr = expand_workspace(cd);
6291             if (*errorcodeptr != 0) goto FAILED;
6292             }
6293 
6294           for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
6295                hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
6296                hc += LINK_SIZE)
6297             {
6298             PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
6299             cd->hwm += LINK_SIZE;
6300             }
6301           base_hwm_offset = this_hwm_offset;
6302           code += len;
6303           }
6304 
6305         /* Now chain through the pending brackets, and fill in their length
6306         fields (which are holding the chain links pro tem). */
6307 
6308         while (bralink != NULL)
6309           {
6310           int oldlinkoffset;
6311           int offset = (int)(code - bralink + 1);
6312           pcre_uchar *bra = code - offset;
6313           oldlinkoffset = GET(bra, 1);
6314           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
6315           *code++ = OP_KET;
6316           PUTINC(code, 0, offset);
6317           PUT(bra, 1, offset);
6318           }
6319         }
6320 
6321       /* If the maximum is unlimited, set a repeater in the final copy. For
6322       ONCE brackets, that's all we need to do. However, possessively repeated
6323       ONCE brackets can be converted into non-capturing brackets, as the
6324       behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
6325       deal with possessive ONCEs specially.
6326 
6327       Otherwise, when we are doing the actual compile phase, check to see
6328       whether this group is one that could match an empty string. If so,
6329       convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
6330       that runtime checking can be done. [This check is also applied to ONCE
6331       groups at runtime, but in a different way.]
6332 
6333       Then, if the quantifier was possessive and the bracket is not a
6334       conditional, we convert the BRA code to the POS form, and the KET code to
6335       KETRPOS. (It turns out to be convenient at runtime to detect this kind of
6336       subpattern at both the start and at the end.) The use of special opcodes
6337       makes it possible to reduce greatly the stack usage in pcre_exec(). If
6338       the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
6339 
6340       Then, if the minimum number of matches is 1 or 0, cancel the possessive
6341       flag so that the default action below, of wrapping everything inside
6342       atomic brackets, does not happen. When the minimum is greater than 1,
6343       there will be earlier copies of the group, and so we still have to wrap
6344       the whole thing. */
6345 
6346       else
6347         {
6348         pcre_uchar *ketcode = code - 1 - LINK_SIZE;
6349         pcre_uchar *bracode = ketcode - GET(ketcode, 1);
6350 
6351         /* Convert possessive ONCE brackets to non-capturing */
6352 
6353         if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
6354             possessive_quantifier) *bracode = OP_BRA;
6355 
6356         /* For non-possessive ONCE brackets, all we need to do is to
6357         set the KET. */
6358 
6359         if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
6360           *ketcode = OP_KETRMAX + repeat_type;
6361 
6362         /* Handle non-ONCE brackets and possessive ONCEs (which have been
6363         converted to non-capturing above). */
6364 
6365         else
6366           {
6367           /* In the compile phase, check for empty string matching. */
6368 
6369           if (lengthptr == NULL)
6370             {
6371             pcre_uchar *scode = bracode;
6372             do
6373               {
6374               if (could_be_empty_branch(scode, ketcode, utf, cd, NULL))
6375                 {
6376                 *bracode += OP_SBRA - OP_BRA;
6377                 break;
6378                 }
6379               scode += GET(scode, 1);
6380               }
6381             while (*scode == OP_ALT);
6382             }
6383 
6384           /* A conditional group with only one branch has an implicit empty
6385           alternative branch. */
6386 
6387           if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
6388             *bracode = OP_SCOND;
6389 
6390           /* Handle possessive quantifiers. */
6391 
6392           if (possessive_quantifier)
6393             {
6394             /* For COND brackets, we wrap the whole thing in a possessively
6395             repeated non-capturing bracket, because we have not invented POS
6396             versions of the COND opcodes. Because we are moving code along, we
6397             must ensure that any pending recursive references are updated. */
6398 
6399             if (*bracode == OP_COND || *bracode == OP_SCOND)
6400               {
6401               int nlen = (int)(code - bracode);
6402               *code = OP_END;
6403               adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6404               memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
6405               code += 1 + LINK_SIZE;
6406               nlen += 1 + LINK_SIZE;
6407               *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
6408               *code++ = OP_KETRPOS;
6409               PUTINC(code, 0, nlen);
6410               PUT(bracode, 1, nlen);
6411               }
6412 
6413             /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
6414 
6415             else
6416               {
6417               *bracode += 1;              /* Switch to xxxPOS opcodes */
6418               *ketcode = OP_KETRPOS;
6419               }
6420 
6421             /* If the minimum is zero, mark it as possessive, then unset the
6422             possessive flag when the minimum is 0 or 1. */
6423 
6424             if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
6425             if (repeat_min < 2) possessive_quantifier = FALSE;
6426             }
6427 
6428           /* Non-possessive quantifier */
6429 
6430           else *ketcode = OP_KETRMAX + repeat_type;
6431           }
6432         }
6433       }
6434 
6435     /* If previous is OP_FAIL, it was generated by an empty class [] in
6436     JavaScript mode. The other ways in which OP_FAIL can be generated, that is
6437     by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
6438     error above. We can just ignore the repeat in JS case. */
6439 
6440     else if (*previous == OP_FAIL) goto END_REPEAT;
6441 
6442     /* Else there's some kind of shambles */
6443 
6444     else
6445       {
6446       *errorcodeptr = ERR11;
6447       goto FAILED;
6448       }
6449 
6450     /* If the character following a repeat is '+', possessive_quantifier is
6451     TRUE. For some opcodes, there are special alternative opcodes for this
6452     case. For anything else, we wrap the entire repeated item inside OP_ONCE
6453     brackets. Logically, the '+' notation is just syntactic sugar, taken from
6454     Sun's Java package, but the special opcodes can optimize it.
6455 
6456     Some (but not all) possessively repeated subpatterns have already been
6457     completely handled in the code just above. For them, possessive_quantifier
6458     is always FALSE at this stage. Note that the repeated item starts at
6459     tempcode, not at previous, which might be the first part of a string whose
6460     (former) last char we repeated. */
6461 
6462     if (possessive_quantifier)
6463       {
6464       int len;
6465 
6466       /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
6467       However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
6468       {5,}, or {5,10}). We skip over an EXACT item; if the length of what
6469       remains is greater than zero, there's a further opcode that can be
6470       handled. If not, do nothing, leaving the EXACT alone. */
6471 
6472       switch(*tempcode)
6473         {
6474         case OP_TYPEEXACT:
6475         tempcode += PRIV(OP_lengths)[*tempcode] +
6476           ((tempcode[1 + IMM2_SIZE] == OP_PROP
6477           || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
6478         break;
6479 
6480         /* CHAR opcodes are used for exacts whose count is 1. */
6481 
6482         case OP_CHAR:
6483         case OP_CHARI:
6484         case OP_NOT:
6485         case OP_NOTI:
6486         case OP_EXACT:
6487         case OP_EXACTI:
6488         case OP_NOTEXACT:
6489         case OP_NOTEXACTI:
6490         tempcode += PRIV(OP_lengths)[*tempcode];
6491 #ifdef SUPPORT_UTF
6492         if (utf && HAS_EXTRALEN(tempcode[-1]))
6493           tempcode += GET_EXTRALEN(tempcode[-1]);
6494 #endif
6495         break;
6496 
6497         /* For the class opcodes, the repeat operator appears at the end;
6498         adjust tempcode to point to it. */
6499 
6500         case OP_CLASS:
6501         case OP_NCLASS:
6502         tempcode += 1 + 32/sizeof(pcre_uchar);
6503         break;
6504 
6505 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6506         case OP_XCLASS:
6507         tempcode += GET(tempcode, 1);
6508         break;
6509 #endif
6510         }
6511 
6512       /* If tempcode is equal to code (which points to the end of the repeated
6513       item), it means we have skipped an EXACT item but there is no following
6514       QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
6515       all other cases, tempcode will be pointing to the repeat opcode, and will
6516       be less than code, so the value of len will be greater than 0. */
6517 
6518       len = (int)(code - tempcode);
6519       if (len > 0)
6520         {
6521         unsigned int repcode = *tempcode;
6522 
6523         /* There is a table for possessifying opcodes, all of which are less
6524         than OP_CALLOUT. A zero entry means there is no possessified version.
6525         */
6526 
6527         if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
6528           *tempcode = opcode_possessify[repcode];
6529 
6530         /* For opcode without a special possessified version, wrap the item in
6531         ONCE brackets. Because we are moving code along, we must ensure that any
6532         pending recursive references are updated. */
6533 
6534         else
6535           {
6536           *code = OP_END;
6537           adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6538           memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6539           code += 1 + LINK_SIZE;
6540           len += 1 + LINK_SIZE;
6541           tempcode[0] = OP_ONCE;
6542           *code++ = OP_KET;
6543           PUTINC(code, 0, len);
6544           PUT(tempcode, 1, len);
6545           }
6546         }
6547 
6548 #ifdef NEVER
6549       if (len > 0) switch (*tempcode)
6550         {
6551         case OP_STAR:  *tempcode = OP_POSSTAR; break;
6552         case OP_PLUS:  *tempcode = OP_POSPLUS; break;
6553         case OP_QUERY: *tempcode = OP_POSQUERY; break;
6554         case OP_UPTO:  *tempcode = OP_POSUPTO; break;
6555 
6556         case OP_STARI:  *tempcode = OP_POSSTARI; break;
6557         case OP_PLUSI:  *tempcode = OP_POSPLUSI; break;
6558         case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
6559         case OP_UPTOI:  *tempcode = OP_POSUPTOI; break;
6560 
6561         case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
6562         case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
6563         case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
6564         case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
6565 
6566         case OP_NOTSTARI:  *tempcode = OP_NOTPOSSTARI; break;
6567         case OP_NOTPLUSI:  *tempcode = OP_NOTPOSPLUSI; break;
6568         case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
6569         case OP_NOTUPTOI:  *tempcode = OP_NOTPOSUPTOI; break;
6570 
6571         case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
6572         case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
6573         case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
6574         case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
6575 
6576         case OP_CRSTAR:   *tempcode = OP_CRPOSSTAR; break;
6577         case OP_CRPLUS:   *tempcode = OP_CRPOSPLUS; break;
6578         case OP_CRQUERY:  *tempcode = OP_CRPOSQUERY; break;
6579         case OP_CRRANGE:  *tempcode = OP_CRPOSRANGE; break;
6580 
6581         /* Because we are moving code along, we must ensure that any
6582         pending recursive references are updated. */
6583 
6584         default:
6585         *code = OP_END;
6586         adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6587         memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6588         code += 1 + LINK_SIZE;
6589         len += 1 + LINK_SIZE;
6590         tempcode[0] = OP_ONCE;
6591         *code++ = OP_KET;
6592         PUTINC(code, 0, len);
6593         PUT(tempcode, 1, len);
6594         break;
6595         }
6596 #endif
6597       }
6598 
6599     /* In all case we no longer have a previous item. We also set the
6600     "follows varying string" flag for subsequently encountered reqchars if
6601     it isn't already set and we have just passed a varying length item. */
6602 
6603     END_REPEAT:
6604     previous = NULL;
6605     cd->req_varyopt |= reqvary;
6606     break;
6607 
6608 
6609     /* ===================================================================*/
6610     /* Start of nested parenthesized sub-expression, or comment or lookahead or
6611     lookbehind or option setting or condition or all the other extended
6612     parenthesis forms.  */
6613 
6614     case CHAR_LEFT_PARENTHESIS:
6615     ptr++;
6616 
6617     /* Now deal with various "verbs" that can be introduced by '*'. */
6618 
6619     if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
6620          || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
6621       {
6622       int i, namelen;
6623       int arglen = 0;
6624       const char *vn = verbnames;
6625       const pcre_uchar *name = ptr + 1;
6626       const pcre_uchar *arg = NULL;
6627       previous = NULL;
6628       ptr++;
6629       while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
6630       namelen = (int)(ptr - name);
6631 
6632       /* It appears that Perl allows any characters whatsoever, other than
6633       a closing parenthesis, to appear in arguments, so we no longer insist on
6634       letters, digits, and underscores. */
6635 
6636       if (*ptr == CHAR_COLON)
6637         {
6638         arg = ++ptr;
6639         while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6640         arglen = (int)(ptr - arg);
6641         if ((unsigned int)arglen > MAX_MARK)
6642           {
6643           *errorcodeptr = ERR75;
6644           goto FAILED;
6645           }
6646         }
6647 
6648       if (*ptr != CHAR_RIGHT_PARENTHESIS)
6649         {
6650         *errorcodeptr = ERR60;
6651         goto FAILED;
6652         }
6653 
6654       /* Scan the table of verb names */
6655 
6656       for (i = 0; i < verbcount; i++)
6657         {
6658         if (namelen == verbs[i].len &&
6659             STRNCMP_UC_C8(name, vn, namelen) == 0)
6660           {
6661           int setverb;
6662 
6663           /* Check for open captures before ACCEPT and convert it to
6664           ASSERT_ACCEPT if in an assertion. */
6665 
6666           if (verbs[i].op == OP_ACCEPT)
6667             {
6668             open_capitem *oc;
6669             if (arglen != 0)
6670               {
6671               *errorcodeptr = ERR59;
6672               goto FAILED;
6673               }
6674             cd->had_accept = TRUE;
6675             for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6676               {
6677               if (lengthptr != NULL)
6678                 {
6679 #ifdef COMPILE_PCRE8
6680                 *lengthptr += 1 + IMM2_SIZE;
6681 #elif defined COMPILE_PCRE16
6682                 *lengthptr += 2 + IMM2_SIZE;
6683 #elif defined COMPILE_PCRE32
6684                 *lengthptr += 4 + IMM2_SIZE;
6685 #endif
6686                 }
6687               else
6688                 {
6689                 *code++ = OP_CLOSE;
6690                 PUT2INC(code, 0, oc->number);
6691                 }
6692               }
6693             setverb = *code++ =
6694               (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6695 
6696             /* Do not set firstchar after *ACCEPT */
6697             if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
6698             }
6699 
6700           /* Handle other cases with/without an argument */
6701 
6702           else if (arglen == 0)
6703             {
6704             if (verbs[i].op < 0)   /* Argument is mandatory */
6705               {
6706               *errorcodeptr = ERR66;
6707               goto FAILED;
6708               }
6709             setverb = *code++ = verbs[i].op;
6710             }
6711 
6712           else
6713             {
6714             if (verbs[i].op_arg < 0)   /* Argument is forbidden */
6715               {
6716               *errorcodeptr = ERR59;
6717               goto FAILED;
6718               }
6719             setverb = *code++ = verbs[i].op_arg;
6720             if (lengthptr != NULL)    /* In pass 1 just add in the length */
6721               {                       /* to avoid potential workspace */
6722               *lengthptr += arglen;   /* overflow. */
6723               *code++ = 0;
6724               }
6725             else
6726               {
6727               *code++ = arglen;
6728               memcpy(code, arg, IN_UCHARS(arglen));
6729               code += arglen;
6730               }
6731             *code++ = 0;
6732             }
6733 
6734           switch (setverb)
6735             {
6736             case OP_THEN:
6737             case OP_THEN_ARG:
6738             cd->external_flags |= PCRE_HASTHEN;
6739             break;
6740 
6741             case OP_PRUNE:
6742             case OP_PRUNE_ARG:
6743             case OP_SKIP:
6744             case OP_SKIP_ARG:
6745             cd->had_pruneorskip = TRUE;
6746             break;
6747             }
6748 
6749           break;  /* Found verb, exit loop */
6750           }
6751 
6752         vn += verbs[i].len + 1;
6753         }
6754 
6755       if (i < verbcount) continue;    /* Successfully handled a verb */
6756       *errorcodeptr = ERR60;          /* Verb not recognized */
6757       goto FAILED;
6758       }
6759 
6760     /* Initialize for "real" parentheses */
6761 
6762     newoptions = options;
6763     skipbytes = 0;
6764     bravalue = OP_CBRA;
6765     item_hwm_offset = cd->hwm - cd->start_workspace;
6766     reset_bracount = FALSE;
6767 
6768     /* Deal with the extended parentheses; all are introduced by '?', and the
6769     appearance of any of them means that this is not a capturing group. */
6770 
6771     if (*ptr == CHAR_QUESTION_MARK)
6772       {
6773       int i, set, unset, namelen;
6774       int *optset;
6775       const pcre_uchar *name;
6776       pcre_uchar *slot;
6777 
6778       switch (*(++ptr))
6779         {
6780         /* ------------------------------------------------------------ */
6781         case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
6782         reset_bracount = TRUE;
6783         cd->dupgroups = TRUE;     /* Record (?| encountered */
6784         /* Fall through */
6785 
6786         /* ------------------------------------------------------------ */
6787         case CHAR_COLON:          /* Non-capturing bracket */
6788         bravalue = OP_BRA;
6789         ptr++;
6790         break;
6791 
6792 
6793         /* ------------------------------------------------------------ */
6794         case CHAR_LEFT_PARENTHESIS:
6795         bravalue = OP_COND;       /* Conditional group */
6796         tempptr = ptr;
6797 
6798         /* A condition can be an assertion, a number (referring to a numbered
6799         group's having been set), a name (referring to a named group), or 'R',
6800         referring to recursion. R<digits> and R&name are also permitted for
6801         recursion tests.
6802 
6803         There are ways of testing a named group: (?(name)) is used by Python;
6804         Perl 5.10 onwards uses (?(<name>) or (?('name')).
6805 
6806         There is one unfortunate ambiguity, caused by history. 'R' can be the
6807         recursive thing or the name 'R' (and similarly for 'R' followed by
6808         digits). We look for a name first; if not found, we try the other case.
6809 
6810         For compatibility with auto-callouts, we allow a callout to be
6811         specified before a condition that is an assertion. First, check for the
6812         syntax of a callout; if found, adjust the temporary pointer that is
6813         used to check for an assertion condition. That's all that is needed! */
6814 
6815         if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C)
6816           {
6817           for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break;
6818           if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
6819             tempptr += i + 1;
6820 
6821           /* tempptr should now be pointing to the opening parenthesis of the
6822           assertion condition. */
6823 
6824           if (*tempptr != CHAR_LEFT_PARENTHESIS)
6825             {
6826             *errorcodeptr = ERR28;
6827             goto FAILED;
6828             }
6829           }
6830 
6831         /* For conditions that are assertions, check the syntax, and then exit
6832         the switch. This will take control down to where bracketed groups,
6833         including assertions, are processed. */
6834 
6835         if (tempptr[1] == CHAR_QUESTION_MARK &&
6836               (tempptr[2] == CHAR_EQUALS_SIGN ||
6837                tempptr[2] == CHAR_EXCLAMATION_MARK ||
6838                  (tempptr[2] == CHAR_LESS_THAN_SIGN &&
6839                    (tempptr[3] == CHAR_EQUALS_SIGN ||
6840                     tempptr[3] == CHAR_EXCLAMATION_MARK))))
6841           {
6842           cd->iscondassert = TRUE;
6843           break;
6844           }
6845 
6846         /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6847         need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6848 
6849         code[1+LINK_SIZE] = OP_CREF;
6850         skipbytes = 1+IMM2_SIZE;
6851         refsign = -1;     /* => not a number */
6852         namelen = -1;     /* => not a name; must set to avoid warning */
6853         name = NULL;      /* Always set to avoid warning */
6854         recno = 0;        /* Always set to avoid warning */
6855 
6856         /* Check for a test for recursion in a named group. */
6857 
6858         ptr++;
6859         if (*ptr == CHAR_R && ptr[1] == CHAR_AMPERSAND)
6860           {
6861           terminator = -1;
6862           ptr += 2;
6863           code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
6864           }
6865 
6866         /* Check for a test for a named group's having been set, using the Perl
6867         syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6868         syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). */
6869 
6870         else if (*ptr == CHAR_LESS_THAN_SIGN)
6871           {
6872           terminator = CHAR_GREATER_THAN_SIGN;
6873           ptr++;
6874           }
6875         else if (*ptr == CHAR_APOSTROPHE)
6876           {
6877           terminator = CHAR_APOSTROPHE;
6878           ptr++;
6879           }
6880         else
6881           {
6882           terminator = CHAR_NULL;
6883           if (*ptr == CHAR_MINUS || *ptr == CHAR_PLUS) refsign = *ptr++;
6884             else if (IS_DIGIT(*ptr)) refsign = 0;
6885           }
6886 
6887         /* Handle a number */
6888 
6889         if (refsign >= 0)
6890           {
6891           while (IS_DIGIT(*ptr))
6892             {
6893             if (recno > INT_MAX / 10 - 1)  /* Integer overflow */
6894               {
6895               while (IS_DIGIT(*ptr)) ptr++;
6896               *errorcodeptr = ERR61;
6897               goto FAILED;
6898               }
6899             recno = recno * 10 + (int)(*ptr - CHAR_0);
6900             ptr++;
6901             }
6902           }
6903 
6904         /* Otherwise we expect to read a name; anything else is an error. When
6905         a name is one of a number of duplicates, a different opcode is used and
6906         it needs more memory. Unfortunately we cannot tell whether a name is a
6907         duplicate in the first pass, so we have to allow for more memory. */
6908 
6909         else
6910           {
6911           if (IS_DIGIT(*ptr))
6912             {
6913             *errorcodeptr = ERR84;
6914             goto FAILED;
6915             }
6916           if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_word) == 0)
6917             {
6918             *errorcodeptr = ERR28;   /* Assertion expected */
6919             goto FAILED;
6920             }
6921           name = ptr++;
6922           while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
6923             {
6924             ptr++;
6925             }
6926           namelen = (int)(ptr - name);
6927           if (lengthptr != NULL) skipbytes += IMM2_SIZE;
6928           }
6929 
6930         /* Check the terminator */
6931 
6932         if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
6933             *ptr++ != CHAR_RIGHT_PARENTHESIS)
6934           {
6935           ptr--;                  /* Error offset */
6936           *errorcodeptr = ERR26;  /* Malformed number or name */
6937           goto FAILED;
6938           }
6939 
6940         /* Do no further checking in the pre-compile phase. */
6941 
6942         if (lengthptr != NULL) break;
6943 
6944         /* In the real compile we do the work of looking for the actual
6945         reference. If refsign is not negative, it means we have a number in
6946         recno. */
6947 
6948         if (refsign >= 0)
6949           {
6950           if (recno <= 0)
6951             {
6952             *errorcodeptr = ERR35;
6953             goto FAILED;
6954             }
6955           if (refsign != 0) recno = (refsign == CHAR_MINUS)?
6956             cd->bracount - recno + 1 : recno + cd->bracount;
6957           if (recno <= 0 || recno > cd->final_bracount)
6958             {
6959             *errorcodeptr = ERR15;
6960             goto FAILED;
6961             }
6962           PUT2(code, 2+LINK_SIZE, recno);
6963           if (recno > cd->top_backref) cd->top_backref = recno;
6964           break;
6965           }
6966 
6967         /* Otherwise look for the name. */
6968 
6969         slot = cd->name_table;
6970         for (i = 0; i < cd->names_found; i++)
6971           {
6972           if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
6973             slot[IMM2_SIZE+namelen] == 0) break;
6974           slot += cd->name_entry_size;
6975           }
6976 
6977         /* Found the named subpattern. If the name is duplicated, add one to
6978         the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6979         appropriate data values. Otherwise, just insert the unique subpattern
6980         number. */
6981 
6982         if (i < cd->names_found)
6983           {
6984           int offset = i++;
6985           int count = 1;
6986           recno = GET2(slot, 0);   /* Number from first found */
6987           if (recno > cd->top_backref) cd->top_backref = recno;
6988           for (; i < cd->names_found; i++)
6989             {
6990             slot += cd->name_entry_size;
6991             if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0 ||
6992               (slot+IMM2_SIZE)[namelen] != 0) break;
6993             count++;
6994             }
6995 
6996           if (count > 1)
6997             {
6998             PUT2(code, 2+LINK_SIZE, offset);
6999             PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
7000             skipbytes += IMM2_SIZE;
7001             code[1+LINK_SIZE]++;
7002             }
7003           else  /* Not a duplicated name */
7004             {
7005             PUT2(code, 2+LINK_SIZE, recno);
7006             }
7007           }
7008 
7009         /* If terminator == CHAR_NULL it means that the name followed directly
7010         after the opening parenthesis [e.g. (?(abc)...] and in this case there
7011         are some further alternatives to try. For the cases where terminator !=
7012         CHAR_NULL [things like (?(<name>... or (?('name')... or (?(R&name)... ]
7013         we have now checked all the possibilities, so give an error. */
7014 
7015         else if (terminator != CHAR_NULL)
7016           {
7017           *errorcodeptr = ERR15;
7018           goto FAILED;
7019           }
7020 
7021         /* Check for (?(R) for recursion. Allow digits after R to specify a
7022         specific group number. */
7023 
7024         else if (*name == CHAR_R)
7025           {
7026           recno = 0;
7027           for (i = 1; i < namelen; i++)
7028             {
7029             if (!IS_DIGIT(name[i]))
7030               {
7031               *errorcodeptr = ERR15;
7032               goto FAILED;
7033               }
7034             if (recno > INT_MAX / 10 - 1)   /* Integer overflow */
7035               {
7036               *errorcodeptr = ERR61;
7037               goto FAILED;
7038               }
7039             recno = recno * 10 + name[i] - CHAR_0;
7040             }
7041           if (recno == 0) recno = RREF_ANY;
7042           code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
7043           PUT2(code, 2+LINK_SIZE, recno);
7044           }
7045 
7046         /* Similarly, check for the (?(DEFINE) "condition", which is always
7047         false. */
7048 
7049         else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
7050           {
7051           code[1+LINK_SIZE] = OP_DEF;
7052           skipbytes = 1;
7053           }
7054 
7055         /* Reference to an unidentified subpattern. */
7056 
7057         else
7058           {
7059           *errorcodeptr = ERR15;
7060           goto FAILED;
7061           }
7062         break;
7063 
7064 
7065         /* ------------------------------------------------------------ */
7066         case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
7067         bravalue = OP_ASSERT;
7068         cd->assert_depth += 1;
7069         ptr++;
7070         break;
7071 
7072         /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
7073         thing to do, but Perl allows all assertions to be quantified, and when
7074         they contain capturing parentheses there may be a potential use for
7075         this feature. Not that that applies to a quantified (?!) but we allow
7076         it for uniformity. */
7077 
7078         /* ------------------------------------------------------------ */
7079         case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
7080         ptr++;
7081         if (*ptr == CHAR_RIGHT_PARENTHESIS && ptr[1] != CHAR_ASTERISK &&
7082              ptr[1] != CHAR_PLUS && ptr[1] != CHAR_QUESTION_MARK &&
7083             (ptr[1] != CHAR_LEFT_CURLY_BRACKET || !is_counted_repeat(ptr+2)))
7084           {
7085           *code++ = OP_FAIL;
7086           previous = NULL;
7087           continue;
7088           }
7089         bravalue = OP_ASSERT_NOT;
7090         cd->assert_depth += 1;
7091         break;
7092 
7093 
7094         /* ------------------------------------------------------------ */
7095         case CHAR_LESS_THAN_SIGN:              /* Lookbehind or named define */
7096         switch (ptr[1])
7097           {
7098           case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
7099           bravalue = OP_ASSERTBACK;
7100           cd->assert_depth += 1;
7101           ptr += 2;
7102           break;
7103 
7104           case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
7105           bravalue = OP_ASSERTBACK_NOT;
7106           cd->assert_depth += 1;
7107           ptr += 2;
7108           break;
7109 
7110           default:                /* Could be name define, else bad */
7111           if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
7112             goto DEFINE_NAME;
7113           ptr++;                  /* Correct offset for error */
7114           *errorcodeptr = ERR24;
7115           goto FAILED;
7116           }
7117         break;
7118 
7119 
7120         /* ------------------------------------------------------------ */
7121         case CHAR_GREATER_THAN_SIGN:           /* One-time brackets */
7122         bravalue = OP_ONCE;
7123         ptr++;
7124         break;
7125 
7126 
7127         /* ------------------------------------------------------------ */
7128         case CHAR_C:                 /* Callout - may be followed by digits; */
7129         previous_callout = code;     /* Save for later completion */
7130         after_manual_callout = 1;    /* Skip one item before completing */
7131         *code++ = OP_CALLOUT;
7132           {
7133           int n = 0;
7134           ptr++;
7135           while(IS_DIGIT(*ptr))
7136             {
7137             n = n * 10 + *ptr++ - CHAR_0;
7138             if (n > 255)
7139               {
7140               *errorcodeptr = ERR38;
7141               goto FAILED;
7142               }
7143             }
7144           if (*ptr != CHAR_RIGHT_PARENTHESIS)
7145             {
7146             *errorcodeptr = ERR39;
7147             goto FAILED;
7148             }
7149           *code++ = n;
7150           PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
7151           PUT(code, LINK_SIZE, 0);                          /* Default length */
7152           code += 2 * LINK_SIZE;
7153           }
7154         previous = NULL;
7155         continue;
7156 
7157 
7158         /* ------------------------------------------------------------ */
7159         case CHAR_P:              /* Python-style named subpattern handling */
7160         if (*(++ptr) == CHAR_EQUALS_SIGN ||
7161             *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
7162           {
7163           is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
7164           terminator = CHAR_RIGHT_PARENTHESIS;
7165           goto NAMED_REF_OR_RECURSE;
7166           }
7167         else if (*ptr != CHAR_LESS_THAN_SIGN)  /* Test for Python-style defn */
7168           {
7169           *errorcodeptr = ERR41;
7170           goto FAILED;
7171           }
7172         /* Fall through to handle (?P< as (?< is handled */
7173         /* fall through */
7174 
7175         /* ------------------------------------------------------------ */
7176         DEFINE_NAME:    /* Come here from (?< handling */
7177         case CHAR_APOSTROPHE:
7178         terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
7179           CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7180         name = ++ptr;
7181         if (IS_DIGIT(*ptr))
7182           {
7183           *errorcodeptr = ERR84;   /* Group name must start with non-digit */
7184           goto FAILED;
7185           }
7186         while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
7187         namelen = (int)(ptr - name);
7188 
7189         /* In the pre-compile phase, do a syntax check, remember the longest
7190         name, and then remember the group in a vector, expanding it if
7191         necessary. Duplicates for the same number are skipped; other duplicates
7192         are checked for validity. In the actual compile, there is nothing to
7193         do. */
7194 
7195         if (lengthptr != NULL)
7196           {
7197           named_group *ng;
7198           pcre_uint32 number = cd->bracount + 1;
7199 
7200           if (*ptr != (pcre_uchar)terminator)
7201             {
7202             *errorcodeptr = ERR42;
7203             goto FAILED;
7204             }
7205 
7206           if (cd->names_found >= MAX_NAME_COUNT)
7207             {
7208             *errorcodeptr = ERR49;
7209             goto FAILED;
7210             }
7211 
7212           if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
7213             {
7214             cd->name_entry_size = namelen + IMM2_SIZE + 1;
7215             if (namelen > MAX_NAME_SIZE)
7216               {
7217               *errorcodeptr = ERR48;
7218               goto FAILED;
7219               }
7220             }
7221 
7222           /* Scan the list to check for duplicates. For duplicate names, if the
7223           number is the same, break the loop, which causes the name to be
7224           discarded; otherwise, if DUPNAMES is not set, give an error.
7225           If it is set, allow the name with a different number, but continue
7226           scanning in case this is a duplicate with the same number. For
7227           non-duplicate names, give an error if the number is duplicated. */
7228 
7229           ng = cd->named_groups;
7230           for (i = 0; i < cd->names_found; i++, ng++)
7231             {
7232             if (namelen == ng->length &&
7233                 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
7234               {
7235               if (ng->number == number) break;
7236               if ((options & PCRE_DUPNAMES) == 0)
7237                 {
7238                 *errorcodeptr = ERR43;
7239                 goto FAILED;
7240                 }
7241               cd->dupnames = TRUE;  /* Duplicate names exist */
7242               }
7243             else if (ng->number == number)
7244               {
7245               *errorcodeptr = ERR65;
7246               goto FAILED;
7247               }
7248             }
7249 
7250           if (i >= cd->names_found)     /* Not a duplicate with same number */
7251             {
7252             /* Increase the list size if necessary */
7253 
7254             if (cd->names_found >= cd->named_group_list_size)
7255               {
7256               int newsize = cd->named_group_list_size * 2;
7257               named_group *newspace = (PUBL(malloc))
7258                 (newsize * sizeof(named_group));
7259 
7260               if (newspace == NULL)
7261                 {
7262                 *errorcodeptr = ERR21;
7263                 goto FAILED;
7264                 }
7265 
7266               memcpy(newspace, cd->named_groups,
7267                 cd->named_group_list_size * sizeof(named_group));
7268               if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
7269                 (PUBL(free))((void *)cd->named_groups);
7270               cd->named_groups = newspace;
7271               cd->named_group_list_size = newsize;
7272               }
7273 
7274             cd->named_groups[cd->names_found].name = name;
7275             cd->named_groups[cd->names_found].length = namelen;
7276             cd->named_groups[cd->names_found].number = number;
7277             cd->names_found++;
7278             }
7279           }
7280 
7281         ptr++;                    /* Move past > or ' in both passes. */
7282         goto NUMBERED_GROUP;
7283 
7284 
7285         /* ------------------------------------------------------------ */
7286         case CHAR_AMPERSAND:            /* Perl recursion/subroutine syntax */
7287         terminator = CHAR_RIGHT_PARENTHESIS;
7288         is_recurse = TRUE;
7289         /* Fall through */
7290 
7291         /* We come here from the Python syntax above that handles both
7292         references (?P=name) and recursion (?P>name), as well as falling
7293         through from the Perl recursion syntax (?&name). We also come here from
7294         the Perl \k<name> or \k'name' back reference syntax and the \k{name}
7295         .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
7296 
7297         NAMED_REF_OR_RECURSE:
7298         name = ++ptr;
7299         if (IS_DIGIT(*ptr))
7300           {
7301           *errorcodeptr = ERR84;   /* Group name must start with non-digit */
7302           goto FAILED;
7303           }
7304         while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
7305         namelen = (int)(ptr - name);
7306 
7307         /* In the pre-compile phase, do a syntax check. We used to just set
7308         a dummy reference number, because it was not used in the first pass.
7309         However, with the change of recursive back references to be atomic,
7310         we have to look for the number so that this state can be identified, as
7311         otherwise the incorrect length is computed. If it's not a backwards
7312         reference, the dummy number will do. */
7313 
7314         if (lengthptr != NULL)
7315           {
7316           named_group *ng;
7317           recno = 0;
7318 
7319           if (namelen == 0)
7320             {
7321             *errorcodeptr = ERR62;
7322             goto FAILED;
7323             }
7324           if (*ptr != (pcre_uchar)terminator)
7325             {
7326             *errorcodeptr = ERR42;
7327             goto FAILED;
7328             }
7329           if (namelen > MAX_NAME_SIZE)
7330             {
7331             *errorcodeptr = ERR48;
7332             goto FAILED;
7333             }
7334 
7335           /* Count named back references. */
7336 
7337           if (!is_recurse) cd->namedrefcount++;
7338 
7339           /* We have to allow for a named reference to a duplicated name (this
7340           cannot be determined until the second pass). This needs an extra
7341           16-bit data item. */
7342 
7343           *lengthptr += IMM2_SIZE;
7344 
7345           /* If this is a forward reference and we are within a (?|...) group,
7346           the reference may end up as the number of a group which we are
7347           currently inside, that is, it could be a recursive reference. In the
7348           real compile this will be picked up and the reference wrapped with
7349           OP_ONCE to make it atomic, so we must space in case this occurs. */
7350 
7351           /* In fact, this can happen for a non-forward reference because
7352           another group with the same number might be created later. This
7353           issue is fixed "properly" in PCRE2. As PCRE1 is now in maintenance
7354           only mode, we finesse the bug by allowing more memory always. */
7355 
7356           *lengthptr += 4 + 4*LINK_SIZE;
7357 
7358           /* It is even worse than that. The current reference may be to an
7359           existing named group with a different number (so apparently not
7360           recursive) but which later on is also attached to a group with the
7361           current number. This can only happen if $(| has been previous
7362           encountered. In that case, we allow yet more memory, just in case.
7363           (Again, this is fixed "properly" in PCRE2. */
7364 
7365           if (cd->dupgroups) *lengthptr += 4 + 4*LINK_SIZE;
7366 
7367           /* Otherwise, check for recursion here. The name table does not exist
7368           in the first pass; instead we must scan the list of names encountered
7369           so far in order to get the number. If the name is not found, leave
7370           the value of recno as 0 for a forward reference. */
7371 
7372           /* This patch (removing "else") fixes a problem when a reference is
7373           to multiple identically named nested groups from within the nest.
7374           Once again, it is not the "proper" fix, and it results in an
7375           over-allocation of memory. */
7376 
7377           /* else */
7378             {
7379             ng = cd->named_groups;
7380             for (i = 0; i < cd->names_found; i++, ng++)
7381               {
7382               if (namelen == ng->length &&
7383                   STRNCMP_UC_UC(name, ng->name, namelen) == 0)
7384                 {
7385                 open_capitem *oc;
7386                 recno = ng->number;
7387                 if (is_recurse) break;
7388                 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
7389                   {
7390                   if (oc->number == recno)
7391                     {
7392                     oc->flag = TRUE;
7393                     break;
7394                     }
7395                   }
7396                 }
7397               }
7398             }
7399           }
7400 
7401         /* In the real compile, search the name table. We check the name
7402         first, and then check that we have reached the end of the name in the
7403         table. That way, if the name is longer than any in the table, the
7404         comparison will fail without reading beyond the table entry. */
7405 
7406         else
7407           {
7408           slot = cd->name_table;
7409           for (i = 0; i < cd->names_found; i++)
7410             {
7411             if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
7412                 slot[IMM2_SIZE+namelen] == 0)
7413               break;
7414             slot += cd->name_entry_size;
7415             }
7416 
7417           if (i < cd->names_found)
7418             {
7419             recno = GET2(slot, 0);
7420             }
7421           else
7422             {
7423             *errorcodeptr = ERR15;
7424             goto FAILED;
7425             }
7426           }
7427 
7428         /* In both phases, for recursions, we can now go to the code than
7429         handles numerical recursion. */
7430 
7431         if (is_recurse) goto HANDLE_RECURSION;
7432 
7433         /* In the second pass we must see if the name is duplicated. If so, we
7434         generate a different opcode. */
7435 
7436         if (lengthptr == NULL && cd->dupnames)
7437           {
7438           int count = 1;
7439           unsigned int index = i;
7440           pcre_uchar *cslot = slot + cd->name_entry_size;
7441 
7442           for (i++; i < cd->names_found; i++)
7443             {
7444             if (STRCMP_UC_UC(slot + IMM2_SIZE, cslot + IMM2_SIZE) != 0) break;
7445             count++;
7446             cslot += cd->name_entry_size;
7447             }
7448 
7449           if (count > 1)
7450             {
7451             if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7452             previous = code;
7453             item_hwm_offset = cd->hwm - cd->start_workspace;
7454             *code++ = ((options & PCRE_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
7455             PUT2INC(code, 0, index);
7456             PUT2INC(code, 0, count);
7457 
7458             /* Process each potentially referenced group. */
7459 
7460             for (; slot < cslot; slot += cd->name_entry_size)
7461               {
7462               open_capitem *oc;
7463               recno = GET2(slot, 0);
7464               cd->backref_map |= (recno < 32)? (1U << recno) : 1;
7465               if (recno > cd->top_backref) cd->top_backref = recno;
7466 
7467               /* Check to see if this back reference is recursive, that it, it
7468               is inside the group that it references. A flag is set so that the
7469               group can be made atomic. */
7470 
7471               for (oc = cd->open_caps; oc != NULL; oc = oc->next)
7472                 {
7473                 if (oc->number == recno)
7474                   {
7475                   oc->flag = TRUE;
7476                   break;
7477                   }
7478                 }
7479               }
7480 
7481             continue;  /* End of back ref handling */
7482             }
7483           }
7484 
7485         /* First pass, or a non-duplicated name. */
7486 
7487         goto HANDLE_REFERENCE;
7488 
7489 
7490         /* ------------------------------------------------------------ */
7491         case CHAR_R:              /* Recursion, same as (?0) */
7492         recno = 0;
7493         if (*(++ptr) != CHAR_RIGHT_PARENTHESIS)
7494           {
7495           *errorcodeptr = ERR29;
7496           goto FAILED;
7497           }
7498         goto HANDLE_RECURSION;
7499 
7500 
7501         /* ------------------------------------------------------------ */
7502         case CHAR_MINUS: case CHAR_PLUS:  /* Recursion or subroutine */
7503         case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
7504         case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
7505           {
7506           const pcre_uchar *called;
7507           terminator = CHAR_RIGHT_PARENTHESIS;
7508 
7509           /* Come here from the \g<...> and \g'...' code (Oniguruma
7510           compatibility). However, the syntax has been checked to ensure that
7511           the ... are a (signed) number, so that neither ERR63 nor ERR29 will
7512           be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
7513           ever be taken. */
7514 
7515           HANDLE_NUMERICAL_RECURSION:
7516 
7517           if ((refsign = *ptr) == CHAR_PLUS)
7518             {
7519             ptr++;
7520             if (!IS_DIGIT(*ptr))
7521               {
7522               *errorcodeptr = ERR63;
7523               goto FAILED;
7524               }
7525             }
7526           else if (refsign == CHAR_MINUS)
7527             {
7528             if (!IS_DIGIT(ptr[1]))
7529               goto OTHER_CHAR_AFTER_QUERY;
7530             ptr++;
7531             }
7532 
7533           recno = 0;
7534           while(IS_DIGIT(*ptr))
7535             {
7536             if (recno > INT_MAX / 10 - 1) /* Integer overflow */
7537               {
7538               while (IS_DIGIT(*ptr)) ptr++;
7539               *errorcodeptr = ERR61;
7540               goto FAILED;
7541               }
7542             recno = recno * 10 + *ptr++ - CHAR_0;
7543             }
7544 
7545           if (*ptr != (pcre_uchar)terminator)
7546             {
7547             *errorcodeptr = ERR29;
7548             goto FAILED;
7549             }
7550 
7551           if (refsign == CHAR_MINUS)
7552             {
7553             if (recno == 0)
7554               {
7555               *errorcodeptr = ERR58;
7556               goto FAILED;
7557               }
7558             recno = cd->bracount - recno + 1;
7559             if (recno <= 0)
7560               {
7561               *errorcodeptr = ERR15;
7562               goto FAILED;
7563               }
7564             }
7565           else if (refsign == CHAR_PLUS)
7566             {
7567             if (recno == 0)
7568               {
7569               *errorcodeptr = ERR58;
7570               goto FAILED;
7571               }
7572             recno += cd->bracount;
7573             }
7574 
7575           /* Come here from code above that handles a named recursion */
7576 
7577           HANDLE_RECURSION:
7578 
7579           previous = code;
7580           item_hwm_offset = cd->hwm - cd->start_workspace;
7581           called = cd->start_code;
7582 
7583           /* When we are actually compiling, find the bracket that is being
7584           referenced. Temporarily end the regex in case it doesn't exist before
7585           this point. If we end up with a forward reference, first check that
7586           the bracket does occur later so we can give the error (and position)
7587           now. Then remember this forward reference in the workspace so it can
7588           be filled in at the end. */
7589 
7590           if (lengthptr == NULL)
7591             {
7592             *code = OP_END;
7593             if (recno != 0)
7594               called = PRIV(find_bracket)(cd->start_code, utf, recno);
7595 
7596             /* Forward reference */
7597 
7598             if (called == NULL)
7599               {
7600               if (recno > cd->final_bracount)
7601                 {
7602                 *errorcodeptr = ERR15;
7603                 goto FAILED;
7604                 }
7605 
7606               /* Fudge the value of "called" so that when it is inserted as an
7607               offset below, what it actually inserted is the reference number
7608               of the group. Then remember the forward reference. */
7609 
7610               called = cd->start_code + recno;
7611               if (cd->hwm >= cd->start_workspace + cd->workspace_size -
7612                   WORK_SIZE_SAFETY_MARGIN)
7613                 {
7614                 *errorcodeptr = expand_workspace(cd);
7615                 if (*errorcodeptr != 0) goto FAILED;
7616                 }
7617               PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
7618               }
7619 
7620             /* If not a forward reference, and the subpattern is still open,
7621             this is a recursive call. We check to see if this is a left
7622             recursion that could loop for ever, and diagnose that case. We
7623             must not, however, do this check if we are in a conditional
7624             subpattern because the condition might be testing for recursion in
7625             a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid.
7626             Forever loops are also detected at runtime, so those that occur in
7627             conditional subpatterns will be picked up then. */
7628 
7629             else if (GET(called, 1) == 0 && cond_depth <= 0 &&
7630                      could_be_empty(called, code, bcptr, utf, cd))
7631               {
7632               *errorcodeptr = ERR40;
7633               goto FAILED;
7634               }
7635             }
7636 
7637           /* Insert the recursion/subroutine item. It does not have a set first
7638           character (relevant if it is repeated, because it will then be
7639           wrapped with ONCE brackets). */
7640 
7641           *code = OP_RECURSE;
7642           PUT(code, 1, (int)(called - cd->start_code));
7643           code += 1 + LINK_SIZE;
7644           groupsetfirstchar = FALSE;
7645           }
7646 
7647         /* Can't determine a first byte now */
7648 
7649         if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7650         zerofirstchar = firstchar;
7651         zerofirstcharflags = firstcharflags;
7652         continue;
7653 
7654 
7655         /* ------------------------------------------------------------ */
7656         default:              /* Other characters: check option setting */
7657         OTHER_CHAR_AFTER_QUERY:
7658         set = unset = 0;
7659         optset = &set;
7660 
7661         while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
7662           {
7663           switch (*ptr++)
7664             {
7665             case CHAR_MINUS: optset = &unset; break;
7666 
7667             case CHAR_J:    /* Record that it changed in the external options */
7668             *optset |= PCRE_DUPNAMES;
7669             cd->external_flags |= PCRE_JCHANGED;
7670             break;
7671 
7672             case CHAR_i: *optset |= PCRE_CASELESS; break;
7673             case CHAR_m: *optset |= PCRE_MULTILINE; break;
7674             case CHAR_s: *optset |= PCRE_DOTALL; break;
7675             case CHAR_x: *optset |= PCRE_EXTENDED; break;
7676             case CHAR_U: *optset |= PCRE_UNGREEDY; break;
7677             case CHAR_X: *optset |= PCRE_EXTRA; break;
7678 
7679             default:  *errorcodeptr = ERR12;
7680                       ptr--;    /* Correct the offset */
7681                       goto FAILED;
7682             }
7683           }
7684 
7685         /* Set up the changed option bits, but don't change anything yet. */
7686 
7687         newoptions = (options | set) & (~unset);
7688 
7689         /* If the options ended with ')' this is not the start of a nested
7690         group with option changes, so the options change at this level.
7691         If we are not at the pattern start, reset the greedy defaults and the
7692         case value for firstchar and reqchar. */
7693 
7694         if (*ptr == CHAR_RIGHT_PARENTHESIS)
7695           {
7696           greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
7697           greedy_non_default = greedy_default ^ 1;
7698           req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
7699 
7700           /* Change options at this level, and pass them back for use
7701           in subsequent branches. */
7702 
7703           *optionsptr = options = newoptions;
7704           previous = NULL;       /* This item can't be repeated */
7705           continue;              /* It is complete */
7706           }
7707 
7708         /* If the options ended with ':' we are heading into a nested group
7709         with possible change of options. Such groups are non-capturing and are
7710         not assertions of any kind. All we need to do is skip over the ':';
7711         the newoptions value is handled below. */
7712 
7713         bravalue = OP_BRA;
7714         ptr++;
7715         }     /* End of switch for character following (? */
7716       }       /* End of (? handling */
7717 
7718     /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
7719     is set, all unadorned brackets become non-capturing and behave like (?:...)
7720     brackets. */
7721 
7722     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
7723       {
7724       bravalue = OP_BRA;
7725       }
7726 
7727     /* Else we have a capturing group. */
7728 
7729     else
7730       {
7731       NUMBERED_GROUP:
7732       cd->bracount += 1;
7733       PUT2(code, 1+LINK_SIZE, cd->bracount);
7734       skipbytes = IMM2_SIZE;
7735       }
7736 
7737     /* Process nested bracketed regex. First check for parentheses nested too
7738     deeply. */
7739 
7740     if ((cd->parens_depth += 1) > PARENS_NEST_LIMIT)
7741       {
7742       *errorcodeptr = ERR82;
7743       goto FAILED;
7744       }
7745 
7746     /* All assertions used not to be repeatable, but this was changed for Perl
7747     compatibility. All kinds can now be repeated except for assertions that are
7748     conditions (Perl also forbids these to be repeated). We copy code into a
7749     non-register variable (tempcode) in order to be able to pass its address
7750     because some compilers complain otherwise. At the start of a conditional
7751     group whose condition is an assertion, cd->iscondassert is set. We unset it
7752     here so as to allow assertions later in the group to be quantified. */
7753 
7754     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT &&
7755         cd->iscondassert)
7756       {
7757       previous = NULL;
7758       cd->iscondassert = FALSE;
7759       }
7760     else
7761       {
7762       previous = code;
7763       item_hwm_offset = cd->hwm - cd->start_workspace;
7764       }
7765 
7766     *code = bravalue;
7767     tempcode = code;
7768     tempreqvary = cd->req_varyopt;        /* Save value before bracket */
7769     tempbracount = cd->bracount;          /* Save value before bracket */
7770     length_prevgroup = 0;                 /* Initialize for pre-compile phase */
7771 
7772     if (!compile_regex(
7773          newoptions,                      /* The complete new option state */
7774          &tempcode,                       /* Where to put code (updated) */
7775          &ptr,                            /* Input pointer (updated) */
7776          errorcodeptr,                    /* Where to put an error message */
7777          (bravalue == OP_ASSERTBACK ||
7778           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
7779          reset_bracount,                  /* True if (?| group */
7780          skipbytes,                       /* Skip over bracket number */
7781          cond_depth +
7782            ((bravalue == OP_COND)?1:0),   /* Depth of condition subpatterns */
7783          &subfirstchar,                   /* For possible first char */
7784          &subfirstcharflags,
7785          &subreqchar,                     /* For possible last char */
7786          &subreqcharflags,
7787          bcptr,                           /* Current branch chain */
7788          cd,                              /* Tables block */
7789          (lengthptr == NULL)? NULL :      /* Actual compile phase */
7790            &length_prevgroup              /* Pre-compile phase */
7791          ))
7792       goto FAILED;
7793 
7794     cd->parens_depth -= 1;
7795 
7796     /* If this was an atomic group and there are no capturing groups within it,
7797     generate OP_ONCE_NC instead of OP_ONCE. */
7798 
7799     if (bravalue == OP_ONCE && cd->bracount <= tempbracount)
7800       *code = OP_ONCE_NC;
7801 
7802     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
7803       cd->assert_depth -= 1;
7804 
7805     /* At the end of compiling, code is still pointing to the start of the
7806     group, while tempcode has been updated to point past the end of the group.
7807     The pattern pointer (ptr) is on the bracket.
7808 
7809     If this is a conditional bracket, check that there are no more than
7810     two branches in the group, or just one if it's a DEFINE group. We do this
7811     in the real compile phase, not in the pre-pass, where the whole group may
7812     not be available. */
7813 
7814     if (bravalue == OP_COND && lengthptr == NULL)
7815       {
7816       pcre_uchar *tc = code;
7817       int condcount = 0;
7818 
7819       do {
7820          condcount++;
7821          tc += GET(tc,1);
7822          }
7823       while (*tc != OP_KET);
7824 
7825       /* A DEFINE group is never obeyed inline (the "condition" is always
7826       false). It must have only one branch. */
7827 
7828       if (code[LINK_SIZE+1] == OP_DEF)
7829         {
7830         if (condcount > 1)
7831           {
7832           *errorcodeptr = ERR54;
7833           goto FAILED;
7834           }
7835         bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
7836         }
7837 
7838       /* A "normal" conditional group. If there is just one branch, we must not
7839       make use of its firstchar or reqchar, because this is equivalent to an
7840       empty second branch. */
7841 
7842       else
7843         {
7844         if (condcount > 2)
7845           {
7846           *errorcodeptr = ERR27;
7847           goto FAILED;
7848           }
7849         if (condcount == 1) subfirstcharflags = subreqcharflags = REQ_NONE;
7850         }
7851       }
7852 
7853     /* Error if hit end of pattern */
7854 
7855     if (*ptr != CHAR_RIGHT_PARENTHESIS)
7856       {
7857       *errorcodeptr = ERR14;
7858       goto FAILED;
7859       }
7860 
7861     /* In the pre-compile phase, update the length by the length of the group,
7862     less the brackets at either end. Then reduce the compiled code to just a
7863     set of non-capturing brackets so that it doesn't use much memory if it is
7864     duplicated by a quantifier.*/
7865 
7866     if (lengthptr != NULL)
7867       {
7868       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
7869         {
7870         *errorcodeptr = ERR20;
7871         goto FAILED;
7872         }
7873       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
7874       code++;   /* This already contains bravalue */
7875       PUTINC(code, 0, 1 + LINK_SIZE);
7876       *code++ = OP_KET;
7877       PUTINC(code, 0, 1 + LINK_SIZE);
7878       break;    /* No need to waste time with special character handling */
7879       }
7880 
7881     /* Otherwise update the main code pointer to the end of the group. */
7882 
7883     code = tempcode;
7884 
7885     /* For a DEFINE group, required and first character settings are not
7886     relevant. */
7887 
7888     if (bravalue == OP_DEF) break;
7889 
7890     /* Handle updating of the required and first characters for other types of
7891     group. Update for normal brackets of all kinds, and conditions with two
7892     branches (see code above). If the bracket is followed by a quantifier with
7893     zero repeat, we have to back off. Hence the definition of zeroreqchar and
7894     zerofirstchar outside the main loop so that they can be accessed for the
7895     back off. */
7896 
7897     zeroreqchar = reqchar;
7898     zeroreqcharflags = reqcharflags;
7899     zerofirstchar = firstchar;
7900     zerofirstcharflags = firstcharflags;
7901     groupsetfirstchar = FALSE;
7902 
7903     if (bravalue >= OP_ONCE)
7904       {
7905       /* If we have not yet set a firstchar in this branch, take it from the
7906       subpattern, remembering that it was set here so that a repeat of more
7907       than one can replicate it as reqchar if necessary. If the subpattern has
7908       no firstchar, set "none" for the whole branch. In both cases, a zero
7909       repeat forces firstchar to "none". */
7910 
7911       if (firstcharflags == REQ_UNSET)
7912         {
7913         if (subfirstcharflags >= 0)
7914           {
7915           firstchar = subfirstchar;
7916           firstcharflags = subfirstcharflags;
7917           groupsetfirstchar = TRUE;
7918           }
7919         else firstcharflags = REQ_NONE;
7920         zerofirstcharflags = REQ_NONE;
7921         }
7922 
7923       /* If firstchar was previously set, convert the subpattern's firstchar
7924       into reqchar if there wasn't one, using the vary flag that was in
7925       existence beforehand. */
7926 
7927       else if (subfirstcharflags >= 0 && subreqcharflags < 0)
7928         {
7929         subreqchar = subfirstchar;
7930         subreqcharflags = subfirstcharflags | tempreqvary;
7931         }
7932 
7933       /* If the subpattern set a required byte (or set a first byte that isn't
7934       really the first byte - see above), set it. */
7935 
7936       if (subreqcharflags >= 0)
7937         {
7938         reqchar = subreqchar;
7939         reqcharflags = subreqcharflags;
7940         }
7941       }
7942 
7943     /* For a forward assertion, we take the reqchar, if set, provided that the
7944     group has also set a first char. This can be helpful if the pattern that
7945     follows the assertion doesn't set a different char. For example, it's
7946     useful for /(?=abcde).+/. We can't set firstchar for an assertion, however
7947     because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
7948     the "real" "a" would then become a reqchar instead of a firstchar. This is
7949     overcome by a scan at the end if there's no firstchar, looking for an
7950     asserted first char. */
7951 
7952     else if (bravalue == OP_ASSERT && subreqcharflags >= 0 &&
7953              subfirstcharflags >= 0)
7954       {
7955       reqchar = subreqchar;
7956       reqcharflags = subreqcharflags;
7957       }
7958     break;     /* End of processing '(' */
7959 
7960 
7961     /* ===================================================================*/
7962     /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
7963     are arranged to be the negation of the corresponding OP_values in the
7964     default case when PCRE_UCP is not set. For the back references, the values
7965     are negative the reference number. Only back references and those types
7966     that consume a character may be repeated. We can test for values between
7967     ESC_b and ESC_Z for the latter; this may have to change if any new ones are
7968     ever created. */
7969 
7970     case CHAR_BACKSLASH:
7971     tempptr = ptr;
7972     escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options, FALSE);
7973     if (*errorcodeptr != 0) goto FAILED;
7974 
7975     if (escape == 0)                  /* The escape coded a single character */
7976       c = ec;
7977     else
7978       {
7979       /* For metasequences that actually match a character, we disable the
7980       setting of a first character if it hasn't already been set. */
7981 
7982       if (firstcharflags == REQ_UNSET && escape > ESC_b && escape < ESC_Z)
7983         firstcharflags = REQ_NONE;
7984 
7985       /* Set values to reset to if this is followed by a zero repeat. */
7986 
7987       zerofirstchar = firstchar;
7988       zerofirstcharflags = firstcharflags;
7989       zeroreqchar = reqchar;
7990       zeroreqcharflags = reqcharflags;
7991 
7992       /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
7993       is a subroutine call by number (Oniguruma syntax). In fact, the value
7994       ESC_g is returned only for these cases. So we don't need to check for <
7995       or ' if the value is ESC_g. For the Perl syntax \g{n} the value is
7996       -n, and for the Perl syntax \g{name} the result is ESC_k (as
7997       that is a synonym for a named back reference). */
7998 
7999       if (escape == ESC_g)
8000         {
8001         const pcre_uchar *p;
8002         pcre_uint32 cf;
8003 
8004         item_hwm_offset = cd->hwm - cd->start_workspace;   /* Normally this is set when '(' is read */
8005         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
8006           CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
8007 
8008         /* These two statements stop the compiler for warning about possibly
8009         unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
8010         fact, because we do the check for a number below, the paths that
8011         would actually be in error are never taken. */
8012 
8013         skipbytes = 0;
8014         reset_bracount = FALSE;
8015 
8016         /* If it's not a signed or unsigned number, treat it as a name. */
8017 
8018         cf = ptr[1];
8019         if (cf != CHAR_PLUS && cf != CHAR_MINUS && !IS_DIGIT(cf))
8020           {
8021           is_recurse = TRUE;
8022           goto NAMED_REF_OR_RECURSE;
8023           }
8024 
8025         /* Signed or unsigned number (cf = ptr[1]) is known to be plus or minus
8026         or a digit. */
8027 
8028         p = ptr + 2;
8029         while (IS_DIGIT(*p)) p++;
8030         if (*p != (pcre_uchar)terminator)
8031           {
8032           *errorcodeptr = ERR57;
8033           goto FAILED;
8034           }
8035         ptr++;
8036         goto HANDLE_NUMERICAL_RECURSION;
8037         }
8038 
8039       /* \k<name> or \k'name' is a back reference by name (Perl syntax).
8040       We also support \k{name} (.NET syntax).  */
8041 
8042       if (escape == ESC_k)
8043         {
8044         if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
8045           ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
8046           {
8047           *errorcodeptr = ERR69;
8048           goto FAILED;
8049           }
8050         is_recurse = FALSE;
8051         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
8052           CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
8053           CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
8054         goto NAMED_REF_OR_RECURSE;
8055         }
8056 
8057       /* Back references are handled specially; must disable firstchar if
8058       not set to cope with cases like (?=(\w+))\1: which would otherwise set
8059       ':' later. */
8060 
8061       if (escape < 0)
8062         {
8063         open_capitem *oc;
8064         recno = -escape;
8065 
8066         /* Come here from named backref handling when the reference is to a
8067         single group (i.e. not to a duplicated name. */
8068 
8069         HANDLE_REFERENCE:
8070         if (firstcharflags == REQ_UNSET) zerofirstcharflags = firstcharflags = REQ_NONE;
8071         previous = code;
8072         item_hwm_offset = cd->hwm - cd->start_workspace;
8073         *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
8074         PUT2INC(code, 0, recno);
8075         cd->backref_map |= (recno < 32)? (1U << recno) : 1;
8076         if (recno > cd->top_backref) cd->top_backref = recno;
8077 
8078         /* Check to see if this back reference is recursive, that it, it
8079         is inside the group that it references. A flag is set so that the
8080         group can be made atomic. */
8081 
8082         for (oc = cd->open_caps; oc != NULL; oc = oc->next)
8083           {
8084           if (oc->number == recno)
8085             {
8086             oc->flag = TRUE;
8087             break;
8088             }
8089           }
8090         }
8091 
8092       /* So are Unicode property matches, if supported. */
8093 
8094 #ifdef SUPPORT_UCP
8095       else if (escape == ESC_P || escape == ESC_p)
8096         {
8097         BOOL negated;
8098         unsigned int ptype = 0, pdata = 0;
8099         if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
8100           goto FAILED;
8101         previous = code;
8102         item_hwm_offset = cd->hwm - cd->start_workspace;
8103         *code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
8104         *code++ = ptype;
8105         *code++ = pdata;
8106         }
8107 #else
8108 
8109       /* If Unicode properties are not supported, \X, \P, and \p are not
8110       allowed. */
8111 
8112       else if (escape == ESC_X || escape == ESC_P || escape == ESC_p)
8113         {
8114         *errorcodeptr = ERR45;
8115         goto FAILED;
8116         }
8117 #endif
8118 
8119       /* For the rest (including \X when Unicode properties are supported), we
8120       can obtain the OP value by negating the escape value in the default
8121       situation when PCRE_UCP is not set. When it *is* set, we substitute
8122       Unicode property tests. Note that \b and \B do a one-character
8123       lookbehind, and \A also behaves as if it does. */
8124 
8125       else
8126         {
8127         if ((escape == ESC_b || escape == ESC_B || escape == ESC_A) &&
8128              cd->max_lookbehind == 0)
8129           cd->max_lookbehind = 1;
8130 #ifdef SUPPORT_UCP
8131         if (escape >= ESC_DU && escape <= ESC_wu)
8132           {
8133           nestptr = ptr + 1;                   /* Where to resume */
8134           ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
8135           }
8136         else
8137 #endif
8138         /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
8139         so that it works in DFA mode and in lookbehinds. */
8140 
8141           {
8142           previous = (escape > ESC_b && escape < ESC_Z)? code : NULL;
8143           item_hwm_offset = cd->hwm - cd->start_workspace;
8144           *code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape;
8145           }
8146         }
8147       continue;
8148       }
8149 
8150     /* We have a data character whose value is in c. In UTF-8 mode it may have
8151     a value > 127. We set its representation in the length/buffer, and then
8152     handle it as a data character. */
8153 
8154 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
8155     if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
8156       mclength = PRIV(ord2utf)(c, mcbuffer);
8157     else
8158 #endif
8159 
8160      {
8161      mcbuffer[0] = c;
8162      mclength = 1;
8163      }
8164     goto ONE_CHAR;
8165 
8166 
8167     /* ===================================================================*/
8168     /* Handle a literal character. It is guaranteed not to be whitespace or #
8169     when the extended flag is set. If we are in a UTF mode, it may be a
8170     multi-unit literal character. */
8171 
8172     default:
8173     NORMAL_CHAR:
8174     mclength = 1;
8175     mcbuffer[0] = c;
8176 
8177 #ifdef SUPPORT_UTF
8178     if (utf && HAS_EXTRALEN(c))
8179       ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
8180 #endif
8181 
8182     /* At this point we have the character's bytes in mcbuffer, and the length
8183     in mclength. When not in UTF-8 mode, the length is always 1. */
8184 
8185     ONE_CHAR:
8186     previous = code;
8187     item_hwm_offset = cd->hwm - cd->start_workspace;
8188 
8189     /* For caseless UTF-8 mode when UCP support is available, check whether
8190     this character has more than one other case. If so, generate a special
8191     OP_PROP item instead of OP_CHARI. */
8192 
8193 #ifdef SUPPORT_UCP
8194     if (utf && (options & PCRE_CASELESS) != 0)
8195       {
8196       GETCHAR(c, mcbuffer);
8197       if ((c = UCD_CASESET(c)) != 0)
8198         {
8199         *code++ = OP_PROP;
8200         *code++ = PT_CLIST;
8201         *code++ = c;
8202         if (firstcharflags == REQ_UNSET)
8203           firstcharflags = zerofirstcharflags = REQ_NONE;
8204         break;
8205         }
8206       }
8207 #endif
8208 
8209     /* Caseful matches, or not one of the multicase characters. */
8210 
8211     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARI : OP_CHAR;
8212     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
8213 
8214     /* Remember if \r or \n were seen */
8215 
8216     if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
8217       cd->external_flags |= PCRE_HASCRORLF;
8218 
8219     /* Set the first and required bytes appropriately. If no previous first
8220     byte, set it from this character, but revert to none on a zero repeat.
8221     Otherwise, leave the firstchar value alone, and don't change it on a zero
8222     repeat. */
8223 
8224     if (firstcharflags == REQ_UNSET)
8225       {
8226       zerofirstcharflags = REQ_NONE;
8227       zeroreqchar = reqchar;
8228       zeroreqcharflags = reqcharflags;
8229 
8230       /* If the character is more than one byte long, we can set firstchar
8231       only if it is not to be matched caselessly. */
8232 
8233       if (mclength == 1 || req_caseopt == 0)
8234         {
8235         firstchar = mcbuffer[0];
8236         firstcharflags = req_caseopt;
8237 
8238         if (mclength != 1)
8239           {
8240           reqchar = code[-1];
8241           reqcharflags = cd->req_varyopt;
8242           }
8243         }
8244       else firstcharflags = reqcharflags = REQ_NONE;
8245       }
8246 
8247     /* firstchar was previously set; we can set reqchar only if the length is
8248     1 or the matching is caseful. */
8249 
8250     else
8251       {
8252       zerofirstchar = firstchar;
8253       zerofirstcharflags = firstcharflags;
8254       zeroreqchar = reqchar;
8255       zeroreqcharflags = reqcharflags;
8256       if (mclength == 1 || req_caseopt == 0)
8257         {
8258         reqchar = code[-1];
8259         reqcharflags = req_caseopt | cd->req_varyopt;
8260         }
8261       }
8262 
8263     break;            /* End of literal character handling */
8264     }
8265   }                   /* end of big loop */
8266 
8267 
8268 /* Control never reaches here by falling through, only by a goto for all the
8269 error states. Pass back the position in the pattern so that it can be displayed
8270 to the user for diagnosing the error. */
8271 
8272 FAILED:
8273 *ptrptr = ptr;
8274 return FALSE;
8275 }
8276 
8277 
8278 
8279 /*************************************************
8280 *     Compile sequence of alternatives           *
8281 *************************************************/
8282 
8283 /* On entry, ptr is pointing past the bracket character, but on return it
8284 points to the closing bracket, or vertical bar, or end of string. The code
8285 variable is pointing at the byte into which the BRA operator has been stored.
8286 This function is used during the pre-compile phase when we are trying to find
8287 out the amount of memory needed, as well as during the real compile phase. The
8288 value of lengthptr distinguishes the two phases.
8289 
8290 Arguments:
8291   options           option bits, including any changes for this subpattern
8292   codeptr           -> the address of the current code pointer
8293   ptrptr            -> the address of the current pattern pointer
8294   errorcodeptr      -> pointer to error code variable
8295   lookbehind        TRUE if this is a lookbehind assertion
8296   reset_bracount    TRUE to reset the count for each branch
8297   skipbytes         skip this many bytes at start (for brackets and OP_COND)
8298   cond_depth        depth of nesting for conditional subpatterns
8299   firstcharptr      place to put the first required character
8300   firstcharflagsptr place to put the first character flags, or a negative number
8301   reqcharptr        place to put the last required character
8302   reqcharflagsptr   place to put the last required character flags, or a negative number
8303   bcptr             pointer to the chain of currently open branches
8304   cd                points to the data block with tables pointers etc.
8305   lengthptr         NULL during the real compile phase
8306                     points to length accumulator during pre-compile phase
8307 
8308 Returns:            TRUE on success
8309 */
8310 
8311 static BOOL
compile_regex(int options,pcre_uchar ** codeptr,const pcre_uchar ** ptrptr,int * errorcodeptr,BOOL lookbehind,BOOL reset_bracount,int skipbytes,int cond_depth,pcre_uint32 * firstcharptr,pcre_int32 * firstcharflagsptr,pcre_uint32 * reqcharptr,pcre_int32 * reqcharflagsptr,branch_chain * bcptr,compile_data * cd,int * lengthptr)8312 compile_regex(int options, pcre_uchar **codeptr, const pcre_uchar **ptrptr,
8313   int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
8314   int cond_depth,
8315   pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
8316   pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
8317   branch_chain *bcptr, compile_data *cd, int *lengthptr)
8318 {
8319 const pcre_uchar *ptr = *ptrptr;
8320 pcre_uchar *code = *codeptr;
8321 pcre_uchar *last_branch = code;
8322 pcre_uchar *start_bracket = code;
8323 pcre_uchar *reverse_count = NULL;
8324 open_capitem capitem;
8325 int capnumber = 0;
8326 pcre_uint32 firstchar, reqchar;
8327 pcre_int32 firstcharflags, reqcharflags;
8328 pcre_uint32 branchfirstchar, branchreqchar;
8329 pcre_int32 branchfirstcharflags, branchreqcharflags;
8330 int length;
8331 unsigned int orig_bracount;
8332 unsigned int max_bracount;
8333 branch_chain bc;
8334 size_t save_hwm_offset;
8335 
8336 /* If set, call the external function that checks for stack availability. */
8337 
8338 if (PUBL(stack_guard) != NULL && PUBL(stack_guard)())
8339   {
8340   *errorcodeptr= ERR85;
8341   return FALSE;
8342   }
8343 
8344 /* Miscellaneous initialization */
8345 
8346 bc.outer = bcptr;
8347 bc.current_branch = code;
8348 
8349 firstchar = reqchar = 0;
8350 firstcharflags = reqcharflags = REQ_UNSET;
8351 
8352 save_hwm_offset = cd->hwm - cd->start_workspace;
8353 
8354 /* Accumulate the length for use in the pre-compile phase. Start with the
8355 length of the BRA and KET and any extra bytes that are required at the
8356 beginning. We accumulate in a local variable to save frequent testing of
8357 lengthptr for NULL. We cannot do this by looking at the value of code at the
8358 start and end of each alternative, because compiled items are discarded during
8359 the pre-compile phase so that the work space is not exceeded. */
8360 
8361 length = 2 + 2*LINK_SIZE + skipbytes;
8362 
8363 /* WARNING: If the above line is changed for any reason, you must also change
8364 the code that abstracts option settings at the start of the pattern and makes
8365 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
8366 pre-compile phase to find out whether anything has yet been compiled or not. */
8367 
8368 /* If this is a capturing subpattern, add to the chain of open capturing items
8369 so that we can detect them if (*ACCEPT) is encountered. This is also used to
8370 detect groups that contain recursive back references to themselves. Note that
8371 only OP_CBRA need be tested here; changing this opcode to one of its variants,
8372 e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */
8373 
8374 if (*code == OP_CBRA)
8375   {
8376   capnumber = GET2(code, 1 + LINK_SIZE);
8377   capitem.number = capnumber;
8378   capitem.next = cd->open_caps;
8379   capitem.flag = FALSE;
8380   cd->open_caps = &capitem;
8381   }
8382 
8383 /* Offset is set zero to mark that this bracket is still open */
8384 
8385 PUT(code, 1, 0);
8386 code += 1 + LINK_SIZE + skipbytes;
8387 
8388 /* Loop for each alternative branch */
8389 
8390 orig_bracount = max_bracount = cd->bracount;
8391 for (;;)
8392   {
8393   /* For a (?| group, reset the capturing bracket count so that each branch
8394   uses the same numbers. */
8395 
8396   if (reset_bracount) cd->bracount = orig_bracount;
8397 
8398   /* Set up dummy OP_REVERSE if lookbehind assertion */
8399 
8400   if (lookbehind)
8401     {
8402     *code++ = OP_REVERSE;
8403     reverse_count = code;
8404     PUTINC(code, 0, 0);
8405     length += 1 + LINK_SIZE;
8406     }
8407 
8408   /* Now compile the branch; in the pre-compile phase its length gets added
8409   into the length. */
8410 
8411   if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstchar,
8412         &branchfirstcharflags, &branchreqchar, &branchreqcharflags, &bc,
8413         cond_depth, cd, (lengthptr == NULL)? NULL : &length))
8414     {
8415     *ptrptr = ptr;
8416     return FALSE;
8417     }
8418 
8419   /* Keep the highest bracket count in case (?| was used and some branch
8420   has fewer than the rest. */
8421 
8422   if (cd->bracount > max_bracount) max_bracount = cd->bracount;
8423 
8424   /* In the real compile phase, there is some post-processing to be done. */
8425 
8426   if (lengthptr == NULL)
8427     {
8428     /* If this is the first branch, the firstchar and reqchar values for the
8429     branch become the values for the regex. */
8430 
8431     if (*last_branch != OP_ALT)
8432       {
8433       firstchar = branchfirstchar;
8434       firstcharflags = branchfirstcharflags;
8435       reqchar = branchreqchar;
8436       reqcharflags = branchreqcharflags;
8437       }
8438 
8439     /* If this is not the first branch, the first char and reqchar have to
8440     match the values from all the previous branches, except that if the
8441     previous value for reqchar didn't have REQ_VARY set, it can still match,
8442     and we set REQ_VARY for the regex. */
8443 
8444     else
8445       {
8446       /* If we previously had a firstchar, but it doesn't match the new branch,
8447       we have to abandon the firstchar for the regex, but if there was
8448       previously no reqchar, it takes on the value of the old firstchar. */
8449 
8450       if (firstcharflags >= 0 &&
8451           (firstcharflags != branchfirstcharflags || firstchar != branchfirstchar))
8452         {
8453         if (reqcharflags < 0)
8454           {
8455           reqchar = firstchar;
8456           reqcharflags = firstcharflags;
8457           }
8458         firstcharflags = REQ_NONE;
8459         }
8460 
8461       /* If we (now or from before) have no firstchar, a firstchar from the
8462       branch becomes a reqchar if there isn't a branch reqchar. */
8463 
8464       if (firstcharflags < 0 && branchfirstcharflags >= 0 && branchreqcharflags < 0)
8465         {
8466         branchreqchar = branchfirstchar;
8467         branchreqcharflags = branchfirstcharflags;
8468         }
8469 
8470       /* Now ensure that the reqchars match */
8471 
8472       if (((reqcharflags & ~REQ_VARY) != (branchreqcharflags & ~REQ_VARY)) ||
8473           reqchar != branchreqchar)
8474         reqcharflags = REQ_NONE;
8475       else
8476         {
8477         reqchar = branchreqchar;
8478         reqcharflags |= branchreqcharflags; /* To "or" REQ_VARY */
8479         }
8480       }
8481 
8482     /* If lookbehind, check that this branch matches a fixed-length string, and
8483     put the length into the OP_REVERSE item. Temporarily mark the end of the
8484     branch with OP_END. If the branch contains OP_RECURSE, the result is -3
8485     because there may be forward references that we can't check here. Set a
8486     flag to cause another lookbehind check at the end. Why not do it all at the
8487     end? Because common, erroneous checks are picked up here and the offset of
8488     the problem can be shown. */
8489 
8490     if (lookbehind)
8491       {
8492       int fixed_length;
8493       *code = OP_END;
8494       fixed_length = find_fixedlength(last_branch,  (options & PCRE_UTF8) != 0,
8495         FALSE, cd, NULL);
8496       DPRINTF(("fixed length = %d\n", fixed_length));
8497       if (fixed_length == -3)
8498         {
8499         cd->check_lookbehind = TRUE;
8500         }
8501       else if (fixed_length < 0)
8502         {
8503         *errorcodeptr = (fixed_length == -2)? ERR36 :
8504                         (fixed_length == -4)? ERR70: ERR25;
8505         *ptrptr = ptr;
8506         return FALSE;
8507         }
8508       else
8509         {
8510         if (fixed_length > cd->max_lookbehind)
8511           cd->max_lookbehind = fixed_length;
8512         PUT(reverse_count, 0, fixed_length);
8513         }
8514       }
8515     }
8516 
8517   /* Reached end of expression, either ')' or end of pattern. In the real
8518   compile phase, go back through the alternative branches and reverse the chain
8519   of offsets, with the field in the BRA item now becoming an offset to the
8520   first alternative. If there are no alternatives, it points to the end of the
8521   group. The length in the terminating ket is always the length of the whole
8522   bracketed item. Return leaving the pointer at the terminating char. */
8523 
8524   if (*ptr != CHAR_VERTICAL_LINE)
8525     {
8526     if (lengthptr == NULL)
8527       {
8528       int branch_length = (int)(code - last_branch);
8529       do
8530         {
8531         int prev_length = GET(last_branch, 1);
8532         PUT(last_branch, 1, branch_length);
8533         branch_length = prev_length;
8534         last_branch -= branch_length;
8535         }
8536       while (branch_length > 0);
8537       }
8538 
8539     /* Fill in the ket */
8540 
8541     *code = OP_KET;
8542     PUT(code, 1, (int)(code - start_bracket));
8543     code += 1 + LINK_SIZE;
8544 
8545     /* If it was a capturing subpattern, check to see if it contained any
8546     recursive back references. If so, we must wrap it in atomic brackets.
8547     Because we are moving code along, we must ensure that any pending recursive
8548     references are updated. In any event, remove the block from the chain. */
8549 
8550     if (capnumber > 0)
8551       {
8552       if (cd->open_caps->flag)
8553         {
8554         *code = OP_END;
8555         adjust_recurse(start_bracket, 1 + LINK_SIZE,
8556           (options & PCRE_UTF8) != 0, cd, save_hwm_offset);
8557         memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
8558           IN_UCHARS(code - start_bracket));
8559         *start_bracket = OP_ONCE;
8560         code += 1 + LINK_SIZE;
8561         PUT(start_bracket, 1, (int)(code - start_bracket));
8562         *code = OP_KET;
8563         PUT(code, 1, (int)(code - start_bracket));
8564         code += 1 + LINK_SIZE;
8565         length += 2 + 2*LINK_SIZE;
8566         }
8567       cd->open_caps = cd->open_caps->next;
8568       }
8569 
8570     /* Retain the highest bracket number, in case resetting was used. */
8571 
8572     cd->bracount = max_bracount;
8573 
8574     /* Set values to pass back */
8575 
8576     *codeptr = code;
8577     *ptrptr = ptr;
8578     *firstcharptr = firstchar;
8579     *firstcharflagsptr = firstcharflags;
8580     *reqcharptr = reqchar;
8581     *reqcharflagsptr = reqcharflags;
8582     if (lengthptr != NULL)
8583       {
8584       if (OFLOW_MAX - *lengthptr < length)
8585         {
8586         *errorcodeptr = ERR20;
8587         return FALSE;
8588         }
8589       *lengthptr += length;
8590       }
8591     return TRUE;
8592     }
8593 
8594   /* Another branch follows. In the pre-compile phase, we can move the code
8595   pointer back to where it was for the start of the first branch. (That is,
8596   pretend that each branch is the only one.)
8597 
8598   In the real compile phase, insert an ALT node. Its length field points back
8599   to the previous branch while the bracket remains open. At the end the chain
8600   is reversed. It's done like this so that the start of the bracket has a
8601   zero offset until it is closed, making it possible to detect recursion. */
8602 
8603   if (lengthptr != NULL)
8604     {
8605     code = *codeptr + 1 + LINK_SIZE + skipbytes;
8606     length += 1 + LINK_SIZE;
8607     }
8608   else
8609     {
8610     *code = OP_ALT;
8611     PUT(code, 1, (int)(code - last_branch));
8612     bc.current_branch = last_branch = code;
8613     code += 1 + LINK_SIZE;
8614     }
8615 
8616   ptr++;
8617   }
8618 /* Control never reaches here */
8619 }
8620 
8621 
8622 
8623 
8624 /*************************************************
8625 *          Check for anchored expression         *
8626 *************************************************/
8627 
8628 /* Try to find out if this is an anchored regular expression. Consider each
8629 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8630 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8631 it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8632 be found, because ^ generates OP_CIRCM in that mode.
8633 
8634 We can also consider a regex to be anchored if OP_SOM starts all its branches.
8635 This is the code for \G, which means "match at start of match position, taking
8636 into account the match offset".
8637 
8638 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8639 because that will try the rest of the pattern at all possible matching points,
8640 so there is no point trying again.... er ....
8641 
8642 .... except when the .* appears inside capturing parentheses, and there is a
8643 subsequent back reference to those parentheses. We haven't enough information
8644 to catch that case precisely.
8645 
8646 At first, the best we could do was to detect when .* was in capturing brackets
8647 and the highest back reference was greater than or equal to that level.
8648 However, by keeping a bitmap of the first 31 back references, we can catch some
8649 of the more common cases more precisely.
8650 
8651 ... A second exception is when the .* appears inside an atomic group, because
8652 this prevents the number of characters it matches from being adjusted.
8653 
8654 Arguments:
8655   code           points to start of expression (the bracket)
8656   bracket_map    a bitmap of which brackets we are inside while testing; this
8657                   handles up to substring 31; after that we just have to take
8658                   the less precise approach
8659   cd             points to the compile data block
8660   atomcount      atomic group level
8661 
8662 Returns:     TRUE or FALSE
8663 */
8664 
8665 static BOOL
is_anchored(register const pcre_uchar * code,unsigned int bracket_map,compile_data * cd,int atomcount)8666 is_anchored(register const pcre_uchar *code, unsigned int bracket_map,
8667   compile_data *cd, int atomcount)
8668 {
8669 do {
8670    const pcre_uchar *scode = first_significant_code(
8671      code + PRIV(OP_lengths)[*code], FALSE);
8672    register int op = *scode;
8673 
8674    /* Non-capturing brackets */
8675 
8676    if (op == OP_BRA  || op == OP_BRAPOS ||
8677        op == OP_SBRA || op == OP_SBRAPOS)
8678      {
8679      if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8680      }
8681 
8682    /* Capturing brackets */
8683 
8684    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8685             op == OP_SCBRA || op == OP_SCBRAPOS)
8686      {
8687      int n = GET2(scode, 1+LINK_SIZE);
8688      int new_map = bracket_map | ((n < 32)? (1U << n) : 1);
8689      if (!is_anchored(scode, new_map, cd, atomcount)) return FALSE;
8690      }
8691 
8692    /* Positive forward assertion */
8693 
8694    else if (op == OP_ASSERT)
8695      {
8696      if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8697      }
8698 
8699    /* Condition; not anchored if no second branch */
8700 
8701    else if (op == OP_COND)
8702      {
8703      if (scode[GET(scode,1)] != OP_ALT) return FALSE;
8704      if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8705      }
8706 
8707    /* Atomic groups */
8708 
8709    else if (op == OP_ONCE || op == OP_ONCE_NC)
8710      {
8711      if (!is_anchored(scode, bracket_map, cd, atomcount + 1))
8712        return FALSE;
8713      }
8714 
8715    /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8716    it isn't in brackets that are or may be referenced or inside an atomic
8717    group. */
8718 
8719    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8720              op == OP_TYPEPOSSTAR))
8721      {
8722      if (scode[1] != OP_ALLANY || (bracket_map & cd->backref_map) != 0 ||
8723          atomcount > 0 || cd->had_pruneorskip)
8724        return FALSE;
8725      }
8726 
8727    /* Check for explicit anchoring */
8728 
8729    else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8730 
8731    code += GET(code, 1);
8732    }
8733 while (*code == OP_ALT);   /* Loop for each alternative */
8734 return TRUE;
8735 }
8736 
8737 
8738 
8739 /*************************************************
8740 *         Check for starting with ^ or .*        *
8741 *************************************************/
8742 
8743 /* This is called to find out if every branch starts with ^ or .* so that
8744 "first char" processing can be done to speed things up in multiline
8745 matching and for non-DOTALL patterns that start with .* (which must start at
8746 the beginning or after \n). As in the case of is_anchored() (see above), we
8747 have to take account of back references to capturing brackets that contain .*
8748 because in that case we can't make the assumption. Also, the appearance of .*
8749 inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
8750 or *SKIP does not count, because once again the assumption no longer holds.
8751 
8752 Arguments:
8753   code           points to start of expression (the bracket)
8754   bracket_map    a bitmap of which brackets we are inside while testing; this
8755                   handles up to substring 31; after that we just have to take
8756                   the less precise approach
8757   cd             points to the compile data
8758   atomcount      atomic group level
8759   inassert       TRUE if in an assertion
8760 
8761 Returns:         TRUE or FALSE
8762 */
8763 
8764 static BOOL
is_startline(const pcre_uchar * code,unsigned int bracket_map,compile_data * cd,int atomcount,BOOL inassert)8765 is_startline(const pcre_uchar *code, unsigned int bracket_map,
8766   compile_data *cd, int atomcount, BOOL inassert)
8767 {
8768 do {
8769    const pcre_uchar *scode = first_significant_code(
8770      code + PRIV(OP_lengths)[*code], FALSE);
8771    register int op = *scode;
8772 
8773    /* If we are at the start of a conditional assertion group, *both* the
8774    conditional assertion *and* what follows the condition must satisfy the test
8775    for start of line. Other kinds of condition fail. Note that there may be an
8776    auto-callout at the start of a condition. */
8777 
8778    if (op == OP_COND)
8779      {
8780      scode += 1 + LINK_SIZE;
8781      if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8782      switch (*scode)
8783        {
8784        case OP_CREF:
8785        case OP_DNCREF:
8786        case OP_RREF:
8787        case OP_DNRREF:
8788        case OP_DEF:
8789        case OP_FAIL:
8790        return FALSE;
8791 
8792        default:     /* Assertion */
8793        if (!is_startline(scode, bracket_map, cd, atomcount, TRUE)) return FALSE;
8794        do scode += GET(scode, 1); while (*scode == OP_ALT);
8795        scode += 1 + LINK_SIZE;
8796        break;
8797        }
8798      scode = first_significant_code(scode, FALSE);
8799      op = *scode;
8800      }
8801 
8802    /* Non-capturing brackets */
8803 
8804    if (op == OP_BRA  || op == OP_BRAPOS ||
8805        op == OP_SBRA || op == OP_SBRAPOS)
8806      {
8807      if (!is_startline(scode, bracket_map, cd, atomcount, inassert)) return FALSE;
8808      }
8809 
8810    /* Capturing brackets */
8811 
8812    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8813             op == OP_SCBRA || op == OP_SCBRAPOS)
8814      {
8815      int n = GET2(scode, 1+LINK_SIZE);
8816      int new_map = bracket_map | ((n < 32)? (1U << n) : 1);
8817      if (!is_startline(scode, new_map, cd, atomcount, inassert)) return FALSE;
8818      }
8819 
8820    /* Positive forward assertions */
8821 
8822    else if (op == OP_ASSERT)
8823      {
8824      if (!is_startline(scode, bracket_map, cd, atomcount, TRUE)) return FALSE;
8825      }
8826 
8827    /* Atomic brackets */
8828 
8829    else if (op == OP_ONCE || op == OP_ONCE_NC)
8830      {
8831      if (!is_startline(scode, bracket_map, cd, atomcount + 1, inassert)) return FALSE;
8832      }
8833 
8834    /* .* means "start at start or after \n" if it isn't in atomic brackets or
8835    brackets that may be referenced or an assertion, as long as the pattern does
8836    not contain *PRUNE or *SKIP, because these break the feature. Consider, for
8837    example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e.
8838    not at the start of a line. */
8839 
8840    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8841      {
8842      if (scode[1] != OP_ANY || (bracket_map & cd->backref_map) != 0 ||
8843          atomcount > 0 || cd->had_pruneorskip || inassert)
8844        return FALSE;
8845      }
8846 
8847    /* Check for explicit circumflex; anything else gives a FALSE result. Note
8848    in particular that this includes atomic brackets OP_ONCE and OP_ONCE_NC
8849    because the number of characters matched by .* cannot be adjusted inside
8850    them. */
8851 
8852    else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8853 
8854    /* Move on to the next alternative */
8855 
8856    code += GET(code, 1);
8857    }
8858 while (*code == OP_ALT);  /* Loop for each alternative */
8859 return TRUE;
8860 }
8861 
8862 
8863 
8864 /*************************************************
8865 *       Check for asserted fixed first char      *
8866 *************************************************/
8867 
8868 /* During compilation, the "first char" settings from forward assertions are
8869 discarded, because they can cause conflicts with actual literals that follow.
8870 However, if we end up without a first char setting for an unanchored pattern,
8871 it is worth scanning the regex to see if there is an initial asserted first
8872 char. If all branches start with the same asserted char, or with a
8873 non-conditional bracket all of whose alternatives start with the same asserted
8874 char (recurse ad lib), then we return that char, with the flags set to zero or
8875 REQ_CASELESS; otherwise return zero with REQ_NONE in the flags.
8876 
8877 Arguments:
8878   code       points to start of expression (the bracket)
8879   flags      points to the first char flags, or to REQ_NONE
8880   inassert   TRUE if in an assertion
8881 
8882 Returns:     the fixed first char, or 0 with REQ_NONE in flags
8883 */
8884 
8885 static pcre_uint32
find_firstassertedchar(const pcre_uchar * code,pcre_int32 * flags,BOOL inassert)8886 find_firstassertedchar(const pcre_uchar *code, pcre_int32 *flags,
8887   BOOL inassert)
8888 {
8889 register pcre_uint32 c = 0;
8890 int cflags = REQ_NONE;
8891 
8892 *flags = REQ_NONE;
8893 do {
8894    pcre_uint32 d;
8895    int dflags;
8896    int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
8897              *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
8898    const pcre_uchar *scode = first_significant_code(code + 1+LINK_SIZE + xl,
8899      TRUE);
8900    register pcre_uchar op = *scode;
8901 
8902    switch(op)
8903      {
8904      default:
8905      return 0;
8906 
8907      case OP_BRA:
8908      case OP_BRAPOS:
8909      case OP_CBRA:
8910      case OP_SCBRA:
8911      case OP_CBRAPOS:
8912      case OP_SCBRAPOS:
8913      case OP_ASSERT:
8914      case OP_ONCE:
8915      case OP_ONCE_NC:
8916      d = find_firstassertedchar(scode, &dflags, op == OP_ASSERT);
8917      if (dflags < 0)
8918        return 0;
8919      if (cflags < 0) { c = d; cflags = dflags; } else if (c != d || cflags != dflags) return 0;
8920      break;
8921 
8922      case OP_EXACT:
8923      scode += IMM2_SIZE;
8924      /* Fall through */
8925 
8926      case OP_CHAR:
8927      case OP_PLUS:
8928      case OP_MINPLUS:
8929      case OP_POSPLUS:
8930      if (!inassert) return 0;
8931      if (cflags < 0) { c = scode[1]; cflags = 0; }
8932        else if (c != scode[1]) return 0;
8933      break;
8934 
8935      case OP_EXACTI:
8936      scode += IMM2_SIZE;
8937      /* Fall through */
8938 
8939      case OP_CHARI:
8940      case OP_PLUSI:
8941      case OP_MINPLUSI:
8942      case OP_POSPLUSI:
8943      if (!inassert) return 0;
8944      if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
8945        else if (c != scode[1]) return 0;
8946      break;
8947      }
8948 
8949    code += GET(code, 1);
8950    }
8951 while (*code == OP_ALT);
8952 
8953 *flags = cflags;
8954 return c;
8955 }
8956 
8957 
8958 
8959 /*************************************************
8960 *     Add an entry to the name/number table      *
8961 *************************************************/
8962 
8963 /* This function is called between compiling passes to add an entry to the
8964 name/number table, maintaining alphabetical order. Checking for permitted
8965 and forbidden duplicates has already been done.
8966 
8967 Arguments:
8968   cd           the compile data block
8969   name         the name to add
8970   length       the length of the name
8971   groupno      the group number
8972 
8973 Returns:       nothing
8974 */
8975 
8976 static void
add_name(compile_data * cd,const pcre_uchar * name,int length,unsigned int groupno)8977 add_name(compile_data *cd, const pcre_uchar *name, int length,
8978   unsigned int groupno)
8979 {
8980 int i;
8981 pcre_uchar *slot = cd->name_table;
8982 
8983 for (i = 0; i < cd->names_found; i++)
8984   {
8985   int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(length));
8986   if (crc == 0 && slot[IMM2_SIZE+length] != 0)
8987     crc = -1; /* Current name is a substring */
8988 
8989   /* Make space in the table and break the loop for an earlier name. For a
8990   duplicate or later name, carry on. We do this for duplicates so that in the
8991   simple case (when ?(| is not used) they are in order of their numbers. In all
8992   cases they are in the order in which they appear in the pattern. */
8993 
8994   if (crc < 0)
8995     {
8996     memmove(slot + cd->name_entry_size, slot,
8997       IN_UCHARS((cd->names_found - i) * cd->name_entry_size));
8998     break;
8999     }
9000 
9001   /* Continue the loop for a later or duplicate name */
9002 
9003   slot += cd->name_entry_size;
9004   }
9005 
9006 PUT2(slot, 0, groupno);
9007 memcpy(slot + IMM2_SIZE, name, IN_UCHARS(length));
9008 slot[IMM2_SIZE + length] = 0;
9009 cd->names_found++;
9010 }
9011 
9012 
9013 
9014 /*************************************************
9015 *        Compile a Regular Expression            *
9016 *************************************************/
9017 
9018 /* This function takes a string and returns a pointer to a block of store
9019 holding a compiled version of the expression. The original API for this
9020 function had no error code return variable; it is retained for backwards
9021 compatibility. The new function is given a new name.
9022 
9023 Arguments:
9024   pattern       the regular expression
9025   options       various option bits
9026   errorcodeptr  pointer to error code variable (pcre_compile2() only)
9027                   can be NULL if you don't want a code value
9028   errorptr      pointer to pointer to error text
9029   erroroffset   ptr offset in pattern where error was detected
9030   tables        pointer to character tables or NULL
9031 
9032 Returns:        pointer to compiled data block, or NULL on error,
9033                 with errorptr and erroroffset set
9034 */
9035 
9036 #if defined COMPILE_PCRE8
9037 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
pcre_compile(const char * pattern,int options,const char ** errorptr,int * erroroffset,const unsigned char * tables)9038 pcre_compile(const char *pattern, int options, const char **errorptr,
9039   int *erroroffset, const unsigned char *tables)
9040 #elif defined COMPILE_PCRE16
9041 PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
9042 pcre16_compile(PCRE_SPTR16 pattern, int options, const char **errorptr,
9043   int *erroroffset, const unsigned char *tables)
9044 #elif defined COMPILE_PCRE32
9045 PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
9046 pcre32_compile(PCRE_SPTR32 pattern, int options, const char **errorptr,
9047   int *erroroffset, const unsigned char *tables)
9048 #endif
9049 {
9050 #if defined COMPILE_PCRE8
9051 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
9052 #elif defined COMPILE_PCRE16
9053 return pcre16_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
9054 #elif defined COMPILE_PCRE32
9055 return pcre32_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
9056 #endif
9057 }
9058 
9059 
9060 #if defined COMPILE_PCRE8
9061 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
pcre_compile2(const char * pattern,int options,int * errorcodeptr,const char ** errorptr,int * erroroffset,const unsigned char * tables)9062 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
9063   const char **errorptr, int *erroroffset, const unsigned char *tables)
9064 #elif defined COMPILE_PCRE16
9065 PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
9066 pcre16_compile2(PCRE_SPTR16 pattern, int options, int *errorcodeptr,
9067   const char **errorptr, int *erroroffset, const unsigned char *tables)
9068 #elif defined COMPILE_PCRE32
9069 PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
9070 pcre32_compile2(PCRE_SPTR32 pattern, int options, int *errorcodeptr,
9071   const char **errorptr, int *erroroffset, const unsigned char *tables)
9072 #endif
9073 {
9074 REAL_PCRE *re;
9075 int length = 1;  /* For final END opcode */
9076 pcre_int32 firstcharflags, reqcharflags;
9077 pcre_uint32 firstchar, reqchar;
9078 pcre_uint32 limit_match = PCRE_UINT32_MAX;
9079 pcre_uint32 limit_recursion = PCRE_UINT32_MAX;
9080 int newline;
9081 int errorcode = 0;
9082 int skipatstart = 0;
9083 BOOL utf;
9084 BOOL never_utf = FALSE;
9085 size_t size;
9086 pcre_uchar *code;
9087 const pcre_uchar *codestart;
9088 const pcre_uchar *ptr;
9089 compile_data compile_block;
9090 compile_data *cd = &compile_block;
9091 
9092 /* This space is used for "compiling" into during the first phase, when we are
9093 computing the amount of memory that is needed. Compiled items are thrown away
9094 as soon as possible, so that a fairly large buffer should be sufficient for
9095 this purpose. The same space is used in the second phase for remembering where
9096 to fill in forward references to subpatterns. That may overflow, in which case
9097 new memory is obtained from malloc(). */
9098 
9099 pcre_uchar cworkspace[COMPILE_WORK_SIZE];
9100 
9101 /* This vector is used for remembering name groups during the pre-compile. In a
9102 similar way to cworkspace, it can be expanded using malloc() if necessary. */
9103 
9104 named_group named_groups[NAMED_GROUP_LIST_SIZE];
9105 cd->named_groups = named_groups;
9106 cd->named_group_list_size = NAMED_GROUP_LIST_SIZE;
9107 
9108 /* Set this early so that early errors get offset 0. */
9109 
9110 ptr = (const pcre_uchar *)pattern;
9111 
9112 /* We can't pass back an error message if errorptr is NULL; I guess the best we
9113 can do is just return NULL, but we can set a code value if there is a code
9114 pointer. */
9115 
9116 if (errorptr == NULL)
9117   {
9118   if (errorcodeptr != NULL) *errorcodeptr = 99;
9119   return NULL;
9120   }
9121 
9122 *errorptr = NULL;
9123 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
9124 
9125 /* However, we can give a message for this error */
9126 
9127 if (erroroffset == NULL)
9128   {
9129   errorcode = ERR16;
9130   goto PCRE_EARLY_ERROR_RETURN2;
9131   }
9132 
9133 *erroroffset = 0;
9134 
9135 /* Set up pointers to the individual character tables */
9136 
9137 if (tables == NULL) tables = PRIV(default_tables);
9138 cd->lcc = tables + lcc_offset;
9139 cd->fcc = tables + fcc_offset;
9140 cd->cbits = tables + cbits_offset;
9141 cd->ctypes = tables + ctypes_offset;
9142 
9143 /* Check that all undefined public option bits are zero */
9144 
9145 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
9146   {
9147   errorcode = ERR17;
9148   goto PCRE_EARLY_ERROR_RETURN;
9149   }
9150 
9151 /* If PCRE_NEVER_UTF is set, remember it. */
9152 
9153 if ((options & PCRE_NEVER_UTF) != 0) never_utf = TRUE;
9154 
9155 /* Check for global one-time settings at the start of the pattern, and remember
9156 the offset for later. */
9157 
9158 cd->external_flags = 0;   /* Initialize here for LIMIT_MATCH/RECURSION */
9159 
9160 while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
9161        ptr[skipatstart+1] == CHAR_ASTERISK)
9162   {
9163   int newnl = 0;
9164   int newbsr = 0;
9165 
9166 /* For completeness and backward compatibility, (*UTFn) is supported in the
9167 relevant libraries, but (*UTF) is generic and always supported. Note that
9168 PCRE_UTF8 == PCRE_UTF16 == PCRE_UTF32. */
9169 
9170 #ifdef COMPILE_PCRE8
9171   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF8_RIGHTPAR, 5) == 0)
9172     { skipatstart += 7; options |= PCRE_UTF8; continue; }
9173 #endif
9174 #ifdef COMPILE_PCRE16
9175   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF16_RIGHTPAR, 6) == 0)
9176     { skipatstart += 8; options |= PCRE_UTF16; continue; }
9177 #endif
9178 #ifdef COMPILE_PCRE32
9179   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF32_RIGHTPAR, 6) == 0)
9180     { skipatstart += 8; options |= PCRE_UTF32; continue; }
9181 #endif
9182 
9183   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 4) == 0)
9184     { skipatstart += 6; options |= PCRE_UTF8; continue; }
9185   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)
9186     { skipatstart += 6; options |= PCRE_UCP; continue; }
9187   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_AUTO_POSSESS_RIGHTPAR, 16) == 0)
9188     { skipatstart += 18; options |= PCRE_NO_AUTO_POSSESS; continue; }
9189   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
9190     { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
9191 
9192   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_MATCH_EQ, 12) == 0)
9193     {
9194     pcre_uint32 c = 0;
9195     int p = skipatstart + 14;
9196     while (isdigit(ptr[p]))
9197       {
9198       if (c > PCRE_UINT32_MAX / 10 - 1) break;   /* Integer overflow */
9199       c = c*10 + ptr[p++] - CHAR_0;
9200       }
9201     if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
9202     if (c < limit_match)
9203       {
9204       limit_match = c;
9205       cd->external_flags |= PCRE_MLSET;
9206       }
9207     skipatstart = p;
9208     continue;
9209     }
9210 
9211   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_RECURSION_EQ, 16) == 0)
9212     {
9213     pcre_uint32 c = 0;
9214     int p = skipatstart + 18;
9215     while (isdigit(ptr[p]))
9216       {
9217       if (c > PCRE_UINT32_MAX / 10 - 1) break;   /* Integer overflow check */
9218       c = c*10 + ptr[p++] - CHAR_0;
9219       }
9220     if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
9221     if (c < limit_recursion)
9222       {
9223       limit_recursion = c;
9224       cd->external_flags |= PCRE_RLSET;
9225       }
9226     skipatstart = p;
9227     continue;
9228     }
9229 
9230   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CR_RIGHTPAR, 3) == 0)
9231     { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
9232   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LF_RIGHTPAR, 3)  == 0)
9233     { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
9234   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CRLF_RIGHTPAR, 5)  == 0)
9235     { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
9236   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANY_RIGHTPAR, 4) == 0)
9237     { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
9238   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANYCRLF_RIGHTPAR, 8) == 0)
9239     { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
9240 
9241   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
9242     { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
9243   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
9244     { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
9245 
9246   if (newnl != 0)
9247     options = (options & ~PCRE_NEWLINE_BITS) | newnl;
9248   else if (newbsr != 0)
9249     options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
9250   else break;
9251   }
9252 
9253 /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
9254 utf = (options & PCRE_UTF8) != 0;
9255 if (utf && never_utf)
9256   {
9257   errorcode = ERR78;
9258   goto PCRE_EARLY_ERROR_RETURN2;
9259   }
9260 
9261 /* Can't support UTF unless PCRE has been compiled to include the code. The
9262 return of an error code from PRIV(valid_utf)() is a new feature, introduced in
9263 release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
9264 not used here. */
9265 
9266 #ifdef SUPPORT_UTF
9267 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&
9268      (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
9269   {
9270 #if defined COMPILE_PCRE8
9271   errorcode = ERR44;
9272 #elif defined COMPILE_PCRE16
9273   errorcode = ERR74;
9274 #elif defined COMPILE_PCRE32
9275   errorcode = ERR77;
9276 #endif
9277   goto PCRE_EARLY_ERROR_RETURN2;
9278   }
9279 #else
9280 if (utf)
9281   {
9282   errorcode = ERR32;
9283   goto PCRE_EARLY_ERROR_RETURN;
9284   }
9285 #endif
9286 
9287 /* Can't support UCP unless PCRE has been compiled to include the code. */
9288 
9289 #ifndef SUPPORT_UCP
9290 if ((options & PCRE_UCP) != 0)
9291   {
9292   errorcode = ERR67;
9293   goto PCRE_EARLY_ERROR_RETURN;
9294   }
9295 #endif
9296 
9297 /* Check validity of \R options. */
9298 
9299 if ((options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) ==
9300      (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
9301   {
9302   errorcode = ERR56;
9303   goto PCRE_EARLY_ERROR_RETURN;
9304   }
9305 
9306 /* Handle different types of newline. The three bits give seven cases. The
9307 current code allows for fixed one- or two-byte sequences, plus "any" and
9308 "anycrlf". */
9309 
9310 switch (options & PCRE_NEWLINE_BITS)
9311   {
9312   case 0: newline = NEWLINE; break;   /* Build-time default */
9313   case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
9314   case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
9315   case PCRE_NEWLINE_CR+
9316        PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
9317   case PCRE_NEWLINE_ANY: newline = -1; break;
9318   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
9319   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
9320   }
9321 
9322 if (newline == -2)
9323   {
9324   cd->nltype = NLTYPE_ANYCRLF;
9325   }
9326 else if (newline < 0)
9327   {
9328   cd->nltype = NLTYPE_ANY;
9329   }
9330 else
9331   {
9332   cd->nltype = NLTYPE_FIXED;
9333   if (newline > 255)
9334     {
9335     cd->nllen = 2;
9336     cd->nl[0] = (newline >> 8) & 255;
9337     cd->nl[1] = newline & 255;
9338     }
9339   else
9340     {
9341     cd->nllen = 1;
9342     cd->nl[0] = newline;
9343     }
9344   }
9345 
9346 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
9347 references to help in deciding whether (.*) can be treated as anchored or not.
9348 */
9349 
9350 cd->top_backref = 0;
9351 cd->backref_map = 0;
9352 
9353 /* Reflect pattern for debugging output */
9354 
9355 DPRINTF(("------------------------------------------------------------------\n"));
9356 #ifdef PCRE_DEBUG
9357 print_puchar(stdout, (PCRE_PUCHAR)pattern);
9358 #endif
9359 DPRINTF(("\n"));
9360 
9361 /* Pretend to compile the pattern while actually just accumulating the length
9362 of memory required. This behaviour is triggered by passing a non-NULL final
9363 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
9364 to compile parts of the pattern into; the compiled code is discarded when it is
9365 no longer needed, so hopefully this workspace will never overflow, though there
9366 is a test for its doing so. */
9367 
9368 cd->bracount = cd->final_bracount = 0;
9369 cd->names_found = 0;
9370 cd->name_entry_size = 0;
9371 cd->name_table = NULL;
9372 cd->dupnames = FALSE;
9373 cd->dupgroups = FALSE;
9374 cd->namedrefcount = 0;
9375 cd->start_code = cworkspace;
9376 cd->hwm = cworkspace;
9377 cd->iscondassert = FALSE;
9378 cd->start_workspace = cworkspace;
9379 cd->workspace_size = COMPILE_WORK_SIZE;
9380 cd->start_pattern = (const pcre_uchar *)pattern;
9381 cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
9382 cd->req_varyopt = 0;
9383 cd->parens_depth = 0;
9384 cd->assert_depth = 0;
9385 cd->max_lookbehind = 0;
9386 cd->external_options = options;
9387 cd->open_caps = NULL;
9388 
9389 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
9390 don't need to look at the result of the function here. The initial options have
9391 been put into the cd block so that they can be changed if an option setting is
9392 found within the regex right at the beginning. Bringing initial option settings
9393 outside can help speed up starting point checks. */
9394 
9395 ptr += skipatstart;
9396 code = cworkspace;
9397 *code = OP_BRA;
9398 
9399 (void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
9400   FALSE, 0, 0, &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL,
9401   cd, &length);
9402 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
9403 
9404 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
9405   (int)(cd->hwm - cworkspace)));
9406 
9407 if (length > MAX_PATTERN_SIZE)
9408   {
9409   errorcode = ERR20;
9410   goto PCRE_EARLY_ERROR_RETURN;
9411   }
9412 
9413 /* Compute the size of the data block for storing the compiled pattern. Integer
9414 overflow should no longer be possible because nowadays we limit the maximum
9415 value of cd->names_found and cd->name_entry_size. */
9416 
9417 size = sizeof(REAL_PCRE) +
9418   (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
9419 
9420 /* Get the memory. */
9421 
9422 re = (REAL_PCRE *)(PUBL(malloc))(size);
9423 if (re == NULL)
9424   {
9425   errorcode = ERR21;
9426   goto PCRE_EARLY_ERROR_RETURN;
9427   }
9428 
9429 /* Put in the magic number, and save the sizes, initial options, internal
9430 flags, and character table pointer. NULL is used for the default character
9431 tables. The nullpad field is at the end; it's there to help in the case when a
9432 regex compiled on a system with 4-byte pointers is run on another with 8-byte
9433 pointers. */
9434 
9435 re->magic_number = MAGIC_NUMBER;
9436 re->size = (int)size;
9437 re->options = cd->external_options;
9438 re->flags = cd->external_flags;
9439 re->limit_match = limit_match;
9440 re->limit_recursion = limit_recursion;
9441 re->first_char = 0;
9442 re->req_char = 0;
9443 re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar);
9444 re->name_entry_size = cd->name_entry_size;
9445 re->name_count = cd->names_found;
9446 re->ref_count = 0;
9447 re->tables = (tables == PRIV(default_tables))? NULL : tables;
9448 re->nullpad = NULL;
9449 #ifdef COMPILE_PCRE32
9450 re->dummy = 0;
9451 #else
9452 re->dummy1 = re->dummy2 = re->dummy3 = 0;
9453 #endif
9454 
9455 /* The starting points of the name/number translation table and of the code are
9456 passed around in the compile data block. The start/end pattern and initial
9457 options are already set from the pre-compile phase, as is the name_entry_size
9458 field. Reset the bracket count and the names_found field. Also reset the hwm
9459 field; this time it's used for remembering forward references to subpatterns.
9460 */
9461 
9462 cd->final_bracount = cd->bracount;  /* Save for checking forward references */
9463 cd->parens_depth = 0;
9464 cd->assert_depth = 0;
9465 cd->bracount = 0;
9466 cd->max_lookbehind = 0;
9467 cd->name_table = (pcre_uchar *)re + re->name_table_offset;
9468 codestart = cd->name_table + re->name_entry_size * re->name_count;
9469 cd->start_code = codestart;
9470 cd->hwm = (pcre_uchar *)(cd->start_workspace);
9471 cd->iscondassert = FALSE;
9472 cd->req_varyopt = 0;
9473 cd->had_accept = FALSE;
9474 cd->had_pruneorskip = FALSE;
9475 cd->check_lookbehind = FALSE;
9476 cd->open_caps = NULL;
9477 
9478 /* If any named groups were found, create the name/number table from the list
9479 created in the first pass. */
9480 
9481 if (cd->names_found > 0)
9482   {
9483   int i = cd->names_found;
9484   named_group *ng = cd->named_groups;
9485   cd->names_found = 0;
9486   for (; i > 0; i--, ng++)
9487     add_name(cd, ng->name, ng->length, ng->number);
9488   if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
9489     (PUBL(free))((void *)cd->named_groups);
9490   cd->named_group_list_size = 0;   /* So we don't free it twice */
9491   }
9492 
9493 /* Set up a starting, non-extracting bracket, then compile the expression. On
9494 error, errorcode will be set non-zero, so we don't need to look at the result
9495 of the function here. */
9496 
9497 ptr = (const pcre_uchar *)pattern + skipatstart;
9498 code = (pcre_uchar *)codestart;
9499 *code = OP_BRA;
9500 (void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, 0, 0,
9501   &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL, cd, NULL);
9502 re->top_bracket = cd->bracount;
9503 re->top_backref = cd->top_backref;
9504 re->max_lookbehind = cd->max_lookbehind;
9505 re->flags = cd->external_flags | PCRE_MODE;
9506 
9507 if (cd->had_accept)
9508   {
9509   reqchar = 0;              /* Must disable after (*ACCEPT) */
9510   reqcharflags = REQ_NONE;
9511   }
9512 
9513 /* If not reached end of pattern on success, there's an excess bracket. */
9514 
9515 if (errorcode == 0 && *ptr != CHAR_NULL) errorcode = ERR22;
9516 
9517 /* Fill in the terminating state and check for disastrous overflow, but
9518 if debugging, leave the test till after things are printed out. */
9519 
9520 *code++ = OP_END;
9521 
9522 #ifndef PCRE_DEBUG
9523 if (code - codestart > length) errorcode = ERR23;
9524 #endif
9525 
9526 #ifdef SUPPORT_VALGRIND
9527 /* If the estimated length exceeds the really used length, mark the extra
9528 allocated memory as unaddressable, so that any out-of-bound reads can be
9529 detected. */
9530 VALGRIND_MAKE_MEM_NOACCESS(code, (length - (code - codestart)) * sizeof(pcre_uchar));
9531 #endif
9532 
9533 /* Fill in any forward references that are required. There may be repeated
9534 references; optimize for them, as searching a large regex takes time. */
9535 
9536 if (cd->hwm > cd->start_workspace)
9537   {
9538   int prev_recno = -1;
9539   const pcre_uchar *groupptr = NULL;
9540   while (errorcode == 0 && cd->hwm > cd->start_workspace)
9541     {
9542     int offset, recno;
9543     cd->hwm -= LINK_SIZE;
9544     offset = GET(cd->hwm, 0);
9545 
9546     /* Check that the hwm handling hasn't gone wrong. This whole area is
9547     rewritten in PCRE2 because there are some obscure cases. */
9548 
9549     if (offset == 0 || codestart[offset-1] != OP_RECURSE)
9550       {
9551       errorcode = ERR10;
9552       break;
9553       }
9554 
9555     recno = GET(codestart, offset);
9556     if (recno != prev_recno)
9557       {
9558       groupptr = PRIV(find_bracket)(codestart, utf, recno);
9559       prev_recno = recno;
9560       }
9561     if (groupptr == NULL) errorcode = ERR53;
9562       else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));
9563     }
9564   }
9565 
9566 /* If the workspace had to be expanded, free the new memory. Set the pointer to
9567 NULL to indicate that forward references have been filled in. */
9568 
9569 if (cd->workspace_size > COMPILE_WORK_SIZE)
9570   (PUBL(free))((void *)cd->start_workspace);
9571 cd->start_workspace = NULL;
9572 
9573 /* Give an error if there's back reference to a non-existent capturing
9574 subpattern. */
9575 
9576 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
9577 
9578 /* Unless disabled, check whether any single character iterators can be
9579 auto-possessified. The function overwrites the appropriate opcode values, so
9580 the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
9581 used in this code because at least one compiler gives a warning about loss of
9582 "const" attribute if the cast (pcre_uchar *)codestart is used directly in the
9583 function call. */
9584 
9585 if (errorcode == 0 && (options & PCRE_NO_AUTO_POSSESS) == 0)
9586   {
9587   pcre_uchar *temp = (pcre_uchar *)codestart;
9588   auto_possessify(temp, utf, cd);
9589   }
9590 
9591 /* If there were any lookbehind assertions that contained OP_RECURSE
9592 (recursions or subroutine calls), a flag is set for them to be checked here,
9593 because they may contain forward references. Actual recursions cannot be fixed
9594 length, but subroutine calls can. It is done like this so that those without
9595 OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
9596 exceptional ones forgo this. We scan the pattern to check that they are fixed
9597 length, and set their lengths. */
9598 
9599 if (errorcode == 0 && cd->check_lookbehind)
9600   {
9601   pcre_uchar *cc = (pcre_uchar *)codestart;
9602 
9603   /* Loop, searching for OP_REVERSE items, and process those that do not have
9604   their length set. (Actually, it will also re-process any that have a length
9605   of zero, but that is a pathological case, and it does no harm.) When we find
9606   one, we temporarily terminate the branch it is in while we scan it. */
9607 
9608   for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf, -1);
9609        cc != NULL;
9610        cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf, -1))
9611     {
9612     if (GET(cc, 1) == 0)
9613       {
9614       int fixed_length;
9615       pcre_uchar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
9616       int end_op = *be;
9617       *be = OP_END;
9618       fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != 0, TRUE,
9619         cd, NULL);
9620       *be = end_op;
9621       DPRINTF(("fixed length = %d\n", fixed_length));
9622       if (fixed_length < 0)
9623         {
9624         errorcode = (fixed_length == -2)? ERR36 :
9625                     (fixed_length == -4)? ERR70 : ERR25;
9626         break;
9627         }
9628       if (fixed_length > cd->max_lookbehind) cd->max_lookbehind = fixed_length;
9629       PUT(cc, 1, fixed_length);
9630       }
9631     cc += 1 + LINK_SIZE;
9632     }
9633   }
9634 
9635 /* Failed to compile, or error while post-processing */
9636 
9637 if (errorcode != 0)
9638   {
9639   (PUBL(free))(re);
9640   PCRE_EARLY_ERROR_RETURN:
9641   if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
9642     (PUBL(free))((void *)cd->named_groups);
9643   *erroroffset = (int)(ptr - (const pcre_uchar *)pattern);
9644   PCRE_EARLY_ERROR_RETURN2:
9645   *errorptr = find_error_text(errorcode);
9646   if (errorcodeptr != NULL) *errorcodeptr = errorcode;
9647   return NULL;
9648   }
9649 
9650 /* If the anchored option was not passed, set the flag if we can determine that
9651 the pattern is anchored by virtue of ^ characters or \A or anything else, such
9652 as starting with non-atomic .* when DOTALL is set and there are no occurrences
9653 of *PRUNE or *SKIP.
9654 
9655 Otherwise, if we know what the first byte has to be, save it, because that
9656 speeds up unanchored matches no end. If not, see if we can set the
9657 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
9658 start with ^. and also when all branches start with non-atomic .* for
9659 non-DOTALL matches when *PRUNE and SKIP are not present. */
9660 
9661 if ((re->options & PCRE_ANCHORED) == 0)
9662   {
9663   if (is_anchored(codestart, 0, cd, 0)) re->options |= PCRE_ANCHORED;
9664   else
9665     {
9666     if (firstcharflags < 0)
9667       firstchar = find_firstassertedchar(codestart, &firstcharflags, FALSE);
9668     if (firstcharflags >= 0)   /* Remove caseless flag for non-caseable chars */
9669       {
9670 #if defined COMPILE_PCRE8
9671       re->first_char = firstchar & 0xff;
9672 #elif defined COMPILE_PCRE16
9673       re->first_char = firstchar & 0xffff;
9674 #elif defined COMPILE_PCRE32
9675       re->first_char = firstchar;
9676 #endif
9677       if ((firstcharflags & REQ_CASELESS) != 0)
9678         {
9679 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
9680         /* We ignore non-ASCII first chars in 8 bit mode. */
9681         if (utf)
9682           {
9683           if (re->first_char < 128)
9684             {
9685             if (cd->fcc[re->first_char] != re->first_char)
9686               re->flags |= PCRE_FCH_CASELESS;
9687             }
9688           else if (UCD_OTHERCASE(re->first_char) != re->first_char)
9689             re->flags |= PCRE_FCH_CASELESS;
9690           }
9691         else
9692 #endif
9693         if (MAX_255(re->first_char)
9694             && cd->fcc[re->first_char] != re->first_char)
9695           re->flags |= PCRE_FCH_CASELESS;
9696         }
9697 
9698       re->flags |= PCRE_FIRSTSET;
9699       }
9700 
9701     else if (is_startline(codestart, 0, cd, 0, FALSE)) re->flags |= PCRE_STARTLINE;
9702     }
9703   }
9704 
9705 /* For an anchored pattern, we use the "required byte" only if it follows a
9706 variable length item in the regex. Remove the caseless flag for non-caseable
9707 bytes. */
9708 
9709 if (reqcharflags >= 0 &&
9710      ((re->options & PCRE_ANCHORED) == 0 || (reqcharflags & REQ_VARY) != 0))
9711   {
9712 #if defined COMPILE_PCRE8
9713   re->req_char = reqchar & 0xff;
9714 #elif defined COMPILE_PCRE16
9715   re->req_char = reqchar & 0xffff;
9716 #elif defined COMPILE_PCRE32
9717   re->req_char = reqchar;
9718 #endif
9719   if ((reqcharflags & REQ_CASELESS) != 0)
9720     {
9721 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
9722     /* We ignore non-ASCII first chars in 8 bit mode. */
9723     if (utf)
9724       {
9725       if (re->req_char < 128)
9726         {
9727         if (cd->fcc[re->req_char] != re->req_char)
9728           re->flags |= PCRE_RCH_CASELESS;
9729         }
9730       else if (UCD_OTHERCASE(re->req_char) != re->req_char)
9731         re->flags |= PCRE_RCH_CASELESS;
9732       }
9733     else
9734 #endif
9735     if (MAX_255(re->req_char) && cd->fcc[re->req_char] != re->req_char)
9736       re->flags |= PCRE_RCH_CASELESS;
9737     }
9738 
9739   re->flags |= PCRE_REQCHSET;
9740   }
9741 
9742 /* Print out the compiled data if debugging is enabled. This is never the
9743 case when building a production library. */
9744 
9745 #ifdef PCRE_DEBUG
9746 printf("Length = %d top_bracket = %d top_backref = %d\n",
9747   length, re->top_bracket, re->top_backref);
9748 
9749 printf("Options=%08x\n", re->options);
9750 
9751 if ((re->flags & PCRE_FIRSTSET) != 0)
9752   {
9753   pcre_uchar ch = re->first_char;
9754   const char *caseless =
9755     ((re->flags & PCRE_FCH_CASELESS) == 0)? "" : " (caseless)";
9756   if (PRINTABLE(ch)) printf("First char = %c%s\n", ch, caseless);
9757     else printf("First char = \\x%02x%s\n", ch, caseless);
9758   }
9759 
9760 if ((re->flags & PCRE_REQCHSET) != 0)
9761   {
9762   pcre_uchar ch = re->req_char;
9763   const char *caseless =
9764     ((re->flags & PCRE_RCH_CASELESS) == 0)? "" : " (caseless)";
9765   if (PRINTABLE(ch)) printf("Req char = %c%s\n", ch, caseless);
9766     else printf("Req char = \\x%02x%s\n", ch, caseless);
9767   }
9768 
9769 #if defined COMPILE_PCRE8
9770 pcre_printint((pcre *)re, stdout, TRUE);
9771 #elif defined COMPILE_PCRE16
9772 pcre16_printint((pcre *)re, stdout, TRUE);
9773 #elif defined COMPILE_PCRE32
9774 pcre32_printint((pcre *)re, stdout, TRUE);
9775 #endif
9776 
9777 /* This check is done here in the debugging case so that the code that
9778 was compiled can be seen. */
9779 
9780 if (code - codestart > length)
9781   {
9782   (PUBL(free))(re);
9783   *errorptr = find_error_text(ERR23);
9784   *erroroffset = ptr - (pcre_uchar *)pattern;
9785   if (errorcodeptr != NULL) *errorcodeptr = ERR23;
9786   return NULL;
9787   }
9788 #endif   /* PCRE_DEBUG */
9789 
9790 /* Check for a pattern than can match an empty string, so that this information
9791 can be provided to applications. */
9792 
9793 do
9794   {
9795   if (could_be_empty_branch(codestart, code, utf, cd, NULL))
9796     {
9797     re->flags |= PCRE_MATCH_EMPTY;
9798     break;
9799     }
9800   codestart += GET(codestart, 1);
9801   }
9802 while (*codestart == OP_ALT);
9803 
9804 #if defined COMPILE_PCRE8
9805 return (pcre *)re;
9806 #elif defined COMPILE_PCRE16
9807 return (pcre16 *)re;
9808 #elif defined COMPILE_PCRE32
9809 return (pcre32 *)re;
9810 #endif
9811 }
9812 
9813 /* End of pcre_compile.c */
9814