1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9            Copyright (c) 1997-2020 University of Cambridge
10 
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14 
15     * Redistributions of source code must retain the above copyright notice,
16       this list of conditions and the following disclaimer.
17 
18     * Redistributions in binary form must reproduce the above copyright
19       notice, this list of conditions and the following disclaimer in the
20       documentation and/or other materials provided with the distribution.
21 
22     * Neither the name of the University of Cambridge nor the names of its
23       contributors may be used to endorse or promote products derived from
24       this software without specific prior written permission.
25 
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39 
40 
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43 
44 
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48 
49 #define NLBLOCK cd             /* Block containing newline information */
50 #define PSSTART start_pattern  /* Field containing pattern start */
51 #define PSEND   end_pattern    /* Field containing pattern end */
52 
53 #include "pcre_internal.h"
54 
55 
56 /* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
57 is also used by pcretest. PCRE_DEBUG is not defined when building a production
58 library. We do not need to select pcre16_printint.c specially, because the
59 COMPILE_PCREx macro will already be appropriately set. */
60 
61 #ifdef PCRE_DEBUG
62 /* pcre_printint.c should not include any headers */
63 #define PCRE_INCLUDED
64 #include "pcre_printint.c"
65 #undef PCRE_INCLUDED
66 #endif
67 
68 
69 /* Macro for setting individual bits in class bitmaps. */
70 
71 #define SETBIT(a,b) a[(b)/8] |= (1U << ((b)&7))
72 
73 /* Maximum length value to check against when making sure that the integer that
74 holds the compiled pattern length does not overflow. We make it a bit less than
75 INT_MAX to allow for adding in group terminating bytes, so that we don't have
76 to check them every time. */
77 
78 #define OFLOW_MAX (INT_MAX - 20)
79 
80 /* Definitions to allow mutual recursion */
81 
82 static int
83   add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
84     const pcre_uint32 *, unsigned int);
85 
86 static BOOL
87   compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
88     pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
89     compile_data *, int *);
90 
91 
92 
93 /*************************************************
94 *      Code parameters and static tables         *
95 *************************************************/
96 
97 /* This value specifies the size of stack workspace that is used during the
98 first pre-compile phase that determines how much memory is required. The regex
99 is partly compiled into this space, but the compiled parts are discarded as
100 soon as they can be, so that hopefully there will never be an overrun. The code
101 does, however, check for an overrun. The largest amount I've seen used is 218,
102 so this number is very generous.
103 
104 The same workspace is used during the second, actual compile phase for
105 remembering forward references to groups so that they can be filled in at the
106 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
107 is 4 there is plenty of room for most patterns. However, the memory can get
108 filled up by repetitions of forward references, for example patterns like
109 /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
110 that the workspace is expanded using malloc() in this situation. The value
111 below is therefore a minimum, and we put a maximum on it for safety. The
112 minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
113 kicks in at the same number of forward references in all cases. */
114 
115 #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
116 #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
117 
118 /* This value determines the size of the initial vector that is used for
119 remembering named groups during the pre-compile. It is allocated on the stack,
120 but if it is too small, it is expanded using malloc(), in a similar way to the
121 workspace. The value is the number of slots in the list. */
122 
123 #define NAMED_GROUP_LIST_SIZE  20
124 
125 /* The overrun tests check for a slightly smaller size so that they detect the
126 overrun before it actually does run off the end of the data block. */
127 
128 #define WORK_SIZE_SAFETY_MARGIN (100)
129 
130 /* Private flags added to firstchar and reqchar. */
131 
132 #define REQ_CASELESS    (1U << 0)        /* Indicates caselessness */
133 #define REQ_VARY        (1U << 1)        /* Reqchar followed non-literal item */
134 /* Negative values for the firstchar and reqchar flags */
135 #define REQ_UNSET       (-2)
136 #define REQ_NONE        (-1)
137 
138 /* Repeated character flags. */
139 
140 #define UTF_LENGTH     0x10000000l      /* The char contains its length. */
141 
142 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
143 are simple data values; negative values are for special things like \d and so
144 on. Zero means further processing is needed (for things like \x), or the escape
145 is invalid. */
146 
147 #ifndef EBCDIC
148 
149 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
150 in UTF-8 mode. */
151 
152 static const short int escapes[] = {
153      0,                       0,
154      0,                       0,
155      0,                       0,
156      0,                       0,
157      0,                       0,
158      CHAR_COLON,              CHAR_SEMICOLON,
159      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
160      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
161      CHAR_COMMERCIAL_AT,      -ESC_A,
162      -ESC_B,                  -ESC_C,
163      -ESC_D,                  -ESC_E,
164      0,                       -ESC_G,
165      -ESC_H,                  0,
166      0,                       -ESC_K,
167      0,                       0,
168      -ESC_N,                  0,
169      -ESC_P,                  -ESC_Q,
170      -ESC_R,                  -ESC_S,
171      0,                       0,
172      -ESC_V,                  -ESC_W,
173      -ESC_X,                  0,
174      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
175      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
176      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
177      CHAR_GRAVE_ACCENT,       ESC_a,
178      -ESC_b,                  0,
179      -ESC_d,                  ESC_e,
180      ESC_f,                   0,
181      -ESC_h,                  0,
182      0,                       -ESC_k,
183      0,                       0,
184      ESC_n,                   0,
185      -ESC_p,                  0,
186      ESC_r,                   -ESC_s,
187      ESC_tee,                 0,
188      -ESC_v,                  -ESC_w,
189      0,                       0,
190      -ESC_z
191 };
192 
193 #else
194 
195 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
196 
197 static const short int escapes[] = {
198 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
199 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
200 /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
201 /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
202 /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
203 /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
204 /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
205 /*  80 */     0, ESC_a, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
206 /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
207 /*  90 */     0,     0, -ESC_k,       0,      0, ESC_n,      0, -ESC_p,
208 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
209 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
210 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
211 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
212 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
213 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
214 /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
215 /*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
216 /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
217 /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
218 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
219 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
220 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
221 };
222 
223 /* We also need a table of characters that may follow \c in an EBCDIC
224 environment for characters 0-31. */
225 
226 static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
227 
228 #endif
229 
230 
231 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
232 searched linearly. Put all the names into a single string, in order to reduce
233 the number of relocations when a shared library is dynamically linked. The
234 string is built from string macros so that it works in UTF-8 mode on EBCDIC
235 platforms. */
236 
237 typedef struct verbitem {
238   int   len;                 /* Length of verb name */
239   int   op;                  /* Op when no arg, or -1 if arg mandatory */
240   int   op_arg;              /* Op when arg present, or -1 if not allowed */
241 } verbitem;
242 
243 static const char verbnames[] =
244   "\0"                       /* Empty name is a shorthand for MARK */
245   STRING_MARK0
246   STRING_ACCEPT0
247   STRING_COMMIT0
248   STRING_F0
249   STRING_FAIL0
250   STRING_PRUNE0
251   STRING_SKIP0
252   STRING_THEN;
253 
254 static const verbitem verbs[] = {
255   { 0, -1,        OP_MARK },
256   { 4, -1,        OP_MARK },
257   { 6, OP_ACCEPT, -1 },
258   { 6, OP_COMMIT, -1 },
259   { 1, OP_FAIL,   -1 },
260   { 4, OP_FAIL,   -1 },
261   { 5, OP_PRUNE,  OP_PRUNE_ARG },
262   { 4, OP_SKIP,   OP_SKIP_ARG  },
263   { 4, OP_THEN,   OP_THEN_ARG  }
264 };
265 
266 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
267 
268 
269 /* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in
270 another regex library. */
271 
272 static const pcre_uchar sub_start_of_word[] = {
273   CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
274   CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' };
275 
276 static const pcre_uchar sub_end_of_word[] = {
277   CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
278   CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
279   CHAR_RIGHT_PARENTHESIS, '\0' };
280 
281 
282 /* Tables of names of POSIX character classes and their lengths. The names are
283 now all in a single string, to reduce the number of relocations when a shared
284 library is dynamically loaded. The list of lengths is terminated by a zero
285 length entry. The first three must be alpha, lower, upper, as this is assumed
286 for handling case independence. The indices for graph, print, and punct are
287 needed, so identify them. */
288 
289 static const char posix_names[] =
290   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
291   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
292   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
293   STRING_word0  STRING_xdigit;
294 
295 static const pcre_uint8 posix_name_lengths[] = {
296   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
297 
298 #define PC_GRAPH  8
299 #define PC_PRINT  9
300 #define PC_PUNCT 10
301 
302 
303 /* Table of class bit maps for each POSIX class. Each class is formed from a
304 base map, with an optional addition or removal of another map. Then, for some
305 classes, there is some additional tweaking: for [:blank:] the vertical space
306 characters are removed, and for [:alpha:] and [:alnum:] the underscore
307 character is removed. The triples in the table consist of the base map offset,
308 second map offset or -1 if no second map, and a non-negative value for map
309 addition or a negative value for map subtraction (if there are two maps). The
310 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
311 remove vertical space characters, 2 => remove underscore. */
312 
313 static const int posix_class_maps[] = {
314   cbit_word,  cbit_digit, -2,             /* alpha */
315   cbit_lower, -1,          0,             /* lower */
316   cbit_upper, -1,          0,             /* upper */
317   cbit_word,  -1,          2,             /* alnum - word without underscore */
318   cbit_print, cbit_cntrl,  0,             /* ascii */
319   cbit_space, -1,          1,             /* blank - a GNU extension */
320   cbit_cntrl, -1,          0,             /* cntrl */
321   cbit_digit, -1,          0,             /* digit */
322   cbit_graph, -1,          0,             /* graph */
323   cbit_print, -1,          0,             /* print */
324   cbit_punct, -1,          0,             /* punct */
325   cbit_space, -1,          0,             /* space */
326   cbit_word,  -1,          0,             /* word - a Perl extension */
327   cbit_xdigit,-1,          0              /* xdigit */
328 };
329 
330 /* Table of substitutes for \d etc when PCRE_UCP is set. They are replaced by
331 Unicode property escapes. */
332 
333 #ifdef SUPPORT_UCP
334 static const pcre_uchar string_PNd[]  = {
335   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
336   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
337 static const pcre_uchar string_pNd[]  = {
338   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
339   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
340 static const pcre_uchar string_PXsp[] = {
341   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
342   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
343 static const pcre_uchar string_pXsp[] = {
344   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
345   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
346 static const pcre_uchar string_PXwd[] = {
347   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
348   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
349 static const pcre_uchar string_pXwd[] = {
350   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
351   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
352 
353 static const pcre_uchar *substitutes[] = {
354   string_PNd,           /* \D */
355   string_pNd,           /* \d */
356   string_PXsp,          /* \S */   /* Xsp is Perl space, but from 8.34, Perl */
357   string_pXsp,          /* \s */   /* space and POSIX space are the same. */
358   string_PXwd,          /* \W */
359   string_pXwd           /* \w */
360 };
361 
362 /* The POSIX class substitutes must be in the order of the POSIX class names,
363 defined above, and there are both positive and negative cases. NULL means no
364 general substitute of a Unicode property escape (\p or \P). However, for some
365 POSIX classes (e.g. graph, print, punct) a special property code is compiled
366 directly. */
367 
368 static const pcre_uchar string_pL[] =   {
369   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
370   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
371 static const pcre_uchar string_pLl[] =  {
372   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
373   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
374 static const pcre_uchar string_pLu[] =  {
375   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
376   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
377 static const pcre_uchar string_pXan[] = {
378   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
379   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
380 static const pcre_uchar string_h[] =    {
381   CHAR_BACKSLASH, CHAR_h, '\0' };
382 static const pcre_uchar string_pXps[] = {
383   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
384   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
385 static const pcre_uchar string_PL[] =   {
386   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
387   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
388 static const pcre_uchar string_PLl[] =  {
389   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
390   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
391 static const pcre_uchar string_PLu[] =  {
392   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
393   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
394 static const pcre_uchar string_PXan[] = {
395   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
396   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
397 static const pcre_uchar string_H[] =    {
398   CHAR_BACKSLASH, CHAR_H, '\0' };
399 static const pcre_uchar string_PXps[] = {
400   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
401   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
402 
403 static const pcre_uchar *posix_substitutes[] = {
404   string_pL,            /* alpha */
405   string_pLl,           /* lower */
406   string_pLu,           /* upper */
407   string_pXan,          /* alnum */
408   NULL,                 /* ascii */
409   string_h,             /* blank */
410   NULL,                 /* cntrl */
411   string_pNd,           /* digit */
412   NULL,                 /* graph */
413   NULL,                 /* print */
414   NULL,                 /* punct */
415   string_pXps,          /* space */   /* Xps is POSIX space, but from 8.34 */
416   string_pXwd,          /* word  */   /* Perl and POSIX space are the same */
417   NULL,                 /* xdigit */
418   /* Negated cases */
419   string_PL,            /* ^alpha */
420   string_PLl,           /* ^lower */
421   string_PLu,           /* ^upper */
422   string_PXan,          /* ^alnum */
423   NULL,                 /* ^ascii */
424   string_H,             /* ^blank */
425   NULL,                 /* ^cntrl */
426   string_PNd,           /* ^digit */
427   NULL,                 /* ^graph */
428   NULL,                 /* ^print */
429   NULL,                 /* ^punct */
430   string_PXps,          /* ^space */  /* Xps is POSIX space, but from 8.34 */
431   string_PXwd,          /* ^word */   /* Perl and POSIX space are the same */
432   NULL                  /* ^xdigit */
433 };
434 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
435 #endif
436 
437 #define STRING(a)  # a
438 #define XSTRING(s) STRING(s)
439 
440 /* The texts of compile-time error messages. These are "char *" because they
441 are passed to the outside world. Do not ever re-use any error number, because
442 they are documented. Always add a new error instead. Messages marked DEAD below
443 are no longer used. This used to be a table of strings, but in order to reduce
444 the number of relocations needed when a shared library is loaded dynamically,
445 it is now one long string. We cannot use a table of offsets, because the
446 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
447 simply count through to the one we want - this isn't a performance issue
448 because these strings are used only when there is a compilation error.
449 
450 Each substring ends with \0 to insert a null character. This includes the final
451 substring, so that the whole string ends with \0\0, which can be detected when
452 counting through. */
453 
454 static const char error_texts[] =
455   "no error\0"
456   "\\ at end of pattern\0"
457   "\\c at end of pattern\0"
458   "unrecognized character follows \\\0"
459   "numbers out of order in {} quantifier\0"
460   /* 5 */
461   "number too big in {} quantifier\0"
462   "missing terminating ] for character class\0"
463   "invalid escape sequence in character class\0"
464   "range out of order in character class\0"
465   "nothing to repeat\0"
466   /* 10 */
467   "internal error: invalid forward reference offset\0"
468   "internal error: unexpected repeat\0"
469   "unrecognized character after (? or (?-\0"
470   "POSIX named classes are supported only within a class\0"
471   "missing )\0"
472   /* 15 */
473   "reference to non-existent subpattern\0"
474   "erroffset passed as NULL\0"
475   "unknown option bit(s) set\0"
476   "missing ) after comment\0"
477   "parentheses nested too deeply\0"  /** DEAD **/
478   /* 20 */
479   "regular expression is too large\0"
480   "failed to get memory\0"
481   "unmatched parentheses\0"
482   "internal error: code overflow\0"
483   "unrecognized character after (?<\0"
484   /* 25 */
485   "lookbehind assertion is not fixed length\0"
486   "malformed number or name after (?(\0"
487   "conditional group contains more than two branches\0"
488   "assertion expected after (?( or (?(?C)\0"
489   "(?R or (?[+-]digits must be followed by )\0"
490   /* 30 */
491   "unknown POSIX class name\0"
492   "POSIX collating elements are not supported\0"
493   "this version of PCRE is compiled without UTF support\0"
494   "spare error\0"  /** DEAD **/
495   "character value in \\x{} or \\o{} is too large\0"
496   /* 35 */
497   "invalid condition (?(0)\0"
498   "\\C not allowed in lookbehind assertion\0"
499   "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
500   "number after (?C is > 255\0"
501   "closing ) for (?C expected\0"
502   /* 40 */
503   "recursive call could loop indefinitely\0"
504   "unrecognized character after (?P\0"
505   "syntax error in subpattern name (missing terminator)\0"
506   "two named subpatterns have the same name\0"
507   "invalid UTF-8 string\0"
508   /* 45 */
509   "support for \\P, \\p, and \\X has not been compiled\0"
510   "malformed \\P or \\p sequence\0"
511   "unknown property name after \\P or \\p\0"
512   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
513   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
514   /* 50 */
515   "repeated subpattern is too long\0"    /** DEAD **/
516   "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
517   "internal error: overran compiling workspace\0"
518   "internal error: previously-checked referenced subpattern not found\0"
519   "DEFINE group contains more than one branch\0"
520   /* 55 */
521   "repeating a DEFINE group is not allowed\0"  /** DEAD **/
522   "inconsistent NEWLINE options\0"
523   "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
524   "a numbered reference must not be zero\0"
525   "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
526   /* 60 */
527   "(*VERB) not recognized or malformed\0"
528   "number is too big\0"
529   "subpattern name expected\0"
530   "digit expected after (?+\0"
531   "] is an invalid data character in JavaScript compatibility mode\0"
532   /* 65 */
533   "different names for subpatterns of the same number are not allowed\0"
534   "(*MARK) must have an argument\0"
535   "this version of PCRE is not compiled with Unicode property support\0"
536 #ifndef EBCDIC
537   "\\c must be followed by an ASCII character\0"
538 #else
539   "\\c must be followed by a letter or one of [\\]^_?\0"
540 #endif
541   "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
542   /* 70 */
543   "internal error: unknown opcode in find_fixedlength()\0"
544   "\\N is not supported in a class\0"
545   "too many forward references\0"
546   "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
547   "invalid UTF-16 string\0"
548   /* 75 */
549   "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
550   "character value in \\u.... sequence is too large\0"
551   "invalid UTF-32 string\0"
552   "setting UTF is disabled by the application\0"
553   "non-hex character in \\x{} (closing brace missing?)\0"
554   /* 80 */
555   "non-octal character in \\o{} (closing brace missing?)\0"
556   "missing opening brace after \\o\0"
557   "parentheses are too deeply nested\0"
558   "invalid range in character class\0"
559   "group name must start with a non-digit\0"
560   /* 85 */
561   "parentheses are too deeply nested (stack check)\0"
562   "digits missing in \\x{} or \\o{}\0"
563   "regular expression is too complicated\0"
564   ;
565 
566 /* Table to identify digits and hex digits. This is used when compiling
567 patterns. Note that the tables in chartables are dependent on the locale, and
568 may mark arbitrary characters as digits - but the PCRE compiling code expects
569 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
570 a private table here. It costs 256 bytes, but it is a lot faster than doing
571 character value tests (at least in some simple cases I timed), and in some
572 applications one wants PCRE to compile efficiently as well as match
573 efficiently.
574 
575 For convenience, we use the same bit definitions as in chartables:
576 
577   0x04   decimal digit
578   0x08   hexadecimal digit
579 
580 Then we can use ctype_digit and ctype_xdigit in the code. */
581 
582 /* Using a simple comparison for decimal numbers rather than a memory read
583 is much faster, and the resulting code is simpler (the compiler turns it
584 into a subtraction and unsigned comparison). */
585 
586 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
587 
588 #ifndef EBCDIC
589 
590 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
591 UTF-8 mode. */
592 
593 static const pcre_uint8 digitab[] =
594   {
595   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
596   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
597   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
598   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
599   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
600   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
601   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
602   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
603   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
604   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
605   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
606   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
607   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
608   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
609   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
610   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
611   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
612   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
613   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
614   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
615   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
616   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
617   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
618   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
619   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
620   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
621   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
622   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
623   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
624   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
625   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
626   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
627 
628 #else
629 
630 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
631 
632 static const pcre_uint8 digitab[] =
633   {
634   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
635   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
636   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
637   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
638   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
639   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
640   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
641   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
642   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
643   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
644   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
645   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
646   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
647   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
648   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
649   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
650   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
651   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
652   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
653   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
654   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
655   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
656   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
657   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
658   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
659   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
660   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
661   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
662   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
663   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
664   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
665   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
666 
667 static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
668   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
669   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
670   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
671   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
672   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
673   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
674   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
675   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
676   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
677   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
678   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
679   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
680   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
681   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
682   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
683   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
684   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
685   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
686   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
687   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
688   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
689   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
690   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
691   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
692   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
693   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
694   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
695   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
696   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
697   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
698   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
699   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
700 #endif
701 
702 
703 /* This table is used to check whether auto-possessification is possible
704 between adjacent character-type opcodes. The left-hand (repeated) opcode is
705 used to select the row, and the right-hand opcode is use to select the column.
706 A value of 1 means that auto-possessification is OK. For example, the second
707 value in the first row means that \D+\d can be turned into \D++\d.
708 
709 The Unicode property types (\P and \p) have to be present to fill out the table
710 because of what their opcode values are, but the table values should always be
711 zero because property types are handled separately in the code. The last four
712 columns apply to items that cannot be repeated, so there is no need to have
713 rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
714 *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
715 
716 #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
717 #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
718 
719 static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
720 /* \D \d \S \s \W \w  . .+ \C \P \p \R \H \h \V \v \X \Z \z  $ $M */
721   { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \D */
722   { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \d */
723   { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \S */
724   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \s */
725   { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \W */
726   { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \w */
727   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .  */
728   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .+ */
729   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \C */
730   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \P */
731   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \p */
732   { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \R */
733   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \H */
734   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \h */
735   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \V */
736   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 },  /* \v */
737   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }   /* \X */
738 };
739 
740 
741 /* This table is used to check whether auto-possessification is possible
742 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
743 left-hand (repeated) opcode is used to select the row, and the right-hand
744 opcode is used to select the column. The values are as follows:
745 
746   0   Always return FALSE (never auto-possessify)
747   1   Character groups are distinct (possessify if both are OP_PROP)
748   2   Check character categories in the same group (general or particular)
749   3   TRUE if the two opcodes are not the same (PROP vs NOTPROP)
750 
751   4   Check left general category vs right particular category
752   5   Check right general category vs left particular category
753 
754   6   Left alphanum vs right general category
755   7   Left space vs right general category
756   8   Left word vs right general category
757 
758   9   Right alphanum vs left general category
759  10   Right space vs left general category
760  11   Right word vs left general category
761 
762  12   Left alphanum vs right particular category
763  13   Left space vs right particular category
764  14   Left word vs right particular category
765 
766  15   Right alphanum vs left particular category
767  16   Right space vs left particular category
768  17   Right word vs left particular category
769 */
770 
771 static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
772 /* ANY LAMP GC  PC  SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
773   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_ANY */
774   { 0,  3,  0,  0,  0,    3,    1,      1,   0,    0,   0 },  /* PT_LAMP */
775   { 0,  0,  2,  4,  0,    9,   10,     10,  11,    0,   0 },  /* PT_GC */
776   { 0,  0,  5,  2,  0,   15,   16,     16,  17,    0,   0 },  /* PT_PC */
777   { 0,  0,  0,  0,  2,    0,    0,      0,   0,    0,   0 },  /* PT_SC */
778   { 0,  3,  6, 12,  0,    3,    1,      1,   0,    0,   0 },  /* PT_ALNUM */
779   { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_SPACE */
780   { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_PXSPACE */
781   { 0,  0,  8, 14,  0,    0,    1,      1,   3,    0,   0 },  /* PT_WORD */
782   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_CLIST */
783   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   3 }   /* PT_UCNC */
784 };
785 
786 /* This table is used to check whether auto-possessification is possible
787 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
788 specifies a general category and the other specifies a particular category. The
789 row is selected by the general category and the column by the particular
790 category. The value is 1 if the particular category is not part of the general
791 category. */
792 
793 static const pcre_uint8 catposstab[7][30] = {
794 /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
795   { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* C */
796   { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* L */
797   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* M */
798   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* N */
799   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },  /* P */
800   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 },  /* S */
801   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 }   /* Z */
802 };
803 
804 /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
805 a general or particular category. The properties in each row are those
806 that apply to the character set in question. Duplication means that a little
807 unnecessary work is done when checking, but this keeps things much simpler
808 because they can all use the same code. For more details see the comment where
809 this table is used.
810 
811 Note: SPACE and PXSPACE used to be different because Perl excluded VT from
812 "space", but from Perl 5.18 it's included, so both categories are treated the
813 same here. */
814 
815 static const pcre_uint8 posspropstab[3][4] = {
816   { ucp_L, ucp_N, ucp_N, ucp_Nl },  /* ALNUM, 3rd and 4th values redundant */
817   { ucp_Z, ucp_Z, ucp_C, ucp_Cc },  /* SPACE and PXSPACE, 2nd value redundant */
818   { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
819 };
820 
821 /* This table is used when converting repeating opcodes into possessified
822 versions as a result of an explicit possessive quantifier such as ++. A zero
823 value means there is no possessified version - in those cases the item in
824 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
825 because all relevant opcodes are less than that. */
826 
827 static const pcre_uint8 opcode_possessify[] = {
828   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
829   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
830 
831   0,                       /* NOTI */
832   OP_POSSTAR, 0,           /* STAR, MINSTAR */
833   OP_POSPLUS, 0,           /* PLUS, MINPLUS */
834   OP_POSQUERY, 0,          /* QUERY, MINQUERY */
835   OP_POSUPTO, 0,           /* UPTO, MINUPTO */
836   0,                       /* EXACT */
837   0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
838 
839   OP_POSSTARI, 0,          /* STARI, MINSTARI */
840   OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
841   OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
842   OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
843   0,                       /* EXACTI */
844   0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
845 
846   OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
847   OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
848   OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
849   OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
850   0,                       /* NOTEXACT */
851   0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
852 
853   OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
854   OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
855   OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
856   OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
857   0,                       /* NOTEXACTI */
858   0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
859 
860   OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
861   OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
862   OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
863   OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
864   0,                       /* TYPEEXACT */
865   0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
866 
867   OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
868   OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
869   OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
870   OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
871   0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
872 
873   0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
874   0, 0,                    /* REF, REFI */
875   0, 0,                    /* DNREF, DNREFI */
876   0, 0                     /* RECURSE, CALLOUT */
877 };
878 
879 
880 
881 /*************************************************
882 *            Find an error text                  *
883 *************************************************/
884 
885 /* The error texts are now all in one long string, to save on relocations. As
886 some of the text is of unknown length, we can't use a table of offsets.
887 Instead, just count through the strings. This is not a performance issue
888 because it happens only when there has been a compilation error.
889 
890 Argument:   the error number
891 Returns:    pointer to the error string
892 */
893 
894 static const char *
find_error_text(int n)895 find_error_text(int n)
896 {
897 const char *s = error_texts;
898 for (; n > 0; n--)
899   {
900   while (*s++ != CHAR_NULL) {};
901   if (*s == CHAR_NULL) return "Error text not found (please report)";
902   }
903 return s;
904 }
905 
906 
907 
908 /*************************************************
909 *           Expand the workspace                 *
910 *************************************************/
911 
912 /* This function is called during the second compiling phase, if the number of
913 forward references fills the existing workspace, which is originally a block on
914 the stack. A larger block is obtained from malloc() unless the ultimate limit
915 has been reached or the increase will be rather small.
916 
917 Argument: pointer to the compile data block
918 Returns:  0 if all went well, else an error number
919 */
920 
921 static int
expand_workspace(compile_data * cd)922 expand_workspace(compile_data *cd)
923 {
924 pcre_uchar *newspace;
925 int newsize = cd->workspace_size * 2;
926 
927 if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
928 if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
929     newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
930  return ERR72;
931 
932 newspace = (PUBL(malloc))(IN_UCHARS(newsize));
933 if (newspace == NULL) return ERR21;
934 memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
935 cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
936 if (cd->workspace_size > COMPILE_WORK_SIZE)
937   (PUBL(free))((void *)cd->start_workspace);
938 cd->start_workspace = newspace;
939 cd->workspace_size = newsize;
940 return 0;
941 }
942 
943 
944 
945 /*************************************************
946 *            Check for counted repeat            *
947 *************************************************/
948 
949 /* This function is called when a '{' is encountered in a place where it might
950 start a quantifier. It looks ahead to see if it really is a quantifier or not.
951 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
952 where the ddds are digits.
953 
954 Arguments:
955   p         pointer to the first char after '{'
956 
957 Returns:    TRUE or FALSE
958 */
959 
960 static BOOL
is_counted_repeat(const pcre_uchar * p)961 is_counted_repeat(const pcre_uchar *p)
962 {
963 if (!IS_DIGIT(*p)) return FALSE;
964 p++;
965 while (IS_DIGIT(*p)) p++;
966 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
967 
968 if (*p++ != CHAR_COMMA) return FALSE;
969 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
970 
971 if (!IS_DIGIT(*p)) return FALSE;
972 p++;
973 while (IS_DIGIT(*p)) p++;
974 
975 return (*p == CHAR_RIGHT_CURLY_BRACKET);
976 }
977 
978 
979 
980 /*************************************************
981 *            Handle escapes                      *
982 *************************************************/
983 
984 /* This function is called when a \ has been encountered. It either returns a
985 positive value for a simple escape such as \n, or 0 for a data character which
986 will be placed in chptr. A backreference to group n is returned as negative n.
987 When UTF-8 is enabled, a positive value greater than 255 may be returned in
988 chptr. On entry, ptr is pointing at the \. On exit, it is on the final
989 character of the escape sequence.
990 
991 Arguments:
992   ptrptr         points to the pattern position pointer
993   chptr          points to a returned data character
994   errorcodeptr   points to the errorcode variable
995   bracount       number of previous extracting brackets
996   options        the options bits
997   isclass        TRUE if inside a character class
998 
999 Returns:         zero => a data character
1000                  positive => a special escape sequence
1001                  negative => a back reference
1002                  on error, errorcodeptr is set
1003 */
1004 
1005 static int
check_escape(const pcre_uchar ** ptrptr,pcre_uint32 * chptr,int * errorcodeptr,int bracount,int options,BOOL isclass)1006 check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
1007   int bracount, int options, BOOL isclass)
1008 {
1009 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
1010 BOOL utf = (options & PCRE_UTF8) != 0;
1011 const pcre_uchar *ptr = *ptrptr + 1;
1012 pcre_uint32 c;
1013 int escape = 0;
1014 int i;
1015 
1016 GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
1017 ptr--;                            /* Set pointer back to the last byte */
1018 
1019 /* If backslash is at the end of the pattern, it's an error. */
1020 
1021 if (c == CHAR_NULL) *errorcodeptr = ERR1;
1022 
1023 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
1024 in a table. A non-zero result is something that can be returned immediately.
1025 Otherwise further processing may be required. */
1026 
1027 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1028 /* Not alphanumeric */
1029 else if (c < CHAR_0 || c > CHAR_z) {}
1030 else if ((i = escapes[c - CHAR_0]) != 0)
1031   { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1032 
1033 #else           /* EBCDIC coding */
1034 /* Not alphanumeric */
1035 else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
1036 else if ((i = escapes[c - 0x48]) != 0)  { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1037 #endif
1038 
1039 /* Escapes that need further processing, or are illegal. */
1040 
1041 else
1042   {
1043   const pcre_uchar *oldptr;
1044   BOOL braced, negated, overflow;
1045   int s;
1046 
1047   switch (c)
1048     {
1049     /* A number of Perl escapes are not handled by PCRE. We give an explicit
1050     error. */
1051 
1052     case CHAR_l:
1053     case CHAR_L:
1054     *errorcodeptr = ERR37;
1055     break;
1056 
1057     case CHAR_u:
1058     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1059       {
1060       /* In JavaScript, \u must be followed by four hexadecimal numbers.
1061       Otherwise it is a lowercase u letter. */
1062       if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1063         && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
1064         && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
1065         && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
1066         {
1067         c = 0;
1068         for (i = 0; i < 4; ++i)
1069           {
1070           register pcre_uint32 cc = *(++ptr);
1071 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1072           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1073           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1074 #else           /* EBCDIC coding */
1075           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1076           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1077 #endif
1078           }
1079 
1080 #if defined COMPILE_PCRE8
1081         if (c > (utf ? 0x10ffffU : 0xffU))
1082 #elif defined COMPILE_PCRE16
1083         if (c > (utf ? 0x10ffffU : 0xffffU))
1084 #elif defined COMPILE_PCRE32
1085         if (utf && c > 0x10ffffU)
1086 #endif
1087           {
1088           *errorcodeptr = ERR76;
1089           }
1090         else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1091         }
1092       }
1093     else
1094       *errorcodeptr = ERR37;
1095     break;
1096 
1097     case CHAR_U:
1098     /* In JavaScript, \U is an uppercase U letter. */
1099     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
1100     break;
1101 
1102     /* In a character class, \g is just a literal "g". Outside a character
1103     class, \g must be followed by one of a number of specific things:
1104 
1105     (1) A number, either plain or braced. If positive, it is an absolute
1106     backreference. If negative, it is a relative backreference. This is a Perl
1107     5.10 feature.
1108 
1109     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1110     is part of Perl's movement towards a unified syntax for back references. As
1111     this is synonymous with \k{name}, we fudge it up by pretending it really
1112     was \k.
1113 
1114     (3) For Oniguruma compatibility we also support \g followed by a name or a
1115     number either in angle brackets or in single quotes. However, these are
1116     (possibly recursive) subroutine calls, _not_ backreferences. Just return
1117     the ESC_g code (cf \k). */
1118 
1119     case CHAR_g:
1120     if (isclass) break;
1121     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
1122       {
1123       escape = ESC_g;
1124       break;
1125       }
1126 
1127     /* Handle the Perl-compatible cases */
1128 
1129     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1130       {
1131       const pcre_uchar *p;
1132       for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
1133         if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
1134       if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
1135         {
1136         escape = ESC_k;
1137         break;
1138         }
1139       braced = TRUE;
1140       ptr++;
1141       }
1142     else braced = FALSE;
1143 
1144     if (ptr[1] == CHAR_MINUS)
1145       {
1146       negated = TRUE;
1147       ptr++;
1148       }
1149     else negated = FALSE;
1150 
1151     /* The integer range is limited by the machine's int representation. */
1152     s = 0;
1153     overflow = FALSE;
1154     while (IS_DIGIT(ptr[1]))
1155       {
1156       if (s > INT_MAX / 10 - 1) /* Integer overflow */
1157         {
1158         overflow = TRUE;
1159         break;
1160         }
1161       s = s * 10 + (int)(*(++ptr) - CHAR_0);
1162       }
1163     if (overflow) /* Integer overflow */
1164       {
1165       while (IS_DIGIT(ptr[1]))
1166         ptr++;
1167       *errorcodeptr = ERR61;
1168       break;
1169       }
1170 
1171     if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
1172       {
1173       *errorcodeptr = ERR57;
1174       break;
1175       }
1176 
1177     if (s == 0)
1178       {
1179       *errorcodeptr = ERR58;
1180       break;
1181       }
1182 
1183     if (negated)
1184       {
1185       if (s > bracount)
1186         {
1187         *errorcodeptr = ERR15;
1188         break;
1189         }
1190       s = bracount - (s - 1);
1191       }
1192 
1193     escape = -s;
1194     break;
1195 
1196     /* The handling of escape sequences consisting of a string of digits
1197     starting with one that is not zero is not straightforward. Perl has changed
1198     over the years. Nowadays \g{} for backreferences and \o{} for octal are
1199     recommended to avoid the ambiguities in the old syntax.
1200 
1201     Outside a character class, the digits are read as a decimal number. If the
1202     number is less than 8 (used to be 10), or if there are that many previous
1203     extracting left brackets, then it is a back reference. Otherwise, up to
1204     three octal digits are read to form an escaped byte. Thus \123 is likely to
1205     be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1206     the octal value is greater than 377, the least significant 8 bits are
1207     taken. \8 and \9 are treated as the literal characters 8 and 9.
1208 
1209     Inside a character class, \ followed by a digit is always either a literal
1210     8 or 9 or an octal number. */
1211 
1212     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1213     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1214 
1215     if (!isclass)
1216       {
1217       oldptr = ptr;
1218       /* The integer range is limited by the machine's int representation. */
1219       s = (int)(c -CHAR_0);
1220       overflow = FALSE;
1221       while (IS_DIGIT(ptr[1]))
1222         {
1223         if (s > INT_MAX / 10 - 1) /* Integer overflow */
1224           {
1225           overflow = TRUE;
1226           break;
1227           }
1228         s = s * 10 + (int)(*(++ptr) - CHAR_0);
1229         }
1230       if (overflow) /* Integer overflow */
1231         {
1232         while (IS_DIGIT(ptr[1]))
1233           ptr++;
1234         *errorcodeptr = ERR61;
1235         break;
1236         }
1237       if (s < 8 || s <= bracount)  /* Check for back reference */
1238         {
1239         escape = -s;
1240         break;
1241         }
1242       ptr = oldptr;      /* Put the pointer back and fall through */
1243       }
1244 
1245     /* Handle a digit following \ when the number is not a back reference. If
1246     the first digit is 8 or 9, Perl used to generate a binary zero byte and
1247     then treat the digit as a following literal. At least by Perl 5.18 this
1248     changed so as not to insert the binary zero. */
1249 
1250     if ((c = *ptr) >= CHAR_8) break;
1251 
1252     /* Fall through with a digit less than 8 */
1253 
1254     /* \0 always starts an octal number, but we may drop through to here with a
1255     larger first octal digit. The original code used just to take the least
1256     significant 8 bits of octal numbers (I think this is what early Perls used
1257     to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1258     but no more than 3 octal digits. */
1259 
1260     case CHAR_0:
1261     c -= CHAR_0;
1262     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1263         c = c * 8 + *(++ptr) - CHAR_0;
1264 #ifdef COMPILE_PCRE8
1265     if (!utf && c > 0xff) *errorcodeptr = ERR51;
1266 #endif
1267     break;
1268 
1269     /* \o is a relatively new Perl feature, supporting a more general way of
1270     specifying character codes in octal. The only supported form is \o{ddd}. */
1271 
1272     case CHAR_o:
1273     if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1274     if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR86; else
1275       {
1276       ptr += 2;
1277       c = 0;
1278       overflow = FALSE;
1279       while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1280         {
1281         register pcre_uint32 cc = *ptr++;
1282         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1283 #ifdef COMPILE_PCRE32
1284         if (c >= 0x20000000l) { overflow = TRUE; break; }
1285 #endif
1286         c = (c << 3) + cc - CHAR_0 ;
1287 #if defined COMPILE_PCRE8
1288         if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1289 #elif defined COMPILE_PCRE16
1290         if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1291 #elif defined COMPILE_PCRE32
1292         if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1293 #endif
1294         }
1295       if (overflow)
1296         {
1297         while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1298         *errorcodeptr = ERR34;
1299         }
1300       else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1301         {
1302         if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1303         }
1304       else *errorcodeptr = ERR80;
1305       }
1306     break;
1307 
1308     /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1309     numbers. Otherwise it is a lowercase x letter. */
1310 
1311     case CHAR_x:
1312     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1313       {
1314       if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1315         && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1316         {
1317         c = 0;
1318         for (i = 0; i < 2; ++i)
1319           {
1320           register pcre_uint32 cc = *(++ptr);
1321 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1322           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1323           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1324 #else           /* EBCDIC coding */
1325           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1326           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1327 #endif
1328           }
1329         }
1330       }    /* End JavaScript handling */
1331 
1332     /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1333     greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1334     digits. If not, { used to be treated as a data character. However, Perl
1335     seems to read hex digits up to the first non-such, and ignore the rest, so
1336     that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1337     now gives an error. */
1338 
1339     else
1340       {
1341       if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1342         {
1343         ptr += 2;
1344         if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1345           {
1346           *errorcodeptr = ERR86;
1347           break;
1348           }
1349         c = 0;
1350         overflow = FALSE;
1351         while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1352           {
1353           register pcre_uint32 cc = *ptr++;
1354           if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1355 
1356 #ifdef COMPILE_PCRE32
1357           if (c >= 0x10000000l) { overflow = TRUE; break; }
1358 #endif
1359 
1360 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1361           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1362           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1363 #else           /* EBCDIC coding */
1364           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1365           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1366 #endif
1367 
1368 #if defined COMPILE_PCRE8
1369           if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1370 #elif defined COMPILE_PCRE16
1371           if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1372 #elif defined COMPILE_PCRE32
1373           if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1374 #endif
1375           }
1376 
1377         if (overflow)
1378           {
1379           while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1380           *errorcodeptr = ERR34;
1381           }
1382 
1383         else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1384           {
1385           if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1386           }
1387 
1388         /* If the sequence of hex digits does not end with '}', give an error.
1389         We used just to recognize this construct and fall through to the normal
1390         \x handling, but nowadays Perl gives an error, which seems much more
1391         sensible, so we do too. */
1392 
1393         else *errorcodeptr = ERR79;
1394         }   /* End of \x{} processing */
1395 
1396       /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1397 
1398       else
1399         {
1400         c = 0;
1401         while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1402           {
1403           pcre_uint32 cc;                          /* Some compilers don't like */
1404           cc = *(++ptr);                           /* ++ in initializers */
1405 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1406           if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
1407           c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1408 #else           /* EBCDIC coding */
1409           if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
1410           c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1411 #endif
1412           }
1413         }     /* End of \xdd handling */
1414       }       /* End of Perl-style \x handling */
1415     break;
1416 
1417     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1418     An error is given if the byte following \c is not an ASCII character. This
1419     coding is ASCII-specific, but then the whole concept of \cx is
1420     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1421 
1422     case CHAR_c:
1423     c = *(++ptr);
1424     if (c == CHAR_NULL)
1425       {
1426       *errorcodeptr = ERR2;
1427       break;
1428       }
1429 #ifndef EBCDIC    /* ASCII/UTF-8 coding */
1430     if (c > 127)  /* Excludes all non-ASCII in either mode */
1431       {
1432       *errorcodeptr = ERR68;
1433       break;
1434       }
1435     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1436     c ^= 0x40;
1437 #else             /* EBCDIC coding */
1438     if (c >= CHAR_a && c <= CHAR_z) c += 64;
1439     if (c == CHAR_QUESTION_MARK)
1440       c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
1441     else
1442       {
1443       for (i = 0; i < 32; i++)
1444         {
1445         if (c == ebcdic_escape_c[i]) break;
1446         }
1447       if (i < 32) c = i; else *errorcodeptr = ERR68;
1448       }
1449 #endif
1450     break;
1451 
1452     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1453     other alphanumeric following \ is an error if PCRE_EXTRA was set;
1454     otherwise, for Perl compatibility, it is a literal. This code looks a bit
1455     odd, but there used to be some cases other than the default, and there may
1456     be again in future, so I haven't "optimized" it. */
1457 
1458     default:
1459     if ((options & PCRE_EXTRA) != 0) switch(c)
1460       {
1461       default:
1462       *errorcodeptr = ERR3;
1463       break;
1464       }
1465     break;
1466     }
1467   }
1468 
1469 /* Perl supports \N{name} for character names, as well as plain \N for "not
1470 newline". PCRE does not support \N{name}. However, it does support
1471 quantification such as \N{2,3}. */
1472 
1473 if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1474      !is_counted_repeat(ptr+2))
1475   *errorcodeptr = ERR37;
1476 
1477 /* If PCRE_UCP is set, we change the values for \d etc. */
1478 
1479 if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
1480   escape += (ESC_DU - ESC_D);
1481 
1482 /* Set the pointer to the final character before returning. */
1483 
1484 *ptrptr = ptr;
1485 *chptr = c;
1486 return escape;
1487 }
1488 
1489 
1490 
1491 #ifdef SUPPORT_UCP
1492 /*************************************************
1493 *               Handle \P and \p                 *
1494 *************************************************/
1495 
1496 /* This function is called after \P or \p has been encountered, provided that
1497 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1498 pointing at the P or p. On exit, it is pointing at the final character of the
1499 escape sequence.
1500 
1501 Argument:
1502   ptrptr         points to the pattern position pointer
1503   negptr         points to a boolean that is set TRUE for negation else FALSE
1504   ptypeptr       points to an unsigned int that is set to the type value
1505   pdataptr       points to an unsigned int that is set to the detailed property value
1506   errorcodeptr   points to the error code variable
1507 
1508 Returns:         TRUE if the type value was found, or FALSE for an invalid type
1509 */
1510 
1511 static BOOL
get_ucp(const pcre_uchar ** ptrptr,BOOL * negptr,unsigned int * ptypeptr,unsigned int * pdataptr,int * errorcodeptr)1512 get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
1513   unsigned int *pdataptr, int *errorcodeptr)
1514 {
1515 pcre_uchar c;
1516 int i, bot, top;
1517 const pcre_uchar *ptr = *ptrptr;
1518 pcre_uchar name[32];
1519 
1520 c = *(++ptr);
1521 if (c == CHAR_NULL) goto ERROR_RETURN;
1522 
1523 *negptr = FALSE;
1524 
1525 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1526 negation. */
1527 
1528 if (c == CHAR_LEFT_CURLY_BRACKET)
1529   {
1530   if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1531     {
1532     *negptr = TRUE;
1533     ptr++;
1534     }
1535   for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1536     {
1537     c = *(++ptr);
1538     if (c == CHAR_NULL) goto ERROR_RETURN;
1539     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1540     name[i] = c;
1541     }
1542   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1543   name[i] = 0;
1544   }
1545 
1546 /* Otherwise there is just one following character */
1547 
1548 else
1549   {
1550   name[0] = c;
1551   name[1] = 0;
1552   }
1553 
1554 *ptrptr = ptr;
1555 
1556 /* Search for a recognized property name using binary chop */
1557 
1558 bot = 0;
1559 top = PRIV(utt_size);
1560 
1561 while (bot < top)
1562   {
1563   int r;
1564   i = (bot + top) >> 1;
1565   r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1566   if (r == 0)
1567     {
1568     *ptypeptr = PRIV(utt)[i].type;
1569     *pdataptr = PRIV(utt)[i].value;
1570     return TRUE;
1571     }
1572   if (r > 0) bot = i + 1; else top = i;
1573   }
1574 
1575 *errorcodeptr = ERR47;
1576 *ptrptr = ptr;
1577 return FALSE;
1578 
1579 ERROR_RETURN:
1580 *errorcodeptr = ERR46;
1581 *ptrptr = ptr;
1582 return FALSE;
1583 }
1584 #endif
1585 
1586 
1587 
1588 /*************************************************
1589 *         Read repeat counts                     *
1590 *************************************************/
1591 
1592 /* Read an item of the form {n,m} and return the values. This is called only
1593 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1594 so the syntax is guaranteed to be correct, but we need to check the values.
1595 
1596 Arguments:
1597   p              pointer to first char after '{'
1598   minp           pointer to int for min
1599   maxp           pointer to int for max
1600                  returned as -1 if no max
1601   errorcodeptr   points to error code variable
1602 
1603 Returns:         pointer to '}' on success;
1604                  current ptr on error, with errorcodeptr set non-zero
1605 */
1606 
1607 static const pcre_uchar *
read_repeat_counts(const pcre_uchar * p,int * minp,int * maxp,int * errorcodeptr)1608 read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1609 {
1610 int min = 0;
1611 int max = -1;
1612 
1613 while (IS_DIGIT(*p))
1614   {
1615   min = min * 10 + (int)(*p++ - CHAR_0);
1616   if (min > 65535)
1617     {
1618     *errorcodeptr = ERR5;
1619     return p;
1620     }
1621   }
1622 
1623 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1624   {
1625   if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1626     {
1627     max = 0;
1628     while(IS_DIGIT(*p))
1629       {
1630       max = max * 10 + (int)(*p++ - CHAR_0);
1631       if (max > 65535)
1632         {
1633         *errorcodeptr = ERR5;
1634         return p;
1635         }
1636       }
1637     if (max < min)
1638       {
1639       *errorcodeptr = ERR4;
1640       return p;
1641       }
1642     }
1643   }
1644 
1645 *minp = min;
1646 *maxp = max;
1647 return p;
1648 }
1649 
1650 
1651 
1652 /*************************************************
1653 *      Find first significant op code            *
1654 *************************************************/
1655 
1656 /* This is called by several functions that scan a compiled expression looking
1657 for a fixed first character, or an anchoring op code etc. It skips over things
1658 that do not influence this. For some calls, it makes sense to skip negative
1659 forward and all backward assertions, and also the \b assertion; for others it
1660 does not.
1661 
1662 Arguments:
1663   code         pointer to the start of the group
1664   skipassert   TRUE if certain assertions are to be skipped
1665 
1666 Returns:       pointer to the first significant opcode
1667 */
1668 
1669 static const pcre_uchar*
first_significant_code(const pcre_uchar * code,BOOL skipassert)1670 first_significant_code(const pcre_uchar *code, BOOL skipassert)
1671 {
1672 for (;;)
1673   {
1674   switch ((int)*code)
1675     {
1676     case OP_ASSERT_NOT:
1677     case OP_ASSERTBACK:
1678     case OP_ASSERTBACK_NOT:
1679     if (!skipassert) return code;
1680     do code += GET(code, 1); while (*code == OP_ALT);
1681     code += PRIV(OP_lengths)[*code];
1682     break;
1683 
1684     case OP_WORD_BOUNDARY:
1685     case OP_NOT_WORD_BOUNDARY:
1686     if (!skipassert) return code;
1687     /* Fall through */
1688 
1689     case OP_CALLOUT:
1690     case OP_CREF:
1691     case OP_DNCREF:
1692     case OP_RREF:
1693     case OP_DNRREF:
1694     case OP_DEF:
1695     code += PRIV(OP_lengths)[*code];
1696     break;
1697 
1698     default:
1699     return code;
1700     }
1701   }
1702 /* Control never reaches here */
1703 }
1704 
1705 
1706 
1707 /*************************************************
1708 *        Find the fixed length of a branch       *
1709 *************************************************/
1710 
1711 /* Scan a branch and compute the fixed length of subject that will match it,
1712 if the length is fixed. This is needed for dealing with backward assertions.
1713 In UTF8 mode, the result is in characters rather than bytes. The branch is
1714 temporarily terminated with OP_END when this function is called.
1715 
1716 This function is called when a backward assertion is encountered, so that if it
1717 fails, the error message can point to the correct place in the pattern.
1718 However, we cannot do this when the assertion contains subroutine calls,
1719 because they can be forward references. We solve this by remembering this case
1720 and doing the check at the end; a flag specifies which mode we are running in.
1721 
1722 Arguments:
1723   code     points to the start of the pattern (the bracket)
1724   utf      TRUE in UTF-8 / UTF-16 / UTF-32 mode
1725   atend    TRUE if called when the pattern is complete
1726   cd       the "compile data" structure
1727   recurses    chain of recurse_check to catch mutual recursion
1728 
1729 Returns:   the fixed length,
1730              or -1 if there is no fixed length,
1731              or -2 if \C was encountered (in UTF-8 mode only)
1732              or -3 if an OP_RECURSE item was encountered and atend is FALSE
1733              or -4 if an unknown opcode was encountered (internal error)
1734 */
1735 
1736 static int
find_fixedlength(pcre_uchar * code,BOOL utf,BOOL atend,compile_data * cd,recurse_check * recurses)1737 find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd,
1738   recurse_check *recurses)
1739 {
1740 int length = -1;
1741 recurse_check this_recurse;
1742 register int branchlength = 0;
1743 register pcre_uchar *cc = code + 1 + LINK_SIZE;
1744 
1745 /* Scan along the opcodes for this branch. If we get to the end of the
1746 branch, check the length against that of the other branches. */
1747 
1748 for (;;)
1749   {
1750   int d;
1751   pcre_uchar *ce, *cs;
1752   register pcre_uchar op = *cc;
1753 
1754   switch (op)
1755     {
1756     /* We only need to continue for OP_CBRA (normal capturing bracket) and
1757     OP_BRA (normal non-capturing bracket) because the other variants of these
1758     opcodes are all concerned with unlimited repeated groups, which of course
1759     are not of fixed length. */
1760 
1761     case OP_CBRA:
1762     case OP_BRA:
1763     case OP_ONCE:
1764     case OP_ONCE_NC:
1765     case OP_COND:
1766     d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd,
1767       recurses);
1768     if (d < 0) return d;
1769     branchlength += d;
1770     do cc += GET(cc, 1); while (*cc == OP_ALT);
1771     cc += 1 + LINK_SIZE;
1772     break;
1773 
1774     /* Reached end of a branch; if it's a ket it is the end of a nested call.
1775     If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1776     an ALT. If it is END it's the end of the outer call. All can be handled by
1777     the same code. Note that we must not include the OP_KETRxxx opcodes here,
1778     because they all imply an unlimited repeat. */
1779 
1780     case OP_ALT:
1781     case OP_KET:
1782     case OP_END:
1783     case OP_ACCEPT:
1784     case OP_ASSERT_ACCEPT:
1785     if (length < 0) length = branchlength;
1786       else if (length != branchlength) return -1;
1787     if (*cc != OP_ALT) return length;
1788     cc += 1 + LINK_SIZE;
1789     branchlength = 0;
1790     break;
1791 
1792     /* A true recursion implies not fixed length, but a subroutine call may
1793     be OK. If the subroutine is a forward reference, we can't deal with
1794     it until the end of the pattern, so return -3. */
1795 
1796     case OP_RECURSE:
1797     if (!atend) return -3;
1798     cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1799     do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
1800     if (cc > cs && cc < ce) return -1;                    /* Recursion */
1801     else   /* Check for mutual recursion */
1802       {
1803       recurse_check *r = recurses;
1804       for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
1805       if (r != NULL) return -1;   /* Mutual recursion */
1806       }
1807     this_recurse.prev = recurses;
1808     this_recurse.group = cs;
1809     d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd, &this_recurse);
1810     if (d < 0) return d;
1811     branchlength += d;
1812     cc += 1 + LINK_SIZE;
1813     break;
1814 
1815     /* Skip over assertive subpatterns */
1816 
1817     case OP_ASSERT:
1818     case OP_ASSERT_NOT:
1819     case OP_ASSERTBACK:
1820     case OP_ASSERTBACK_NOT:
1821     do cc += GET(cc, 1); while (*cc == OP_ALT);
1822     cc += 1 + LINK_SIZE;
1823     break;
1824 
1825     /* Skip over things that don't match chars */
1826 
1827     case OP_MARK:
1828     case OP_PRUNE_ARG:
1829     case OP_SKIP_ARG:
1830     case OP_THEN_ARG:
1831     cc += cc[1] + PRIV(OP_lengths)[*cc];
1832     break;
1833 
1834     case OP_CALLOUT:
1835     case OP_CIRC:
1836     case OP_CIRCM:
1837     case OP_CLOSE:
1838     case OP_COMMIT:
1839     case OP_CREF:
1840     case OP_DEF:
1841     case OP_DNCREF:
1842     case OP_DNRREF:
1843     case OP_DOLL:
1844     case OP_DOLLM:
1845     case OP_EOD:
1846     case OP_EODN:
1847     case OP_FAIL:
1848     case OP_NOT_WORD_BOUNDARY:
1849     case OP_PRUNE:
1850     case OP_REVERSE:
1851     case OP_RREF:
1852     case OP_SET_SOM:
1853     case OP_SKIP:
1854     case OP_SOD:
1855     case OP_SOM:
1856     case OP_THEN:
1857     case OP_WORD_BOUNDARY:
1858     cc += PRIV(OP_lengths)[*cc];
1859     break;
1860 
1861     /* Handle literal characters */
1862 
1863     case OP_CHAR:
1864     case OP_CHARI:
1865     case OP_NOT:
1866     case OP_NOTI:
1867     branchlength++;
1868     cc += 2;
1869 #ifdef SUPPORT_UTF
1870     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1871 #endif
1872     break;
1873 
1874     /* Handle exact repetitions. The count is already in characters, but we
1875     need to skip over a multibyte character in UTF8 mode.  */
1876 
1877     case OP_EXACT:
1878     case OP_EXACTI:
1879     case OP_NOTEXACT:
1880     case OP_NOTEXACTI:
1881     branchlength += (int)GET2(cc,1);
1882     cc += 2 + IMM2_SIZE;
1883 #ifdef SUPPORT_UTF
1884     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1885 #endif
1886     break;
1887 
1888     case OP_TYPEEXACT:
1889     branchlength += GET2(cc,1);
1890     if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1891       cc += 2;
1892     cc += 1 + IMM2_SIZE + 1;
1893     break;
1894 
1895     /* Handle single-char matchers */
1896 
1897     case OP_PROP:
1898     case OP_NOTPROP:
1899     cc += 2;
1900     /* Fall through */
1901 
1902     case OP_HSPACE:
1903     case OP_VSPACE:
1904     case OP_NOT_HSPACE:
1905     case OP_NOT_VSPACE:
1906     case OP_NOT_DIGIT:
1907     case OP_DIGIT:
1908     case OP_NOT_WHITESPACE:
1909     case OP_WHITESPACE:
1910     case OP_NOT_WORDCHAR:
1911     case OP_WORDCHAR:
1912     case OP_ANY:
1913     case OP_ALLANY:
1914     branchlength++;
1915     cc++;
1916     break;
1917 
1918     /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1919     otherwise \C is coded as OP_ALLANY. */
1920 
1921     case OP_ANYBYTE:
1922     return -2;
1923 
1924     /* Check a class for variable quantification */
1925 
1926     case OP_CLASS:
1927     case OP_NCLASS:
1928 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1929     case OP_XCLASS:
1930     /* The original code caused an unsigned overflow in 64 bit systems,
1931     so now we use a conditional statement. */
1932     if (op == OP_XCLASS)
1933       cc += GET(cc, 1);
1934     else
1935       cc += PRIV(OP_lengths)[OP_CLASS];
1936 #else
1937     cc += PRIV(OP_lengths)[OP_CLASS];
1938 #endif
1939 
1940     switch (*cc)
1941       {
1942       case OP_CRSTAR:
1943       case OP_CRMINSTAR:
1944       case OP_CRPLUS:
1945       case OP_CRMINPLUS:
1946       case OP_CRQUERY:
1947       case OP_CRMINQUERY:
1948       case OP_CRPOSSTAR:
1949       case OP_CRPOSPLUS:
1950       case OP_CRPOSQUERY:
1951       return -1;
1952 
1953       case OP_CRRANGE:
1954       case OP_CRMINRANGE:
1955       case OP_CRPOSRANGE:
1956       if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1957       branchlength += (int)GET2(cc,1);
1958       cc += 1 + 2 * IMM2_SIZE;
1959       break;
1960 
1961       default:
1962       branchlength++;
1963       }
1964     break;
1965 
1966     /* Anything else is variable length */
1967 
1968     case OP_ANYNL:
1969     case OP_BRAMINZERO:
1970     case OP_BRAPOS:
1971     case OP_BRAPOSZERO:
1972     case OP_BRAZERO:
1973     case OP_CBRAPOS:
1974     case OP_EXTUNI:
1975     case OP_KETRMAX:
1976     case OP_KETRMIN:
1977     case OP_KETRPOS:
1978     case OP_MINPLUS:
1979     case OP_MINPLUSI:
1980     case OP_MINQUERY:
1981     case OP_MINQUERYI:
1982     case OP_MINSTAR:
1983     case OP_MINSTARI:
1984     case OP_MINUPTO:
1985     case OP_MINUPTOI:
1986     case OP_NOTMINPLUS:
1987     case OP_NOTMINPLUSI:
1988     case OP_NOTMINQUERY:
1989     case OP_NOTMINQUERYI:
1990     case OP_NOTMINSTAR:
1991     case OP_NOTMINSTARI:
1992     case OP_NOTMINUPTO:
1993     case OP_NOTMINUPTOI:
1994     case OP_NOTPLUS:
1995     case OP_NOTPLUSI:
1996     case OP_NOTPOSPLUS:
1997     case OP_NOTPOSPLUSI:
1998     case OP_NOTPOSQUERY:
1999     case OP_NOTPOSQUERYI:
2000     case OP_NOTPOSSTAR:
2001     case OP_NOTPOSSTARI:
2002     case OP_NOTPOSUPTO:
2003     case OP_NOTPOSUPTOI:
2004     case OP_NOTQUERY:
2005     case OP_NOTQUERYI:
2006     case OP_NOTSTAR:
2007     case OP_NOTSTARI:
2008     case OP_NOTUPTO:
2009     case OP_NOTUPTOI:
2010     case OP_PLUS:
2011     case OP_PLUSI:
2012     case OP_POSPLUS:
2013     case OP_POSPLUSI:
2014     case OP_POSQUERY:
2015     case OP_POSQUERYI:
2016     case OP_POSSTAR:
2017     case OP_POSSTARI:
2018     case OP_POSUPTO:
2019     case OP_POSUPTOI:
2020     case OP_QUERY:
2021     case OP_QUERYI:
2022     case OP_REF:
2023     case OP_REFI:
2024     case OP_DNREF:
2025     case OP_DNREFI:
2026     case OP_SBRA:
2027     case OP_SBRAPOS:
2028     case OP_SCBRA:
2029     case OP_SCBRAPOS:
2030     case OP_SCOND:
2031     case OP_SKIPZERO:
2032     case OP_STAR:
2033     case OP_STARI:
2034     case OP_TYPEMINPLUS:
2035     case OP_TYPEMINQUERY:
2036     case OP_TYPEMINSTAR:
2037     case OP_TYPEMINUPTO:
2038     case OP_TYPEPLUS:
2039     case OP_TYPEPOSPLUS:
2040     case OP_TYPEPOSQUERY:
2041     case OP_TYPEPOSSTAR:
2042     case OP_TYPEPOSUPTO:
2043     case OP_TYPEQUERY:
2044     case OP_TYPESTAR:
2045     case OP_TYPEUPTO:
2046     case OP_UPTO:
2047     case OP_UPTOI:
2048     return -1;
2049 
2050     /* Catch unrecognized opcodes so that when new ones are added they
2051     are not forgotten, as has happened in the past. */
2052 
2053     default:
2054     return -4;
2055     }
2056   }
2057 /* Control never gets here */
2058 }
2059 
2060 
2061 
2062 /*************************************************
2063 *    Scan compiled regex for specific bracket    *
2064 *************************************************/
2065 
2066 /* This little function scans through a compiled pattern until it finds a
2067 capturing bracket with the given number, or, if the number is negative, an
2068 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
2069 so that it can be called from pcre_study() when finding the minimum matching
2070 length.
2071 
2072 Arguments:
2073   code        points to start of expression
2074   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
2075   number      the required bracket number or negative to find a lookbehind
2076 
2077 Returns:      pointer to the opcode for the bracket, or NULL if not found
2078 */
2079 
2080 const pcre_uchar *
PRIV(find_bracket)2081 PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2082 {
2083 for (;;)
2084   {
2085   register pcre_uchar c = *code;
2086 
2087   if (c == OP_END) return NULL;
2088 
2089   /* XCLASS is used for classes that cannot be represented just by a bit
2090   map. This includes negated single high-valued characters. The length in
2091   the table is zero; the actual length is stored in the compiled code. */
2092 
2093   if (c == OP_XCLASS) code += GET(code, 1);
2094 
2095   /* Handle recursion */
2096 
2097   else if (c == OP_REVERSE)
2098     {
2099     if (number < 0) return (pcre_uchar *)code;
2100     code += PRIV(OP_lengths)[c];
2101     }
2102 
2103   /* Handle capturing bracket */
2104 
2105   else if (c == OP_CBRA || c == OP_SCBRA ||
2106            c == OP_CBRAPOS || c == OP_SCBRAPOS)
2107     {
2108     int n = (int)GET2(code, 1+LINK_SIZE);
2109     if (n == number) return (pcre_uchar *)code;
2110     code += PRIV(OP_lengths)[c];
2111     }
2112 
2113   /* Otherwise, we can get the item's length from the table, except that for
2114   repeated character types, we have to test for \p and \P, which have an extra
2115   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2116   must add in its length. */
2117 
2118   else
2119     {
2120     switch(c)
2121       {
2122       case OP_TYPESTAR:
2123       case OP_TYPEMINSTAR:
2124       case OP_TYPEPLUS:
2125       case OP_TYPEMINPLUS:
2126       case OP_TYPEQUERY:
2127       case OP_TYPEMINQUERY:
2128       case OP_TYPEPOSSTAR:
2129       case OP_TYPEPOSPLUS:
2130       case OP_TYPEPOSQUERY:
2131       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2132       break;
2133 
2134       case OP_TYPEUPTO:
2135       case OP_TYPEMINUPTO:
2136       case OP_TYPEEXACT:
2137       case OP_TYPEPOSUPTO:
2138       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2139         code += 2;
2140       break;
2141 
2142       case OP_MARK:
2143       case OP_PRUNE_ARG:
2144       case OP_SKIP_ARG:
2145       case OP_THEN_ARG:
2146       code += code[1];
2147       break;
2148       }
2149 
2150     /* Add in the fixed length from the table */
2151 
2152     code += PRIV(OP_lengths)[c];
2153 
2154   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2155   a multi-byte character. The length in the table is a minimum, so we have to
2156   arrange to skip the extra bytes. */
2157 
2158 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2159     if (utf) switch(c)
2160       {
2161       case OP_CHAR:
2162       case OP_CHARI:
2163       case OP_NOT:
2164       case OP_NOTI:
2165       case OP_EXACT:
2166       case OP_EXACTI:
2167       case OP_NOTEXACT:
2168       case OP_NOTEXACTI:
2169       case OP_UPTO:
2170       case OP_UPTOI:
2171       case OP_NOTUPTO:
2172       case OP_NOTUPTOI:
2173       case OP_MINUPTO:
2174       case OP_MINUPTOI:
2175       case OP_NOTMINUPTO:
2176       case OP_NOTMINUPTOI:
2177       case OP_POSUPTO:
2178       case OP_POSUPTOI:
2179       case OP_NOTPOSUPTO:
2180       case OP_NOTPOSUPTOI:
2181       case OP_STAR:
2182       case OP_STARI:
2183       case OP_NOTSTAR:
2184       case OP_NOTSTARI:
2185       case OP_MINSTAR:
2186       case OP_MINSTARI:
2187       case OP_NOTMINSTAR:
2188       case OP_NOTMINSTARI:
2189       case OP_POSSTAR:
2190       case OP_POSSTARI:
2191       case OP_NOTPOSSTAR:
2192       case OP_NOTPOSSTARI:
2193       case OP_PLUS:
2194       case OP_PLUSI:
2195       case OP_NOTPLUS:
2196       case OP_NOTPLUSI:
2197       case OP_MINPLUS:
2198       case OP_MINPLUSI:
2199       case OP_NOTMINPLUS:
2200       case OP_NOTMINPLUSI:
2201       case OP_POSPLUS:
2202       case OP_POSPLUSI:
2203       case OP_NOTPOSPLUS:
2204       case OP_NOTPOSPLUSI:
2205       case OP_QUERY:
2206       case OP_QUERYI:
2207       case OP_NOTQUERY:
2208       case OP_NOTQUERYI:
2209       case OP_MINQUERY:
2210       case OP_MINQUERYI:
2211       case OP_NOTMINQUERY:
2212       case OP_NOTMINQUERYI:
2213       case OP_POSQUERY:
2214       case OP_POSQUERYI:
2215       case OP_NOTPOSQUERY:
2216       case OP_NOTPOSQUERYI:
2217       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2218       break;
2219       }
2220 #else
2221     (void)(utf);  /* Keep compiler happy by referencing function argument */
2222 #endif
2223     }
2224   }
2225 }
2226 
2227 
2228 
2229 /*************************************************
2230 *   Scan compiled regex for recursion reference  *
2231 *************************************************/
2232 
2233 /* This little function scans through a compiled pattern until it finds an
2234 instance of OP_RECURSE.
2235 
2236 Arguments:
2237   code        points to start of expression
2238   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
2239 
2240 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
2241 */
2242 
2243 static const pcre_uchar *
find_recurse(const pcre_uchar * code,BOOL utf)2244 find_recurse(const pcre_uchar *code, BOOL utf)
2245 {
2246 for (;;)
2247   {
2248   register pcre_uchar c = *code;
2249   if (c == OP_END) return NULL;
2250   if (c == OP_RECURSE) return code;
2251 
2252   /* XCLASS is used for classes that cannot be represented just by a bit
2253   map. This includes negated single high-valued characters. The length in
2254   the table is zero; the actual length is stored in the compiled code. */
2255 
2256   if (c == OP_XCLASS) code += GET(code, 1);
2257 
2258   /* Otherwise, we can get the item's length from the table, except that for
2259   repeated character types, we have to test for \p and \P, which have an extra
2260   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2261   must add in its length. */
2262 
2263   else
2264     {
2265     switch(c)
2266       {
2267       case OP_TYPESTAR:
2268       case OP_TYPEMINSTAR:
2269       case OP_TYPEPLUS:
2270       case OP_TYPEMINPLUS:
2271       case OP_TYPEQUERY:
2272       case OP_TYPEMINQUERY:
2273       case OP_TYPEPOSSTAR:
2274       case OP_TYPEPOSPLUS:
2275       case OP_TYPEPOSQUERY:
2276       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2277       break;
2278 
2279       case OP_TYPEPOSUPTO:
2280       case OP_TYPEUPTO:
2281       case OP_TYPEMINUPTO:
2282       case OP_TYPEEXACT:
2283       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2284         code += 2;
2285       break;
2286 
2287       case OP_MARK:
2288       case OP_PRUNE_ARG:
2289       case OP_SKIP_ARG:
2290       case OP_THEN_ARG:
2291       code += code[1];
2292       break;
2293       }
2294 
2295     /* Add in the fixed length from the table */
2296 
2297     code += PRIV(OP_lengths)[c];
2298 
2299     /* In UTF-8 mode, opcodes that are followed by a character may be followed
2300     by a multi-byte character. The length in the table is a minimum, so we have
2301     to arrange to skip the extra bytes. */
2302 
2303 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2304     if (utf) switch(c)
2305       {
2306       case OP_CHAR:
2307       case OP_CHARI:
2308       case OP_NOT:
2309       case OP_NOTI:
2310       case OP_EXACT:
2311       case OP_EXACTI:
2312       case OP_NOTEXACT:
2313       case OP_NOTEXACTI:
2314       case OP_UPTO:
2315       case OP_UPTOI:
2316       case OP_NOTUPTO:
2317       case OP_NOTUPTOI:
2318       case OP_MINUPTO:
2319       case OP_MINUPTOI:
2320       case OP_NOTMINUPTO:
2321       case OP_NOTMINUPTOI:
2322       case OP_POSUPTO:
2323       case OP_POSUPTOI:
2324       case OP_NOTPOSUPTO:
2325       case OP_NOTPOSUPTOI:
2326       case OP_STAR:
2327       case OP_STARI:
2328       case OP_NOTSTAR:
2329       case OP_NOTSTARI:
2330       case OP_MINSTAR:
2331       case OP_MINSTARI:
2332       case OP_NOTMINSTAR:
2333       case OP_NOTMINSTARI:
2334       case OP_POSSTAR:
2335       case OP_POSSTARI:
2336       case OP_NOTPOSSTAR:
2337       case OP_NOTPOSSTARI:
2338       case OP_PLUS:
2339       case OP_PLUSI:
2340       case OP_NOTPLUS:
2341       case OP_NOTPLUSI:
2342       case OP_MINPLUS:
2343       case OP_MINPLUSI:
2344       case OP_NOTMINPLUS:
2345       case OP_NOTMINPLUSI:
2346       case OP_POSPLUS:
2347       case OP_POSPLUSI:
2348       case OP_NOTPOSPLUS:
2349       case OP_NOTPOSPLUSI:
2350       case OP_QUERY:
2351       case OP_QUERYI:
2352       case OP_NOTQUERY:
2353       case OP_NOTQUERYI:
2354       case OP_MINQUERY:
2355       case OP_MINQUERYI:
2356       case OP_NOTMINQUERY:
2357       case OP_NOTMINQUERYI:
2358       case OP_POSQUERY:
2359       case OP_POSQUERYI:
2360       case OP_NOTPOSQUERY:
2361       case OP_NOTPOSQUERYI:
2362       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2363       break;
2364       }
2365 #else
2366     (void)(utf);  /* Keep compiler happy by referencing function argument */
2367 #endif
2368     }
2369   }
2370 }
2371 
2372 
2373 
2374 /*************************************************
2375 *    Scan compiled branch for non-emptiness      *
2376 *************************************************/
2377 
2378 /* This function scans through a branch of a compiled pattern to see whether it
2379 can match the empty string or not. It is called from could_be_empty()
2380 below and from compile_branch() when checking for an unlimited repeat of a
2381 group that can match nothing. Note that first_significant_code() skips over
2382 backward and negative forward assertions when its final argument is TRUE. If we
2383 hit an unclosed bracket, we return "empty" - this means we've struck an inner
2384 bracket whose current branch will already have been scanned.
2385 
2386 Arguments:
2387   code        points to start of search
2388   endcode     points to where to stop
2389   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2390   cd          contains pointers to tables etc.
2391   recurses    chain of recurse_check to catch mutual recursion
2392 
2393 Returns:      TRUE if what is matched could be empty
2394 */
2395 
2396 static BOOL
could_be_empty_branch(const pcre_uchar * code,const pcre_uchar * endcode,BOOL utf,compile_data * cd,recurse_check * recurses)2397 could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2398   BOOL utf, compile_data *cd, recurse_check *recurses)
2399 {
2400 register pcre_uchar c;
2401 recurse_check this_recurse;
2402 
2403 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2404      code < endcode;
2405      code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2406   {
2407   const pcre_uchar *ccode;
2408 
2409   c = *code;
2410 
2411   /* Skip over forward assertions; the other assertions are skipped by
2412   first_significant_code() with a TRUE final argument. */
2413 
2414   if (c == OP_ASSERT)
2415     {
2416     do code += GET(code, 1); while (*code == OP_ALT);
2417     c = *code;
2418     continue;
2419     }
2420 
2421   /* For a recursion/subroutine call, if its end has been reached, which
2422   implies a backward reference subroutine call, we can scan it. If it's a
2423   forward reference subroutine call, we can't. To detect forward reference
2424   we have to scan up the list that is kept in the workspace. This function is
2425   called only when doing the real compile, not during the pre-compile that
2426   measures the size of the compiled pattern. */
2427 
2428   if (c == OP_RECURSE)
2429     {
2430     const pcre_uchar *scode = cd->start_code + GET(code, 1);
2431     const pcre_uchar *endgroup = scode;
2432     BOOL empty_branch;
2433 
2434     /* Test for forward reference or uncompleted reference. This is disabled
2435     when called to scan a completed pattern by setting cd->start_workspace to
2436     NULL. */
2437 
2438     if (cd->start_workspace != NULL)
2439       {
2440       const pcre_uchar *tcode;
2441       for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2442         if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2443       if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2444       }
2445 
2446     /* If the reference is to a completed group, we need to detect whether this
2447     is a recursive call, as otherwise there will be an infinite loop. If it is
2448     a recursion, just skip over it. Simple recursions are easily detected. For
2449     mutual recursions we keep a chain on the stack. */
2450 
2451     do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2452     if (code >= scode && code <= endgroup) continue;  /* Simple recursion */
2453     else
2454       {
2455       recurse_check *r = recurses;
2456       for (r = recurses; r != NULL; r = r->prev)
2457         if (r->group == scode) break;
2458       if (r != NULL) continue;   /* Mutual recursion */
2459       }
2460 
2461     /* Completed reference; scan the referenced group, remembering it on the
2462     stack chain to detect mutual recursions. */
2463 
2464     empty_branch = FALSE;
2465     this_recurse.prev = recurses;
2466     this_recurse.group = scode;
2467 
2468     do
2469       {
2470       if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
2471         {
2472         empty_branch = TRUE;
2473         break;
2474         }
2475       scode += GET(scode, 1);
2476       }
2477     while (*scode == OP_ALT);
2478 
2479     if (!empty_branch) return FALSE;  /* All branches are non-empty */
2480     continue;
2481     }
2482 
2483   /* Groups with zero repeats can of course be empty; skip them. */
2484 
2485   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2486       c == OP_BRAPOSZERO)
2487     {
2488     code += PRIV(OP_lengths)[c];
2489     do code += GET(code, 1); while (*code == OP_ALT);
2490     c = *code;
2491     continue;
2492     }
2493 
2494   /* A nested group that is already marked as "could be empty" can just be
2495   skipped. */
2496 
2497   if (c == OP_SBRA  || c == OP_SBRAPOS ||
2498       c == OP_SCBRA || c == OP_SCBRAPOS)
2499     {
2500     do code += GET(code, 1); while (*code == OP_ALT);
2501     c = *code;
2502     continue;
2503     }
2504 
2505   /* For other groups, scan the branches. */
2506 
2507   if (c == OP_BRA  || c == OP_BRAPOS ||
2508       c == OP_CBRA || c == OP_CBRAPOS ||
2509       c == OP_ONCE || c == OP_ONCE_NC ||
2510       c == OP_COND || c == OP_SCOND)
2511     {
2512     BOOL empty_branch;
2513     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
2514 
2515     /* If a conditional group has only one branch, there is a second, implied,
2516     empty branch, so just skip over the conditional, because it could be empty.
2517     Otherwise, scan the individual branches of the group. */
2518 
2519     if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2520       code += GET(code, 1);
2521     else
2522       {
2523       empty_branch = FALSE;
2524       do
2525         {
2526         if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd,
2527           recurses)) empty_branch = TRUE;
2528         code += GET(code, 1);
2529         }
2530       while (*code == OP_ALT);
2531       if (!empty_branch) return FALSE;   /* All branches are non-empty */
2532       }
2533 
2534     c = *code;
2535     continue;
2536     }
2537 
2538   /* Handle the other opcodes */
2539 
2540   switch (c)
2541     {
2542     /* Check for quantifiers after a class. XCLASS is used for classes that
2543     cannot be represented just by a bit map. This includes negated single
2544     high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2545     actual length is stored in the compiled code, so we must update "code"
2546     here. */
2547 
2548 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2549     case OP_XCLASS:
2550     ccode = code += GET(code, 1);
2551     goto CHECK_CLASS_REPEAT;
2552 #endif
2553 
2554     case OP_CLASS:
2555     case OP_NCLASS:
2556     ccode = code + PRIV(OP_lengths)[OP_CLASS];
2557 
2558 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2559     CHECK_CLASS_REPEAT:
2560 #endif
2561 
2562     switch (*ccode)
2563       {
2564       case OP_CRSTAR:            /* These could be empty; continue */
2565       case OP_CRMINSTAR:
2566       case OP_CRQUERY:
2567       case OP_CRMINQUERY:
2568       case OP_CRPOSSTAR:
2569       case OP_CRPOSQUERY:
2570       break;
2571 
2572       default:                   /* Non-repeat => class must match */
2573       case OP_CRPLUS:            /* These repeats aren't empty */
2574       case OP_CRMINPLUS:
2575       case OP_CRPOSPLUS:
2576       return FALSE;
2577 
2578       case OP_CRRANGE:
2579       case OP_CRMINRANGE:
2580       case OP_CRPOSRANGE:
2581       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
2582       break;
2583       }
2584     break;
2585 
2586     /* Opcodes that must match a character */
2587 
2588     case OP_ANY:
2589     case OP_ALLANY:
2590     case OP_ANYBYTE:
2591 
2592     case OP_PROP:
2593     case OP_NOTPROP:
2594     case OP_ANYNL:
2595 
2596     case OP_NOT_HSPACE:
2597     case OP_HSPACE:
2598     case OP_NOT_VSPACE:
2599     case OP_VSPACE:
2600     case OP_EXTUNI:
2601 
2602     case OP_NOT_DIGIT:
2603     case OP_DIGIT:
2604     case OP_NOT_WHITESPACE:
2605     case OP_WHITESPACE:
2606     case OP_NOT_WORDCHAR:
2607     case OP_WORDCHAR:
2608 
2609     case OP_CHAR:
2610     case OP_CHARI:
2611     case OP_NOT:
2612     case OP_NOTI:
2613 
2614     case OP_PLUS:
2615     case OP_PLUSI:
2616     case OP_MINPLUS:
2617     case OP_MINPLUSI:
2618 
2619     case OP_NOTPLUS:
2620     case OP_NOTPLUSI:
2621     case OP_NOTMINPLUS:
2622     case OP_NOTMINPLUSI:
2623 
2624     case OP_POSPLUS:
2625     case OP_POSPLUSI:
2626     case OP_NOTPOSPLUS:
2627     case OP_NOTPOSPLUSI:
2628 
2629     case OP_EXACT:
2630     case OP_EXACTI:
2631     case OP_NOTEXACT:
2632     case OP_NOTEXACTI:
2633 
2634     case OP_TYPEPLUS:
2635     case OP_TYPEMINPLUS:
2636     case OP_TYPEPOSPLUS:
2637     case OP_TYPEEXACT:
2638 
2639     return FALSE;
2640 
2641     /* These are going to continue, as they may be empty, but we have to
2642     fudge the length for the \p and \P cases. */
2643 
2644     case OP_TYPESTAR:
2645     case OP_TYPEMINSTAR:
2646     case OP_TYPEPOSSTAR:
2647     case OP_TYPEQUERY:
2648     case OP_TYPEMINQUERY:
2649     case OP_TYPEPOSQUERY:
2650     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2651     break;
2652 
2653     /* Same for these */
2654 
2655     case OP_TYPEUPTO:
2656     case OP_TYPEMINUPTO:
2657     case OP_TYPEPOSUPTO:
2658     if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2659       code += 2;
2660     break;
2661 
2662     /* End of branch */
2663 
2664     case OP_KET:
2665     case OP_KETRMAX:
2666     case OP_KETRMIN:
2667     case OP_KETRPOS:
2668     case OP_ALT:
2669     return TRUE;
2670 
2671     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2672     MINUPTO, and POSUPTO and their caseless and negative versions may be
2673     followed by a multibyte character. */
2674 
2675 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2676     case OP_STAR:
2677     case OP_STARI:
2678     case OP_NOTSTAR:
2679     case OP_NOTSTARI:
2680 
2681     case OP_MINSTAR:
2682     case OP_MINSTARI:
2683     case OP_NOTMINSTAR:
2684     case OP_NOTMINSTARI:
2685 
2686     case OP_POSSTAR:
2687     case OP_POSSTARI:
2688     case OP_NOTPOSSTAR:
2689     case OP_NOTPOSSTARI:
2690 
2691     case OP_QUERY:
2692     case OP_QUERYI:
2693     case OP_NOTQUERY:
2694     case OP_NOTQUERYI:
2695 
2696     case OP_MINQUERY:
2697     case OP_MINQUERYI:
2698     case OP_NOTMINQUERY:
2699     case OP_NOTMINQUERYI:
2700 
2701     case OP_POSQUERY:
2702     case OP_POSQUERYI:
2703     case OP_NOTPOSQUERY:
2704     case OP_NOTPOSQUERYI:
2705 
2706     if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2707     break;
2708 
2709     case OP_UPTO:
2710     case OP_UPTOI:
2711     case OP_NOTUPTO:
2712     case OP_NOTUPTOI:
2713 
2714     case OP_MINUPTO:
2715     case OP_MINUPTOI:
2716     case OP_NOTMINUPTO:
2717     case OP_NOTMINUPTOI:
2718 
2719     case OP_POSUPTO:
2720     case OP_POSUPTOI:
2721     case OP_NOTPOSUPTO:
2722     case OP_NOTPOSUPTOI:
2723 
2724     if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2725     break;
2726 #endif
2727 
2728     /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2729     string. */
2730 
2731     case OP_MARK:
2732     case OP_PRUNE_ARG:
2733     case OP_SKIP_ARG:
2734     case OP_THEN_ARG:
2735     code += code[1];
2736     break;
2737 
2738     /* None of the remaining opcodes are required to match a character. */
2739 
2740     default:
2741     break;
2742     }
2743   }
2744 
2745 return TRUE;
2746 }
2747 
2748 
2749 
2750 /*************************************************
2751 *    Scan compiled regex for non-emptiness       *
2752 *************************************************/
2753 
2754 /* This function is called to check for left recursive calls. We want to check
2755 the current branch of the current pattern to see if it could match the empty
2756 string. If it could, we must look outwards for branches at other levels,
2757 stopping when we pass beyond the bracket which is the subject of the recursion.
2758 This function is called only during the real compile, not during the
2759 pre-compile.
2760 
2761 Arguments:
2762   code        points to start of the recursion
2763   endcode     points to where to stop (current RECURSE item)
2764   bcptr       points to the chain of current (unclosed) branch starts
2765   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2766   cd          pointers to tables etc
2767 
2768 Returns:      TRUE if what is matched could be empty
2769 */
2770 
2771 static BOOL
could_be_empty(const pcre_uchar * code,const pcre_uchar * endcode,branch_chain * bcptr,BOOL utf,compile_data * cd)2772 could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2773   branch_chain *bcptr, BOOL utf, compile_data *cd)
2774 {
2775 while (bcptr != NULL && bcptr->current_branch >= code)
2776   {
2777   if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
2778     return FALSE;
2779   bcptr = bcptr->outer;
2780   }
2781 return TRUE;
2782 }
2783 
2784 
2785 
2786 /*************************************************
2787 *        Base opcode of repeated opcodes         *
2788 *************************************************/
2789 
2790 /* Returns the base opcode for repeated single character type opcodes. If the
2791 opcode is not a repeated character type, it returns with the original value.
2792 
2793 Arguments:  c opcode
2794 Returns:    base opcode for the type
2795 */
2796 
2797 static pcre_uchar
get_repeat_base(pcre_uchar c)2798 get_repeat_base(pcre_uchar c)
2799 {
2800 return (c > OP_TYPEPOSUPTO)? c :
2801        (c >= OP_TYPESTAR)?   OP_TYPESTAR :
2802        (c >= OP_NOTSTARI)?   OP_NOTSTARI :
2803        (c >= OP_NOTSTAR)?    OP_NOTSTAR :
2804        (c >= OP_STARI)?      OP_STARI :
2805                              OP_STAR;
2806 }
2807 
2808 
2809 
2810 #ifdef SUPPORT_UCP
2811 /*************************************************
2812 *        Check a character and a property        *
2813 *************************************************/
2814 
2815 /* This function is called by check_auto_possessive() when a property item
2816 is adjacent to a fixed character.
2817 
2818 Arguments:
2819   c            the character
2820   ptype        the property type
2821   pdata        the data for the type
2822   negated      TRUE if it's a negated property (\P or \p{^)
2823 
2824 Returns:       TRUE if auto-possessifying is OK
2825 */
2826 
2827 static BOOL
check_char_prop(pcre_uint32 c,unsigned int ptype,unsigned int pdata,BOOL negated)2828 check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
2829   BOOL negated)
2830 {
2831 const pcre_uint32 *p;
2832 const ucd_record *prop = GET_UCD(c);
2833 
2834 switch(ptype)
2835   {
2836   case PT_LAMP:
2837   return (prop->chartype == ucp_Lu ||
2838           prop->chartype == ucp_Ll ||
2839           prop->chartype == ucp_Lt) == negated;
2840 
2841   case PT_GC:
2842   return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2843 
2844   case PT_PC:
2845   return (pdata == prop->chartype) == negated;
2846 
2847   case PT_SC:
2848   return (pdata == prop->script) == negated;
2849 
2850   /* These are specials */
2851 
2852   case PT_ALNUM:
2853   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2854           PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2855 
2856   /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2857   means that Perl space and POSIX space are now identical. PCRE was changed
2858   at release 8.34. */
2859 
2860   case PT_SPACE:    /* Perl space */
2861   case PT_PXSPACE:  /* POSIX space */
2862   switch(c)
2863     {
2864     HSPACE_CASES:
2865     VSPACE_CASES:
2866     return negated;
2867 
2868     default:
2869     return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
2870     }
2871   break;  /* Control never reaches here */
2872 
2873   case PT_WORD:
2874   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2875           PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2876           c == CHAR_UNDERSCORE) == negated;
2877 
2878   case PT_CLIST:
2879   p = PRIV(ucd_caseless_sets) + prop->caseset;
2880   for (;;)
2881     {
2882     if (c < *p) return !negated;
2883     if (c == *p++) return negated;
2884     }
2885   break;  /* Control never reaches here */
2886   }
2887 
2888 return FALSE;
2889 }
2890 #endif  /* SUPPORT_UCP */
2891 
2892 
2893 
2894 /*************************************************
2895 *        Fill the character property list        *
2896 *************************************************/
2897 
2898 /* Checks whether the code points to an opcode that can take part in auto-
2899 possessification, and if so, fills a list with its properties.
2900 
2901 Arguments:
2902   code        points to start of expression
2903   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2904   fcc         points to case-flipping table
2905   list        points to output list
2906               list[0] will be filled with the opcode
2907               list[1] will be non-zero if this opcode
2908                 can match an empty character string
2909               list[2..7] depends on the opcode
2910 
2911 Returns:      points to the start of the next opcode if *code is accepted
2912               NULL if *code is not accepted
2913 */
2914 
2915 static const pcre_uchar *
get_chr_property_list(const pcre_uchar * code,BOOL utf,const pcre_uint8 * fcc,pcre_uint32 * list)2916 get_chr_property_list(const pcre_uchar *code, BOOL utf,
2917   const pcre_uint8 *fcc, pcre_uint32 *list)
2918 {
2919 pcre_uchar c = *code;
2920 pcre_uchar base;
2921 const pcre_uchar *end;
2922 pcre_uint32 chr;
2923 
2924 #ifdef SUPPORT_UCP
2925 pcre_uint32 *clist_dest;
2926 const pcre_uint32 *clist_src;
2927 #else
2928 ((void)utf); /* Suppress "unused parameter" compiler warning */
2929 #endif
2930 
2931 list[0] = c;
2932 list[1] = FALSE;
2933 code++;
2934 
2935 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
2936   {
2937   base = get_repeat_base(c);
2938   c -= (base - OP_STAR);
2939 
2940   if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
2941     code += IMM2_SIZE;
2942 
2943   list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
2944 
2945   switch(base)
2946     {
2947     case OP_STAR:
2948     list[0] = OP_CHAR;
2949     break;
2950 
2951     case OP_STARI:
2952     list[0] = OP_CHARI;
2953     break;
2954 
2955     case OP_NOTSTAR:
2956     list[0] = OP_NOT;
2957     break;
2958 
2959     case OP_NOTSTARI:
2960     list[0] = OP_NOTI;
2961     break;
2962 
2963     case OP_TYPESTAR:
2964     list[0] = *code;
2965     code++;
2966     break;
2967     }
2968   c = list[0];
2969   }
2970 
2971 switch(c)
2972   {
2973   case OP_NOT_DIGIT:
2974   case OP_DIGIT:
2975   case OP_NOT_WHITESPACE:
2976   case OP_WHITESPACE:
2977   case OP_NOT_WORDCHAR:
2978   case OP_WORDCHAR:
2979   case OP_ANY:
2980   case OP_ALLANY:
2981   case OP_ANYNL:
2982   case OP_NOT_HSPACE:
2983   case OP_HSPACE:
2984   case OP_NOT_VSPACE:
2985   case OP_VSPACE:
2986   case OP_EXTUNI:
2987   case OP_EODN:
2988   case OP_EOD:
2989   case OP_DOLL:
2990   case OP_DOLLM:
2991   return code;
2992 
2993   case OP_CHAR:
2994   case OP_NOT:
2995   GETCHARINCTEST(chr, code);
2996   list[2] = chr;
2997   list[3] = NOTACHAR;
2998   return code;
2999 
3000   case OP_CHARI:
3001   case OP_NOTI:
3002   list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
3003   GETCHARINCTEST(chr, code);
3004   list[2] = chr;
3005 
3006 #ifdef SUPPORT_UCP
3007   if (chr < 128 || (chr < 256 && !utf))
3008     list[3] = fcc[chr];
3009   else
3010     list[3] = UCD_OTHERCASE(chr);
3011 #elif defined SUPPORT_UTF || !defined COMPILE_PCRE8
3012   list[3] = (chr < 256) ? fcc[chr] : chr;
3013 #else
3014   list[3] = fcc[chr];
3015 #endif
3016 
3017   /* The othercase might be the same value. */
3018 
3019   if (chr == list[3])
3020     list[3] = NOTACHAR;
3021   else
3022     list[4] = NOTACHAR;
3023   return code;
3024 
3025 #ifdef SUPPORT_UCP
3026   case OP_PROP:
3027   case OP_NOTPROP:
3028   if (code[0] != PT_CLIST)
3029     {
3030     list[2] = code[0];
3031     list[3] = code[1];
3032     return code + 2;
3033     }
3034 
3035   /* Convert only if we have enough space. */
3036 
3037   clist_src = PRIV(ucd_caseless_sets) + code[1];
3038   clist_dest = list + 2;
3039   code += 2;
3040 
3041   do {
3042      if (clist_dest >= list + 8)
3043        {
3044        /* Early return if there is not enough space. This should never
3045        happen, since all clists are shorter than 5 character now. */
3046        list[2] = code[0];
3047        list[3] = code[1];
3048        return code;
3049        }
3050      *clist_dest++ = *clist_src;
3051      }
3052   while(*clist_src++ != NOTACHAR);
3053 
3054   /* All characters are stored. The terminating NOTACHAR
3055   is copied form the clist itself. */
3056 
3057   list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
3058   return code;
3059 #endif
3060 
3061   case OP_NCLASS:
3062   case OP_CLASS:
3063 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3064   case OP_XCLASS:
3065   if (c == OP_XCLASS)
3066     end = code + GET(code, 0) - 1;
3067   else
3068 #endif
3069     end = code + 32 / sizeof(pcre_uchar);
3070 
3071   switch(*end)
3072     {
3073     case OP_CRSTAR:
3074     case OP_CRMINSTAR:
3075     case OP_CRQUERY:
3076     case OP_CRMINQUERY:
3077     case OP_CRPOSSTAR:
3078     case OP_CRPOSQUERY:
3079     list[1] = TRUE;
3080     end++;
3081     break;
3082 
3083     case OP_CRPLUS:
3084     case OP_CRMINPLUS:
3085     case OP_CRPOSPLUS:
3086     end++;
3087     break;
3088 
3089     case OP_CRRANGE:
3090     case OP_CRMINRANGE:
3091     case OP_CRPOSRANGE:
3092     list[1] = (GET2(end, 1) == 0);
3093     end += 1 + 2 * IMM2_SIZE;
3094     break;
3095     }
3096   list[2] = (pcre_uint32)(end - code);
3097   return end;
3098   }
3099 return NULL;    /* Opcode not accepted */
3100 }
3101 
3102 
3103 
3104 /*************************************************
3105 *    Scan further character sets for match       *
3106 *************************************************/
3107 
3108 /* Checks whether the base and the current opcode have a common character, in
3109 which case the base cannot be possessified.
3110 
3111 Arguments:
3112   code        points to the byte code
3113   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
3114   cd          static compile data
3115   base_list   the data list of the base opcode
3116 
3117 Returns:      TRUE if the auto-possessification is possible
3118 */
3119 
3120 static BOOL
compare_opcodes(const pcre_uchar * code,BOOL utf,const compile_data * cd,const pcre_uint32 * base_list,const pcre_uchar * base_end,int * rec_limit)3121 compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
3122   const pcre_uint32 *base_list, const pcre_uchar *base_end, int *rec_limit)
3123 {
3124 pcre_uchar c;
3125 pcre_uint32 list[8];
3126 const pcre_uint32 *chr_ptr;
3127 const pcre_uint32 *ochr_ptr;
3128 const pcre_uint32 *list_ptr;
3129 const pcre_uchar *next_code;
3130 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3131 const pcre_uchar *xclass_flags;
3132 #endif
3133 const pcre_uint8 *class_bitset;
3134 const pcre_uint8 *set1, *set2, *set_end;
3135 pcre_uint32 chr;
3136 BOOL accepted, invert_bits;
3137 BOOL entered_a_group = FALSE;
3138 
3139 if (*rec_limit == 0) return FALSE;
3140 --(*rec_limit);
3141 
3142 /* Note: the base_list[1] contains whether the current opcode has greedy
3143 (represented by a non-zero value) quantifier. This is a different from
3144 other character type lists, which stores here that the character iterator
3145 matches to an empty string (also represented by a non-zero value). */
3146 
3147 for(;;)
3148   {
3149   /* All operations move the code pointer forward.
3150   Therefore infinite recursions are not possible. */
3151 
3152   c = *code;
3153 
3154   /* Skip over callouts */
3155 
3156   if (c == OP_CALLOUT)
3157     {
3158     code += PRIV(OP_lengths)[c];
3159     continue;
3160     }
3161 
3162   if (c == OP_ALT)
3163     {
3164     do code += GET(code, 1); while (*code == OP_ALT);
3165     c = *code;
3166     }
3167 
3168   switch(c)
3169     {
3170     case OP_END:
3171     case OP_KETRPOS:
3172     /* TRUE only in greedy case. The non-greedy case could be replaced by
3173     an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
3174     uses more memory, which we cannot get at this stage.) */
3175 
3176     return base_list[1] != 0;
3177 
3178     case OP_KET:
3179     /* If the bracket is capturing, and referenced by an OP_RECURSE, or
3180     it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
3181     cannot be converted to a possessive form. */
3182 
3183     if (base_list[1] == 0) return FALSE;
3184 
3185     switch(*(code - GET(code, 1)))
3186       {
3187       case OP_ASSERT:
3188       case OP_ASSERT_NOT:
3189       case OP_ASSERTBACK:
3190       case OP_ASSERTBACK_NOT:
3191       case OP_ONCE:
3192       case OP_ONCE_NC:
3193       /* Atomic sub-patterns and assertions can always auto-possessify their
3194       last iterator. However, if the group was entered as a result of checking
3195       a previous iterator, this is not possible. */
3196 
3197       return !entered_a_group;
3198       }
3199 
3200     code += PRIV(OP_lengths)[c];
3201     continue;
3202 
3203     case OP_ONCE:
3204     case OP_ONCE_NC:
3205     case OP_BRA:
3206     case OP_CBRA:
3207     next_code = code + GET(code, 1);
3208     code += PRIV(OP_lengths)[c];
3209 
3210     while (*next_code == OP_ALT)
3211       {
3212       if (!compare_opcodes(code, utf, cd, base_list, base_end, rec_limit))
3213         return FALSE;
3214       code = next_code + 1 + LINK_SIZE;
3215       next_code += GET(next_code, 1);
3216       }
3217 
3218     entered_a_group = TRUE;
3219     continue;
3220 
3221     case OP_BRAZERO:
3222     case OP_BRAMINZERO:
3223 
3224     next_code = code + 1;
3225     if (*next_code != OP_BRA && *next_code != OP_CBRA
3226         && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
3227 
3228     do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3229 
3230     /* The bracket content will be checked by the
3231     OP_BRA/OP_CBRA case above. */
3232     next_code += 1 + LINK_SIZE;
3233     if (!compare_opcodes(next_code, utf, cd, base_list, base_end, rec_limit))
3234       return FALSE;
3235 
3236     code += PRIV(OP_lengths)[c];
3237     continue;
3238 
3239     default:
3240     break;
3241     }
3242 
3243   /* Check for a supported opcode, and load its properties. */
3244 
3245   code = get_chr_property_list(code, utf, cd->fcc, list);
3246   if (code == NULL) return FALSE;    /* Unsupported */
3247 
3248   /* If either opcode is a small character list, set pointers for comparing
3249   characters from that list with another list, or with a property. */
3250 
3251   if (base_list[0] == OP_CHAR)
3252     {
3253     chr_ptr = base_list + 2;
3254     list_ptr = list;
3255     }
3256   else if (list[0] == OP_CHAR)
3257     {
3258     chr_ptr = list + 2;
3259     list_ptr = base_list;
3260     }
3261 
3262   /* Character bitsets can also be compared to certain opcodes. */
3263 
3264   else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
3265 #ifdef COMPILE_PCRE8
3266       /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
3267       || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
3268 #endif
3269       )
3270     {
3271 #ifdef COMPILE_PCRE8
3272     if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
3273 #else
3274     if (base_list[0] == OP_CLASS)
3275 #endif
3276       {
3277       set1 = (pcre_uint8 *)(base_end - base_list[2]);
3278       list_ptr = list;
3279       }
3280     else
3281       {
3282       set1 = (pcre_uint8 *)(code - list[2]);
3283       list_ptr = base_list;
3284       }
3285 
3286     invert_bits = FALSE;
3287     switch(list_ptr[0])
3288       {
3289       case OP_CLASS:
3290       case OP_NCLASS:
3291       set2 = (pcre_uint8 *)
3292         ((list_ptr == list ? code : base_end) - list_ptr[2]);
3293       break;
3294 
3295 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3296       case OP_XCLASS:
3297       xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE;
3298       if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;
3299       if ((*xclass_flags & XCL_MAP) == 0)
3300         {
3301         /* No bits are set for characters < 256. */
3302         if (list[1] == 0) return (*xclass_flags & XCL_NOT) == 0;
3303         /* Might be an empty repeat. */
3304         continue;
3305         }
3306       set2 = (pcre_uint8 *)(xclass_flags + 1);
3307       break;
3308 #endif
3309 
3310       case OP_NOT_DIGIT:
3311       invert_bits = TRUE;
3312       /* Fall through */
3313       case OP_DIGIT:
3314       set2 = (pcre_uint8 *)(cd->cbits + cbit_digit);
3315       break;
3316 
3317       case OP_NOT_WHITESPACE:
3318       invert_bits = TRUE;
3319       /* Fall through */
3320       case OP_WHITESPACE:
3321       set2 = (pcre_uint8 *)(cd->cbits + cbit_space);
3322       break;
3323 
3324       case OP_NOT_WORDCHAR:
3325       invert_bits = TRUE;
3326       /* Fall through */
3327       case OP_WORDCHAR:
3328       set2 = (pcre_uint8 *)(cd->cbits + cbit_word);
3329       break;
3330 
3331       default:
3332       return FALSE;
3333       }
3334 
3335     /* Because the sets are unaligned, we need
3336     to perform byte comparison here. */
3337     set_end = set1 + 32;
3338     if (invert_bits)
3339       {
3340       do
3341         {
3342         if ((*set1++ & ~(*set2++)) != 0) return FALSE;
3343         }
3344       while (set1 < set_end);
3345       }
3346     else
3347       {
3348       do
3349         {
3350         if ((*set1++ & *set2++) != 0) return FALSE;
3351         }
3352       while (set1 < set_end);
3353       }
3354 
3355     if (list[1] == 0) return TRUE;
3356     /* Might be an empty repeat. */
3357     continue;
3358     }
3359 
3360   /* Some property combinations also acceptable. Unicode property opcodes are
3361   processed specially; the rest can be handled with a lookup table. */
3362 
3363   else
3364     {
3365     pcre_uint32 leftop, rightop;
3366 
3367     leftop = base_list[0];
3368     rightop = list[0];
3369 
3370 #ifdef SUPPORT_UCP
3371     accepted = FALSE; /* Always set in non-unicode case. */
3372     if (leftop == OP_PROP || leftop == OP_NOTPROP)
3373       {
3374       if (rightop == OP_EOD)
3375         accepted = TRUE;
3376       else if (rightop == OP_PROP || rightop == OP_NOTPROP)
3377         {
3378         int n;
3379         const pcre_uint8 *p;
3380         BOOL same = leftop == rightop;
3381         BOOL lisprop = leftop == OP_PROP;
3382         BOOL risprop = rightop == OP_PROP;
3383         BOOL bothprop = lisprop && risprop;
3384 
3385         /* There's a table that specifies how each combination is to be
3386         processed:
3387           0   Always return FALSE (never auto-possessify)
3388           1   Character groups are distinct (possessify if both are OP_PROP)
3389           2   Check character categories in the same group (general or particular)
3390           3   Return TRUE if the two opcodes are not the same
3391           ... see comments below
3392         */
3393 
3394         n = propposstab[base_list[2]][list[2]];
3395         switch(n)
3396           {
3397           case 0: break;
3398           case 1: accepted = bothprop; break;
3399           case 2: accepted = (base_list[3] == list[3]) != same; break;
3400           case 3: accepted = !same; break;
3401 
3402           case 4:  /* Left general category, right particular category */
3403           accepted = risprop && catposstab[base_list[3]][list[3]] == same;
3404           break;
3405 
3406           case 5:  /* Right general category, left particular category */
3407           accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
3408           break;
3409 
3410           /* This code is logically tricky. Think hard before fiddling with it.
3411           The posspropstab table has four entries per row. Each row relates to
3412           one of PCRE's special properties such as ALNUM or SPACE or WORD.
3413           Only WORD actually needs all four entries, but using repeats for the
3414           others means they can all use the same code below.
3415 
3416           The first two entries in each row are Unicode general categories, and
3417           apply always, because all the characters they include are part of the
3418           PCRE character set. The third and fourth entries are a general and a
3419           particular category, respectively, that include one or more relevant
3420           characters. One or the other is used, depending on whether the check
3421           is for a general or a particular category. However, in both cases the
3422           category contains more characters than the specials that are defined
3423           for the property being tested against. Therefore, it cannot be used
3424           in a NOTPROP case.
3425 
3426           Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3427           Underscore is covered by ucp_P or ucp_Po. */
3428 
3429           case 6:  /* Left alphanum vs right general category */
3430           case 7:  /* Left space vs right general category */
3431           case 8:  /* Left word vs right general category */
3432           p = posspropstab[n-6];
3433           accepted = risprop && lisprop ==
3434             (list[3] != p[0] &&
3435              list[3] != p[1] &&
3436             (list[3] != p[2] || !lisprop));
3437           break;
3438 
3439           case 9:   /* Right alphanum vs left general category */
3440           case 10:  /* Right space vs left general category */
3441           case 11:  /* Right word vs left general category */
3442           p = posspropstab[n-9];
3443           accepted = lisprop && risprop ==
3444             (base_list[3] != p[0] &&
3445              base_list[3] != p[1] &&
3446             (base_list[3] != p[2] || !risprop));
3447           break;
3448 
3449           case 12:  /* Left alphanum vs right particular category */
3450           case 13:  /* Left space vs right particular category */
3451           case 14:  /* Left word vs right particular category */
3452           p = posspropstab[n-12];
3453           accepted = risprop && lisprop ==
3454             (catposstab[p[0]][list[3]] &&
3455              catposstab[p[1]][list[3]] &&
3456             (list[3] != p[3] || !lisprop));
3457           break;
3458 
3459           case 15:  /* Right alphanum vs left particular category */
3460           case 16:  /* Right space vs left particular category */
3461           case 17:  /* Right word vs left particular category */
3462           p = posspropstab[n-15];
3463           accepted = lisprop && risprop ==
3464             (catposstab[p[0]][base_list[3]] &&
3465              catposstab[p[1]][base_list[3]] &&
3466             (base_list[3] != p[3] || !risprop));
3467           break;
3468           }
3469         }
3470       }
3471 
3472     else
3473 #endif  /* SUPPORT_UCP */
3474 
3475     accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3476            rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3477            autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3478 
3479     if (!accepted) return FALSE;
3480 
3481     if (list[1] == 0) return TRUE;
3482     /* Might be an empty repeat. */
3483     continue;
3484     }
3485 
3486   /* Control reaches here only if one of the items is a small character list.
3487   All characters are checked against the other side. */
3488 
3489   do
3490     {
3491     chr = *chr_ptr;
3492 
3493     switch(list_ptr[0])
3494       {
3495       case OP_CHAR:
3496       ochr_ptr = list_ptr + 2;
3497       do
3498         {
3499         if (chr == *ochr_ptr) return FALSE;
3500         ochr_ptr++;
3501         }
3502       while(*ochr_ptr != NOTACHAR);
3503       break;
3504 
3505       case OP_NOT:
3506       ochr_ptr = list_ptr + 2;
3507       do
3508         {
3509         if (chr == *ochr_ptr)
3510           break;
3511         ochr_ptr++;
3512         }
3513       while(*ochr_ptr != NOTACHAR);
3514       if (*ochr_ptr == NOTACHAR) return FALSE;   /* Not found */
3515       break;
3516 
3517       /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not*
3518       set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3519 
3520       case OP_DIGIT:
3521       if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE;
3522       break;
3523 
3524       case OP_NOT_DIGIT:
3525       if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE;
3526       break;
3527 
3528       case OP_WHITESPACE:
3529       if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE;
3530       break;
3531 
3532       case OP_NOT_WHITESPACE:
3533       if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE;
3534       break;
3535 
3536       case OP_WORDCHAR:
3537       if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE;
3538       break;
3539 
3540       case OP_NOT_WORDCHAR:
3541       if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE;
3542       break;
3543 
3544       case OP_HSPACE:
3545       switch(chr)
3546         {
3547         HSPACE_CASES: return FALSE;
3548         default: break;
3549         }
3550       break;
3551 
3552       case OP_NOT_HSPACE:
3553       switch(chr)
3554         {
3555         HSPACE_CASES: break;
3556         default: return FALSE;
3557         }
3558       break;
3559 
3560       case OP_ANYNL:
3561       case OP_VSPACE:
3562       switch(chr)
3563         {
3564         VSPACE_CASES: return FALSE;
3565         default: break;
3566         }
3567       break;
3568 
3569       case OP_NOT_VSPACE:
3570       switch(chr)
3571         {
3572         VSPACE_CASES: break;
3573         default: return FALSE;
3574         }
3575       break;
3576 
3577       case OP_DOLL:
3578       case OP_EODN:
3579       switch (chr)
3580         {
3581         case CHAR_CR:
3582         case CHAR_LF:
3583         case CHAR_VT:
3584         case CHAR_FF:
3585         case CHAR_NEL:
3586 #ifndef EBCDIC
3587         case 0x2028:
3588         case 0x2029:
3589 #endif  /* Not EBCDIC */
3590         return FALSE;
3591         }
3592       break;
3593 
3594       case OP_EOD:    /* Can always possessify before \z */
3595       break;
3596 
3597 #ifdef SUPPORT_UCP
3598       case OP_PROP:
3599       case OP_NOTPROP:
3600       if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3601             list_ptr[0] == OP_NOTPROP))
3602         return FALSE;
3603       break;
3604 #endif
3605 
3606       case OP_NCLASS:
3607       if (chr > 255) return FALSE;
3608       /* Fall through */
3609 
3610       case OP_CLASS:
3611       if (chr > 255) break;
3612       class_bitset = (pcre_uint8 *)
3613         ((list_ptr == list ? code : base_end) - list_ptr[2]);
3614       if ((class_bitset[chr >> 3] & (1U << (chr & 7))) != 0) return FALSE;
3615       break;
3616 
3617 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3618       case OP_XCLASS:
3619       if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
3620           list_ptr[2] + LINK_SIZE, utf)) return FALSE;
3621       break;
3622 #endif
3623 
3624       default:
3625       return FALSE;
3626       }
3627 
3628     chr_ptr++;
3629     }
3630   while(*chr_ptr != NOTACHAR);
3631 
3632   /* At least one character must be matched from this opcode. */
3633 
3634   if (list[1] == 0) return TRUE;
3635   }
3636 
3637 /* Control never reaches here. There used to be a fail-save return FALSE; here,
3638 but some compilers complain about an unreachable statement. */
3639 
3640 }
3641 
3642 
3643 
3644 /*************************************************
3645 *    Scan compiled regex for auto-possession     *
3646 *************************************************/
3647 
3648 /* Replaces single character iterations with their possessive alternatives
3649 if appropriate. This function modifies the compiled opcode!
3650 
3651 Arguments:
3652   code        points to start of the byte code
3653   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
3654   cd          static compile data
3655 
3656 Returns:      nothing
3657 */
3658 
3659 static void
auto_possessify(pcre_uchar * code,BOOL utf,const compile_data * cd)3660 auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)
3661 {
3662 register pcre_uchar c;
3663 const pcre_uchar *end;
3664 pcre_uchar *repeat_opcode;
3665 pcre_uint32 list[8];
3666 int rec_limit;
3667 
3668 for (;;)
3669   {
3670   c = *code;
3671 
3672   /* When a pattern with bad UTF-8 encoding is compiled with NO_UTF_CHECK,
3673   it may compile without complaining, but may get into a loop here if the code
3674   pointer points to a bad value. This is, of course a documentated possibility,
3675   when NO_UTF_CHECK is set, so it isn't a bug, but we can detect this case and
3676   just give up on this optimization. */
3677 
3678   if (c >= OP_TABLE_LENGTH) return;
3679 
3680   if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
3681     {
3682     c -= get_repeat_base(c) - OP_STAR;
3683     end = (c <= OP_MINUPTO) ?
3684       get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3685     list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3686 
3687     rec_limit = 1000;
3688     if (end != NULL && compare_opcodes(end, utf, cd, list, end, &rec_limit))
3689       {
3690       switch(c)
3691         {
3692         case OP_STAR:
3693         *code += OP_POSSTAR - OP_STAR;
3694         break;
3695 
3696         case OP_MINSTAR:
3697         *code += OP_POSSTAR - OP_MINSTAR;
3698         break;
3699 
3700         case OP_PLUS:
3701         *code += OP_POSPLUS - OP_PLUS;
3702         break;
3703 
3704         case OP_MINPLUS:
3705         *code += OP_POSPLUS - OP_MINPLUS;
3706         break;
3707 
3708         case OP_QUERY:
3709         *code += OP_POSQUERY - OP_QUERY;
3710         break;
3711 
3712         case OP_MINQUERY:
3713         *code += OP_POSQUERY - OP_MINQUERY;
3714         break;
3715 
3716         case OP_UPTO:
3717         *code += OP_POSUPTO - OP_UPTO;
3718         break;
3719 
3720         case OP_MINUPTO:
3721         *code += OP_POSUPTO - OP_MINUPTO;
3722         break;
3723         }
3724       }
3725     c = *code;
3726     }
3727   else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
3728     {
3729 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3730     if (c == OP_XCLASS)
3731       repeat_opcode = code + GET(code, 1);
3732     else
3733 #endif
3734       repeat_opcode = code + 1 + (32 / sizeof(pcre_uchar));
3735 
3736     c = *repeat_opcode;
3737     if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
3738       {
3739       /* end must not be NULL. */
3740       end = get_chr_property_list(code, utf, cd->fcc, list);
3741 
3742       list[1] = (c & 1) == 0;
3743 
3744       rec_limit = 1000;
3745       if (compare_opcodes(end, utf, cd, list, end, &rec_limit))
3746         {
3747         switch (c)
3748           {
3749           case OP_CRSTAR:
3750           case OP_CRMINSTAR:
3751           *repeat_opcode = OP_CRPOSSTAR;
3752           break;
3753 
3754           case OP_CRPLUS:
3755           case OP_CRMINPLUS:
3756           *repeat_opcode = OP_CRPOSPLUS;
3757           break;
3758 
3759           case OP_CRQUERY:
3760           case OP_CRMINQUERY:
3761           *repeat_opcode = OP_CRPOSQUERY;
3762           break;
3763 
3764           case OP_CRRANGE:
3765           case OP_CRMINRANGE:
3766           *repeat_opcode = OP_CRPOSRANGE;
3767           break;
3768           }
3769         }
3770       }
3771     c = *code;
3772     }
3773 
3774   switch(c)
3775     {
3776     case OP_END:
3777     return;
3778 
3779     case OP_TYPESTAR:
3780     case OP_TYPEMINSTAR:
3781     case OP_TYPEPLUS:
3782     case OP_TYPEMINPLUS:
3783     case OP_TYPEQUERY:
3784     case OP_TYPEMINQUERY:
3785     case OP_TYPEPOSSTAR:
3786     case OP_TYPEPOSPLUS:
3787     case OP_TYPEPOSQUERY:
3788     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
3789     break;
3790 
3791     case OP_TYPEUPTO:
3792     case OP_TYPEMINUPTO:
3793     case OP_TYPEEXACT:
3794     case OP_TYPEPOSUPTO:
3795     if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
3796       code += 2;
3797     break;
3798 
3799 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3800     case OP_XCLASS:
3801     code += GET(code, 1);
3802     break;
3803 #endif
3804 
3805     case OP_MARK:
3806     case OP_PRUNE_ARG:
3807     case OP_SKIP_ARG:
3808     case OP_THEN_ARG:
3809     code += code[1];
3810     break;
3811     }
3812 
3813   /* Add in the fixed length from the table */
3814 
3815   code += PRIV(OP_lengths)[c];
3816 
3817   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
3818   a multi-byte character. The length in the table is a minimum, so we have to
3819   arrange to skip the extra bytes. */
3820 
3821 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
3822   if (utf) switch(c)
3823     {
3824     case OP_CHAR:
3825     case OP_CHARI:
3826     case OP_NOT:
3827     case OP_NOTI:
3828     case OP_STAR:
3829     case OP_MINSTAR:
3830     case OP_PLUS:
3831     case OP_MINPLUS:
3832     case OP_QUERY:
3833     case OP_MINQUERY:
3834     case OP_UPTO:
3835     case OP_MINUPTO:
3836     case OP_EXACT:
3837     case OP_POSSTAR:
3838     case OP_POSPLUS:
3839     case OP_POSQUERY:
3840     case OP_POSUPTO:
3841     case OP_STARI:
3842     case OP_MINSTARI:
3843     case OP_PLUSI:
3844     case OP_MINPLUSI:
3845     case OP_QUERYI:
3846     case OP_MINQUERYI:
3847     case OP_UPTOI:
3848     case OP_MINUPTOI:
3849     case OP_EXACTI:
3850     case OP_POSSTARI:
3851     case OP_POSPLUSI:
3852     case OP_POSQUERYI:
3853     case OP_POSUPTOI:
3854     case OP_NOTSTAR:
3855     case OP_NOTMINSTAR:
3856     case OP_NOTPLUS:
3857     case OP_NOTMINPLUS:
3858     case OP_NOTQUERY:
3859     case OP_NOTMINQUERY:
3860     case OP_NOTUPTO:
3861     case OP_NOTMINUPTO:
3862     case OP_NOTEXACT:
3863     case OP_NOTPOSSTAR:
3864     case OP_NOTPOSPLUS:
3865     case OP_NOTPOSQUERY:
3866     case OP_NOTPOSUPTO:
3867     case OP_NOTSTARI:
3868     case OP_NOTMINSTARI:
3869     case OP_NOTPLUSI:
3870     case OP_NOTMINPLUSI:
3871     case OP_NOTQUERYI:
3872     case OP_NOTMINQUERYI:
3873     case OP_NOTUPTOI:
3874     case OP_NOTMINUPTOI:
3875     case OP_NOTEXACTI:
3876     case OP_NOTPOSSTARI:
3877     case OP_NOTPOSPLUSI:
3878     case OP_NOTPOSQUERYI:
3879     case OP_NOTPOSUPTOI:
3880     if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
3881     break;
3882     }
3883 #else
3884   (void)(utf);  /* Keep compiler happy by referencing function argument */
3885 #endif
3886   }
3887 }
3888 
3889 
3890 
3891 /*************************************************
3892 *           Check for POSIX class syntax         *
3893 *************************************************/
3894 
3895 /* This function is called when the sequence "[:" or "[." or "[=" is
3896 encountered in a character class. It checks whether this is followed by a
3897 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
3898 reach an unescaped ']' without the special preceding character, return FALSE.
3899 
3900 Originally, this function only recognized a sequence of letters between the
3901 terminators, but it seems that Perl recognizes any sequence of characters,
3902 though of course unknown POSIX names are subsequently rejected. Perl gives an
3903 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
3904 didn't consider this to be a POSIX class. Likewise for [:1234:].
3905 
3906 The problem in trying to be exactly like Perl is in the handling of escapes. We
3907 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
3908 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
3909 below handles the special cases \\ and \], but does not try to do any other
3910 escape processing. This makes it different from Perl for cases such as
3911 [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
3912 not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
3913 when Perl does, I think.
3914 
3915 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
3916 It seems that the appearance of a nested POSIX class supersedes an apparent
3917 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
3918 a digit.
3919 
3920 In Perl, unescaped square brackets may also appear as part of class names. For
3921 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
3922 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
3923 seem right at all. PCRE does not allow closing square brackets in POSIX class
3924 names.
3925 
3926 Arguments:
3927   ptr      pointer to the initial [
3928   endptr   where to return the end pointer
3929 
3930 Returns:   TRUE or FALSE
3931 */
3932 
3933 static BOOL
check_posix_syntax(const pcre_uchar * ptr,const pcre_uchar ** endptr)3934 check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
3935 {
3936 pcre_uchar terminator;          /* Don't combine these lines; the Solaris cc */
3937 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
3938 for (++ptr; *ptr != CHAR_NULL; ptr++)
3939   {
3940   if (*ptr == CHAR_BACKSLASH &&
3941       (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET ||
3942        ptr[1] == CHAR_BACKSLASH))
3943     ptr++;
3944   else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
3945             *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
3946   else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3947     {
3948     *endptr = ptr;
3949     return TRUE;
3950     }
3951   }
3952 return FALSE;
3953 }
3954 
3955 
3956 
3957 
3958 /*************************************************
3959 *          Check POSIX class name                *
3960 *************************************************/
3961 
3962 /* This function is called to check the name given in a POSIX-style class entry
3963 such as [:alnum:].
3964 
3965 Arguments:
3966   ptr        points to the first letter
3967   len        the length of the name
3968 
3969 Returns:     a value representing the name, or -1 if unknown
3970 */
3971 
3972 static int
check_posix_name(const pcre_uchar * ptr,int len)3973 check_posix_name(const pcre_uchar *ptr, int len)
3974 {
3975 const char *pn = posix_names;
3976 register int yield = 0;
3977 while (posix_name_lengths[yield] != 0)
3978   {
3979   if (len == posix_name_lengths[yield] &&
3980     STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
3981   pn += posix_name_lengths[yield] + 1;
3982   yield++;
3983   }
3984 return -1;
3985 }
3986 
3987 
3988 /*************************************************
3989 *    Adjust OP_RECURSE items in repeated group   *
3990 *************************************************/
3991 
3992 /* OP_RECURSE items contain an offset from the start of the regex to the group
3993 that is referenced. This means that groups can be replicated for fixed
3994 repetition simply by copying (because the recursion is allowed to refer to
3995 earlier groups that are outside the current group). However, when a group is
3996 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
3997 inserted before it, after it has been compiled. This means that any OP_RECURSE
3998 items within it that refer to the group itself or any contained groups have to
3999 have their offsets adjusted. That one of the jobs of this function. Before it
4000 is called, the partially compiled regex must be temporarily terminated with
4001 OP_END.
4002 
4003 This function has been extended to cope with forward references for recursions
4004 and subroutine calls. It must check the list of such references for the
4005 group we are dealing with. If it finds that one of the recursions in the
4006 current group is on this list, it does not adjust the value in the reference
4007 (which is a group number). After the group has been scanned, all the offsets in
4008 the forward reference list for the group are adjusted.
4009 
4010 Arguments:
4011   group      points to the start of the group
4012   adjust     the amount by which the group is to be moved
4013   utf        TRUE in UTF-8 / UTF-16 / UTF-32 mode
4014   cd         contains pointers to tables etc.
4015   save_hwm_offset   the hwm forward reference offset at the start of the group
4016 
4017 Returns:     nothing
4018 */
4019 
4020 static void
adjust_recurse(pcre_uchar * group,int adjust,BOOL utf,compile_data * cd,size_t save_hwm_offset)4021 adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
4022   size_t save_hwm_offset)
4023 {
4024 int offset;
4025 pcre_uchar *hc;
4026 pcre_uchar *ptr = group;
4027 
4028 while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
4029   {
4030   for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
4031        hc += LINK_SIZE)
4032     {
4033     offset = (int)GET(hc, 0);
4034     if (cd->start_code + offset == ptr + 1) break;
4035     }
4036 
4037   /* If we have not found this recursion on the forward reference list, adjust
4038   the recursion's offset if it's after the start of this group. */
4039 
4040   if (hc >= cd->hwm)
4041     {
4042     offset = (int)GET(ptr, 1);
4043     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
4044     }
4045 
4046   ptr += 1 + LINK_SIZE;
4047   }
4048 
4049 /* Now adjust all forward reference offsets for the group. */
4050 
4051 for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
4052      hc += LINK_SIZE)
4053   {
4054   offset = (int)GET(hc, 0);
4055   PUT(hc, 0, offset + adjust);
4056   }
4057 }
4058 
4059 
4060 
4061 /*************************************************
4062 *        Insert an automatic callout point       *
4063 *************************************************/
4064 
4065 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
4066 callout points before each pattern item.
4067 
4068 Arguments:
4069   code           current code pointer
4070   ptr            current pattern pointer
4071   cd             pointers to tables etc
4072 
4073 Returns:         new code pointer
4074 */
4075 
4076 static pcre_uchar *
auto_callout(pcre_uchar * code,const pcre_uchar * ptr,compile_data * cd)4077 auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
4078 {
4079 *code++ = OP_CALLOUT;
4080 *code++ = 255;
4081 PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
4082 PUT(code, LINK_SIZE, 0);                       /* Default length */
4083 return code + 2 * LINK_SIZE;
4084 }
4085 
4086 
4087 
4088 /*************************************************
4089 *         Complete a callout item                *
4090 *************************************************/
4091 
4092 /* A callout item contains the length of the next item in the pattern, which
4093 we can't fill in till after we have reached the relevant point. This is used
4094 for both automatic and manual callouts.
4095 
4096 Arguments:
4097   previous_callout   points to previous callout item
4098   ptr                current pattern pointer
4099   cd                 pointers to tables etc
4100 
4101 Returns:             nothing
4102 */
4103 
4104 static void
complete_callout(pcre_uchar * previous_callout,const pcre_uchar * ptr,compile_data * cd)4105 complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
4106 {
4107 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
4108 PUT(previous_callout, 2 + LINK_SIZE, length);
4109 }
4110 
4111 
4112 
4113 #ifdef SUPPORT_UCP
4114 /*************************************************
4115 *           Get othercase range                  *
4116 *************************************************/
4117 
4118 /* This function is passed the start and end of a class range, in UTF-8 mode
4119 with UCP support. It searches up the characters, looking for ranges of
4120 characters in the "other" case. Each call returns the next one, updating the
4121 start address. A character with multiple other cases is returned on its own
4122 with a special return value.
4123 
4124 Arguments:
4125   cptr        points to starting character value; updated
4126   d           end value
4127   ocptr       where to put start of othercase range
4128   odptr       where to put end of othercase range
4129 
4130 Yield:        -1 when no more
4131                0 when a range is returned
4132               >0 the CASESET offset for char with multiple other cases
4133                 in this case, ocptr contains the original
4134 */
4135 
4136 static int
get_othercase_range(pcre_uint32 * cptr,pcre_uint32 d,pcre_uint32 * ocptr,pcre_uint32 * odptr)4137 get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
4138   pcre_uint32 *odptr)
4139 {
4140 pcre_uint32 c, othercase, next;
4141 unsigned int co;
4142 
4143 /* Find the first character that has an other case. If it has multiple other
4144 cases, return its case offset value. */
4145 
4146 for (c = *cptr; c <= d; c++)
4147   {
4148   if ((co = UCD_CASESET(c)) != 0)
4149     {
4150     *ocptr = c++;   /* Character that has the set */
4151     *cptr = c;      /* Rest of input range */
4152     return (int)co;
4153     }
4154   if ((othercase = UCD_OTHERCASE(c)) != c) break;
4155   }
4156 
4157 if (c > d) return -1;  /* Reached end of range */
4158 
4159 /* Found a character that has a single other case. Search for the end of the
4160 range, which is either the end of the input range, or a character that has zero
4161 or more than one other cases. */
4162 
4163 *ocptr = othercase;
4164 next = othercase + 1;
4165 
4166 for (++c; c <= d; c++)
4167   {
4168   if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
4169   next++;
4170   }
4171 
4172 *odptr = next - 1;     /* End of othercase range */
4173 *cptr = c;             /* Rest of input range */
4174 return 0;
4175 }
4176 #endif  /* SUPPORT_UCP */
4177 
4178 
4179 
4180 /*************************************************
4181 *        Add a character or range to a class     *
4182 *************************************************/
4183 
4184 /* This function packages up the logic of adding a character or range of
4185 characters to a class. The character values in the arguments will be within the
4186 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
4187 mutually recursive with the function immediately below.
4188 
4189 Arguments:
4190   classbits     the bit map for characters < 256
4191   uchardptr     points to the pointer for extra data
4192   options       the options word
4193   cd            contains pointers to tables etc.
4194   start         start of range character
4195   end           end of range character
4196 
4197 Returns:        the number of < 256 characters added
4198                 the pointer to extra data is updated
4199 */
4200 
4201 static int
add_to_class(pcre_uint8 * classbits,pcre_uchar ** uchardptr,int options,compile_data * cd,pcre_uint32 start,pcre_uint32 end)4202 add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4203   compile_data *cd, pcre_uint32 start, pcre_uint32 end)
4204 {
4205 pcre_uint32 c;
4206 pcre_uint32 classbits_end = (end <= 0xff ? end : 0xff);
4207 int n8 = 0;
4208 
4209 ((void)uchardptr);
4210 ((void)propposstab);
4211 ((void)catposstab);
4212 ((void)posspropstab);
4213 
4214 /* If caseless matching is required, scan the range and process alternate
4215 cases. In Unicode, there are 8-bit characters that have alternate cases that
4216 are greater than 255 and vice-versa. Sometimes we can just extend the original
4217 range. */
4218 
4219 if ((options & PCRE_CASELESS) != 0)
4220   {
4221 #ifdef SUPPORT_UCP
4222   if ((options & PCRE_UTF8) != 0)
4223     {
4224     int rc;
4225     pcre_uint32 oc, od;
4226 
4227     options &= ~PCRE_CASELESS;   /* Remove for recursive calls */
4228     c = start;
4229 
4230     while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
4231       {
4232       /* Handle a single character that has more than one other case. */
4233 
4234       if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
4235         PRIV(ucd_caseless_sets) + rc, oc);
4236 
4237       /* Do nothing if the other case range is within the original range. */
4238 
4239       else if (oc >= start && od <= end) continue;
4240 
4241       /* Extend the original range if there is overlap, noting that if oc < c, we
4242       can't have od > end because a subrange is always shorter than the basic
4243       range. Otherwise, use a recursive call to add the additional range. */
4244 
4245       else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
4246       else if (od > end && oc <= end + 1)
4247         {
4248         end = od;       /* Extend upwards */
4249         if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
4250         }
4251       else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
4252       }
4253     }
4254   else
4255 #endif  /* SUPPORT_UCP */
4256 
4257   /* Not UTF-mode, or no UCP */
4258 
4259   for (c = start; c <= classbits_end; c++)
4260     {
4261     SETBIT(classbits, cd->fcc[c]);
4262     n8++;
4263     }
4264   }
4265 
4266 /* Now handle the original range. Adjust the final value according to the bit
4267 length - this means that the same lists of (e.g.) horizontal spaces can be used
4268 in all cases. */
4269 
4270 #if defined COMPILE_PCRE8
4271 #ifdef SUPPORT_UTF
4272   if ((options & PCRE_UTF8) == 0)
4273 #endif
4274   if (end > 0xff) end = 0xff;
4275 
4276 #elif defined COMPILE_PCRE16
4277 #ifdef SUPPORT_UTF
4278   if ((options & PCRE_UTF16) == 0)
4279 #endif
4280   if (end > 0xffff) end = 0xffff;
4281 
4282 #endif /* COMPILE_PCRE[8|16] */
4283 
4284 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
4285 
4286 for (c = start; c <= classbits_end; c++)
4287   {
4288   /* Regardless of start, c will always be <= 255. */
4289   SETBIT(classbits, c);
4290   n8++;
4291   }
4292 
4293 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4294 if (start <= 0xff) start = 0xff + 1;
4295 
4296 if (end >= start)
4297   {
4298   pcre_uchar *uchardata = *uchardptr;
4299 #ifdef SUPPORT_UTF
4300   if ((options & PCRE_UTF8) != 0)  /* All UTFs use the same flag bit */
4301     {
4302     if (start < end)
4303       {
4304       *uchardata++ = XCL_RANGE;
4305       uchardata += PRIV(ord2utf)(start, uchardata);
4306       uchardata += PRIV(ord2utf)(end, uchardata);
4307       }
4308     else if (start == end)
4309       {
4310       *uchardata++ = XCL_SINGLE;
4311       uchardata += PRIV(ord2utf)(start, uchardata);
4312       }
4313     }
4314   else
4315 #endif  /* SUPPORT_UTF */
4316 
4317   /* Without UTF support, character values are constrained by the bit length,
4318   and can only be > 256 for 16-bit and 32-bit libraries. */
4319 
4320 #ifdef COMPILE_PCRE8
4321     {}
4322 #else
4323   if (start < end)
4324     {
4325     *uchardata++ = XCL_RANGE;
4326     *uchardata++ = start;
4327     *uchardata++ = end;
4328     }
4329   else if (start == end)
4330     {
4331     *uchardata++ = XCL_SINGLE;
4332     *uchardata++ = start;
4333     }
4334 #endif
4335 
4336   *uchardptr = uchardata;   /* Updata extra data pointer */
4337   }
4338 #endif /* SUPPORT_UTF || !COMPILE_PCRE8 */
4339 
4340 return n8;    /* Number of 8-bit characters */
4341 }
4342 
4343 
4344 
4345 
4346 /*************************************************
4347 *        Add a list of characters to a class     *
4348 *************************************************/
4349 
4350 /* This function is used for adding a list of case-equivalent characters to a
4351 class, and also for adding a list of horizontal or vertical whitespace. If the
4352 list is in order (which it should be), ranges of characters are detected and
4353 handled appropriately. This function is mutually recursive with the function
4354 above.
4355 
4356 Arguments:
4357   classbits     the bit map for characters < 256
4358   uchardptr     points to the pointer for extra data
4359   options       the options word
4360   cd            contains pointers to tables etc.
4361   p             points to row of 32-bit values, terminated by NOTACHAR
4362   except        character to omit; this is used when adding lists of
4363                   case-equivalent characters to avoid including the one we
4364                   already know about
4365 
4366 Returns:        the number of < 256 characters added
4367                 the pointer to extra data is updated
4368 */
4369 
4370 static int
add_list_to_class(pcre_uint8 * classbits,pcre_uchar ** uchardptr,int options,compile_data * cd,const pcre_uint32 * p,unsigned int except)4371 add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4372   compile_data *cd, const pcre_uint32 *p, unsigned int except)
4373 {
4374 int n8 = 0;
4375 while (p[0] < NOTACHAR)
4376   {
4377   int n = 0;
4378   if (p[0] != except)
4379     {
4380     while(p[n+1] == p[0] + n + 1) n++;
4381     n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
4382     }
4383   p += n + 1;
4384   }
4385 return n8;
4386 }
4387 
4388 
4389 
4390 /*************************************************
4391 *    Add characters not in a list to a class     *
4392 *************************************************/
4393 
4394 /* This function is used for adding the complement of a list of horizontal or
4395 vertical whitespace to a class. The list must be in order.
4396 
4397 Arguments:
4398   classbits     the bit map for characters < 256
4399   uchardptr     points to the pointer for extra data
4400   options       the options word
4401   cd            contains pointers to tables etc.
4402   p             points to row of 32-bit values, terminated by NOTACHAR
4403 
4404 Returns:        the number of < 256 characters added
4405                 the pointer to extra data is updated
4406 */
4407 
4408 static int
add_not_list_to_class(pcre_uint8 * classbits,pcre_uchar ** uchardptr,int options,compile_data * cd,const pcre_uint32 * p)4409 add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
4410   int options, compile_data *cd, const pcre_uint32 *p)
4411 {
4412 BOOL utf = (options & PCRE_UTF8) != 0;
4413 int n8 = 0;
4414 if (p[0] > 0)
4415   n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
4416 while (p[0] < NOTACHAR)
4417   {
4418   while (p[1] == p[0] + 1) p++;
4419   n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
4420     (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
4421   p++;
4422   }
4423 return n8;
4424 }
4425 
4426 
4427 
4428 /*************************************************
4429 *           Compile one branch                   *
4430 *************************************************/
4431 
4432 /* Scan the pattern, compiling it into the a vector. If the options are
4433 changed during the branch, the pointer is used to change the external options
4434 bits. This function is used during the pre-compile phase when we are trying
4435 to find out the amount of memory needed, as well as during the real compile
4436 phase. The value of lengthptr distinguishes the two phases.
4437 
4438 Arguments:
4439   optionsptr        pointer to the option bits
4440   codeptr           points to the pointer to the current code point
4441   ptrptr            points to the current pattern pointer
4442   errorcodeptr      points to error code variable
4443   firstcharptr      place to put the first required character
4444   firstcharflagsptr place to put the first character flags, or a negative number
4445   reqcharptr        place to put the last required character
4446   reqcharflagsptr   place to put the last required character flags, or a negative number
4447   bcptr             points to current branch chain
4448   cond_depth        conditional nesting depth
4449   cd                contains pointers to tables etc.
4450   lengthptr         NULL during the real compile phase
4451                     points to length accumulator during pre-compile phase
4452 
4453 Returns:            TRUE on success
4454                     FALSE, with *errorcodeptr set non-zero on error
4455 */
4456 
4457 static BOOL
compile_branch(int * optionsptr,pcre_uchar ** codeptr,const pcre_uchar ** ptrptr,int * errorcodeptr,pcre_uint32 * firstcharptr,pcre_int32 * firstcharflagsptr,pcre_uint32 * reqcharptr,pcre_int32 * reqcharflagsptr,branch_chain * bcptr,int cond_depth,compile_data * cd,int * lengthptr)4458 compile_branch(int *optionsptr, pcre_uchar **codeptr,
4459   const pcre_uchar **ptrptr, int *errorcodeptr,
4460   pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
4461   pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
4462   branch_chain *bcptr, int cond_depth,
4463   compile_data *cd, int *lengthptr)
4464 {
4465 int repeat_type, op_type;
4466 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
4467 int bravalue = 0;
4468 int greedy_default, greedy_non_default;
4469 pcre_uint32 firstchar, reqchar;
4470 pcre_int32 firstcharflags, reqcharflags;
4471 pcre_uint32 zeroreqchar, zerofirstchar;
4472 pcre_int32 zeroreqcharflags, zerofirstcharflags;
4473 pcre_int32 req_caseopt, reqvary, tempreqvary;
4474 int options = *optionsptr;               /* May change dynamically */
4475 int after_manual_callout = 0;
4476 int length_prevgroup = 0;
4477 register pcre_uint32 c;
4478 int escape;
4479 register pcre_uchar *code = *codeptr;
4480 pcre_uchar *last_code = code;
4481 pcre_uchar *orig_code = code;
4482 pcre_uchar *tempcode;
4483 BOOL inescq = FALSE;
4484 BOOL groupsetfirstchar = FALSE;
4485 const pcre_uchar *ptr = *ptrptr;
4486 const pcre_uchar *tempptr;
4487 const pcre_uchar *nestptr = NULL;
4488 pcre_uchar *previous = NULL;
4489 pcre_uchar *previous_callout = NULL;
4490 size_t item_hwm_offset = 0;
4491 pcre_uint8 classbits[32];
4492 
4493 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
4494 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
4495 dynamically as we process the pattern. */
4496 
4497 #ifdef SUPPORT_UTF
4498 /* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
4499 BOOL utf = (options & PCRE_UTF8) != 0;
4500 #ifndef COMPILE_PCRE32
4501 pcre_uchar utf_chars[6];
4502 #endif
4503 #else
4504 BOOL utf = FALSE;
4505 #endif
4506 
4507 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
4508 class_uchardata always so that it can be passed to add_to_class() always,
4509 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
4510 alternative calls for the different cases. */
4511 
4512 pcre_uchar *class_uchardata;
4513 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4514 BOOL xclass;
4515 pcre_uchar *class_uchardata_base;
4516 #endif
4517 
4518 #ifdef PCRE_DEBUG
4519 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
4520 #endif
4521 
4522 /* Set up the default and non-default settings for greediness */
4523 
4524 greedy_default = ((options & PCRE_UNGREEDY) != 0);
4525 greedy_non_default = greedy_default ^ 1;
4526 
4527 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
4528 matching encountered yet". It gets changed to REQ_NONE if we hit something that
4529 matches a non-fixed char first char; reqchar just remains unset if we never
4530 find one.
4531 
4532 When we hit a repeat whose minimum is zero, we may have to adjust these values
4533 to take the zero repeat into account. This is implemented by setting them to
4534 zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
4535 item types that can be repeated set these backoff variables appropriately. */
4536 
4537 firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
4538 firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
4539 
4540 /* The variable req_caseopt contains either the REQ_CASELESS value
4541 or zero, according to the current setting of the caseless flag. The
4542 REQ_CASELESS leaves the lower 28 bit empty. It is added into the
4543 firstchar or reqchar variables to record the case status of the
4544 value. This is used only for ASCII characters. */
4545 
4546 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
4547 
4548 /* Switch on next character until the end of the branch */
4549 
4550 for (;; ptr++)
4551   {
4552   BOOL negate_class;
4553   BOOL should_flip_negation;
4554   BOOL possessive_quantifier;
4555   BOOL is_quantifier;
4556   BOOL is_recurse;
4557   BOOL reset_bracount;
4558   int class_has_8bitchar;
4559   int class_one_char;
4560 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4561   BOOL xclass_has_prop;
4562 #endif
4563   int newoptions;
4564   int recno;
4565   int refsign;
4566   int skipbytes;
4567   pcre_uint32 subreqchar, subfirstchar;
4568   pcre_int32 subreqcharflags, subfirstcharflags;
4569   int terminator;
4570   unsigned int mclength;
4571   unsigned int tempbracount;
4572   pcre_uint32 ec;
4573   pcre_uchar mcbuffer[8];
4574 
4575   /* Come here to restart the loop without advancing the pointer. */
4576 
4577   REDO_LOOP:
4578 
4579   /* Get next character in the pattern */
4580 
4581   c = *ptr;
4582 
4583   /* If we are at the end of a nested substitution, revert to the outer level
4584   string. Nesting only happens one level deep. */
4585 
4586   if (c == CHAR_NULL && nestptr != NULL)
4587     {
4588     ptr = nestptr;
4589     nestptr = NULL;
4590     c = *ptr;
4591     }
4592 
4593   /* If we are in the pre-compile phase, accumulate the length used for the
4594   previous cycle of this loop. */
4595 
4596   if (lengthptr != NULL)
4597     {
4598 #ifdef PCRE_DEBUG
4599     if (code > cd->hwm) cd->hwm = code;                 /* High water info */
4600 #endif
4601     if (code > cd->start_workspace + cd->workspace_size -
4602         WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
4603       {
4604       *errorcodeptr = (code >= cd->start_workspace + cd->workspace_size)?
4605         ERR52 : ERR87;
4606       goto FAILED;
4607       }
4608 
4609     /* There is at least one situation where code goes backwards: this is the
4610     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
4611     the class is simply eliminated. However, it is created first, so we have to
4612     allow memory for it. Therefore, don't ever reduce the length at this point.
4613     */
4614 
4615     if (code < last_code) code = last_code;
4616 
4617     /* Paranoid check for integer overflow */
4618 
4619     if (OFLOW_MAX - *lengthptr < code - last_code)
4620       {
4621       *errorcodeptr = ERR20;
4622       goto FAILED;
4623       }
4624 
4625     *lengthptr += (int)(code - last_code);
4626     DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
4627       (int)(code - last_code), c, c));
4628 
4629     /* If "previous" is set and it is not at the start of the work space, move
4630     it back to there, in order to avoid filling up the work space. Otherwise,
4631     if "previous" is NULL, reset the current code pointer to the start. */
4632 
4633     if (previous != NULL)
4634       {
4635       if (previous > orig_code)
4636         {
4637         memmove(orig_code, previous, IN_UCHARS(code - previous));
4638         code -= previous - orig_code;
4639         previous = orig_code;
4640         }
4641       }
4642     else code = orig_code;
4643 
4644     /* Remember where this code item starts so we can pick up the length
4645     next time round. */
4646 
4647     last_code = code;
4648     }
4649 
4650   /* In the real compile phase, just check the workspace used by the forward
4651   reference list. */
4652 
4653   else if (cd->hwm > cd->start_workspace + cd->workspace_size)
4654     {
4655     *errorcodeptr = ERR52;
4656     goto FAILED;
4657     }
4658 
4659   /* If in \Q...\E, check for the end; if not, we have a literal. Otherwise an
4660   isolated \E is ignored. */
4661 
4662   if (c != CHAR_NULL)
4663     {
4664     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4665       {
4666       inescq = FALSE;
4667       ptr++;
4668       continue;
4669       }
4670     else if (inescq)
4671       {
4672       if (previous_callout != NULL)
4673         {
4674         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
4675           complete_callout(previous_callout, ptr, cd);
4676         previous_callout = NULL;
4677         }
4678       if ((options & PCRE_AUTO_CALLOUT) != 0)
4679         {
4680         previous_callout = code;
4681         code = auto_callout(code, ptr, cd);
4682         }
4683       goto NORMAL_CHAR;
4684       }
4685 
4686     /* Check for the start of a \Q...\E sequence. We must do this here rather
4687     than later in case it is immediately followed by \E, which turns it into a
4688     "do nothing" sequence. */
4689 
4690     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
4691       {
4692       inescq = TRUE;
4693       ptr++;
4694       continue;
4695       }
4696     }
4697 
4698   /* In extended mode, skip white space and comments. */
4699 
4700   if ((options & PCRE_EXTENDED) != 0)
4701     {
4702     const pcre_uchar *wscptr = ptr;
4703     while (MAX_255(c) && (cd->ctypes[c] & ctype_space) != 0) c = *(++ptr);
4704     if (c == CHAR_NUMBER_SIGN)
4705       {
4706       ptr++;
4707       while (*ptr != CHAR_NULL)
4708         {
4709         if (IS_NEWLINE(ptr))         /* For non-fixed-length newline cases, */
4710           {                          /* IS_NEWLINE sets cd->nllen. */
4711           ptr += cd->nllen;
4712           break;
4713           }
4714         ptr++;
4715 #ifdef SUPPORT_UTF
4716         if (utf) FORWARDCHAR(ptr);
4717 #endif
4718         }
4719       }
4720 
4721     /* If we skipped any characters, restart the loop. Otherwise, we didn't see
4722     a comment. */
4723 
4724     if (ptr > wscptr) goto REDO_LOOP;
4725     }
4726 
4727   /* Skip over (?# comments. We need to do this here because we want to know if
4728   the next thing is a quantifier, and these comments may come between an item
4729   and its quantifier. */
4730 
4731   if (c == CHAR_LEFT_PARENTHESIS && ptr[1] == CHAR_QUESTION_MARK &&
4732       ptr[2] == CHAR_NUMBER_SIGN)
4733     {
4734     ptr += 3;
4735     while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
4736     if (*ptr == CHAR_NULL)
4737       {
4738       *errorcodeptr = ERR18;
4739       goto FAILED;
4740       }
4741     continue;
4742     }
4743 
4744   /* See if the next thing is a quantifier. */
4745 
4746   is_quantifier =
4747     c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4748     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4749 
4750   /* Fill in length of a previous callout, except when the next thing is a
4751   quantifier or when processing a property substitution string in UCP mode. */
4752 
4753   if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
4754        after_manual_callout-- <= 0)
4755     {
4756     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
4757       complete_callout(previous_callout, ptr, cd);
4758     previous_callout = NULL;
4759     }
4760 
4761   /* Create auto callout, except for quantifiers, or while processing property
4762   strings that are substituted for \w etc in UCP mode. */
4763 
4764   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
4765     {
4766     previous_callout = code;
4767     code = auto_callout(code, ptr, cd);
4768     }
4769 
4770   /* Process the next pattern item. */
4771 
4772   switch(c)
4773     {
4774     /* ===================================================================*/
4775     case CHAR_NULL:                /* The branch terminates at string end */
4776     case CHAR_VERTICAL_LINE:       /* or | or ) */
4777     case CHAR_RIGHT_PARENTHESIS:
4778     *firstcharptr = firstchar;
4779     *firstcharflagsptr = firstcharflags;
4780     *reqcharptr = reqchar;
4781     *reqcharflagsptr = reqcharflags;
4782     *codeptr = code;
4783     *ptrptr = ptr;
4784     if (lengthptr != NULL)
4785       {
4786       if (OFLOW_MAX - *lengthptr < code - last_code)
4787         {
4788         *errorcodeptr = ERR20;
4789         goto FAILED;
4790         }
4791       *lengthptr += (int)(code - last_code);   /* To include callout length */
4792       DPRINTF((">> end branch\n"));
4793       }
4794     return TRUE;
4795 
4796 
4797     /* ===================================================================*/
4798     /* Handle single-character metacharacters. In multiline mode, ^ disables
4799     the setting of any following char as a first character. */
4800 
4801     case CHAR_CIRCUMFLEX_ACCENT:
4802     previous = NULL;
4803     if ((options & PCRE_MULTILINE) != 0)
4804       {
4805       if (firstcharflags == REQ_UNSET)
4806         zerofirstcharflags = firstcharflags = REQ_NONE;
4807       *code++ = OP_CIRCM;
4808       }
4809     else *code++ = OP_CIRC;
4810     break;
4811 
4812     case CHAR_DOLLAR_SIGN:
4813     previous = NULL;
4814     *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
4815     break;
4816 
4817     /* There can never be a first char if '.' is first, whatever happens about
4818     repeats. The value of reqchar doesn't change either. */
4819 
4820     case CHAR_DOT:
4821     if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4822     zerofirstchar = firstchar;
4823     zerofirstcharflags = firstcharflags;
4824     zeroreqchar = reqchar;
4825     zeroreqcharflags = reqcharflags;
4826     previous = code;
4827     item_hwm_offset = cd->hwm - cd->start_workspace;
4828     *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
4829     break;
4830 
4831 
4832     /* ===================================================================*/
4833     /* Character classes. If the included characters are all < 256, we build a
4834     32-byte bitmap of the permitted characters, except in the special case
4835     where there is only one such character. For negated classes, we build the
4836     map as usual, then invert it at the end. However, we use a different opcode
4837     so that data characters > 255 can be handled correctly.
4838 
4839     If the class contains characters outside the 0-255 range, a different
4840     opcode is compiled. It may optionally have a bit map for characters < 256,
4841     but those above are are explicitly listed afterwards. A flag byte tells
4842     whether the bitmap is present, and whether this is a negated class or not.
4843 
4844     In JavaScript compatibility mode, an isolated ']' causes an error. In
4845     default (Perl) mode, it is treated as a data character. */
4846 
4847     case CHAR_RIGHT_SQUARE_BRACKET:
4848     if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4849       {
4850       *errorcodeptr = ERR64;
4851       goto FAILED;
4852       }
4853     goto NORMAL_CHAR;
4854 
4855     /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
4856     used for "start of word" and "end of word". As these are otherwise illegal
4857     sequences, we don't break anything by recognizing them. They are replaced
4858     by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
4859     erroneous and are handled by the normal code below. */
4860 
4861     case CHAR_LEFT_SQUARE_BRACKET:
4862     if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
4863       {
4864       nestptr = ptr + 7;
4865       ptr = sub_start_of_word;
4866       goto REDO_LOOP;
4867       }
4868 
4869     if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
4870       {
4871       nestptr = ptr + 7;
4872       ptr = sub_end_of_word;
4873       goto REDO_LOOP;
4874       }
4875 
4876     /* Handle a real character class. */
4877 
4878     previous = code;
4879     item_hwm_offset = cd->hwm - cd->start_workspace;
4880 
4881     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
4882     they are encountered at the top level, so we'll do that too. */
4883 
4884     if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4885          ptr[1] == CHAR_EQUALS_SIGN) &&
4886         check_posix_syntax(ptr, &tempptr))
4887       {
4888       *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
4889       goto FAILED;
4890       }
4891 
4892     /* If the first character is '^', set the negation flag and skip it. Also,
4893     if the first few characters (either before or after ^) are \Q\E or \E we
4894     skip them too. This makes for compatibility with Perl. */
4895 
4896     negate_class = FALSE;
4897     for (;;)
4898       {
4899       c = *(++ptr);
4900       if (c == CHAR_BACKSLASH)
4901         {
4902         if (ptr[1] == CHAR_E)
4903           ptr++;
4904         else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
4905           ptr += 3;
4906         else
4907           break;
4908         }
4909       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4910         negate_class = TRUE;
4911       else break;
4912       }
4913 
4914     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
4915     an initial ']' is taken as a data character -- the code below handles
4916     that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
4917     [^] must match any character, so generate OP_ALLANY. */
4918 
4919     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4920         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4921       {
4922       *code++ = negate_class? OP_ALLANY : OP_FAIL;
4923       if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4924       zerofirstchar = firstchar;
4925       zerofirstcharflags = firstcharflags;
4926       break;
4927       }
4928 
4929     /* If a class contains a negative special such as \S, we need to flip the
4930     negation flag at the end, so that support for characters > 255 works
4931     correctly (they are all included in the class). */
4932 
4933     should_flip_negation = FALSE;
4934 
4935     /* Extended class (xclass) will be used when characters > 255
4936     might match. */
4937 
4938 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4939     xclass = FALSE;
4940     class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
4941     class_uchardata_base = class_uchardata;   /* Save the start */
4942 #endif
4943 
4944     /* For optimization purposes, we track some properties of the class:
4945     class_has_8bitchar will be non-zero if the class contains at least one <
4946     256 character; class_one_char will be 1 if the class contains just one
4947     character; xclass_has_prop will be TRUE if unicode property checks
4948     are present in the class. */
4949 
4950     class_has_8bitchar = 0;
4951     class_one_char = 0;
4952 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4953     xclass_has_prop = FALSE;
4954 #endif
4955 
4956     /* Initialize the 32-char bit map to all zeros. We build the map in a
4957     temporary bit of memory, in case the class contains fewer than two
4958     8-bit characters because in that case the compiled code doesn't use the bit
4959     map. */
4960 
4961     memset(classbits, 0, 32 * sizeof(pcre_uint8));
4962 
4963     /* Process characters until ] is reached. By writing this as a "do" it
4964     means that an initial ] is taken as a data character. At the start of the
4965     loop, c contains the first byte of the character. */
4966 
4967     if (c != CHAR_NULL) do
4968       {
4969       const pcre_uchar *oldptr;
4970 
4971 #ifdef SUPPORT_UTF
4972       if (utf && HAS_EXTRALEN(c))
4973         {                           /* Braces are required because the */
4974         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
4975         }
4976 #endif
4977 
4978 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4979       /* In the pre-compile phase, accumulate the length of any extra
4980       data and reset the pointer. This is so that very large classes that
4981       contain a zillion > 255 characters no longer overwrite the work space
4982       (which is on the stack). We have to remember that there was XCLASS data,
4983       however. */
4984 
4985       if (class_uchardata > class_uchardata_base) xclass = TRUE;
4986 
4987       if (lengthptr != NULL && class_uchardata > class_uchardata_base)
4988         {
4989         *lengthptr += (int)(class_uchardata - class_uchardata_base);
4990         class_uchardata = class_uchardata_base;
4991         }
4992 #endif
4993 
4994       /* Inside \Q...\E everything is literal except \E */
4995 
4996       if (inescq)
4997         {
4998         if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
4999           {
5000           inescq = FALSE;                   /* Reset literal state */
5001           ptr++;                            /* Skip the 'E' */
5002           continue;                         /* Carry on with next */
5003           }
5004         goto CHECK_RANGE;                   /* Could be range if \E follows */
5005         }
5006 
5007       /* Handle POSIX class names. Perl allows a negation extension of the
5008       form [:^name:]. A square bracket that doesn't match the syntax is
5009       treated as a literal. We also recognize the POSIX constructions
5010       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
5011       5.6 and 5.8 do. */
5012 
5013       if (c == CHAR_LEFT_SQUARE_BRACKET &&
5014           (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
5015            ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
5016         {
5017         BOOL local_negate = FALSE;
5018         int posix_class, taboffset, tabopt;
5019         register const pcre_uint8 *cbits = cd->cbits;
5020         pcre_uint8 pbits[32];
5021 
5022         if (ptr[1] != CHAR_COLON)
5023           {
5024           *errorcodeptr = ERR31;
5025           goto FAILED;
5026           }
5027 
5028         ptr += 2;
5029         if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
5030           {
5031           local_negate = TRUE;
5032           should_flip_negation = TRUE;  /* Note negative special */
5033           ptr++;
5034           }
5035 
5036         posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
5037         if (posix_class < 0)
5038           {
5039           *errorcodeptr = ERR30;
5040           goto FAILED;
5041           }
5042 
5043         /* If matching is caseless, upper and lower are converted to
5044         alpha. This relies on the fact that the class table starts with
5045         alpha, lower, upper as the first 3 entries. */
5046 
5047         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
5048           posix_class = 0;
5049 
5050         /* When PCRE_UCP is set, some of the POSIX classes are converted to
5051         different escape sequences that use Unicode properties \p or \P. Others
5052         that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
5053         directly. */
5054 
5055 #ifdef SUPPORT_UCP
5056         if ((options & PCRE_UCP) != 0)
5057           {
5058           unsigned int ptype = 0;
5059           int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
5060 
5061           /* The posix_substitutes table specifies which POSIX classes can be
5062           converted to \p or \P items. */
5063 
5064           if (posix_substitutes[pc] != NULL)
5065             {
5066             nestptr = tempptr + 1;
5067             ptr = posix_substitutes[pc] - 1;
5068             continue;
5069             }
5070 
5071           /* There are three other classes that generate special property calls
5072           that are recognized only in an XCLASS. */
5073 
5074           else switch(posix_class)
5075             {
5076             case PC_GRAPH:
5077             ptype = PT_PXGRAPH;
5078             /* Fall through */
5079             case PC_PRINT:
5080             if (ptype == 0) ptype = PT_PXPRINT;
5081             /* Fall through */
5082             case PC_PUNCT:
5083             if (ptype == 0) ptype = PT_PXPUNCT;
5084             *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
5085             *class_uchardata++ = ptype;
5086             *class_uchardata++ = 0;
5087             xclass_has_prop = TRUE;
5088             ptr = tempptr + 1;
5089             continue;
5090 
5091             /* For the other POSIX classes (ascii, cntrl, xdigit) we are going
5092             to fall through to the non-UCP case and build a bit map for
5093             characters with code points less than 256. If we are in a negated
5094             POSIX class, characters with code points greater than 255 must
5095             either all match or all not match. In the special case where we
5096             have not yet generated any xclass data, and this is the final item
5097             in the overall class, we need do nothing: later on, the opcode
5098             OP_NCLASS will be used to indicate that characters greater than 255
5099             are acceptable. If we have already seen an xclass item or one may
5100             follow (we have to assume that it might if this is not the end of
5101             the class), explicitly list all wide codepoints, which will then
5102             either not match or match, depending on whether the class is or is
5103             not negated. */
5104 
5105             default:
5106             if (local_negate &&
5107                 (xclass || tempptr[2] != CHAR_RIGHT_SQUARE_BRACKET))
5108               {
5109               *class_uchardata++ = XCL_RANGE;
5110               class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
5111               class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
5112               }
5113             break;
5114             }
5115           }
5116 #endif
5117         /* In the non-UCP case, or when UCP makes no difference, we build the
5118         bit map for the POSIX class in a chunk of local store because we may be
5119         adding and subtracting from it, and we don't want to subtract bits that
5120         may be in the main map already. At the end we or the result into the
5121         bit map that is being built. */
5122 
5123         posix_class *= 3;
5124 
5125         /* Copy in the first table (always present) */
5126 
5127         memcpy(pbits, cbits + posix_class_maps[posix_class],
5128           32 * sizeof(pcre_uint8));
5129 
5130         /* If there is a second table, add or remove it as required. */
5131 
5132         taboffset = posix_class_maps[posix_class + 1];
5133         tabopt = posix_class_maps[posix_class + 2];
5134 
5135         if (taboffset >= 0)
5136           {
5137           if (tabopt >= 0)
5138             for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
5139           else
5140             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
5141           }
5142 
5143         /* Now see if we need to remove any special characters. An option
5144         value of 1 removes vertical space and 2 removes underscore. */
5145 
5146         if (tabopt < 0) tabopt = -tabopt;
5147         if (tabopt == 1) pbits[1] &= ~0x3c;
5148           else if (tabopt == 2) pbits[11] &= 0x7f;
5149 
5150         /* Add the POSIX table or its complement into the main table that is
5151         being built and we are done. */
5152 
5153         if (local_negate)
5154           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
5155         else
5156           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
5157 
5158         ptr = tempptr + 1;
5159         /* Every class contains at least one < 256 character. */
5160         class_has_8bitchar = 1;
5161         /* Every class contains at least two characters. */
5162         class_one_char = 2;
5163         continue;    /* End of POSIX syntax handling */
5164         }
5165 
5166       /* Backslash may introduce a single character, or it may introduce one
5167       of the specials, which just set a flag. The sequence \b is a special
5168       case. Inside a class (and only there) it is treated as backspace. We
5169       assume that other escapes have more than one character in them, so
5170       speculatively set both class_has_8bitchar and class_one_char bigger
5171       than one. Unrecognized escapes fall through and are either treated
5172       as literal characters (by default), or are faulted if
5173       PCRE_EXTRA is set. */
5174 
5175       if (c == CHAR_BACKSLASH)
5176         {
5177         escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options,
5178           TRUE);
5179         if (*errorcodeptr != 0) goto FAILED;
5180         if (escape == 0) c = ec;
5181         else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
5182         else if (escape == ESC_N)          /* \N is not supported in a class */
5183           {
5184           *errorcodeptr = ERR71;
5185           goto FAILED;
5186           }
5187         else if (escape == ESC_Q)            /* Handle start of quoted string */
5188           {
5189           if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5190             {
5191             ptr += 2; /* avoid empty string */
5192             }
5193           else inescq = TRUE;
5194           continue;
5195           }
5196         else if (escape == ESC_E) continue;  /* Ignore orphan \E */
5197 
5198         else
5199           {
5200           register const pcre_uint8 *cbits = cd->cbits;
5201           /* Every class contains at least two < 256 characters. */
5202           class_has_8bitchar++;
5203           /* Every class contains at least two characters. */
5204           class_one_char += 2;
5205 
5206           switch (escape)
5207             {
5208 #ifdef SUPPORT_UCP
5209             case ESC_du:     /* These are the values given for \d etc */
5210             case ESC_DU:     /* when PCRE_UCP is set. We replace the */
5211             case ESC_wu:     /* escape sequence with an appropriate \p */
5212             case ESC_WU:     /* or \P to test Unicode properties instead */
5213             case ESC_su:     /* of the default ASCII testing. */
5214             case ESC_SU:
5215             nestptr = ptr;
5216             ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
5217             class_has_8bitchar--;                /* Undo! */
5218             continue;
5219 #endif
5220             case ESC_d:
5221             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
5222             continue;
5223 
5224             case ESC_D:
5225             should_flip_negation = TRUE;
5226             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
5227             continue;
5228 
5229             case ESC_w:
5230             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
5231             continue;
5232 
5233             case ESC_W:
5234             should_flip_negation = TRUE;
5235             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
5236             continue;
5237 
5238             /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5239             5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5240             previously set by something earlier in the character class.
5241             Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5242             we could just adjust the appropriate bit. From PCRE 8.34 we no
5243             longer treat \s and \S specially. */
5244 
5245             case ESC_s:
5246             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
5247             continue;
5248 
5249             case ESC_S:
5250             should_flip_negation = TRUE;
5251             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
5252             continue;
5253 
5254             /* The rest apply in both UCP and non-UCP cases. */
5255 
5256             case ESC_h:
5257             (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5258               PRIV(hspace_list), NOTACHAR);
5259             continue;
5260 
5261             case ESC_H:
5262             (void)add_not_list_to_class(classbits, &class_uchardata, options,
5263               cd, PRIV(hspace_list));
5264             continue;
5265 
5266             case ESC_v:
5267             (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5268               PRIV(vspace_list), NOTACHAR);
5269             continue;
5270 
5271             case ESC_V:
5272             (void)add_not_list_to_class(classbits, &class_uchardata, options,
5273               cd, PRIV(vspace_list));
5274             continue;
5275 
5276             case ESC_p:
5277             case ESC_P:
5278 #ifdef SUPPORT_UCP
5279               {
5280               BOOL negated;
5281               unsigned int ptype = 0, pdata = 0;
5282               if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
5283                 goto FAILED;
5284               *class_uchardata++ = ((escape == ESC_p) != negated)?
5285                 XCL_PROP : XCL_NOTPROP;
5286               *class_uchardata++ = ptype;
5287               *class_uchardata++ = pdata;
5288               xclass_has_prop = TRUE;
5289               class_has_8bitchar--;                /* Undo! */
5290               continue;
5291               }
5292 #else
5293             *errorcodeptr = ERR45;
5294             goto FAILED;
5295 #endif
5296             /* Unrecognized escapes are faulted if PCRE is running in its
5297             strict mode. By default, for compatibility with Perl, they are
5298             treated as literals. */
5299 
5300             default:
5301             if ((options & PCRE_EXTRA) != 0)
5302               {
5303               *errorcodeptr = ERR7;
5304               goto FAILED;
5305               }
5306             class_has_8bitchar--;    /* Undo the speculative increase. */
5307             class_one_char -= 2;     /* Undo the speculative increase. */
5308             c = *ptr;                /* Get the final character and fall through */
5309             break;
5310             }
5311           }
5312 
5313         /* Fall through if the escape just defined a single character (c >= 0).
5314         This may be greater than 256. */
5315 
5316         escape = 0;
5317 
5318         }   /* End of backslash handling */
5319 
5320       /* A character may be followed by '-' to form a range. However, Perl does
5321       not permit ']' to be the end of the range. A '-' character at the end is
5322       treated as a literal. Perl ignores orphaned \E sequences entirely. The
5323       code for handling \Q and \E is messy. */
5324 
5325       CHECK_RANGE:
5326       while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5327         {
5328         inescq = FALSE;
5329         ptr += 2;
5330         }
5331       oldptr = ptr;
5332 
5333       /* Remember if \r or \n were explicitly used */
5334 
5335       if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5336 
5337       /* Check for range */
5338 
5339       if (!inescq && ptr[1] == CHAR_MINUS)
5340         {
5341         pcre_uint32 d;
5342         ptr += 2;
5343         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
5344 
5345         /* If we hit \Q (not followed by \E) at this point, go into escaped
5346         mode. */
5347 
5348         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
5349           {
5350           ptr += 2;
5351           if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
5352             { ptr += 2; continue; }
5353           inescq = TRUE;
5354           break;
5355           }
5356 
5357         /* Minus (hyphen) at the end of a class is treated as a literal, so put
5358         back the pointer and jump to handle the character that preceded it. */
5359 
5360         if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
5361           {
5362           ptr = oldptr;
5363           goto CLASS_SINGLE_CHARACTER;
5364           }
5365 
5366         /* Otherwise, we have a potential range; pick up the next character */
5367 
5368 #ifdef SUPPORT_UTF
5369         if (utf)
5370           {                           /* Braces are required because the */
5371           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
5372           }
5373         else
5374 #endif
5375         d = *ptr;  /* Not UTF-8 mode */
5376 
5377         /* The second part of a range can be a single-character escape
5378         sequence, but not any of the other escapes. Perl treats a hyphen as a
5379         literal in such circumstances. However, in Perl's warning mode, a
5380         warning is given, so PCRE now faults it as it is almost certainly a
5381         mistake on the user's part. */
5382 
5383         if (!inescq)
5384           {
5385           if (d == CHAR_BACKSLASH)
5386             {
5387             int descape;
5388             descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
5389             if (*errorcodeptr != 0) goto FAILED;
5390 
5391             /* 0 means a character was put into d; \b is backspace; any other
5392             special causes an error. */
5393 
5394             if (descape != 0)
5395               {
5396               if (descape == ESC_b) d = CHAR_BS; else
5397                 {
5398                 *errorcodeptr = ERR83;
5399                 goto FAILED;
5400                 }
5401               }
5402             }
5403 
5404           /* A hyphen followed by a POSIX class is treated in the same way. */
5405 
5406           else if (d == CHAR_LEFT_SQUARE_BRACKET &&
5407                    (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
5408                     ptr[1] == CHAR_EQUALS_SIGN) &&
5409                    check_posix_syntax(ptr, &tempptr))
5410             {
5411             *errorcodeptr = ERR83;
5412             goto FAILED;
5413             }
5414           }
5415 
5416         /* Check that the two values are in the correct order. Optimize
5417         one-character ranges. */
5418 
5419         if (d < c)
5420           {
5421           *errorcodeptr = ERR8;
5422           goto FAILED;
5423           }
5424         if (d == c) goto CLASS_SINGLE_CHARACTER;  /* A few lines below */
5425 
5426         /* We have found a character range, so single character optimizations
5427         cannot be done anymore. Any value greater than 1 indicates that there
5428         is more than one character. */
5429 
5430         class_one_char = 2;
5431 
5432         /* Remember an explicit \r or \n, and add the range to the class. */
5433 
5434         if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5435 
5436         class_has_8bitchar +=
5437           add_to_class(classbits, &class_uchardata, options, cd, c, d);
5438 
5439         continue;   /* Go get the next char in the class */
5440         }
5441 
5442       /* Handle a single character - we can get here for a normal non-escape
5443       char, or after \ that introduces a single character or for an apparent
5444       range that isn't. Only the value 1 matters for class_one_char, so don't
5445       increase it if it is already 2 or more ... just in case there's a class
5446       with a zillion characters in it. */
5447 
5448       CLASS_SINGLE_CHARACTER:
5449       if (class_one_char < 2) class_one_char++;
5450 
5451       /* If xclass_has_prop is false and class_one_char is 1, we have the first
5452       single character in the class, and there have been no prior ranges, or
5453       XCLASS items generated by escapes. If this is the final character in the
5454       class, we can optimize by turning the item into a 1-character OP_CHAR[I]
5455       if it's positive, or OP_NOT[I] if it's negative. In the positive case, it
5456       can cause firstchar to be set. Otherwise, there can be no first char if
5457       this item is first, whatever repeat count may follow. In the case of
5458       reqchar, save the previous value for reinstating. */
5459 
5460       if (!inescq &&
5461 #ifdef SUPPORT_UCP
5462           !xclass_has_prop &&
5463 #endif
5464           class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
5465         {
5466         ptr++;
5467         zeroreqchar = reqchar;
5468         zeroreqcharflags = reqcharflags;
5469 
5470         if (negate_class)
5471           {
5472 #ifdef SUPPORT_UCP
5473           int d;
5474 #endif
5475           if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5476           zerofirstchar = firstchar;
5477           zerofirstcharflags = firstcharflags;
5478 
5479           /* For caseless UTF-8 mode when UCP support is available, check
5480           whether this character has more than one other case. If so, generate
5481           a special OP_NOTPROP item instead of OP_NOTI. */
5482 
5483 #ifdef SUPPORT_UCP
5484           if (utf && (options & PCRE_CASELESS) != 0 &&
5485               (d = UCD_CASESET(c)) != 0)
5486             {
5487             *code++ = OP_NOTPROP;
5488             *code++ = PT_CLIST;
5489             *code++ = d;
5490             }
5491           else
5492 #endif
5493           /* Char has only one other case, or UCP not available */
5494 
5495             {
5496             *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
5497 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5498             if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5499               code += PRIV(ord2utf)(c, code);
5500             else
5501 #endif
5502               *code++ = c;
5503             }
5504 
5505           /* We are finished with this character class */
5506 
5507           goto END_CLASS;
5508           }
5509 
5510         /* For a single, positive character, get the value into mcbuffer, and
5511         then we can handle this with the normal one-character code. */
5512 
5513 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5514         if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5515           mclength = PRIV(ord2utf)(c, mcbuffer);
5516         else
5517 #endif
5518           {
5519           mcbuffer[0] = c;
5520           mclength = 1;
5521           }
5522         goto ONE_CHAR;
5523         }       /* End of 1-char optimization */
5524 
5525       /* There is more than one character in the class, or an XCLASS item
5526       has been generated. Add this character to the class. */
5527 
5528       class_has_8bitchar +=
5529         add_to_class(classbits, &class_uchardata, options, cd, c, c);
5530       }
5531 
5532     /* Loop until ']' reached. This "while" is the end of the "do" far above.
5533     If we are at the end of an internal nested string, revert to the outer
5534     string. */
5535 
5536     while (((c = *(++ptr)) != CHAR_NULL ||
5537            (nestptr != NULL &&
5538              (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != CHAR_NULL)) &&
5539            (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
5540 
5541     /* Check for missing terminating ']' */
5542 
5543     if (c == CHAR_NULL)
5544       {
5545       *errorcodeptr = ERR6;
5546       goto FAILED;
5547       }
5548 
5549     /* We will need an XCLASS if data has been placed in class_uchardata. In
5550     the second phase this is a sufficient test. However, in the pre-compile
5551     phase, class_uchardata gets emptied to prevent workspace overflow, so it
5552     only if the very last character in the class needs XCLASS will it contain
5553     anything at this point. For this reason, xclass gets set TRUE above when
5554     uchar_classdata is emptied, and that's why this code is the way it is here
5555     instead of just doing a test on class_uchardata below. */
5556 
5557 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5558     if (class_uchardata > class_uchardata_base) xclass = TRUE;
5559 #endif
5560 
5561     /* If this is the first thing in the branch, there can be no first char
5562     setting, whatever the repeat count. Any reqchar setting must remain
5563     unchanged after any kind of repeat. */
5564 
5565     if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5566     zerofirstchar = firstchar;
5567     zerofirstcharflags = firstcharflags;
5568     zeroreqchar = reqchar;
5569     zeroreqcharflags = reqcharflags;
5570 
5571     /* If there are characters with values > 255, we have to compile an
5572     extended class, with its own opcode, unless there was a negated special
5573     such as \S in the class, and PCRE_UCP is not set, because in that case all
5574     characters > 255 are in the class, so any that were explicitly given as
5575     well can be ignored. If (when there are explicit characters > 255 that must
5576     be listed) there are no characters < 256, we can omit the bitmap in the
5577     actual compiled code. */
5578 
5579 #ifdef SUPPORT_UTF
5580     if (xclass && (xclass_has_prop || !should_flip_negation ||
5581         (options & PCRE_UCP) != 0))
5582 #elif !defined COMPILE_PCRE8
5583     if (xclass && (xclass_has_prop || !should_flip_negation))
5584 #endif
5585 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5586       {
5587       /* For non-UCP wide characters, in a non-negative class containing \S or
5588       similar (should_flip_negation is set), all characters greater than 255
5589       must be in the class. */
5590 
5591       if (
5592 #if defined COMPILE_PCRE8
5593            utf &&
5594 #endif
5595            should_flip_negation && !negate_class && (options & PCRE_UCP) == 0)
5596         {
5597         *class_uchardata++ = XCL_RANGE;
5598         if (utf)   /* Will always be utf in the 8-bit library */
5599           {
5600           class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
5601           class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
5602           }
5603         else       /* Can only happen for the 16-bit & 32-bit libraries */
5604           {
5605 #if defined COMPILE_PCRE16
5606           *class_uchardata++ = 0x100;
5607           *class_uchardata++ = 0xffffu;
5608 #elif defined COMPILE_PCRE32
5609           *class_uchardata++ = 0x100;
5610           *class_uchardata++ = 0xffffffffu;
5611 #endif
5612           }
5613         }
5614 
5615       *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
5616       *code++ = OP_XCLASS;
5617       code += LINK_SIZE;
5618       *code = negate_class? XCL_NOT:0;
5619       if (xclass_has_prop) *code |= XCL_HASPROP;
5620 
5621       /* If the map is required, move up the extra data to make room for it;
5622       otherwise just move the code pointer to the end of the extra data. */
5623 
5624       if (class_has_8bitchar > 0)
5625         {
5626         *code++ |= XCL_MAP;
5627         memmove(code + (32 / sizeof(pcre_uchar)), code,
5628           IN_UCHARS(class_uchardata - code));
5629         if (negate_class && !xclass_has_prop)
5630           for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5631         memcpy(code, classbits, 32);
5632         code = class_uchardata + (32 / sizeof(pcre_uchar));
5633         }
5634       else code = class_uchardata;
5635 
5636       /* Now fill in the complete length of the item */
5637 
5638       PUT(previous, 1, (int)(code - previous));
5639       break;   /* End of class handling */
5640       }
5641 
5642     /* Even though any XCLASS list is now discarded, we must allow for
5643     its memory. */
5644 
5645     if (lengthptr != NULL)
5646       *lengthptr += (int)(class_uchardata - class_uchardata_base);
5647 #endif
5648 
5649     /* If there are no characters > 255, or they are all to be included or
5650     excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
5651     whole class was negated and whether there were negative specials such as \S
5652     (non-UCP) in the class. Then copy the 32-byte map into the code vector,
5653     negating it if necessary. */
5654 
5655     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
5656     if (lengthptr == NULL)    /* Save time in the pre-compile phase */
5657       {
5658       if (negate_class)
5659         for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5660       memcpy(code, classbits, 32);
5661       }
5662     code += 32 / sizeof(pcre_uchar);
5663 
5664     END_CLASS:
5665     break;
5666 
5667 
5668     /* ===================================================================*/
5669     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
5670     has been tested above. */
5671 
5672     case CHAR_LEFT_CURLY_BRACKET:
5673     if (!is_quantifier) goto NORMAL_CHAR;
5674     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
5675     if (*errorcodeptr != 0) goto FAILED;
5676     goto REPEAT;
5677 
5678     case CHAR_ASTERISK:
5679     repeat_min = 0;
5680     repeat_max = -1;
5681     goto REPEAT;
5682 
5683     case CHAR_PLUS:
5684     repeat_min = 1;
5685     repeat_max = -1;
5686     goto REPEAT;
5687 
5688     case CHAR_QUESTION_MARK:
5689     repeat_min = 0;
5690     repeat_max = 1;
5691 
5692     REPEAT:
5693     if (previous == NULL)
5694       {
5695       *errorcodeptr = ERR9;
5696       goto FAILED;
5697       }
5698 
5699     if (repeat_min == 0)
5700       {
5701       firstchar = zerofirstchar;    /* Adjust for zero repeat */
5702       firstcharflags = zerofirstcharflags;
5703       reqchar = zeroreqchar;        /* Ditto */
5704       reqcharflags = zeroreqcharflags;
5705       }
5706 
5707     /* Remember whether this is a variable length repeat */
5708 
5709     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
5710 
5711     op_type = 0;                    /* Default single-char op codes */
5712     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
5713 
5714     /* Save start of previous item, in case we have to move it up in order to
5715     insert something before it. */
5716 
5717     tempcode = previous;
5718 
5719     /* Before checking for a possessive quantifier, we must skip over
5720     whitespace and comments in extended mode because Perl allows white space at
5721     this point. */
5722 
5723     if ((options & PCRE_EXTENDED) != 0)
5724       {
5725       const pcre_uchar *p = ptr + 1;
5726       for (;;)
5727         {
5728         while (MAX_255(*p) && (cd->ctypes[*p] & ctype_space) != 0) p++;
5729         if (*p != CHAR_NUMBER_SIGN) break;
5730         p++;
5731         while (*p != CHAR_NULL)
5732           {
5733           if (IS_NEWLINE(p))         /* For non-fixed-length newline cases, */
5734             {                        /* IS_NEWLINE sets cd->nllen. */
5735             p += cd->nllen;
5736             break;
5737             }
5738           p++;
5739 #ifdef SUPPORT_UTF
5740           if (utf) FORWARDCHAR(p);
5741 #endif
5742           }           /* Loop for comment characters */
5743         }             /* Loop for multiple comments */
5744       ptr = p - 1;    /* Character before the next significant one. */
5745       }
5746 
5747     /* We also need to skip over (?# comments, which are not dependent on
5748     extended mode. */
5749 
5750     if (ptr[1] == CHAR_LEFT_PARENTHESIS && ptr[2] == CHAR_QUESTION_MARK &&
5751         ptr[3] == CHAR_NUMBER_SIGN)
5752       {
5753       ptr += 4;
5754       while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
5755       if (*ptr == CHAR_NULL)
5756         {
5757         *errorcodeptr = ERR18;
5758         goto FAILED;
5759         }
5760       }
5761 
5762     /* If the next character is '+', we have a possessive quantifier. This
5763     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
5764     If the next character is '?' this is a minimizing repeat, by default,
5765     but if PCRE_UNGREEDY is set, it works the other way round. We change the
5766     repeat type to the non-default. */
5767 
5768     if (ptr[1] == CHAR_PLUS)
5769       {
5770       repeat_type = 0;                  /* Force greedy */
5771       possessive_quantifier = TRUE;
5772       ptr++;
5773       }
5774     else if (ptr[1] == CHAR_QUESTION_MARK)
5775       {
5776       repeat_type = greedy_non_default;
5777       ptr++;
5778       }
5779     else repeat_type = greedy_default;
5780 
5781     /* If previous was a recursion call, wrap it in atomic brackets so that
5782     previous becomes the atomic group. All recursions were so wrapped in the
5783     past, but it no longer happens for non-repeated recursions. In fact, the
5784     repeated ones could be re-implemented independently so as not to need this,
5785     but for the moment we rely on the code for repeating groups. */
5786 
5787     if (*previous == OP_RECURSE)
5788       {
5789       memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
5790       *previous = OP_ONCE;
5791       PUT(previous, 1, 2 + 2*LINK_SIZE);
5792       previous[2 + 2*LINK_SIZE] = OP_KET;
5793       PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
5794       code += 2 + 2 * LINK_SIZE;
5795       length_prevgroup = 3 + 3*LINK_SIZE;
5796 
5797       /* When actually compiling, we need to check whether this was a forward
5798       reference, and if so, adjust the offset. */
5799 
5800       if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
5801         {
5802         int offset = GET(cd->hwm, -LINK_SIZE);
5803         if (offset == previous + 1 - cd->start_code)
5804           PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
5805         }
5806       }
5807 
5808     /* Now handle repetition for the different types of item. */
5809 
5810     /* If previous was a character or negated character match, abolish the item
5811     and generate a repeat item instead. If a char item has a minimum of more
5812     than one, ensure that it is set in reqchar - it might not be if a sequence
5813     such as x{3} is the first thing in a branch because the x will have gone
5814     into firstchar instead.  */
5815 
5816     if (*previous == OP_CHAR || *previous == OP_CHARI
5817         || *previous == OP_NOT || *previous == OP_NOTI)
5818       {
5819       switch (*previous)
5820         {
5821         default: /* Make compiler happy. */
5822         case OP_CHAR:  op_type = OP_STAR - OP_STAR; break;
5823         case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
5824         case OP_NOT:   op_type = OP_NOTSTAR - OP_STAR; break;
5825         case OP_NOTI:  op_type = OP_NOTSTARI - OP_STAR; break;
5826         }
5827 
5828       /* Deal with UTF characters that take up more than one character. It's
5829       easier to write this out separately than try to macrify it. Use c to
5830       hold the length of the character in bytes, plus UTF_LENGTH to flag that
5831       it's a length rather than a small character. */
5832 
5833 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5834       if (utf && NOT_FIRSTCHAR(code[-1]))
5835         {
5836         pcre_uchar *lastchar = code - 1;
5837         BACKCHAR(lastchar);
5838         c = (int)(code - lastchar);     /* Length of UTF-8 character */
5839         memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
5840         c |= UTF_LENGTH;                /* Flag c as a length */
5841         }
5842       else
5843 #endif /* SUPPORT_UTF */
5844 
5845       /* Handle the case of a single charater - either with no UTF support, or
5846       with UTF disabled, or for a single character UTF character. */
5847         {
5848         c = code[-1];
5849         if (*previous <= OP_CHARI && repeat_min > 1)
5850           {
5851           reqchar = c;
5852           reqcharflags = req_caseopt | cd->req_varyopt;
5853           }
5854         }
5855 
5856       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
5857       }
5858 
5859     /* If previous was a character type match (\d or similar), abolish it and
5860     create a suitable repeat item. The code is shared with single-character
5861     repeats by setting op_type to add a suitable offset into repeat_type. Note
5862     the the Unicode property types will be present only when SUPPORT_UCP is
5863     defined, but we don't wrap the little bits of code here because it just
5864     makes it horribly messy. */
5865 
5866     else if (*previous < OP_EODN)
5867       {
5868       pcre_uchar *oldcode;
5869       int prop_type, prop_value;
5870       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
5871       c = *previous;
5872 
5873       OUTPUT_SINGLE_REPEAT:
5874       if (*previous == OP_PROP || *previous == OP_NOTPROP)
5875         {
5876         prop_type = previous[1];
5877         prop_value = previous[2];
5878         }
5879       else prop_type = prop_value = -1;
5880 
5881       oldcode = code;
5882       code = previous;                  /* Usually overwrite previous item */
5883 
5884       /* If the maximum is zero then the minimum must also be zero; Perl allows
5885       this case, so we do too - by simply omitting the item altogether. */
5886 
5887       if (repeat_max == 0) goto END_REPEAT;
5888 
5889       /* Combine the op_type with the repeat_type */
5890 
5891       repeat_type += op_type;
5892 
5893       /* A minimum of zero is handled either as the special case * or ?, or as
5894       an UPTO, with the maximum given. */
5895 
5896       if (repeat_min == 0)
5897         {
5898         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
5899           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
5900         else
5901           {
5902           *code++ = OP_UPTO + repeat_type;
5903           PUT2INC(code, 0, repeat_max);
5904           }
5905         }
5906 
5907       /* A repeat minimum of 1 is optimized into some special cases. If the
5908       maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
5909       left in place and, if the maximum is greater than 1, we use OP_UPTO with
5910       one less than the maximum. */
5911 
5912       else if (repeat_min == 1)
5913         {
5914         if (repeat_max == -1)
5915           *code++ = OP_PLUS + repeat_type;
5916         else
5917           {
5918           code = oldcode;                 /* leave previous item in place */
5919           if (repeat_max == 1) goto END_REPEAT;
5920           *code++ = OP_UPTO + repeat_type;
5921           PUT2INC(code, 0, repeat_max - 1);
5922           }
5923         }
5924 
5925       /* The case {n,n} is just an EXACT, while the general case {n,m} is
5926       handled as an EXACT followed by an UPTO. */
5927 
5928       else
5929         {
5930         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
5931         PUT2INC(code, 0, repeat_min);
5932 
5933         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
5934         we have to insert the character for the previous code. For a repeated
5935         Unicode property match, there are two extra bytes that define the
5936         required property. In UTF-8 mode, long characters have their length in
5937         c, with the UTF_LENGTH bit as a flag. */
5938 
5939         if (repeat_max < 0)
5940           {
5941 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5942           if (utf && (c & UTF_LENGTH) != 0)
5943             {
5944             memcpy(code, utf_chars, IN_UCHARS(c & 7));
5945             code += c & 7;
5946             }
5947           else
5948 #endif
5949             {
5950             *code++ = c;
5951             if (prop_type >= 0)
5952               {
5953               *code++ = prop_type;
5954               *code++ = prop_value;
5955               }
5956             }
5957           *code++ = OP_STAR + repeat_type;
5958           }
5959 
5960         /* Else insert an UPTO if the max is greater than the min, again
5961         preceded by the character, for the previously inserted code. If the
5962         UPTO is just for 1 instance, we can use QUERY instead. */
5963 
5964         else if (repeat_max != repeat_min)
5965           {
5966 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5967           if (utf && (c & UTF_LENGTH) != 0)
5968             {
5969             memcpy(code, utf_chars, IN_UCHARS(c & 7));
5970             code += c & 7;
5971             }
5972           else
5973 #endif
5974           *code++ = c;
5975           if (prop_type >= 0)
5976             {
5977             *code++ = prop_type;
5978             *code++ = prop_value;
5979             }
5980           repeat_max -= repeat_min;
5981 
5982           if (repeat_max == 1)
5983             {
5984             *code++ = OP_QUERY + repeat_type;
5985             }
5986           else
5987             {
5988             *code++ = OP_UPTO + repeat_type;
5989             PUT2INC(code, 0, repeat_max);
5990             }
5991           }
5992         }
5993 
5994       /* The character or character type itself comes last in all cases. */
5995 
5996 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5997       if (utf && (c & UTF_LENGTH) != 0)
5998         {
5999         memcpy(code, utf_chars, IN_UCHARS(c & 7));
6000         code += c & 7;
6001         }
6002       else
6003 #endif
6004       *code++ = c;
6005 
6006       /* For a repeated Unicode property match, there are two extra bytes that
6007       define the required property. */
6008 
6009 #ifdef SUPPORT_UCP
6010       if (prop_type >= 0)
6011         {
6012         *code++ = prop_type;
6013         *code++ = prop_value;
6014         }
6015 #endif
6016       }
6017 
6018     /* If previous was a character class or a back reference, we put the repeat
6019     stuff after it, but just skip the item if the repeat was {0,0}. */
6020 
6021     else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
6022 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6023              *previous == OP_XCLASS ||
6024 #endif
6025              *previous == OP_REF   || *previous == OP_REFI ||
6026              *previous == OP_DNREF || *previous == OP_DNREFI)
6027       {
6028       if (repeat_max == 0)
6029         {
6030         code = previous;
6031         goto END_REPEAT;
6032         }
6033 
6034       if (repeat_min == 0 && repeat_max == -1)
6035         *code++ = OP_CRSTAR + repeat_type;
6036       else if (repeat_min == 1 && repeat_max == -1)
6037         *code++ = OP_CRPLUS + repeat_type;
6038       else if (repeat_min == 0 && repeat_max == 1)
6039         *code++ = OP_CRQUERY + repeat_type;
6040       else
6041         {
6042         *code++ = OP_CRRANGE + repeat_type;
6043         PUT2INC(code, 0, repeat_min);
6044         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
6045         PUT2INC(code, 0, repeat_max);
6046         }
6047       }
6048 
6049     /* If previous was a bracket group, we may have to replicate it in certain
6050     cases. Note that at this point we can encounter only the "basic" bracket
6051     opcodes such as BRA and CBRA, as this is the place where they get converted
6052     into the more special varieties such as BRAPOS and SBRA. A test for >=
6053     OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
6054     ASSERTBACK_NOT, ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND.
6055     Originally, PCRE did not allow repetition of assertions, but now it does,
6056     for Perl compatibility. */
6057 
6058     else if (*previous >= OP_ASSERT && *previous <= OP_COND)
6059       {
6060       register int i;
6061       int len = (int)(code - previous);
6062       size_t base_hwm_offset = item_hwm_offset;
6063       pcre_uchar *bralink = NULL;
6064       pcre_uchar *brazeroptr = NULL;
6065 
6066       /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
6067       we just ignore the repeat. */
6068 
6069       if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
6070         goto END_REPEAT;
6071 
6072       /* There is no sense in actually repeating assertions. The only potential
6073       use of repetition is in cases when the assertion is optional. Therefore,
6074       if the minimum is greater than zero, just ignore the repeat. If the
6075       maximum is not zero or one, set it to 1. */
6076 
6077       if (*previous < OP_ONCE)    /* Assertion */
6078         {
6079         if (repeat_min > 0) goto END_REPEAT;
6080         if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
6081         }
6082 
6083       /* The case of a zero minimum is special because of the need to stick
6084       OP_BRAZERO in front of it, and because the group appears once in the
6085       data, whereas in other cases it appears the minimum number of times. For
6086       this reason, it is simplest to treat this case separately, as otherwise
6087       the code gets far too messy. There are several special subcases when the
6088       minimum is zero. */
6089 
6090       if (repeat_min == 0)
6091         {
6092         /* If the maximum is also zero, we used to just omit the group from the
6093         output altogether, like this:
6094 
6095         ** if (repeat_max == 0)
6096         **   {
6097         **   code = previous;
6098         **   goto END_REPEAT;
6099         **   }
6100 
6101         However, that fails when a group or a subgroup within it is referenced
6102         as a subroutine from elsewhere in the pattern, so now we stick in
6103         OP_SKIPZERO in front of it so that it is skipped on execution. As we
6104         don't have a list of which groups are referenced, we cannot do this
6105         selectively.
6106 
6107         If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
6108         and do no more at this point. However, we do need to adjust any
6109         OP_RECURSE calls inside the group that refer to the group itself or any
6110         internal or forward referenced group, because the offset is from the
6111         start of the whole regex. Temporarily terminate the pattern while doing
6112         this. */
6113 
6114         if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
6115           {
6116           *code = OP_END;
6117           adjust_recurse(previous, 1, utf, cd, item_hwm_offset);
6118           memmove(previous + 1, previous, IN_UCHARS(len));
6119           code++;
6120           if (repeat_max == 0)
6121             {
6122             *previous++ = OP_SKIPZERO;
6123             goto END_REPEAT;
6124             }
6125           brazeroptr = previous;    /* Save for possessive optimizing */
6126           *previous++ = OP_BRAZERO + repeat_type;
6127           }
6128 
6129         /* If the maximum is greater than 1 and limited, we have to replicate
6130         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
6131         The first one has to be handled carefully because it's the original
6132         copy, which has to be moved up. The remainder can be handled by code
6133         that is common with the non-zero minimum case below. We have to
6134         adjust the value or repeat_max, since one less copy is required. Once
6135         again, we may have to adjust any OP_RECURSE calls inside the group. */
6136 
6137         else
6138           {
6139           int offset;
6140           *code = OP_END;
6141           adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, item_hwm_offset);
6142           memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
6143           code += 2 + LINK_SIZE;
6144           *previous++ = OP_BRAZERO + repeat_type;
6145           *previous++ = OP_BRA;
6146 
6147           /* We chain together the bracket offset fields that have to be
6148           filled in later when the ends of the brackets are reached. */
6149 
6150           offset = (bralink == NULL)? 0 : (int)(previous - bralink);
6151           bralink = previous;
6152           PUTINC(previous, 0, offset);
6153           }
6154 
6155         repeat_max--;
6156         }
6157 
6158       /* If the minimum is greater than zero, replicate the group as many
6159       times as necessary, and adjust the maximum to the number of subsequent
6160       copies that we need. If we set a first char from the group, and didn't
6161       set a required char, copy the latter from the former. If there are any
6162       forward reference subroutine calls in the group, there will be entries on
6163       the workspace list; replicate these with an appropriate increment. */
6164 
6165       else
6166         {
6167         if (repeat_min > 1)
6168           {
6169           /* In the pre-compile phase, we don't actually do the replication. We
6170           just adjust the length as if we had. Do some paranoid checks for
6171           potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
6172           integer type when available, otherwise double. */
6173 
6174           if (lengthptr != NULL)
6175             {
6176             int delta = (repeat_min - 1)*length_prevgroup;
6177             if ((INT64_OR_DOUBLE)(repeat_min - 1)*
6178                   (INT64_OR_DOUBLE)length_prevgroup >
6179                     (INT64_OR_DOUBLE)INT_MAX ||
6180                 OFLOW_MAX - *lengthptr < delta)
6181               {
6182               *errorcodeptr = ERR20;
6183               goto FAILED;
6184               }
6185             *lengthptr += delta;
6186             }
6187 
6188           /* This is compiling for real. If there is a set first byte for
6189           the group, and we have not yet set a "required byte", set it. Make
6190           sure there is enough workspace for copying forward references before
6191           doing the copy. */
6192 
6193           else
6194             {
6195             if (groupsetfirstchar && reqcharflags < 0)
6196               {
6197               reqchar = firstchar;
6198               reqcharflags = firstcharflags;
6199               }
6200 
6201             for (i = 1; i < repeat_min; i++)
6202               {
6203               pcre_uchar *hc;
6204               size_t this_hwm_offset = cd->hwm - cd->start_workspace;
6205               memcpy(code, previous, IN_UCHARS(len));
6206 
6207               while (cd->hwm > cd->start_workspace + cd->workspace_size -
6208                      WORK_SIZE_SAFETY_MARGIN -
6209                      (this_hwm_offset - base_hwm_offset))
6210                 {
6211                 *errorcodeptr = expand_workspace(cd);
6212                 if (*errorcodeptr != 0) goto FAILED;
6213                 }
6214 
6215               for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
6216                    hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
6217                    hc += LINK_SIZE)
6218                 {
6219                 PUT(cd->hwm, 0, GET(hc, 0) + len);
6220                 cd->hwm += LINK_SIZE;
6221                 }
6222               base_hwm_offset = this_hwm_offset;
6223               code += len;
6224               }
6225             }
6226           }
6227 
6228         if (repeat_max > 0) repeat_max -= repeat_min;
6229         }
6230 
6231       /* This code is common to both the zero and non-zero minimum cases. If
6232       the maximum is limited, it replicates the group in a nested fashion,
6233       remembering the bracket starts on a stack. In the case of a zero minimum,
6234       the first one was set up above. In all cases the repeat_max now specifies
6235       the number of additional copies needed. Again, we must remember to
6236       replicate entries on the forward reference list. */
6237 
6238       if (repeat_max >= 0)
6239         {
6240         /* In the pre-compile phase, we don't actually do the replication. We
6241         just adjust the length as if we had. For each repetition we must add 1
6242         to the length for BRAZERO and for all but the last repetition we must
6243         add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
6244         paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
6245         a 64-bit integer type when available, otherwise double. */
6246 
6247         if (lengthptr != NULL && repeat_max > 0)
6248           {
6249           int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
6250                       2 - 2*LINK_SIZE;   /* Last one doesn't nest */
6251           if ((INT64_OR_DOUBLE)repeat_max *
6252                 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
6253                   > (INT64_OR_DOUBLE)INT_MAX ||
6254               OFLOW_MAX - *lengthptr < delta)
6255             {
6256             *errorcodeptr = ERR20;
6257             goto FAILED;
6258             }
6259           *lengthptr += delta;
6260           }
6261 
6262         /* This is compiling for real */
6263 
6264         else for (i = repeat_max - 1; i >= 0; i--)
6265           {
6266           pcre_uchar *hc;
6267           size_t this_hwm_offset = cd->hwm - cd->start_workspace;
6268 
6269           *code++ = OP_BRAZERO + repeat_type;
6270 
6271           /* All but the final copy start a new nesting, maintaining the
6272           chain of brackets outstanding. */
6273 
6274           if (i != 0)
6275             {
6276             int offset;
6277             *code++ = OP_BRA;
6278             offset = (bralink == NULL)? 0 : (int)(code - bralink);
6279             bralink = code;
6280             PUTINC(code, 0, offset);
6281             }
6282 
6283           memcpy(code, previous, IN_UCHARS(len));
6284 
6285           /* Ensure there is enough workspace for forward references before
6286           copying them. */
6287 
6288           while (cd->hwm > cd->start_workspace + cd->workspace_size -
6289                  WORK_SIZE_SAFETY_MARGIN -
6290                  (this_hwm_offset - base_hwm_offset))
6291             {
6292             *errorcodeptr = expand_workspace(cd);
6293             if (*errorcodeptr != 0) goto FAILED;
6294             }
6295 
6296           for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
6297                hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
6298                hc += LINK_SIZE)
6299             {
6300             PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
6301             cd->hwm += LINK_SIZE;
6302             }
6303           base_hwm_offset = this_hwm_offset;
6304           code += len;
6305           }
6306 
6307         /* Now chain through the pending brackets, and fill in their length
6308         fields (which are holding the chain links pro tem). */
6309 
6310         while (bralink != NULL)
6311           {
6312           int oldlinkoffset;
6313           int offset = (int)(code - bralink + 1);
6314           pcre_uchar *bra = code - offset;
6315           oldlinkoffset = GET(bra, 1);
6316           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
6317           *code++ = OP_KET;
6318           PUTINC(code, 0, offset);
6319           PUT(bra, 1, offset);
6320           }
6321         }
6322 
6323       /* If the maximum is unlimited, set a repeater in the final copy. For
6324       ONCE brackets, that's all we need to do. However, possessively repeated
6325       ONCE brackets can be converted into non-capturing brackets, as the
6326       behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
6327       deal with possessive ONCEs specially.
6328 
6329       Otherwise, when we are doing the actual compile phase, check to see
6330       whether this group is one that could match an empty string. If so,
6331       convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
6332       that runtime checking can be done. [This check is also applied to ONCE
6333       groups at runtime, but in a different way.]
6334 
6335       Then, if the quantifier was possessive and the bracket is not a
6336       conditional, we convert the BRA code to the POS form, and the KET code to
6337       KETRPOS. (It turns out to be convenient at runtime to detect this kind of
6338       subpattern at both the start and at the end.) The use of special opcodes
6339       makes it possible to reduce greatly the stack usage in pcre_exec(). If
6340       the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
6341 
6342       Then, if the minimum number of matches is 1 or 0, cancel the possessive
6343       flag so that the default action below, of wrapping everything inside
6344       atomic brackets, does not happen. When the minimum is greater than 1,
6345       there will be earlier copies of the group, and so we still have to wrap
6346       the whole thing. */
6347 
6348       else
6349         {
6350         pcre_uchar *ketcode = code - 1 - LINK_SIZE;
6351         pcre_uchar *bracode = ketcode - GET(ketcode, 1);
6352 
6353         /* Convert possessive ONCE brackets to non-capturing */
6354 
6355         if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
6356             possessive_quantifier) *bracode = OP_BRA;
6357 
6358         /* For non-possessive ONCE brackets, all we need to do is to
6359         set the KET. */
6360 
6361         if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
6362           *ketcode = OP_KETRMAX + repeat_type;
6363 
6364         /* Handle non-ONCE brackets and possessive ONCEs (which have been
6365         converted to non-capturing above). */
6366 
6367         else
6368           {
6369           /* In the compile phase, check for empty string matching. */
6370 
6371           if (lengthptr == NULL)
6372             {
6373             pcre_uchar *scode = bracode;
6374             do
6375               {
6376               if (could_be_empty_branch(scode, ketcode, utf, cd, NULL))
6377                 {
6378                 *bracode += OP_SBRA - OP_BRA;
6379                 break;
6380                 }
6381               scode += GET(scode, 1);
6382               }
6383             while (*scode == OP_ALT);
6384             }
6385 
6386           /* A conditional group with only one branch has an implicit empty
6387           alternative branch. */
6388 
6389           if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
6390             *bracode = OP_SCOND;
6391 
6392           /* Handle possessive quantifiers. */
6393 
6394           if (possessive_quantifier)
6395             {
6396             /* For COND brackets, we wrap the whole thing in a possessively
6397             repeated non-capturing bracket, because we have not invented POS
6398             versions of the COND opcodes. Because we are moving code along, we
6399             must ensure that any pending recursive references are updated. */
6400 
6401             if (*bracode == OP_COND || *bracode == OP_SCOND)
6402               {
6403               int nlen = (int)(code - bracode);
6404               *code = OP_END;
6405               adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6406               memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
6407               code += 1 + LINK_SIZE;
6408               nlen += 1 + LINK_SIZE;
6409               *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
6410               *code++ = OP_KETRPOS;
6411               PUTINC(code, 0, nlen);
6412               PUT(bracode, 1, nlen);
6413               }
6414 
6415             /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
6416 
6417             else
6418               {
6419               *bracode += 1;              /* Switch to xxxPOS opcodes */
6420               *ketcode = OP_KETRPOS;
6421               }
6422 
6423             /* If the minimum is zero, mark it as possessive, then unset the
6424             possessive flag when the minimum is 0 or 1. */
6425 
6426             if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
6427             if (repeat_min < 2) possessive_quantifier = FALSE;
6428             }
6429 
6430           /* Non-possessive quantifier */
6431 
6432           else *ketcode = OP_KETRMAX + repeat_type;
6433           }
6434         }
6435       }
6436 
6437     /* If previous is OP_FAIL, it was generated by an empty class [] in
6438     JavaScript mode. The other ways in which OP_FAIL can be generated, that is
6439     by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
6440     error above. We can just ignore the repeat in JS case. */
6441 
6442     else if (*previous == OP_FAIL) goto END_REPEAT;
6443 
6444     /* Else there's some kind of shambles */
6445 
6446     else
6447       {
6448       *errorcodeptr = ERR11;
6449       goto FAILED;
6450       }
6451 
6452     /* If the character following a repeat is '+', possessive_quantifier is
6453     TRUE. For some opcodes, there are special alternative opcodes for this
6454     case. For anything else, we wrap the entire repeated item inside OP_ONCE
6455     brackets. Logically, the '+' notation is just syntactic sugar, taken from
6456     Sun's Java package, but the special opcodes can optimize it.
6457 
6458     Some (but not all) possessively repeated subpatterns have already been
6459     completely handled in the code just above. For them, possessive_quantifier
6460     is always FALSE at this stage. Note that the repeated item starts at
6461     tempcode, not at previous, which might be the first part of a string whose
6462     (former) last char we repeated. */
6463 
6464     if (possessive_quantifier)
6465       {
6466       int len;
6467 
6468       /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
6469       However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
6470       {5,}, or {5,10}). We skip over an EXACT item; if the length of what
6471       remains is greater than zero, there's a further opcode that can be
6472       handled. If not, do nothing, leaving the EXACT alone. */
6473 
6474       switch(*tempcode)
6475         {
6476         case OP_TYPEEXACT:
6477         tempcode += PRIV(OP_lengths)[*tempcode] +
6478           ((tempcode[1 + IMM2_SIZE] == OP_PROP
6479           || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
6480         break;
6481 
6482         /* CHAR opcodes are used for exacts whose count is 1. */
6483 
6484         case OP_CHAR:
6485         case OP_CHARI:
6486         case OP_NOT:
6487         case OP_NOTI:
6488         case OP_EXACT:
6489         case OP_EXACTI:
6490         case OP_NOTEXACT:
6491         case OP_NOTEXACTI:
6492         tempcode += PRIV(OP_lengths)[*tempcode];
6493 #ifdef SUPPORT_UTF
6494         if (utf && HAS_EXTRALEN(tempcode[-1]))
6495           tempcode += GET_EXTRALEN(tempcode[-1]);
6496 #endif
6497         break;
6498 
6499         /* For the class opcodes, the repeat operator appears at the end;
6500         adjust tempcode to point to it. */
6501 
6502         case OP_CLASS:
6503         case OP_NCLASS:
6504         tempcode += 1 + 32/sizeof(pcre_uchar);
6505         break;
6506 
6507 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6508         case OP_XCLASS:
6509         tempcode += GET(tempcode, 1);
6510         break;
6511 #endif
6512         }
6513 
6514       /* If tempcode is equal to code (which points to the end of the repeated
6515       item), it means we have skipped an EXACT item but there is no following
6516       QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
6517       all other cases, tempcode will be pointing to the repeat opcode, and will
6518       be less than code, so the value of len will be greater than 0. */
6519 
6520       len = (int)(code - tempcode);
6521       if (len > 0)
6522         {
6523         unsigned int repcode = *tempcode;
6524 
6525         /* There is a table for possessifying opcodes, all of which are less
6526         than OP_CALLOUT. A zero entry means there is no possessified version.
6527         */
6528 
6529         if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
6530           *tempcode = opcode_possessify[repcode];
6531 
6532         /* For opcode without a special possessified version, wrap the item in
6533         ONCE brackets. Because we are moving code along, we must ensure that any
6534         pending recursive references are updated. */
6535 
6536         else
6537           {
6538           *code = OP_END;
6539           adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6540           memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6541           code += 1 + LINK_SIZE;
6542           len += 1 + LINK_SIZE;
6543           tempcode[0] = OP_ONCE;
6544           *code++ = OP_KET;
6545           PUTINC(code, 0, len);
6546           PUT(tempcode, 1, len);
6547           }
6548         }
6549 
6550 #ifdef NEVER
6551       if (len > 0) switch (*tempcode)
6552         {
6553         case OP_STAR:  *tempcode = OP_POSSTAR; break;
6554         case OP_PLUS:  *tempcode = OP_POSPLUS; break;
6555         case OP_QUERY: *tempcode = OP_POSQUERY; break;
6556         case OP_UPTO:  *tempcode = OP_POSUPTO; break;
6557 
6558         case OP_STARI:  *tempcode = OP_POSSTARI; break;
6559         case OP_PLUSI:  *tempcode = OP_POSPLUSI; break;
6560         case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
6561         case OP_UPTOI:  *tempcode = OP_POSUPTOI; break;
6562 
6563         case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
6564         case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
6565         case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
6566         case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
6567 
6568         case OP_NOTSTARI:  *tempcode = OP_NOTPOSSTARI; break;
6569         case OP_NOTPLUSI:  *tempcode = OP_NOTPOSPLUSI; break;
6570         case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
6571         case OP_NOTUPTOI:  *tempcode = OP_NOTPOSUPTOI; break;
6572 
6573         case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
6574         case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
6575         case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
6576         case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
6577 
6578         case OP_CRSTAR:   *tempcode = OP_CRPOSSTAR; break;
6579         case OP_CRPLUS:   *tempcode = OP_CRPOSPLUS; break;
6580         case OP_CRQUERY:  *tempcode = OP_CRPOSQUERY; break;
6581         case OP_CRRANGE:  *tempcode = OP_CRPOSRANGE; break;
6582 
6583         /* Because we are moving code along, we must ensure that any
6584         pending recursive references are updated. */
6585 
6586         default:
6587         *code = OP_END;
6588         adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6589         memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6590         code += 1 + LINK_SIZE;
6591         len += 1 + LINK_SIZE;
6592         tempcode[0] = OP_ONCE;
6593         *code++ = OP_KET;
6594         PUTINC(code, 0, len);
6595         PUT(tempcode, 1, len);
6596         break;
6597         }
6598 #endif
6599       }
6600 
6601     /* In all case we no longer have a previous item. We also set the
6602     "follows varying string" flag for subsequently encountered reqchars if
6603     it isn't already set and we have just passed a varying length item. */
6604 
6605     END_REPEAT:
6606     previous = NULL;
6607     cd->req_varyopt |= reqvary;
6608     break;
6609 
6610 
6611     /* ===================================================================*/
6612     /* Start of nested parenthesized sub-expression, or comment or lookahead or
6613     lookbehind or option setting or condition or all the other extended
6614     parenthesis forms.  */
6615 
6616     case CHAR_LEFT_PARENTHESIS:
6617     ptr++;
6618 
6619     /* Now deal with various "verbs" that can be introduced by '*'. */
6620 
6621     if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
6622          || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
6623       {
6624       int i, namelen;
6625       int arglen = 0;
6626       const char *vn = verbnames;
6627       const pcre_uchar *name = ptr + 1;
6628       const pcre_uchar *arg = NULL;
6629       previous = NULL;
6630       ptr++;
6631       while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
6632       namelen = (int)(ptr - name);
6633 
6634       /* It appears that Perl allows any characters whatsoever, other than
6635       a closing parenthesis, to appear in arguments, so we no longer insist on
6636       letters, digits, and underscores. */
6637 
6638       if (*ptr == CHAR_COLON)
6639         {
6640         arg = ++ptr;
6641         while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6642         arglen = (int)(ptr - arg);
6643         if ((unsigned int)arglen > MAX_MARK)
6644           {
6645           *errorcodeptr = ERR75;
6646           goto FAILED;
6647           }
6648         }
6649 
6650       if (*ptr != CHAR_RIGHT_PARENTHESIS)
6651         {
6652         *errorcodeptr = ERR60;
6653         goto FAILED;
6654         }
6655 
6656       /* Scan the table of verb names */
6657 
6658       for (i = 0; i < verbcount; i++)
6659         {
6660         if (namelen == verbs[i].len &&
6661             STRNCMP_UC_C8(name, vn, namelen) == 0)
6662           {
6663           int setverb;
6664 
6665           /* Check for open captures before ACCEPT and convert it to
6666           ASSERT_ACCEPT if in an assertion. */
6667 
6668           if (verbs[i].op == OP_ACCEPT)
6669             {
6670             open_capitem *oc;
6671             if (arglen != 0)
6672               {
6673               *errorcodeptr = ERR59;
6674               goto FAILED;
6675               }
6676             cd->had_accept = TRUE;
6677             for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6678               {
6679               if (lengthptr != NULL)
6680                 {
6681 #ifdef COMPILE_PCRE8
6682                 *lengthptr += 1 + IMM2_SIZE;
6683 #elif defined COMPILE_PCRE16
6684                 *lengthptr += 2 + IMM2_SIZE;
6685 #elif defined COMPILE_PCRE32
6686                 *lengthptr += 4 + IMM2_SIZE;
6687 #endif
6688                 }
6689               else
6690                 {
6691                 *code++ = OP_CLOSE;
6692                 PUT2INC(code, 0, oc->number);
6693                 }
6694               }
6695             setverb = *code++ =
6696               (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6697 
6698             /* Do not set firstchar after *ACCEPT */
6699             if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
6700             }
6701 
6702           /* Handle other cases with/without an argument */
6703 
6704           else if (arglen == 0)
6705             {
6706             if (verbs[i].op < 0)   /* Argument is mandatory */
6707               {
6708               *errorcodeptr = ERR66;
6709               goto FAILED;
6710               }
6711             setverb = *code++ = verbs[i].op;
6712             }
6713 
6714           else
6715             {
6716             if (verbs[i].op_arg < 0)   /* Argument is forbidden */
6717               {
6718               *errorcodeptr = ERR59;
6719               goto FAILED;
6720               }
6721             setverb = *code++ = verbs[i].op_arg;
6722             if (lengthptr != NULL)    /* In pass 1 just add in the length */
6723               {                       /* to avoid potential workspace */
6724               *lengthptr += arglen;   /* overflow. */
6725               *code++ = 0;
6726               }
6727             else
6728               {
6729               *code++ = arglen;
6730               memcpy(code, arg, IN_UCHARS(arglen));
6731               code += arglen;
6732               }
6733             *code++ = 0;
6734             }
6735 
6736           switch (setverb)
6737             {
6738             case OP_THEN:
6739             case OP_THEN_ARG:
6740             cd->external_flags |= PCRE_HASTHEN;
6741             break;
6742 
6743             case OP_PRUNE:
6744             case OP_PRUNE_ARG:
6745             case OP_SKIP:
6746             case OP_SKIP_ARG:
6747             cd->had_pruneorskip = TRUE;
6748             break;
6749             }
6750 
6751           break;  /* Found verb, exit loop */
6752           }
6753 
6754         vn += verbs[i].len + 1;
6755         }
6756 
6757       if (i < verbcount) continue;    /* Successfully handled a verb */
6758       *errorcodeptr = ERR60;          /* Verb not recognized */
6759       goto FAILED;
6760       }
6761 
6762     /* Initialize for "real" parentheses */
6763 
6764     newoptions = options;
6765     skipbytes = 0;
6766     bravalue = OP_CBRA;
6767     item_hwm_offset = cd->hwm - cd->start_workspace;
6768     reset_bracount = FALSE;
6769 
6770     /* Deal with the extended parentheses; all are introduced by '?', and the
6771     appearance of any of them means that this is not a capturing group. */
6772 
6773     if (*ptr == CHAR_QUESTION_MARK)
6774       {
6775       int i, set, unset, namelen;
6776       int *optset;
6777       const pcre_uchar *name;
6778       pcre_uchar *slot;
6779 
6780       switch (*(++ptr))
6781         {
6782         /* ------------------------------------------------------------ */
6783         case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
6784         reset_bracount = TRUE;
6785         cd->dupgroups = TRUE;     /* Record (?| encountered */
6786         /* Fall through */
6787 
6788         /* ------------------------------------------------------------ */
6789         case CHAR_COLON:          /* Non-capturing bracket */
6790         bravalue = OP_BRA;
6791         ptr++;
6792         break;
6793 
6794 
6795         /* ------------------------------------------------------------ */
6796         case CHAR_LEFT_PARENTHESIS:
6797         bravalue = OP_COND;       /* Conditional group */
6798         tempptr = ptr;
6799 
6800         /* A condition can be an assertion, a number (referring to a numbered
6801         group's having been set), a name (referring to a named group), or 'R',
6802         referring to recursion. R<digits> and R&name are also permitted for
6803         recursion tests.
6804 
6805         There are ways of testing a named group: (?(name)) is used by Python;
6806         Perl 5.10 onwards uses (?(<name>) or (?('name')).
6807 
6808         There is one unfortunate ambiguity, caused by history. 'R' can be the
6809         recursive thing or the name 'R' (and similarly for 'R' followed by
6810         digits). We look for a name first; if not found, we try the other case.
6811 
6812         For compatibility with auto-callouts, we allow a callout to be
6813         specified before a condition that is an assertion. First, check for the
6814         syntax of a callout; if found, adjust the temporary pointer that is
6815         used to check for an assertion condition. That's all that is needed! */
6816 
6817         if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C)
6818           {
6819           for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break;
6820           if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
6821             tempptr += i + 1;
6822 
6823           /* tempptr should now be pointing to the opening parenthesis of the
6824           assertion condition. */
6825 
6826           if (*tempptr != CHAR_LEFT_PARENTHESIS)
6827             {
6828             *errorcodeptr = ERR28;
6829             goto FAILED;
6830             }
6831           }
6832 
6833         /* For conditions that are assertions, check the syntax, and then exit
6834         the switch. This will take control down to where bracketed groups,
6835         including assertions, are processed. */
6836 
6837         if (tempptr[1] == CHAR_QUESTION_MARK &&
6838               (tempptr[2] == CHAR_EQUALS_SIGN ||
6839                tempptr[2] == CHAR_EXCLAMATION_MARK ||
6840                  (tempptr[2] == CHAR_LESS_THAN_SIGN &&
6841                    (tempptr[3] == CHAR_EQUALS_SIGN ||
6842                     tempptr[3] == CHAR_EXCLAMATION_MARK))))
6843           {
6844           cd->iscondassert = TRUE;
6845           break;
6846           }
6847 
6848         /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6849         need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6850 
6851         code[1+LINK_SIZE] = OP_CREF;
6852         skipbytes = 1+IMM2_SIZE;
6853         refsign = -1;     /* => not a number */
6854         namelen = -1;     /* => not a name; must set to avoid warning */
6855         name = NULL;      /* Always set to avoid warning */
6856         recno = 0;        /* Always set to avoid warning */
6857 
6858         /* Check for a test for recursion in a named group. */
6859 
6860         ptr++;
6861         if (*ptr == CHAR_R && ptr[1] == CHAR_AMPERSAND)
6862           {
6863           terminator = -1;
6864           ptr += 2;
6865           code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
6866           }
6867 
6868         /* Check for a test for a named group's having been set, using the Perl
6869         syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6870         syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). */
6871 
6872         else if (*ptr == CHAR_LESS_THAN_SIGN)
6873           {
6874           terminator = CHAR_GREATER_THAN_SIGN;
6875           ptr++;
6876           }
6877         else if (*ptr == CHAR_APOSTROPHE)
6878           {
6879           terminator = CHAR_APOSTROPHE;
6880           ptr++;
6881           }
6882         else
6883           {
6884           terminator = CHAR_NULL;
6885           if (*ptr == CHAR_MINUS || *ptr == CHAR_PLUS) refsign = *ptr++;
6886             else if (IS_DIGIT(*ptr)) refsign = 0;
6887           }
6888 
6889         /* Handle a number */
6890 
6891         if (refsign >= 0)
6892           {
6893           while (IS_DIGIT(*ptr))
6894             {
6895             if (recno > INT_MAX / 10 - 1)  /* Integer overflow */
6896               {
6897               while (IS_DIGIT(*ptr)) ptr++;
6898               *errorcodeptr = ERR61;
6899               goto FAILED;
6900               }
6901             recno = recno * 10 + (int)(*ptr - CHAR_0);
6902             ptr++;
6903             }
6904           }
6905 
6906         /* Otherwise we expect to read a name; anything else is an error. When
6907         a name is one of a number of duplicates, a different opcode is used and
6908         it needs more memory. Unfortunately we cannot tell whether a name is a
6909         duplicate in the first pass, so we have to allow for more memory. */
6910 
6911         else
6912           {
6913           if (IS_DIGIT(*ptr))
6914             {
6915             *errorcodeptr = ERR84;
6916             goto FAILED;
6917             }
6918           if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_word) == 0)
6919             {
6920             *errorcodeptr = ERR28;   /* Assertion expected */
6921             goto FAILED;
6922             }
6923           name = ptr++;
6924           while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
6925             {
6926             ptr++;
6927             }
6928           namelen = (int)(ptr - name);
6929           if (lengthptr != NULL) skipbytes += IMM2_SIZE;
6930           }
6931 
6932         /* Check the terminator */
6933 
6934         if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
6935             *ptr++ != CHAR_RIGHT_PARENTHESIS)
6936           {
6937           ptr--;                  /* Error offset */
6938           *errorcodeptr = ERR26;  /* Malformed number or name */
6939           goto FAILED;
6940           }
6941 
6942         /* Do no further checking in the pre-compile phase. */
6943 
6944         if (lengthptr != NULL) break;
6945 
6946         /* In the real compile we do the work of looking for the actual
6947         reference. If refsign is not negative, it means we have a number in
6948         recno. */
6949 
6950         if (refsign >= 0)
6951           {
6952           if (recno <= 0)
6953             {
6954             *errorcodeptr = ERR35;
6955             goto FAILED;
6956             }
6957           if (refsign != 0) recno = (refsign == CHAR_MINUS)?
6958             cd->bracount - recno + 1 : recno + cd->bracount;
6959           if (recno <= 0 || recno > cd->final_bracount)
6960             {
6961             *errorcodeptr = ERR15;
6962             goto FAILED;
6963             }
6964           PUT2(code, 2+LINK_SIZE, recno);
6965           if (recno > cd->top_backref) cd->top_backref = recno;
6966           break;
6967           }
6968 
6969         /* Otherwise look for the name. */
6970 
6971         slot = cd->name_table;
6972         for (i = 0; i < cd->names_found; i++)
6973           {
6974           if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
6975             slot[IMM2_SIZE+namelen] == 0) break;
6976           slot += cd->name_entry_size;
6977           }
6978 
6979         /* Found the named subpattern. If the name is duplicated, add one to
6980         the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6981         appropriate data values. Otherwise, just insert the unique subpattern
6982         number. */
6983 
6984         if (i < cd->names_found)
6985           {
6986           int offset = i++;
6987           int count = 1;
6988           recno = GET2(slot, 0);   /* Number from first found */
6989           if (recno > cd->top_backref) cd->top_backref = recno;
6990           for (; i < cd->names_found; i++)
6991             {
6992             slot += cd->name_entry_size;
6993             if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0 ||
6994               (slot+IMM2_SIZE)[namelen] != 0) break;
6995             count++;
6996             }
6997 
6998           if (count > 1)
6999             {
7000             PUT2(code, 2+LINK_SIZE, offset);
7001             PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
7002             skipbytes += IMM2_SIZE;
7003             code[1+LINK_SIZE]++;
7004             }
7005           else  /* Not a duplicated name */
7006             {
7007             PUT2(code, 2+LINK_SIZE, recno);
7008             }
7009           }
7010 
7011         /* If terminator == CHAR_NULL it means that the name followed directly
7012         after the opening parenthesis [e.g. (?(abc)...] and in this case there
7013         are some further alternatives to try. For the cases where terminator !=
7014         CHAR_NULL [things like (?(<name>... or (?('name')... or (?(R&name)... ]
7015         we have now checked all the possibilities, so give an error. */
7016 
7017         else if (terminator != CHAR_NULL)
7018           {
7019           *errorcodeptr = ERR15;
7020           goto FAILED;
7021           }
7022 
7023         /* Check for (?(R) for recursion. Allow digits after R to specify a
7024         specific group number. */
7025 
7026         else if (*name == CHAR_R)
7027           {
7028           recno = 0;
7029           for (i = 1; i < namelen; i++)
7030             {
7031             if (!IS_DIGIT(name[i]))
7032               {
7033               *errorcodeptr = ERR15;
7034               goto FAILED;
7035               }
7036             if (recno > INT_MAX / 10 - 1)   /* Integer overflow */
7037               {
7038               *errorcodeptr = ERR61;
7039               goto FAILED;
7040               }
7041             recno = recno * 10 + name[i] - CHAR_0;
7042             }
7043           if (recno == 0) recno = RREF_ANY;
7044           code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
7045           PUT2(code, 2+LINK_SIZE, recno);
7046           }
7047 
7048         /* Similarly, check for the (?(DEFINE) "condition", which is always
7049         false. */
7050 
7051         else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
7052           {
7053           code[1+LINK_SIZE] = OP_DEF;
7054           skipbytes = 1;
7055           }
7056 
7057         /* Reference to an unidentified subpattern. */
7058 
7059         else
7060           {
7061           *errorcodeptr = ERR15;
7062           goto FAILED;
7063           }
7064         break;
7065 
7066 
7067         /* ------------------------------------------------------------ */
7068         case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
7069         bravalue = OP_ASSERT;
7070         cd->assert_depth += 1;
7071         ptr++;
7072         break;
7073 
7074         /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
7075         thing to do, but Perl allows all assertions to be quantified, and when
7076         they contain capturing parentheses there may be a potential use for
7077         this feature. Not that that applies to a quantified (?!) but we allow
7078         it for uniformity. */
7079 
7080         /* ------------------------------------------------------------ */
7081         case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
7082         ptr++;
7083         if (*ptr == CHAR_RIGHT_PARENTHESIS && ptr[1] != CHAR_ASTERISK &&
7084              ptr[1] != CHAR_PLUS && ptr[1] != CHAR_QUESTION_MARK &&
7085             (ptr[1] != CHAR_LEFT_CURLY_BRACKET || !is_counted_repeat(ptr+2)))
7086           {
7087           *code++ = OP_FAIL;
7088           previous = NULL;
7089           continue;
7090           }
7091         bravalue = OP_ASSERT_NOT;
7092         cd->assert_depth += 1;
7093         break;
7094 
7095 
7096         /* ------------------------------------------------------------ */
7097         case CHAR_LESS_THAN_SIGN:              /* Lookbehind or named define */
7098         switch (ptr[1])
7099           {
7100           case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
7101           bravalue = OP_ASSERTBACK;
7102           cd->assert_depth += 1;
7103           ptr += 2;
7104           break;
7105 
7106           case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
7107           bravalue = OP_ASSERTBACK_NOT;
7108           cd->assert_depth += 1;
7109           ptr += 2;
7110           break;
7111 
7112           default:                /* Could be name define, else bad */
7113           if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
7114             goto DEFINE_NAME;
7115           ptr++;                  /* Correct offset for error */
7116           *errorcodeptr = ERR24;
7117           goto FAILED;
7118           }
7119         break;
7120 
7121 
7122         /* ------------------------------------------------------------ */
7123         case CHAR_GREATER_THAN_SIGN:           /* One-time brackets */
7124         bravalue = OP_ONCE;
7125         ptr++;
7126         break;
7127 
7128 
7129         /* ------------------------------------------------------------ */
7130         case CHAR_C:                 /* Callout - may be followed by digits; */
7131         previous_callout = code;     /* Save for later completion */
7132         after_manual_callout = 1;    /* Skip one item before completing */
7133         *code++ = OP_CALLOUT;
7134           {
7135           int n = 0;
7136           ptr++;
7137           while(IS_DIGIT(*ptr))
7138             {
7139             n = n * 10 + *ptr++ - CHAR_0;
7140             if (n > 255)
7141               {
7142               *errorcodeptr = ERR38;
7143               goto FAILED;
7144               }
7145             }
7146           if (*ptr != CHAR_RIGHT_PARENTHESIS)
7147             {
7148             *errorcodeptr = ERR39;
7149             goto FAILED;
7150             }
7151           *code++ = n;
7152           PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
7153           PUT(code, LINK_SIZE, 0);                          /* Default length */
7154           code += 2 * LINK_SIZE;
7155           }
7156         previous = NULL;
7157         continue;
7158 
7159 
7160         /* ------------------------------------------------------------ */
7161         case CHAR_P:              /* Python-style named subpattern handling */
7162         if (*(++ptr) == CHAR_EQUALS_SIGN ||
7163             *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
7164           {
7165           is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
7166           terminator = CHAR_RIGHT_PARENTHESIS;
7167           goto NAMED_REF_OR_RECURSE;
7168           }
7169         else if (*ptr != CHAR_LESS_THAN_SIGN)  /* Test for Python-style defn */
7170           {
7171           *errorcodeptr = ERR41;
7172           goto FAILED;
7173           }
7174         /* Fall through to handle (?P< as (?< is handled */
7175 
7176 
7177         /* ------------------------------------------------------------ */
7178         DEFINE_NAME:    /* Come here from (?< handling */
7179         case CHAR_APOSTROPHE:
7180         terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
7181           CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7182         name = ++ptr;
7183         if (IS_DIGIT(*ptr))
7184           {
7185           *errorcodeptr = ERR84;   /* Group name must start with non-digit */
7186           goto FAILED;
7187           }
7188         while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
7189         namelen = (int)(ptr - name);
7190 
7191         /* In the pre-compile phase, do a syntax check, remember the longest
7192         name, and then remember the group in a vector, expanding it if
7193         necessary. Duplicates for the same number are skipped; other duplicates
7194         are checked for validity. In the actual compile, there is nothing to
7195         do. */
7196 
7197         if (lengthptr != NULL)
7198           {
7199           named_group *ng;
7200           pcre_uint32 number = cd->bracount + 1;
7201 
7202           if (*ptr != (pcre_uchar)terminator)
7203             {
7204             *errorcodeptr = ERR42;
7205             goto FAILED;
7206             }
7207 
7208           if (cd->names_found >= MAX_NAME_COUNT)
7209             {
7210             *errorcodeptr = ERR49;
7211             goto FAILED;
7212             }
7213 
7214           if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
7215             {
7216             cd->name_entry_size = namelen + IMM2_SIZE + 1;
7217             if (namelen > MAX_NAME_SIZE)
7218               {
7219               *errorcodeptr = ERR48;
7220               goto FAILED;
7221               }
7222             }
7223 
7224           /* Scan the list to check for duplicates. For duplicate names, if the
7225           number is the same, break the loop, which causes the name to be
7226           discarded; otherwise, if DUPNAMES is not set, give an error.
7227           If it is set, allow the name with a different number, but continue
7228           scanning in case this is a duplicate with the same number. For
7229           non-duplicate names, give an error if the number is duplicated. */
7230 
7231           ng = cd->named_groups;
7232           for (i = 0; i < cd->names_found; i++, ng++)
7233             {
7234             if (namelen == ng->length &&
7235                 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
7236               {
7237               if (ng->number == number) break;
7238               if ((options & PCRE_DUPNAMES) == 0)
7239                 {
7240                 *errorcodeptr = ERR43;
7241                 goto FAILED;
7242                 }
7243               cd->dupnames = TRUE;  /* Duplicate names exist */
7244               }
7245             else if (ng->number == number)
7246               {
7247               *errorcodeptr = ERR65;
7248               goto FAILED;
7249               }
7250             }
7251 
7252           if (i >= cd->names_found)     /* Not a duplicate with same number */
7253             {
7254             /* Increase the list size if necessary */
7255 
7256             if (cd->names_found >= cd->named_group_list_size)
7257               {
7258               int newsize = cd->named_group_list_size * 2;
7259               named_group *newspace = (PUBL(malloc))
7260                 (newsize * sizeof(named_group));
7261 
7262               if (newspace == NULL)
7263                 {
7264                 *errorcodeptr = ERR21;
7265                 goto FAILED;
7266                 }
7267 
7268               memcpy(newspace, cd->named_groups,
7269                 cd->named_group_list_size * sizeof(named_group));
7270               if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
7271                 (PUBL(free))((void *)cd->named_groups);
7272               cd->named_groups = newspace;
7273               cd->named_group_list_size = newsize;
7274               }
7275 
7276             cd->named_groups[cd->names_found].name = name;
7277             cd->named_groups[cd->names_found].length = namelen;
7278             cd->named_groups[cd->names_found].number = number;
7279             cd->names_found++;
7280             }
7281           }
7282 
7283         ptr++;                    /* Move past > or ' in both passes. */
7284         goto NUMBERED_GROUP;
7285 
7286 
7287         /* ------------------------------------------------------------ */
7288         case CHAR_AMPERSAND:            /* Perl recursion/subroutine syntax */
7289         terminator = CHAR_RIGHT_PARENTHESIS;
7290         is_recurse = TRUE;
7291         /* Fall through */
7292 
7293         /* We come here from the Python syntax above that handles both
7294         references (?P=name) and recursion (?P>name), as well as falling
7295         through from the Perl recursion syntax (?&name). We also come here from
7296         the Perl \k<name> or \k'name' back reference syntax and the \k{name}
7297         .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
7298 
7299         NAMED_REF_OR_RECURSE:
7300         name = ++ptr;
7301         if (IS_DIGIT(*ptr))
7302           {
7303           *errorcodeptr = ERR84;   /* Group name must start with non-digit */
7304           goto FAILED;
7305           }
7306         while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
7307         namelen = (int)(ptr - name);
7308 
7309         /* In the pre-compile phase, do a syntax check. We used to just set
7310         a dummy reference number, because it was not used in the first pass.
7311         However, with the change of recursive back references to be atomic,
7312         we have to look for the number so that this state can be identified, as
7313         otherwise the incorrect length is computed. If it's not a backwards
7314         reference, the dummy number will do. */
7315 
7316         if (lengthptr != NULL)
7317           {
7318           named_group *ng;
7319           recno = 0;
7320 
7321           if (namelen == 0)
7322             {
7323             *errorcodeptr = ERR62;
7324             goto FAILED;
7325             }
7326           if (*ptr != (pcre_uchar)terminator)
7327             {
7328             *errorcodeptr = ERR42;
7329             goto FAILED;
7330             }
7331           if (namelen > MAX_NAME_SIZE)
7332             {
7333             *errorcodeptr = ERR48;
7334             goto FAILED;
7335             }
7336 
7337           /* Count named back references. */
7338 
7339           if (!is_recurse) cd->namedrefcount++;
7340 
7341           /* We have to allow for a named reference to a duplicated name (this
7342           cannot be determined until the second pass). This needs an extra
7343           16-bit data item. */
7344 
7345           *lengthptr += IMM2_SIZE;
7346 
7347           /* If this is a forward reference and we are within a (?|...) group,
7348           the reference may end up as the number of a group which we are
7349           currently inside, that is, it could be a recursive reference. In the
7350           real compile this will be picked up and the reference wrapped with
7351           OP_ONCE to make it atomic, so we must space in case this occurs. */
7352 
7353           /* In fact, this can happen for a non-forward reference because
7354           another group with the same number might be created later. This
7355           issue is fixed "properly" in PCRE2. As PCRE1 is now in maintenance
7356           only mode, we finesse the bug by allowing more memory always. */
7357 
7358           *lengthptr += 4 + 4*LINK_SIZE;
7359 
7360           /* It is even worse than that. The current reference may be to an
7361           existing named group with a different number (so apparently not
7362           recursive) but which later on is also attached to a group with the
7363           current number. This can only happen if $(| has been previous
7364           encountered. In that case, we allow yet more memory, just in case.
7365           (Again, this is fixed "properly" in PCRE2. */
7366 
7367           if (cd->dupgroups) *lengthptr += 4 + 4*LINK_SIZE;
7368 
7369           /* Otherwise, check for recursion here. The name table does not exist
7370           in the first pass; instead we must scan the list of names encountered
7371           so far in order to get the number. If the name is not found, leave
7372           the value of recno as 0 for a forward reference. */
7373 
7374           /* This patch (removing "else") fixes a problem when a reference is
7375           to multiple identically named nested groups from within the nest.
7376           Once again, it is not the "proper" fix, and it results in an
7377           over-allocation of memory. */
7378 
7379           /* else */
7380             {
7381             ng = cd->named_groups;
7382             for (i = 0; i < cd->names_found; i++, ng++)
7383               {
7384               if (namelen == ng->length &&
7385                   STRNCMP_UC_UC(name, ng->name, namelen) == 0)
7386                 {
7387                 open_capitem *oc;
7388                 recno = ng->number;
7389                 if (is_recurse) break;
7390                 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
7391                   {
7392                   if (oc->number == recno)
7393                     {
7394                     oc->flag = TRUE;
7395                     break;
7396                     }
7397                   }
7398                 }
7399               }
7400             }
7401           }
7402 
7403         /* In the real compile, search the name table. We check the name
7404         first, and then check that we have reached the end of the name in the
7405         table. That way, if the name is longer than any in the table, the
7406         comparison will fail without reading beyond the table entry. */
7407 
7408         else
7409           {
7410           slot = cd->name_table;
7411           for (i = 0; i < cd->names_found; i++)
7412             {
7413             if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
7414                 slot[IMM2_SIZE+namelen] == 0)
7415               break;
7416             slot += cd->name_entry_size;
7417             }
7418 
7419           if (i < cd->names_found)
7420             {
7421             recno = GET2(slot, 0);
7422             }
7423           else
7424             {
7425             *errorcodeptr = ERR15;
7426             goto FAILED;
7427             }
7428           }
7429 
7430         /* In both phases, for recursions, we can now go to the code than
7431         handles numerical recursion. */
7432 
7433         if (is_recurse) goto HANDLE_RECURSION;
7434 
7435         /* In the second pass we must see if the name is duplicated. If so, we
7436         generate a different opcode. */
7437 
7438         if (lengthptr == NULL && cd->dupnames)
7439           {
7440           int count = 1;
7441           unsigned int index = i;
7442           pcre_uchar *cslot = slot + cd->name_entry_size;
7443 
7444           for (i++; i < cd->names_found; i++)
7445             {
7446             if (STRCMP_UC_UC(slot + IMM2_SIZE, cslot + IMM2_SIZE) != 0) break;
7447             count++;
7448             cslot += cd->name_entry_size;
7449             }
7450 
7451           if (count > 1)
7452             {
7453             if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7454             previous = code;
7455             item_hwm_offset = cd->hwm - cd->start_workspace;
7456             *code++ = ((options & PCRE_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
7457             PUT2INC(code, 0, index);
7458             PUT2INC(code, 0, count);
7459 
7460             /* Process each potentially referenced group. */
7461 
7462             for (; slot < cslot; slot += cd->name_entry_size)
7463               {
7464               open_capitem *oc;
7465               recno = GET2(slot, 0);
7466               cd->backref_map |= (recno < 32)? (1U << recno) : 1;
7467               if (recno > cd->top_backref) cd->top_backref = recno;
7468 
7469               /* Check to see if this back reference is recursive, that it, it
7470               is inside the group that it references. A flag is set so that the
7471               group can be made atomic. */
7472 
7473               for (oc = cd->open_caps; oc != NULL; oc = oc->next)
7474                 {
7475                 if (oc->number == recno)
7476                   {
7477                   oc->flag = TRUE;
7478                   break;
7479                   }
7480                 }
7481               }
7482 
7483             continue;  /* End of back ref handling */
7484             }
7485           }
7486 
7487         /* First pass, or a non-duplicated name. */
7488 
7489         goto HANDLE_REFERENCE;
7490 
7491 
7492         /* ------------------------------------------------------------ */
7493         case CHAR_R:              /* Recursion, same as (?0) */
7494         recno = 0;
7495         if (*(++ptr) != CHAR_RIGHT_PARENTHESIS)
7496           {
7497           *errorcodeptr = ERR29;
7498           goto FAILED;
7499           }
7500         goto HANDLE_RECURSION;
7501 
7502 
7503         /* ------------------------------------------------------------ */
7504         case CHAR_MINUS: case CHAR_PLUS:  /* Recursion or subroutine */
7505         case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
7506         case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
7507           {
7508           const pcre_uchar *called;
7509           terminator = CHAR_RIGHT_PARENTHESIS;
7510 
7511           /* Come here from the \g<...> and \g'...' code (Oniguruma
7512           compatibility). However, the syntax has been checked to ensure that
7513           the ... are a (signed) number, so that neither ERR63 nor ERR29 will
7514           be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
7515           ever be taken. */
7516 
7517           HANDLE_NUMERICAL_RECURSION:
7518 
7519           if ((refsign = *ptr) == CHAR_PLUS)
7520             {
7521             ptr++;
7522             if (!IS_DIGIT(*ptr))
7523               {
7524               *errorcodeptr = ERR63;
7525               goto FAILED;
7526               }
7527             }
7528           else if (refsign == CHAR_MINUS)
7529             {
7530             if (!IS_DIGIT(ptr[1]))
7531               goto OTHER_CHAR_AFTER_QUERY;
7532             ptr++;
7533             }
7534 
7535           recno = 0;
7536           while(IS_DIGIT(*ptr))
7537             {
7538             if (recno > INT_MAX / 10 - 1) /* Integer overflow */
7539               {
7540               while (IS_DIGIT(*ptr)) ptr++;
7541               *errorcodeptr = ERR61;
7542               goto FAILED;
7543               }
7544             recno = recno * 10 + *ptr++ - CHAR_0;
7545             }
7546 
7547           if (*ptr != (pcre_uchar)terminator)
7548             {
7549             *errorcodeptr = ERR29;
7550             goto FAILED;
7551             }
7552 
7553           if (refsign == CHAR_MINUS)
7554             {
7555             if (recno == 0)
7556               {
7557               *errorcodeptr = ERR58;
7558               goto FAILED;
7559               }
7560             recno = cd->bracount - recno + 1;
7561             if (recno <= 0)
7562               {
7563               *errorcodeptr = ERR15;
7564               goto FAILED;
7565               }
7566             }
7567           else if (refsign == CHAR_PLUS)
7568             {
7569             if (recno == 0)
7570               {
7571               *errorcodeptr = ERR58;
7572               goto FAILED;
7573               }
7574             recno += cd->bracount;
7575             }
7576 
7577           /* Come here from code above that handles a named recursion */
7578 
7579           HANDLE_RECURSION:
7580 
7581           previous = code;
7582           item_hwm_offset = cd->hwm - cd->start_workspace;
7583           called = cd->start_code;
7584 
7585           /* When we are actually compiling, find the bracket that is being
7586           referenced. Temporarily end the regex in case it doesn't exist before
7587           this point. If we end up with a forward reference, first check that
7588           the bracket does occur later so we can give the error (and position)
7589           now. Then remember this forward reference in the workspace so it can
7590           be filled in at the end. */
7591 
7592           if (lengthptr == NULL)
7593             {
7594             *code = OP_END;
7595             if (recno != 0)
7596               called = PRIV(find_bracket)(cd->start_code, utf, recno);
7597 
7598             /* Forward reference */
7599 
7600             if (called == NULL)
7601               {
7602               if (recno > cd->final_bracount)
7603                 {
7604                 *errorcodeptr = ERR15;
7605                 goto FAILED;
7606                 }
7607 
7608               /* Fudge the value of "called" so that when it is inserted as an
7609               offset below, what it actually inserted is the reference number
7610               of the group. Then remember the forward reference. */
7611 
7612               called = cd->start_code + recno;
7613               if (cd->hwm >= cd->start_workspace + cd->workspace_size -
7614                   WORK_SIZE_SAFETY_MARGIN)
7615                 {
7616                 *errorcodeptr = expand_workspace(cd);
7617                 if (*errorcodeptr != 0) goto FAILED;
7618                 }
7619               PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
7620               }
7621 
7622             /* If not a forward reference, and the subpattern is still open,
7623             this is a recursive call. We check to see if this is a left
7624             recursion that could loop for ever, and diagnose that case. We
7625             must not, however, do this check if we are in a conditional
7626             subpattern because the condition might be testing for recursion in
7627             a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid.
7628             Forever loops are also detected at runtime, so those that occur in
7629             conditional subpatterns will be picked up then. */
7630 
7631             else if (GET(called, 1) == 0 && cond_depth <= 0 &&
7632                      could_be_empty(called, code, bcptr, utf, cd))
7633               {
7634               *errorcodeptr = ERR40;
7635               goto FAILED;
7636               }
7637             }
7638 
7639           /* Insert the recursion/subroutine item. It does not have a set first
7640           character (relevant if it is repeated, because it will then be
7641           wrapped with ONCE brackets). */
7642 
7643           *code = OP_RECURSE;
7644           PUT(code, 1, (int)(called - cd->start_code));
7645           code += 1 + LINK_SIZE;
7646           groupsetfirstchar = FALSE;
7647           }
7648 
7649         /* Can't determine a first byte now */
7650 
7651         if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7652         zerofirstchar = firstchar;
7653         zerofirstcharflags = firstcharflags;
7654         continue;
7655 
7656 
7657         /* ------------------------------------------------------------ */
7658         default:              /* Other characters: check option setting */
7659         OTHER_CHAR_AFTER_QUERY:
7660         set = unset = 0;
7661         optset = &set;
7662 
7663         while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
7664           {
7665           switch (*ptr++)
7666             {
7667             case CHAR_MINUS: optset = &unset; break;
7668 
7669             case CHAR_J:    /* Record that it changed in the external options */
7670             *optset |= PCRE_DUPNAMES;
7671             cd->external_flags |= PCRE_JCHANGED;
7672             break;
7673 
7674             case CHAR_i: *optset |= PCRE_CASELESS; break;
7675             case CHAR_m: *optset |= PCRE_MULTILINE; break;
7676             case CHAR_s: *optset |= PCRE_DOTALL; break;
7677             case CHAR_x: *optset |= PCRE_EXTENDED; break;
7678             case CHAR_U: *optset |= PCRE_UNGREEDY; break;
7679             case CHAR_X: *optset |= PCRE_EXTRA; break;
7680 
7681             default:  *errorcodeptr = ERR12;
7682                       ptr--;    /* Correct the offset */
7683                       goto FAILED;
7684             }
7685           }
7686 
7687         /* Set up the changed option bits, but don't change anything yet. */
7688 
7689         newoptions = (options | set) & (~unset);
7690 
7691         /* If the options ended with ')' this is not the start of a nested
7692         group with option changes, so the options change at this level.
7693         If we are not at the pattern start, reset the greedy defaults and the
7694         case value for firstchar and reqchar. */
7695 
7696         if (*ptr == CHAR_RIGHT_PARENTHESIS)
7697           {
7698           greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
7699           greedy_non_default = greedy_default ^ 1;
7700           req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
7701 
7702           /* Change options at this level, and pass them back for use
7703           in subsequent branches. */
7704 
7705           *optionsptr = options = newoptions;
7706           previous = NULL;       /* This item can't be repeated */
7707           continue;              /* It is complete */
7708           }
7709 
7710         /* If the options ended with ':' we are heading into a nested group
7711         with possible change of options. Such groups are non-capturing and are
7712         not assertions of any kind. All we need to do is skip over the ':';
7713         the newoptions value is handled below. */
7714 
7715         bravalue = OP_BRA;
7716         ptr++;
7717         }     /* End of switch for character following (? */
7718       }       /* End of (? handling */
7719 
7720     /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
7721     is set, all unadorned brackets become non-capturing and behave like (?:...)
7722     brackets. */
7723 
7724     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
7725       {
7726       bravalue = OP_BRA;
7727       }
7728 
7729     /* Else we have a capturing group. */
7730 
7731     else
7732       {
7733       NUMBERED_GROUP:
7734       cd->bracount += 1;
7735       PUT2(code, 1+LINK_SIZE, cd->bracount);
7736       skipbytes = IMM2_SIZE;
7737       }
7738 
7739     /* Process nested bracketed regex. First check for parentheses nested too
7740     deeply. */
7741 
7742     if ((cd->parens_depth += 1) > PARENS_NEST_LIMIT)
7743       {
7744       *errorcodeptr = ERR82;
7745       goto FAILED;
7746       }
7747 
7748     /* All assertions used not to be repeatable, but this was changed for Perl
7749     compatibility. All kinds can now be repeated except for assertions that are
7750     conditions (Perl also forbids these to be repeated). We copy code into a
7751     non-register variable (tempcode) in order to be able to pass its address
7752     because some compilers complain otherwise. At the start of a conditional
7753     group whose condition is an assertion, cd->iscondassert is set. We unset it
7754     here so as to allow assertions later in the group to be quantified. */
7755 
7756     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT &&
7757         cd->iscondassert)
7758       {
7759       previous = NULL;
7760       cd->iscondassert = FALSE;
7761       }
7762     else
7763       {
7764       previous = code;
7765       item_hwm_offset = cd->hwm - cd->start_workspace;
7766       }
7767 
7768     *code = bravalue;
7769     tempcode = code;
7770     tempreqvary = cd->req_varyopt;        /* Save value before bracket */
7771     tempbracount = cd->bracount;          /* Save value before bracket */
7772     length_prevgroup = 0;                 /* Initialize for pre-compile phase */
7773 
7774     if (!compile_regex(
7775          newoptions,                      /* The complete new option state */
7776          &tempcode,                       /* Where to put code (updated) */
7777          &ptr,                            /* Input pointer (updated) */
7778          errorcodeptr,                    /* Where to put an error message */
7779          (bravalue == OP_ASSERTBACK ||
7780           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
7781          reset_bracount,                  /* True if (?| group */
7782          skipbytes,                       /* Skip over bracket number */
7783          cond_depth +
7784            ((bravalue == OP_COND)?1:0),   /* Depth of condition subpatterns */
7785          &subfirstchar,                   /* For possible first char */
7786          &subfirstcharflags,
7787          &subreqchar,                     /* For possible last char */
7788          &subreqcharflags,
7789          bcptr,                           /* Current branch chain */
7790          cd,                              /* Tables block */
7791          (lengthptr == NULL)? NULL :      /* Actual compile phase */
7792            &length_prevgroup              /* Pre-compile phase */
7793          ))
7794       goto FAILED;
7795 
7796     cd->parens_depth -= 1;
7797 
7798     /* If this was an atomic group and there are no capturing groups within it,
7799     generate OP_ONCE_NC instead of OP_ONCE. */
7800 
7801     if (bravalue == OP_ONCE && cd->bracount <= tempbracount)
7802       *code = OP_ONCE_NC;
7803 
7804     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
7805       cd->assert_depth -= 1;
7806 
7807     /* At the end of compiling, code is still pointing to the start of the
7808     group, while tempcode has been updated to point past the end of the group.
7809     The pattern pointer (ptr) is on the bracket.
7810 
7811     If this is a conditional bracket, check that there are no more than
7812     two branches in the group, or just one if it's a DEFINE group. We do this
7813     in the real compile phase, not in the pre-pass, where the whole group may
7814     not be available. */
7815 
7816     if (bravalue == OP_COND && lengthptr == NULL)
7817       {
7818       pcre_uchar *tc = code;
7819       int condcount = 0;
7820 
7821       do {
7822          condcount++;
7823          tc += GET(tc,1);
7824          }
7825       while (*tc != OP_KET);
7826 
7827       /* A DEFINE group is never obeyed inline (the "condition" is always
7828       false). It must have only one branch. */
7829 
7830       if (code[LINK_SIZE+1] == OP_DEF)
7831         {
7832         if (condcount > 1)
7833           {
7834           *errorcodeptr = ERR54;
7835           goto FAILED;
7836           }
7837         bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
7838         }
7839 
7840       /* A "normal" conditional group. If there is just one branch, we must not
7841       make use of its firstchar or reqchar, because this is equivalent to an
7842       empty second branch. */
7843 
7844       else
7845         {
7846         if (condcount > 2)
7847           {
7848           *errorcodeptr = ERR27;
7849           goto FAILED;
7850           }
7851         if (condcount == 1) subfirstcharflags = subreqcharflags = REQ_NONE;
7852         }
7853       }
7854 
7855     /* Error if hit end of pattern */
7856 
7857     if (*ptr != CHAR_RIGHT_PARENTHESIS)
7858       {
7859       *errorcodeptr = ERR14;
7860       goto FAILED;
7861       }
7862 
7863     /* In the pre-compile phase, update the length by the length of the group,
7864     less the brackets at either end. Then reduce the compiled code to just a
7865     set of non-capturing brackets so that it doesn't use much memory if it is
7866     duplicated by a quantifier.*/
7867 
7868     if (lengthptr != NULL)
7869       {
7870       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
7871         {
7872         *errorcodeptr = ERR20;
7873         goto FAILED;
7874         }
7875       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
7876       code++;   /* This already contains bravalue */
7877       PUTINC(code, 0, 1 + LINK_SIZE);
7878       *code++ = OP_KET;
7879       PUTINC(code, 0, 1 + LINK_SIZE);
7880       break;    /* No need to waste time with special character handling */
7881       }
7882 
7883     /* Otherwise update the main code pointer to the end of the group. */
7884 
7885     code = tempcode;
7886 
7887     /* For a DEFINE group, required and first character settings are not
7888     relevant. */
7889 
7890     if (bravalue == OP_DEF) break;
7891 
7892     /* Handle updating of the required and first characters for other types of
7893     group. Update for normal brackets of all kinds, and conditions with two
7894     branches (see code above). If the bracket is followed by a quantifier with
7895     zero repeat, we have to back off. Hence the definition of zeroreqchar and
7896     zerofirstchar outside the main loop so that they can be accessed for the
7897     back off. */
7898 
7899     zeroreqchar = reqchar;
7900     zeroreqcharflags = reqcharflags;
7901     zerofirstchar = firstchar;
7902     zerofirstcharflags = firstcharflags;
7903     groupsetfirstchar = FALSE;
7904 
7905     if (bravalue >= OP_ONCE)
7906       {
7907       /* If we have not yet set a firstchar in this branch, take it from the
7908       subpattern, remembering that it was set here so that a repeat of more
7909       than one can replicate it as reqchar if necessary. If the subpattern has
7910       no firstchar, set "none" for the whole branch. In both cases, a zero
7911       repeat forces firstchar to "none". */
7912 
7913       if (firstcharflags == REQ_UNSET)
7914         {
7915         if (subfirstcharflags >= 0)
7916           {
7917           firstchar = subfirstchar;
7918           firstcharflags = subfirstcharflags;
7919           groupsetfirstchar = TRUE;
7920           }
7921         else firstcharflags = REQ_NONE;
7922         zerofirstcharflags = REQ_NONE;
7923         }
7924 
7925       /* If firstchar was previously set, convert the subpattern's firstchar
7926       into reqchar if there wasn't one, using the vary flag that was in
7927       existence beforehand. */
7928 
7929       else if (subfirstcharflags >= 0 && subreqcharflags < 0)
7930         {
7931         subreqchar = subfirstchar;
7932         subreqcharflags = subfirstcharflags | tempreqvary;
7933         }
7934 
7935       /* If the subpattern set a required byte (or set a first byte that isn't
7936       really the first byte - see above), set it. */
7937 
7938       if (subreqcharflags >= 0)
7939         {
7940         reqchar = subreqchar;
7941         reqcharflags = subreqcharflags;
7942         }
7943       }
7944 
7945     /* For a forward assertion, we take the reqchar, if set, provided that the
7946     group has also set a first char. This can be helpful if the pattern that
7947     follows the assertion doesn't set a different char. For example, it's
7948     useful for /(?=abcde).+/. We can't set firstchar for an assertion, however
7949     because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
7950     the "real" "a" would then become a reqchar instead of a firstchar. This is
7951     overcome by a scan at the end if there's no firstchar, looking for an
7952     asserted first char. */
7953 
7954     else if (bravalue == OP_ASSERT && subreqcharflags >= 0 &&
7955              subfirstcharflags >= 0)
7956       {
7957       reqchar = subreqchar;
7958       reqcharflags = subreqcharflags;
7959       }
7960     break;     /* End of processing '(' */
7961 
7962 
7963     /* ===================================================================*/
7964     /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
7965     are arranged to be the negation of the corresponding OP_values in the
7966     default case when PCRE_UCP is not set. For the back references, the values
7967     are negative the reference number. Only back references and those types
7968     that consume a character may be repeated. We can test for values between
7969     ESC_b and ESC_Z for the latter; this may have to change if any new ones are
7970     ever created. */
7971 
7972     case CHAR_BACKSLASH:
7973     tempptr = ptr;
7974     escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options, FALSE);
7975     if (*errorcodeptr != 0) goto FAILED;
7976 
7977     if (escape == 0)                  /* The escape coded a single character */
7978       c = ec;
7979     else
7980       {
7981       /* For metasequences that actually match a character, we disable the
7982       setting of a first character if it hasn't already been set. */
7983 
7984       if (firstcharflags == REQ_UNSET && escape > ESC_b && escape < ESC_Z)
7985         firstcharflags = REQ_NONE;
7986 
7987       /* Set values to reset to if this is followed by a zero repeat. */
7988 
7989       zerofirstchar = firstchar;
7990       zerofirstcharflags = firstcharflags;
7991       zeroreqchar = reqchar;
7992       zeroreqcharflags = reqcharflags;
7993 
7994       /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
7995       is a subroutine call by number (Oniguruma syntax). In fact, the value
7996       ESC_g is returned only for these cases. So we don't need to check for <
7997       or ' if the value is ESC_g. For the Perl syntax \g{n} the value is
7998       -n, and for the Perl syntax \g{name} the result is ESC_k (as
7999       that is a synonym for a named back reference). */
8000 
8001       if (escape == ESC_g)
8002         {
8003         const pcre_uchar *p;
8004         pcre_uint32 cf;
8005 
8006         item_hwm_offset = cd->hwm - cd->start_workspace;   /* Normally this is set when '(' is read */
8007         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
8008           CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
8009 
8010         /* These two statements stop the compiler for warning about possibly
8011         unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
8012         fact, because we do the check for a number below, the paths that
8013         would actually be in error are never taken. */
8014 
8015         skipbytes = 0;
8016         reset_bracount = FALSE;
8017 
8018         /* If it's not a signed or unsigned number, treat it as a name. */
8019 
8020         cf = ptr[1];
8021         if (cf != CHAR_PLUS && cf != CHAR_MINUS && !IS_DIGIT(cf))
8022           {
8023           is_recurse = TRUE;
8024           goto NAMED_REF_OR_RECURSE;
8025           }
8026 
8027         /* Signed or unsigned number (cf = ptr[1]) is known to be plus or minus
8028         or a digit. */
8029 
8030         p = ptr + 2;
8031         while (IS_DIGIT(*p)) p++;
8032         if (*p != (pcre_uchar)terminator)
8033           {
8034           *errorcodeptr = ERR57;
8035           goto FAILED;
8036           }
8037         ptr++;
8038         goto HANDLE_NUMERICAL_RECURSION;
8039         }
8040 
8041       /* \k<name> or \k'name' is a back reference by name (Perl syntax).
8042       We also support \k{name} (.NET syntax).  */
8043 
8044       if (escape == ESC_k)
8045         {
8046         if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
8047           ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
8048           {
8049           *errorcodeptr = ERR69;
8050           goto FAILED;
8051           }
8052         is_recurse = FALSE;
8053         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
8054           CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
8055           CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
8056         goto NAMED_REF_OR_RECURSE;
8057         }
8058 
8059       /* Back references are handled specially; must disable firstchar if
8060       not set to cope with cases like (?=(\w+))\1: which would otherwise set
8061       ':' later. */
8062 
8063       if (escape < 0)
8064         {
8065         open_capitem *oc;
8066         recno = -escape;
8067 
8068         /* Come here from named backref handling when the reference is to a
8069         single group (i.e. not to a duplicated name. */
8070 
8071         HANDLE_REFERENCE:
8072         if (firstcharflags == REQ_UNSET) zerofirstcharflags = firstcharflags = REQ_NONE;
8073         previous = code;
8074         item_hwm_offset = cd->hwm - cd->start_workspace;
8075         *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
8076         PUT2INC(code, 0, recno);
8077         cd->backref_map |= (recno < 32)? (1U << recno) : 1;
8078         if (recno > cd->top_backref) cd->top_backref = recno;
8079 
8080         /* Check to see if this back reference is recursive, that it, it
8081         is inside the group that it references. A flag is set so that the
8082         group can be made atomic. */
8083 
8084         for (oc = cd->open_caps; oc != NULL; oc = oc->next)
8085           {
8086           if (oc->number == recno)
8087             {
8088             oc->flag = TRUE;
8089             break;
8090             }
8091           }
8092         }
8093 
8094       /* So are Unicode property matches, if supported. */
8095 
8096 #ifdef SUPPORT_UCP
8097       else if (escape == ESC_P || escape == ESC_p)
8098         {
8099         BOOL negated;
8100         unsigned int ptype = 0, pdata = 0;
8101         if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
8102           goto FAILED;
8103         previous = code;
8104         item_hwm_offset = cd->hwm - cd->start_workspace;
8105         *code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
8106         *code++ = ptype;
8107         *code++ = pdata;
8108         }
8109 #else
8110 
8111       /* If Unicode properties are not supported, \X, \P, and \p are not
8112       allowed. */
8113 
8114       else if (escape == ESC_X || escape == ESC_P || escape == ESC_p)
8115         {
8116         *errorcodeptr = ERR45;
8117         goto FAILED;
8118         }
8119 #endif
8120 
8121       /* For the rest (including \X when Unicode properties are supported), we
8122       can obtain the OP value by negating the escape value in the default
8123       situation when PCRE_UCP is not set. When it *is* set, we substitute
8124       Unicode property tests. Note that \b and \B do a one-character
8125       lookbehind, and \A also behaves as if it does. */
8126 
8127       else
8128         {
8129         if ((escape == ESC_b || escape == ESC_B || escape == ESC_A) &&
8130              cd->max_lookbehind == 0)
8131           cd->max_lookbehind = 1;
8132 #ifdef SUPPORT_UCP
8133         if (escape >= ESC_DU && escape <= ESC_wu)
8134           {
8135           nestptr = ptr + 1;                   /* Where to resume */
8136           ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
8137           }
8138         else
8139 #endif
8140         /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
8141         so that it works in DFA mode and in lookbehinds. */
8142 
8143           {
8144           previous = (escape > ESC_b && escape < ESC_Z)? code : NULL;
8145           item_hwm_offset = cd->hwm - cd->start_workspace;
8146           *code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape;
8147           }
8148         }
8149       continue;
8150       }
8151 
8152     /* We have a data character whose value is in c. In UTF-8 mode it may have
8153     a value > 127. We set its representation in the length/buffer, and then
8154     handle it as a data character. */
8155 
8156 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
8157     if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
8158       mclength = PRIV(ord2utf)(c, mcbuffer);
8159     else
8160 #endif
8161 
8162      {
8163      mcbuffer[0] = c;
8164      mclength = 1;
8165      }
8166     goto ONE_CHAR;
8167 
8168 
8169     /* ===================================================================*/
8170     /* Handle a literal character. It is guaranteed not to be whitespace or #
8171     when the extended flag is set. If we are in a UTF mode, it may be a
8172     multi-unit literal character. */
8173 
8174     default:
8175     NORMAL_CHAR:
8176     mclength = 1;
8177     mcbuffer[0] = c;
8178 
8179 #ifdef SUPPORT_UTF
8180     if (utf && HAS_EXTRALEN(c))
8181       ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
8182 #endif
8183 
8184     /* At this point we have the character's bytes in mcbuffer, and the length
8185     in mclength. When not in UTF-8 mode, the length is always 1. */
8186 
8187     ONE_CHAR:
8188     previous = code;
8189     item_hwm_offset = cd->hwm - cd->start_workspace;
8190 
8191     /* For caseless UTF-8 mode when UCP support is available, check whether
8192     this character has more than one other case. If so, generate a special
8193     OP_PROP item instead of OP_CHARI. */
8194 
8195 #ifdef SUPPORT_UCP
8196     if (utf && (options & PCRE_CASELESS) != 0)
8197       {
8198       GETCHAR(c, mcbuffer);
8199       if ((c = UCD_CASESET(c)) != 0)
8200         {
8201         *code++ = OP_PROP;
8202         *code++ = PT_CLIST;
8203         *code++ = c;
8204         if (firstcharflags == REQ_UNSET)
8205           firstcharflags = zerofirstcharflags = REQ_NONE;
8206         break;
8207         }
8208       }
8209 #endif
8210 
8211     /* Caseful matches, or not one of the multicase characters. */
8212 
8213     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARI : OP_CHAR;
8214     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
8215 
8216     /* Remember if \r or \n were seen */
8217 
8218     if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
8219       cd->external_flags |= PCRE_HASCRORLF;
8220 
8221     /* Set the first and required bytes appropriately. If no previous first
8222     byte, set it from this character, but revert to none on a zero repeat.
8223     Otherwise, leave the firstchar value alone, and don't change it on a zero
8224     repeat. */
8225 
8226     if (firstcharflags == REQ_UNSET)
8227       {
8228       zerofirstcharflags = REQ_NONE;
8229       zeroreqchar = reqchar;
8230       zeroreqcharflags = reqcharflags;
8231 
8232       /* If the character is more than one byte long, we can set firstchar
8233       only if it is not to be matched caselessly. */
8234 
8235       if (mclength == 1 || req_caseopt == 0)
8236         {
8237         firstchar = mcbuffer[0];
8238         firstcharflags = req_caseopt;
8239 
8240         if (mclength != 1)
8241           {
8242           reqchar = code[-1];
8243           reqcharflags = cd->req_varyopt;
8244           }
8245         }
8246       else firstcharflags = reqcharflags = REQ_NONE;
8247       }
8248 
8249     /* firstchar was previously set; we can set reqchar only if the length is
8250     1 or the matching is caseful. */
8251 
8252     else
8253       {
8254       zerofirstchar = firstchar;
8255       zerofirstcharflags = firstcharflags;
8256       zeroreqchar = reqchar;
8257       zeroreqcharflags = reqcharflags;
8258       if (mclength == 1 || req_caseopt == 0)
8259         {
8260         reqchar = code[-1];
8261         reqcharflags = req_caseopt | cd->req_varyopt;
8262         }
8263       }
8264 
8265     break;            /* End of literal character handling */
8266     }
8267   }                   /* end of big loop */
8268 
8269 
8270 /* Control never reaches here by falling through, only by a goto for all the
8271 error states. Pass back the position in the pattern so that it can be displayed
8272 to the user for diagnosing the error. */
8273 
8274 FAILED:
8275 *ptrptr = ptr;
8276 return FALSE;
8277 }
8278 
8279 
8280 
8281 /*************************************************
8282 *     Compile sequence of alternatives           *
8283 *************************************************/
8284 
8285 /* On entry, ptr is pointing past the bracket character, but on return it
8286 points to the closing bracket, or vertical bar, or end of string. The code
8287 variable is pointing at the byte into which the BRA operator has been stored.
8288 This function is used during the pre-compile phase when we are trying to find
8289 out the amount of memory needed, as well as during the real compile phase. The
8290 value of lengthptr distinguishes the two phases.
8291 
8292 Arguments:
8293   options           option bits, including any changes for this subpattern
8294   codeptr           -> the address of the current code pointer
8295   ptrptr            -> the address of the current pattern pointer
8296   errorcodeptr      -> pointer to error code variable
8297   lookbehind        TRUE if this is a lookbehind assertion
8298   reset_bracount    TRUE to reset the count for each branch
8299   skipbytes         skip this many bytes at start (for brackets and OP_COND)
8300   cond_depth        depth of nesting for conditional subpatterns
8301   firstcharptr      place to put the first required character
8302   firstcharflagsptr place to put the first character flags, or a negative number
8303   reqcharptr        place to put the last required character
8304   reqcharflagsptr   place to put the last required character flags, or a negative number
8305   bcptr             pointer to the chain of currently open branches
8306   cd                points to the data block with tables pointers etc.
8307   lengthptr         NULL during the real compile phase
8308                     points to length accumulator during pre-compile phase
8309 
8310 Returns:            TRUE on success
8311 */
8312 
8313 static BOOL
compile_regex(int options,pcre_uchar ** codeptr,const pcre_uchar ** ptrptr,int * errorcodeptr,BOOL lookbehind,BOOL reset_bracount,int skipbytes,int cond_depth,pcre_uint32 * firstcharptr,pcre_int32 * firstcharflagsptr,pcre_uint32 * reqcharptr,pcre_int32 * reqcharflagsptr,branch_chain * bcptr,compile_data * cd,int * lengthptr)8314 compile_regex(int options, pcre_uchar **codeptr, const pcre_uchar **ptrptr,
8315   int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
8316   int cond_depth,
8317   pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
8318   pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
8319   branch_chain *bcptr, compile_data *cd, int *lengthptr)
8320 {
8321 const pcre_uchar *ptr = *ptrptr;
8322 pcre_uchar *code = *codeptr;
8323 pcre_uchar *last_branch = code;
8324 pcre_uchar *start_bracket = code;
8325 pcre_uchar *reverse_count = NULL;
8326 open_capitem capitem;
8327 int capnumber = 0;
8328 pcre_uint32 firstchar, reqchar;
8329 pcre_int32 firstcharflags, reqcharflags;
8330 pcre_uint32 branchfirstchar, branchreqchar;
8331 pcre_int32 branchfirstcharflags, branchreqcharflags;
8332 int length;
8333 unsigned int orig_bracount;
8334 unsigned int max_bracount;
8335 branch_chain bc;
8336 size_t save_hwm_offset;
8337 
8338 /* If set, call the external function that checks for stack availability. */
8339 
8340 if (PUBL(stack_guard) != NULL && PUBL(stack_guard)())
8341   {
8342   *errorcodeptr= ERR85;
8343   return FALSE;
8344   }
8345 
8346 /* Miscellaneous initialization */
8347 
8348 bc.outer = bcptr;
8349 bc.current_branch = code;
8350 
8351 firstchar = reqchar = 0;
8352 firstcharflags = reqcharflags = REQ_UNSET;
8353 
8354 save_hwm_offset = cd->hwm - cd->start_workspace;
8355 
8356 /* Accumulate the length for use in the pre-compile phase. Start with the
8357 length of the BRA and KET and any extra bytes that are required at the
8358 beginning. We accumulate in a local variable to save frequent testing of
8359 lenthptr for NULL. We cannot do this by looking at the value of code at the
8360 start and end of each alternative, because compiled items are discarded during
8361 the pre-compile phase so that the work space is not exceeded. */
8362 
8363 length = 2 + 2*LINK_SIZE + skipbytes;
8364 
8365 /* WARNING: If the above line is changed for any reason, you must also change
8366 the code that abstracts option settings at the start of the pattern and makes
8367 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
8368 pre-compile phase to find out whether anything has yet been compiled or not. */
8369 
8370 /* If this is a capturing subpattern, add to the chain of open capturing items
8371 so that we can detect them if (*ACCEPT) is encountered. This is also used to
8372 detect groups that contain recursive back references to themselves. Note that
8373 only OP_CBRA need be tested here; changing this opcode to one of its variants,
8374 e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */
8375 
8376 if (*code == OP_CBRA)
8377   {
8378   capnumber = GET2(code, 1 + LINK_SIZE);
8379   capitem.number = capnumber;
8380   capitem.next = cd->open_caps;
8381   capitem.flag = FALSE;
8382   cd->open_caps = &capitem;
8383   }
8384 
8385 /* Offset is set zero to mark that this bracket is still open */
8386 
8387 PUT(code, 1, 0);
8388 code += 1 + LINK_SIZE + skipbytes;
8389 
8390 /* Loop for each alternative branch */
8391 
8392 orig_bracount = max_bracount = cd->bracount;
8393 for (;;)
8394   {
8395   /* For a (?| group, reset the capturing bracket count so that each branch
8396   uses the same numbers. */
8397 
8398   if (reset_bracount) cd->bracount = orig_bracount;
8399 
8400   /* Set up dummy OP_REVERSE if lookbehind assertion */
8401 
8402   if (lookbehind)
8403     {
8404     *code++ = OP_REVERSE;
8405     reverse_count = code;
8406     PUTINC(code, 0, 0);
8407     length += 1 + LINK_SIZE;
8408     }
8409 
8410   /* Now compile the branch; in the pre-compile phase its length gets added
8411   into the length. */
8412 
8413   if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstchar,
8414         &branchfirstcharflags, &branchreqchar, &branchreqcharflags, &bc,
8415         cond_depth, cd, (lengthptr == NULL)? NULL : &length))
8416     {
8417     *ptrptr = ptr;
8418     return FALSE;
8419     }
8420 
8421   /* Keep the highest bracket count in case (?| was used and some branch
8422   has fewer than the rest. */
8423 
8424   if (cd->bracount > max_bracount) max_bracount = cd->bracount;
8425 
8426   /* In the real compile phase, there is some post-processing to be done. */
8427 
8428   if (lengthptr == NULL)
8429     {
8430     /* If this is the first branch, the firstchar and reqchar values for the
8431     branch become the values for the regex. */
8432 
8433     if (*last_branch != OP_ALT)
8434       {
8435       firstchar = branchfirstchar;
8436       firstcharflags = branchfirstcharflags;
8437       reqchar = branchreqchar;
8438       reqcharflags = branchreqcharflags;
8439       }
8440 
8441     /* If this is not the first branch, the first char and reqchar have to
8442     match the values from all the previous branches, except that if the
8443     previous value for reqchar didn't have REQ_VARY set, it can still match,
8444     and we set REQ_VARY for the regex. */
8445 
8446     else
8447       {
8448       /* If we previously had a firstchar, but it doesn't match the new branch,
8449       we have to abandon the firstchar for the regex, but if there was
8450       previously no reqchar, it takes on the value of the old firstchar. */
8451 
8452       if (firstcharflags >= 0 &&
8453           (firstcharflags != branchfirstcharflags || firstchar != branchfirstchar))
8454         {
8455         if (reqcharflags < 0)
8456           {
8457           reqchar = firstchar;
8458           reqcharflags = firstcharflags;
8459           }
8460         firstcharflags = REQ_NONE;
8461         }
8462 
8463       /* If we (now or from before) have no firstchar, a firstchar from the
8464       branch becomes a reqchar if there isn't a branch reqchar. */
8465 
8466       if (firstcharflags < 0 && branchfirstcharflags >= 0 && branchreqcharflags < 0)
8467         {
8468         branchreqchar = branchfirstchar;
8469         branchreqcharflags = branchfirstcharflags;
8470         }
8471 
8472       /* Now ensure that the reqchars match */
8473 
8474       if (((reqcharflags & ~REQ_VARY) != (branchreqcharflags & ~REQ_VARY)) ||
8475           reqchar != branchreqchar)
8476         reqcharflags = REQ_NONE;
8477       else
8478         {
8479         reqchar = branchreqchar;
8480         reqcharflags |= branchreqcharflags; /* To "or" REQ_VARY */
8481         }
8482       }
8483 
8484     /* If lookbehind, check that this branch matches a fixed-length string, and
8485     put the length into the OP_REVERSE item. Temporarily mark the end of the
8486     branch with OP_END. If the branch contains OP_RECURSE, the result is -3
8487     because there may be forward references that we can't check here. Set a
8488     flag to cause another lookbehind check at the end. Why not do it all at the
8489     end? Because common, erroneous checks are picked up here and the offset of
8490     the problem can be shown. */
8491 
8492     if (lookbehind)
8493       {
8494       int fixed_length;
8495       *code = OP_END;
8496       fixed_length = find_fixedlength(last_branch,  (options & PCRE_UTF8) != 0,
8497         FALSE, cd, NULL);
8498       DPRINTF(("fixed length = %d\n", fixed_length));
8499       if (fixed_length == -3)
8500         {
8501         cd->check_lookbehind = TRUE;
8502         }
8503       else if (fixed_length < 0)
8504         {
8505         *errorcodeptr = (fixed_length == -2)? ERR36 :
8506                         (fixed_length == -4)? ERR70: ERR25;
8507         *ptrptr = ptr;
8508         return FALSE;
8509         }
8510       else
8511         {
8512         if (fixed_length > cd->max_lookbehind)
8513           cd->max_lookbehind = fixed_length;
8514         PUT(reverse_count, 0, fixed_length);
8515         }
8516       }
8517     }
8518 
8519   /* Reached end of expression, either ')' or end of pattern. In the real
8520   compile phase, go back through the alternative branches and reverse the chain
8521   of offsets, with the field in the BRA item now becoming an offset to the
8522   first alternative. If there are no alternatives, it points to the end of the
8523   group. The length in the terminating ket is always the length of the whole
8524   bracketed item. Return leaving the pointer at the terminating char. */
8525 
8526   if (*ptr != CHAR_VERTICAL_LINE)
8527     {
8528     if (lengthptr == NULL)
8529       {
8530       int branch_length = (int)(code - last_branch);
8531       do
8532         {
8533         int prev_length = GET(last_branch, 1);
8534         PUT(last_branch, 1, branch_length);
8535         branch_length = prev_length;
8536         last_branch -= branch_length;
8537         }
8538       while (branch_length > 0);
8539       }
8540 
8541     /* Fill in the ket */
8542 
8543     *code = OP_KET;
8544     PUT(code, 1, (int)(code - start_bracket));
8545     code += 1 + LINK_SIZE;
8546 
8547     /* If it was a capturing subpattern, check to see if it contained any
8548     recursive back references. If so, we must wrap it in atomic brackets.
8549     Because we are moving code along, we must ensure that any pending recursive
8550     references are updated. In any event, remove the block from the chain. */
8551 
8552     if (capnumber > 0)
8553       {
8554       if (cd->open_caps->flag)
8555         {
8556         *code = OP_END;
8557         adjust_recurse(start_bracket, 1 + LINK_SIZE,
8558           (options & PCRE_UTF8) != 0, cd, save_hwm_offset);
8559         memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
8560           IN_UCHARS(code - start_bracket));
8561         *start_bracket = OP_ONCE;
8562         code += 1 + LINK_SIZE;
8563         PUT(start_bracket, 1, (int)(code - start_bracket));
8564         *code = OP_KET;
8565         PUT(code, 1, (int)(code - start_bracket));
8566         code += 1 + LINK_SIZE;
8567         length += 2 + 2*LINK_SIZE;
8568         }
8569       cd->open_caps = cd->open_caps->next;
8570       }
8571 
8572     /* Retain the highest bracket number, in case resetting was used. */
8573 
8574     cd->bracount = max_bracount;
8575 
8576     /* Set values to pass back */
8577 
8578     *codeptr = code;
8579     *ptrptr = ptr;
8580     *firstcharptr = firstchar;
8581     *firstcharflagsptr = firstcharflags;
8582     *reqcharptr = reqchar;
8583     *reqcharflagsptr = reqcharflags;
8584     if (lengthptr != NULL)
8585       {
8586       if (OFLOW_MAX - *lengthptr < length)
8587         {
8588         *errorcodeptr = ERR20;
8589         return FALSE;
8590         }
8591       *lengthptr += length;
8592       }
8593     return TRUE;
8594     }
8595 
8596   /* Another branch follows. In the pre-compile phase, we can move the code
8597   pointer back to where it was for the start of the first branch. (That is,
8598   pretend that each branch is the only one.)
8599 
8600   In the real compile phase, insert an ALT node. Its length field points back
8601   to the previous branch while the bracket remains open. At the end the chain
8602   is reversed. It's done like this so that the start of the bracket has a
8603   zero offset until it is closed, making it possible to detect recursion. */
8604 
8605   if (lengthptr != NULL)
8606     {
8607     code = *codeptr + 1 + LINK_SIZE + skipbytes;
8608     length += 1 + LINK_SIZE;
8609     }
8610   else
8611     {
8612     *code = OP_ALT;
8613     PUT(code, 1, (int)(code - last_branch));
8614     bc.current_branch = last_branch = code;
8615     code += 1 + LINK_SIZE;
8616     }
8617 
8618   ptr++;
8619   }
8620 /* Control never reaches here */
8621 }
8622 
8623 
8624 
8625 
8626 /*************************************************
8627 *          Check for anchored expression         *
8628 *************************************************/
8629 
8630 /* Try to find out if this is an anchored regular expression. Consider each
8631 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8632 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8633 it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8634 be found, because ^ generates OP_CIRCM in that mode.
8635 
8636 We can also consider a regex to be anchored if OP_SOM starts all its branches.
8637 This is the code for \G, which means "match at start of match position, taking
8638 into account the match offset".
8639 
8640 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8641 because that will try the rest of the pattern at all possible matching points,
8642 so there is no point trying again.... er ....
8643 
8644 .... except when the .* appears inside capturing parentheses, and there is a
8645 subsequent back reference to those parentheses. We haven't enough information
8646 to catch that case precisely.
8647 
8648 At first, the best we could do was to detect when .* was in capturing brackets
8649 and the highest back reference was greater than or equal to that level.
8650 However, by keeping a bitmap of the first 31 back references, we can catch some
8651 of the more common cases more precisely.
8652 
8653 ... A second exception is when the .* appears inside an atomic group, because
8654 this prevents the number of characters it matches from being adjusted.
8655 
8656 Arguments:
8657   code           points to start of expression (the bracket)
8658   bracket_map    a bitmap of which brackets we are inside while testing; this
8659                   handles up to substring 31; after that we just have to take
8660                   the less precise approach
8661   cd             points to the compile data block
8662   atomcount      atomic group level
8663 
8664 Returns:     TRUE or FALSE
8665 */
8666 
8667 static BOOL
is_anchored(register const pcre_uchar * code,unsigned int bracket_map,compile_data * cd,int atomcount)8668 is_anchored(register const pcre_uchar *code, unsigned int bracket_map,
8669   compile_data *cd, int atomcount)
8670 {
8671 do {
8672    const pcre_uchar *scode = first_significant_code(
8673      code + PRIV(OP_lengths)[*code], FALSE);
8674    register int op = *scode;
8675 
8676    /* Non-capturing brackets */
8677 
8678    if (op == OP_BRA  || op == OP_BRAPOS ||
8679        op == OP_SBRA || op == OP_SBRAPOS)
8680      {
8681      if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8682      }
8683 
8684    /* Capturing brackets */
8685 
8686    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8687             op == OP_SCBRA || op == OP_SCBRAPOS)
8688      {
8689      int n = GET2(scode, 1+LINK_SIZE);
8690      int new_map = bracket_map | ((n < 32)? (1U << n) : 1);
8691      if (!is_anchored(scode, new_map, cd, atomcount)) return FALSE;
8692      }
8693 
8694    /* Positive forward assertion */
8695 
8696    else if (op == OP_ASSERT)
8697      {
8698      if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8699      }
8700 
8701    /* Condition; not anchored if no second branch */
8702 
8703    else if (op == OP_COND)
8704      {
8705      if (scode[GET(scode,1)] != OP_ALT) return FALSE;
8706      if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8707      }
8708 
8709    /* Atomic groups */
8710 
8711    else if (op == OP_ONCE || op == OP_ONCE_NC)
8712      {
8713      if (!is_anchored(scode, bracket_map, cd, atomcount + 1))
8714        return FALSE;
8715      }
8716 
8717    /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8718    it isn't in brackets that are or may be referenced or inside an atomic
8719    group. */
8720 
8721    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8722              op == OP_TYPEPOSSTAR))
8723      {
8724      if (scode[1] != OP_ALLANY || (bracket_map & cd->backref_map) != 0 ||
8725          atomcount > 0 || cd->had_pruneorskip)
8726        return FALSE;
8727      }
8728 
8729    /* Check for explicit anchoring */
8730 
8731    else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8732 
8733    code += GET(code, 1);
8734    }
8735 while (*code == OP_ALT);   /* Loop for each alternative */
8736 return TRUE;
8737 }
8738 
8739 
8740 
8741 /*************************************************
8742 *         Check for starting with ^ or .*        *
8743 *************************************************/
8744 
8745 /* This is called to find out if every branch starts with ^ or .* so that
8746 "first char" processing can be done to speed things up in multiline
8747 matching and for non-DOTALL patterns that start with .* (which must start at
8748 the beginning or after \n). As in the case of is_anchored() (see above), we
8749 have to take account of back references to capturing brackets that contain .*
8750 because in that case we can't make the assumption. Also, the appearance of .*
8751 inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
8752 or *SKIP does not count, because once again the assumption no longer holds.
8753 
8754 Arguments:
8755   code           points to start of expression (the bracket)
8756   bracket_map    a bitmap of which brackets we are inside while testing; this
8757                   handles up to substring 31; after that we just have to take
8758                   the less precise approach
8759   cd             points to the compile data
8760   atomcount      atomic group level
8761   inassert       TRUE if in an assertion
8762 
8763 Returns:         TRUE or FALSE
8764 */
8765 
8766 static BOOL
is_startline(const pcre_uchar * code,unsigned int bracket_map,compile_data * cd,int atomcount,BOOL inassert)8767 is_startline(const pcre_uchar *code, unsigned int bracket_map,
8768   compile_data *cd, int atomcount, BOOL inassert)
8769 {
8770 do {
8771    const pcre_uchar *scode = first_significant_code(
8772      code + PRIV(OP_lengths)[*code], FALSE);
8773    register int op = *scode;
8774 
8775    /* If we are at the start of a conditional assertion group, *both* the
8776    conditional assertion *and* what follows the condition must satisfy the test
8777    for start of line. Other kinds of condition fail. Note that there may be an
8778    auto-callout at the start of a condition. */
8779 
8780    if (op == OP_COND)
8781      {
8782      scode += 1 + LINK_SIZE;
8783      if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8784      switch (*scode)
8785        {
8786        case OP_CREF:
8787        case OP_DNCREF:
8788        case OP_RREF:
8789        case OP_DNRREF:
8790        case OP_DEF:
8791        case OP_FAIL:
8792        return FALSE;
8793 
8794        default:     /* Assertion */
8795        if (!is_startline(scode, bracket_map, cd, atomcount, TRUE)) return FALSE;
8796        do scode += GET(scode, 1); while (*scode == OP_ALT);
8797        scode += 1 + LINK_SIZE;
8798        break;
8799        }
8800      scode = first_significant_code(scode, FALSE);
8801      op = *scode;
8802      }
8803 
8804    /* Non-capturing brackets */
8805 
8806    if (op == OP_BRA  || op == OP_BRAPOS ||
8807        op == OP_SBRA || op == OP_SBRAPOS)
8808      {
8809      if (!is_startline(scode, bracket_map, cd, atomcount, inassert)) return FALSE;
8810      }
8811 
8812    /* Capturing brackets */
8813 
8814    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8815             op == OP_SCBRA || op == OP_SCBRAPOS)
8816      {
8817      int n = GET2(scode, 1+LINK_SIZE);
8818      int new_map = bracket_map | ((n < 32)? (1U << n) : 1);
8819      if (!is_startline(scode, new_map, cd, atomcount, inassert)) return FALSE;
8820      }
8821 
8822    /* Positive forward assertions */
8823 
8824    else if (op == OP_ASSERT)
8825      {
8826      if (!is_startline(scode, bracket_map, cd, atomcount, TRUE)) return FALSE;
8827      }
8828 
8829    /* Atomic brackets */
8830 
8831    else if (op == OP_ONCE || op == OP_ONCE_NC)
8832      {
8833      if (!is_startline(scode, bracket_map, cd, atomcount + 1, inassert)) return FALSE;
8834      }
8835 
8836    /* .* means "start at start or after \n" if it isn't in atomic brackets or
8837    brackets that may be referenced or an assertion, as long as the pattern does
8838    not contain *PRUNE or *SKIP, because these break the feature. Consider, for
8839    example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e.
8840    not at the start of a line. */
8841 
8842    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8843      {
8844      if (scode[1] != OP_ANY || (bracket_map & cd->backref_map) != 0 ||
8845          atomcount > 0 || cd->had_pruneorskip || inassert)
8846        return FALSE;
8847      }
8848 
8849    /* Check for explicit circumflex; anything else gives a FALSE result. Note
8850    in particular that this includes atomic brackets OP_ONCE and OP_ONCE_NC
8851    because the number of characters matched by .* cannot be adjusted inside
8852    them. */
8853 
8854    else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8855 
8856    /* Move on to the next alternative */
8857 
8858    code += GET(code, 1);
8859    }
8860 while (*code == OP_ALT);  /* Loop for each alternative */
8861 return TRUE;
8862 }
8863 
8864 
8865 
8866 /*************************************************
8867 *       Check for asserted fixed first char      *
8868 *************************************************/
8869 
8870 /* During compilation, the "first char" settings from forward assertions are
8871 discarded, because they can cause conflicts with actual literals that follow.
8872 However, if we end up without a first char setting for an unanchored pattern,
8873 it is worth scanning the regex to see if there is an initial asserted first
8874 char. If all branches start with the same asserted char, or with a
8875 non-conditional bracket all of whose alternatives start with the same asserted
8876 char (recurse ad lib), then we return that char, with the flags set to zero or
8877 REQ_CASELESS; otherwise return zero with REQ_NONE in the flags.
8878 
8879 Arguments:
8880   code       points to start of expression (the bracket)
8881   flags      points to the first char flags, or to REQ_NONE
8882   inassert   TRUE if in an assertion
8883 
8884 Returns:     the fixed first char, or 0 with REQ_NONE in flags
8885 */
8886 
8887 static pcre_uint32
find_firstassertedchar(const pcre_uchar * code,pcre_int32 * flags,BOOL inassert)8888 find_firstassertedchar(const pcre_uchar *code, pcre_int32 *flags,
8889   BOOL inassert)
8890 {
8891 register pcre_uint32 c = 0;
8892 int cflags = REQ_NONE;
8893 
8894 *flags = REQ_NONE;
8895 do {
8896    pcre_uint32 d;
8897    int dflags;
8898    int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
8899              *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
8900    const pcre_uchar *scode = first_significant_code(code + 1+LINK_SIZE + xl,
8901      TRUE);
8902    register pcre_uchar op = *scode;
8903 
8904    switch(op)
8905      {
8906      default:
8907      return 0;
8908 
8909      case OP_BRA:
8910      case OP_BRAPOS:
8911      case OP_CBRA:
8912      case OP_SCBRA:
8913      case OP_CBRAPOS:
8914      case OP_SCBRAPOS:
8915      case OP_ASSERT:
8916      case OP_ONCE:
8917      case OP_ONCE_NC:
8918      d = find_firstassertedchar(scode, &dflags, op == OP_ASSERT);
8919      if (dflags < 0)
8920        return 0;
8921      if (cflags < 0) { c = d; cflags = dflags; } else if (c != d || cflags != dflags) return 0;
8922      break;
8923 
8924      case OP_EXACT:
8925      scode += IMM2_SIZE;
8926      /* Fall through */
8927 
8928      case OP_CHAR:
8929      case OP_PLUS:
8930      case OP_MINPLUS:
8931      case OP_POSPLUS:
8932      if (!inassert) return 0;
8933      if (cflags < 0) { c = scode[1]; cflags = 0; }
8934        else if (c != scode[1]) return 0;
8935      break;
8936 
8937      case OP_EXACTI:
8938      scode += IMM2_SIZE;
8939      /* Fall through */
8940 
8941      case OP_CHARI:
8942      case OP_PLUSI:
8943      case OP_MINPLUSI:
8944      case OP_POSPLUSI:
8945      if (!inassert) return 0;
8946      if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
8947        else if (c != scode[1]) return 0;
8948      break;
8949      }
8950 
8951    code += GET(code, 1);
8952    }
8953 while (*code == OP_ALT);
8954 
8955 *flags = cflags;
8956 return c;
8957 }
8958 
8959 
8960 
8961 /*************************************************
8962 *     Add an entry to the name/number table      *
8963 *************************************************/
8964 
8965 /* This function is called between compiling passes to add an entry to the
8966 name/number table, maintaining alphabetical order. Checking for permitted
8967 and forbidden duplicates has already been done.
8968 
8969 Arguments:
8970   cd           the compile data block
8971   name         the name to add
8972   length       the length of the name
8973   groupno      the group number
8974 
8975 Returns:       nothing
8976 */
8977 
8978 static void
add_name(compile_data * cd,const pcre_uchar * name,int length,unsigned int groupno)8979 add_name(compile_data *cd, const pcre_uchar *name, int length,
8980   unsigned int groupno)
8981 {
8982 int i;
8983 pcre_uchar *slot = cd->name_table;
8984 
8985 for (i = 0; i < cd->names_found; i++)
8986   {
8987   int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(length));
8988   if (crc == 0 && slot[IMM2_SIZE+length] != 0)
8989     crc = -1; /* Current name is a substring */
8990 
8991   /* Make space in the table and break the loop for an earlier name. For a
8992   duplicate or later name, carry on. We do this for duplicates so that in the
8993   simple case (when ?(| is not used) they are in order of their numbers. In all
8994   cases they are in the order in which they appear in the pattern. */
8995 
8996   if (crc < 0)
8997     {
8998     memmove(slot + cd->name_entry_size, slot,
8999       IN_UCHARS((cd->names_found - i) * cd->name_entry_size));
9000     break;
9001     }
9002 
9003   /* Continue the loop for a later or duplicate name */
9004 
9005   slot += cd->name_entry_size;
9006   }
9007 
9008 PUT2(slot, 0, groupno);
9009 memcpy(slot + IMM2_SIZE, name, IN_UCHARS(length));
9010 slot[IMM2_SIZE + length] = 0;
9011 cd->names_found++;
9012 }
9013 
9014 
9015 
9016 /*************************************************
9017 *        Compile a Regular Expression            *
9018 *************************************************/
9019 
9020 /* This function takes a string and returns a pointer to a block of store
9021 holding a compiled version of the expression. The original API for this
9022 function had no error code return variable; it is retained for backwards
9023 compatibility. The new function is given a new name.
9024 
9025 Arguments:
9026   pattern       the regular expression
9027   options       various option bits
9028   errorcodeptr  pointer to error code variable (pcre_compile2() only)
9029                   can be NULL if you don't want a code value
9030   errorptr      pointer to pointer to error text
9031   erroroffset   ptr offset in pattern where error was detected
9032   tables        pointer to character tables or NULL
9033 
9034 Returns:        pointer to compiled data block, or NULL on error,
9035                 with errorptr and erroroffset set
9036 */
9037 
9038 #if defined COMPILE_PCRE8
9039 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
pcre_compile(const char * pattern,int options,const char ** errorptr,int * erroroffset,const unsigned char * tables)9040 pcre_compile(const char *pattern, int options, const char **errorptr,
9041   int *erroroffset, const unsigned char *tables)
9042 #elif defined COMPILE_PCRE16
9043 PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
9044 pcre16_compile(PCRE_SPTR16 pattern, int options, const char **errorptr,
9045   int *erroroffset, const unsigned char *tables)
9046 #elif defined COMPILE_PCRE32
9047 PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
9048 pcre32_compile(PCRE_SPTR32 pattern, int options, const char **errorptr,
9049   int *erroroffset, const unsigned char *tables)
9050 #endif
9051 {
9052 #if defined COMPILE_PCRE8
9053 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
9054 #elif defined COMPILE_PCRE16
9055 return pcre16_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
9056 #elif defined COMPILE_PCRE32
9057 return pcre32_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
9058 #endif
9059 }
9060 
9061 
9062 #if defined COMPILE_PCRE8
9063 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
pcre_compile2(const char * pattern,int options,int * errorcodeptr,const char ** errorptr,int * erroroffset,const unsigned char * tables)9064 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
9065   const char **errorptr, int *erroroffset, const unsigned char *tables)
9066 #elif defined COMPILE_PCRE16
9067 PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
9068 pcre16_compile2(PCRE_SPTR16 pattern, int options, int *errorcodeptr,
9069   const char **errorptr, int *erroroffset, const unsigned char *tables)
9070 #elif defined COMPILE_PCRE32
9071 PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
9072 pcre32_compile2(PCRE_SPTR32 pattern, int options, int *errorcodeptr,
9073   const char **errorptr, int *erroroffset, const unsigned char *tables)
9074 #endif
9075 {
9076 REAL_PCRE *re;
9077 int length = 1;  /* For final END opcode */
9078 pcre_int32 firstcharflags, reqcharflags;
9079 pcre_uint32 firstchar, reqchar;
9080 pcre_uint32 limit_match = PCRE_UINT32_MAX;
9081 pcre_uint32 limit_recursion = PCRE_UINT32_MAX;
9082 int newline;
9083 int errorcode = 0;
9084 int skipatstart = 0;
9085 BOOL utf;
9086 BOOL never_utf = FALSE;
9087 size_t size;
9088 pcre_uchar *code;
9089 const pcre_uchar *codestart;
9090 const pcre_uchar *ptr;
9091 compile_data compile_block;
9092 compile_data *cd = &compile_block;
9093 
9094 /* This space is used for "compiling" into during the first phase, when we are
9095 computing the amount of memory that is needed. Compiled items are thrown away
9096 as soon as possible, so that a fairly large buffer should be sufficient for
9097 this purpose. The same space is used in the second phase for remembering where
9098 to fill in forward references to subpatterns. That may overflow, in which case
9099 new memory is obtained from malloc(). */
9100 
9101 pcre_uchar cworkspace[COMPILE_WORK_SIZE];
9102 
9103 /* This vector is used for remembering name groups during the pre-compile. In a
9104 similar way to cworkspace, it can be expanded using malloc() if necessary. */
9105 
9106 named_group named_groups[NAMED_GROUP_LIST_SIZE];
9107 
9108 /* Set this early so that early errors get offset 0. */
9109 
9110 ptr = (const pcre_uchar *)pattern;
9111 
9112 /* We can't pass back an error message if errorptr is NULL; I guess the best we
9113 can do is just return NULL, but we can set a code value if there is a code
9114 pointer. */
9115 
9116 if (errorptr == NULL)
9117   {
9118   if (errorcodeptr != NULL) *errorcodeptr = 99;
9119   return NULL;
9120   }
9121 
9122 *errorptr = NULL;
9123 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
9124 
9125 /* However, we can give a message for this error */
9126 
9127 if (erroroffset == NULL)
9128   {
9129   errorcode = ERR16;
9130   goto PCRE_EARLY_ERROR_RETURN2;
9131   }
9132 
9133 *erroroffset = 0;
9134 
9135 /* Set up pointers to the individual character tables */
9136 
9137 if (tables == NULL) tables = PRIV(default_tables);
9138 cd->lcc = tables + lcc_offset;
9139 cd->fcc = tables + fcc_offset;
9140 cd->cbits = tables + cbits_offset;
9141 cd->ctypes = tables + ctypes_offset;
9142 
9143 /* Check that all undefined public option bits are zero */
9144 
9145 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
9146   {
9147   errorcode = ERR17;
9148   goto PCRE_EARLY_ERROR_RETURN;
9149   }
9150 
9151 /* If PCRE_NEVER_UTF is set, remember it. */
9152 
9153 if ((options & PCRE_NEVER_UTF) != 0) never_utf = TRUE;
9154 
9155 /* Check for global one-time settings at the start of the pattern, and remember
9156 the offset for later. */
9157 
9158 cd->external_flags = 0;   /* Initialize here for LIMIT_MATCH/RECURSION */
9159 
9160 while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
9161        ptr[skipatstart+1] == CHAR_ASTERISK)
9162   {
9163   int newnl = 0;
9164   int newbsr = 0;
9165 
9166 /* For completeness and backward compatibility, (*UTFn) is supported in the
9167 relevant libraries, but (*UTF) is generic and always supported. Note that
9168 PCRE_UTF8 == PCRE_UTF16 == PCRE_UTF32. */
9169 
9170 #ifdef COMPILE_PCRE8
9171   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF8_RIGHTPAR, 5) == 0)
9172     { skipatstart += 7; options |= PCRE_UTF8; continue; }
9173 #endif
9174 #ifdef COMPILE_PCRE16
9175   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF16_RIGHTPAR, 6) == 0)
9176     { skipatstart += 8; options |= PCRE_UTF16; continue; }
9177 #endif
9178 #ifdef COMPILE_PCRE32
9179   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF32_RIGHTPAR, 6) == 0)
9180     { skipatstart += 8; options |= PCRE_UTF32; continue; }
9181 #endif
9182 
9183   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 4) == 0)
9184     { skipatstart += 6; options |= PCRE_UTF8; continue; }
9185   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)
9186     { skipatstart += 6; options |= PCRE_UCP; continue; }
9187   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_AUTO_POSSESS_RIGHTPAR, 16) == 0)
9188     { skipatstart += 18; options |= PCRE_NO_AUTO_POSSESS; continue; }
9189   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
9190     { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
9191 
9192   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_MATCH_EQ, 12) == 0)
9193     {
9194     pcre_uint32 c = 0;
9195     int p = skipatstart + 14;
9196     while (isdigit(ptr[p]))
9197       {
9198       if (c > PCRE_UINT32_MAX / 10 - 1) break;   /* Integer overflow */
9199       c = c*10 + ptr[p++] - CHAR_0;
9200       }
9201     if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
9202     if (c < limit_match)
9203       {
9204       limit_match = c;
9205       cd->external_flags |= PCRE_MLSET;
9206       }
9207     skipatstart = p;
9208     continue;
9209     }
9210 
9211   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_RECURSION_EQ, 16) == 0)
9212     {
9213     pcre_uint32 c = 0;
9214     int p = skipatstart + 18;
9215     while (isdigit(ptr[p]))
9216       {
9217       if (c > PCRE_UINT32_MAX / 10 - 1) break;   /* Integer overflow check */
9218       c = c*10 + ptr[p++] - CHAR_0;
9219       }
9220     if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
9221     if (c < limit_recursion)
9222       {
9223       limit_recursion = c;
9224       cd->external_flags |= PCRE_RLSET;
9225       }
9226     skipatstart = p;
9227     continue;
9228     }
9229 
9230   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CR_RIGHTPAR, 3) == 0)
9231     { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
9232   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LF_RIGHTPAR, 3)  == 0)
9233     { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
9234   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CRLF_RIGHTPAR, 5)  == 0)
9235     { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
9236   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANY_RIGHTPAR, 4) == 0)
9237     { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
9238   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANYCRLF_RIGHTPAR, 8) == 0)
9239     { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
9240 
9241   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
9242     { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
9243   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
9244     { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
9245 
9246   if (newnl != 0)
9247     options = (options & ~PCRE_NEWLINE_BITS) | newnl;
9248   else if (newbsr != 0)
9249     options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
9250   else break;
9251   }
9252 
9253 /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
9254 utf = (options & PCRE_UTF8) != 0;
9255 if (utf && never_utf)
9256   {
9257   errorcode = ERR78;
9258   goto PCRE_EARLY_ERROR_RETURN2;
9259   }
9260 
9261 /* Can't support UTF unless PCRE has been compiled to include the code. The
9262 return of an error code from PRIV(valid_utf)() is a new feature, introduced in
9263 release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
9264 not used here. */
9265 
9266 #ifdef SUPPORT_UTF
9267 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&
9268      (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
9269   {
9270 #if defined COMPILE_PCRE8
9271   errorcode = ERR44;
9272 #elif defined COMPILE_PCRE16
9273   errorcode = ERR74;
9274 #elif defined COMPILE_PCRE32
9275   errorcode = ERR77;
9276 #endif
9277   goto PCRE_EARLY_ERROR_RETURN2;
9278   }
9279 #else
9280 if (utf)
9281   {
9282   errorcode = ERR32;
9283   goto PCRE_EARLY_ERROR_RETURN;
9284   }
9285 #endif
9286 
9287 /* Can't support UCP unless PCRE has been compiled to include the code. */
9288 
9289 #ifndef SUPPORT_UCP
9290 if ((options & PCRE_UCP) != 0)
9291   {
9292   errorcode = ERR67;
9293   goto PCRE_EARLY_ERROR_RETURN;
9294   }
9295 #endif
9296 
9297 /* Check validity of \R options. */
9298 
9299 if ((options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) ==
9300      (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
9301   {
9302   errorcode = ERR56;
9303   goto PCRE_EARLY_ERROR_RETURN;
9304   }
9305 
9306 /* Handle different types of newline. The three bits give seven cases. The
9307 current code allows for fixed one- or two-byte sequences, plus "any" and
9308 "anycrlf". */
9309 
9310 switch (options & PCRE_NEWLINE_BITS)
9311   {
9312   case 0: newline = NEWLINE; break;   /* Build-time default */
9313   case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
9314   case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
9315   case PCRE_NEWLINE_CR+
9316        PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
9317   case PCRE_NEWLINE_ANY: newline = -1; break;
9318   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
9319   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
9320   }
9321 
9322 if (newline == -2)
9323   {
9324   cd->nltype = NLTYPE_ANYCRLF;
9325   }
9326 else if (newline < 0)
9327   {
9328   cd->nltype = NLTYPE_ANY;
9329   }
9330 else
9331   {
9332   cd->nltype = NLTYPE_FIXED;
9333   if (newline > 255)
9334     {
9335     cd->nllen = 2;
9336     cd->nl[0] = (newline >> 8) & 255;
9337     cd->nl[1] = newline & 255;
9338     }
9339   else
9340     {
9341     cd->nllen = 1;
9342     cd->nl[0] = newline;
9343     }
9344   }
9345 
9346 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
9347 references to help in deciding whether (.*) can be treated as anchored or not.
9348 */
9349 
9350 cd->top_backref = 0;
9351 cd->backref_map = 0;
9352 
9353 /* Reflect pattern for debugging output */
9354 
9355 DPRINTF(("------------------------------------------------------------------\n"));
9356 #ifdef PCRE_DEBUG
9357 print_puchar(stdout, (PCRE_PUCHAR)pattern);
9358 #endif
9359 DPRINTF(("\n"));
9360 
9361 /* Pretend to compile the pattern while actually just accumulating the length
9362 of memory required. This behaviour is triggered by passing a non-NULL final
9363 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
9364 to compile parts of the pattern into; the compiled code is discarded when it is
9365 no longer needed, so hopefully this workspace will never overflow, though there
9366 is a test for its doing so. */
9367 
9368 cd->bracount = cd->final_bracount = 0;
9369 cd->names_found = 0;
9370 cd->name_entry_size = 0;
9371 cd->name_table = NULL;
9372 cd->dupnames = FALSE;
9373 cd->dupgroups = FALSE;
9374 cd->namedrefcount = 0;
9375 cd->start_code = cworkspace;
9376 cd->hwm = cworkspace;
9377 cd->iscondassert = FALSE;
9378 cd->start_workspace = cworkspace;
9379 cd->workspace_size = COMPILE_WORK_SIZE;
9380 cd->named_groups = named_groups;
9381 cd->named_group_list_size = NAMED_GROUP_LIST_SIZE;
9382 cd->start_pattern = (const pcre_uchar *)pattern;
9383 cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
9384 cd->req_varyopt = 0;
9385 cd->parens_depth = 0;
9386 cd->assert_depth = 0;
9387 cd->max_lookbehind = 0;
9388 cd->external_options = options;
9389 cd->open_caps = NULL;
9390 
9391 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
9392 don't need to look at the result of the function here. The initial options have
9393 been put into the cd block so that they can be changed if an option setting is
9394 found within the regex right at the beginning. Bringing initial option settings
9395 outside can help speed up starting point checks. */
9396 
9397 ptr += skipatstart;
9398 code = cworkspace;
9399 *code = OP_BRA;
9400 
9401 (void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
9402   FALSE, 0, 0, &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL,
9403   cd, &length);
9404 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
9405 
9406 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
9407   (int)(cd->hwm - cworkspace)));
9408 
9409 if (length > MAX_PATTERN_SIZE)
9410   {
9411   errorcode = ERR20;
9412   goto PCRE_EARLY_ERROR_RETURN;
9413   }
9414 
9415 /* Compute the size of the data block for storing the compiled pattern. Integer
9416 overflow should no longer be possible because nowadays we limit the maximum
9417 value of cd->names_found and cd->name_entry_size. */
9418 
9419 size = sizeof(REAL_PCRE) +
9420   (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
9421 
9422 /* Get the memory. */
9423 
9424 re = (REAL_PCRE *)(PUBL(malloc))(size);
9425 if (re == NULL)
9426   {
9427   errorcode = ERR21;
9428   goto PCRE_EARLY_ERROR_RETURN;
9429   }
9430 
9431 /* Put in the magic number, and save the sizes, initial options, internal
9432 flags, and character table pointer. NULL is used for the default character
9433 tables. The nullpad field is at the end; it's there to help in the case when a
9434 regex compiled on a system with 4-byte pointers is run on another with 8-byte
9435 pointers. */
9436 
9437 re->magic_number = MAGIC_NUMBER;
9438 re->size = (int)size;
9439 re->options = cd->external_options;
9440 re->flags = cd->external_flags;
9441 re->limit_match = limit_match;
9442 re->limit_recursion = limit_recursion;
9443 re->first_char = 0;
9444 re->req_char = 0;
9445 re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar);
9446 re->name_entry_size = cd->name_entry_size;
9447 re->name_count = cd->names_found;
9448 re->ref_count = 0;
9449 re->tables = (tables == PRIV(default_tables))? NULL : tables;
9450 re->nullpad = NULL;
9451 #ifdef COMPILE_PCRE32
9452 re->dummy = 0;
9453 #else
9454 re->dummy1 = re->dummy2 = re->dummy3 = 0;
9455 #endif
9456 
9457 /* The starting points of the name/number translation table and of the code are
9458 passed around in the compile data block. The start/end pattern and initial
9459 options are already set from the pre-compile phase, as is the name_entry_size
9460 field. Reset the bracket count and the names_found field. Also reset the hwm
9461 field; this time it's used for remembering forward references to subpatterns.
9462 */
9463 
9464 cd->final_bracount = cd->bracount;  /* Save for checking forward references */
9465 cd->parens_depth = 0;
9466 cd->assert_depth = 0;
9467 cd->bracount = 0;
9468 cd->max_lookbehind = 0;
9469 cd->name_table = (pcre_uchar *)re + re->name_table_offset;
9470 codestart = cd->name_table + re->name_entry_size * re->name_count;
9471 cd->start_code = codestart;
9472 cd->hwm = (pcre_uchar *)(cd->start_workspace);
9473 cd->iscondassert = FALSE;
9474 cd->req_varyopt = 0;
9475 cd->had_accept = FALSE;
9476 cd->had_pruneorskip = FALSE;
9477 cd->check_lookbehind = FALSE;
9478 cd->open_caps = NULL;
9479 
9480 /* If any named groups were found, create the name/number table from the list
9481 created in the first pass. */
9482 
9483 if (cd->names_found > 0)
9484   {
9485   int i = cd->names_found;
9486   named_group *ng = cd->named_groups;
9487   cd->names_found = 0;
9488   for (; i > 0; i--, ng++)
9489     add_name(cd, ng->name, ng->length, ng->number);
9490   if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
9491     (PUBL(free))((void *)cd->named_groups);
9492   }
9493 
9494 /* Set up a starting, non-extracting bracket, then compile the expression. On
9495 error, errorcode will be set non-zero, so we don't need to look at the result
9496 of the function here. */
9497 
9498 ptr = (const pcre_uchar *)pattern + skipatstart;
9499 code = (pcre_uchar *)codestart;
9500 *code = OP_BRA;
9501 (void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, 0, 0,
9502   &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL, cd, NULL);
9503 re->top_bracket = cd->bracount;
9504 re->top_backref = cd->top_backref;
9505 re->max_lookbehind = cd->max_lookbehind;
9506 re->flags = cd->external_flags | PCRE_MODE;
9507 
9508 if (cd->had_accept)
9509   {
9510   reqchar = 0;              /* Must disable after (*ACCEPT) */
9511   reqcharflags = REQ_NONE;
9512   }
9513 
9514 /* If not reached end of pattern on success, there's an excess bracket. */
9515 
9516 if (errorcode == 0 && *ptr != CHAR_NULL) errorcode = ERR22;
9517 
9518 /* Fill in the terminating state and check for disastrous overflow, but
9519 if debugging, leave the test till after things are printed out. */
9520 
9521 *code++ = OP_END;
9522 
9523 #ifndef PCRE_DEBUG
9524 if (code - codestart > length) errorcode = ERR23;
9525 #endif
9526 
9527 #ifdef SUPPORT_VALGRIND
9528 /* If the estimated length exceeds the really used length, mark the extra
9529 allocated memory as unaddressable, so that any out-of-bound reads can be
9530 detected. */
9531 VALGRIND_MAKE_MEM_NOACCESS(code, (length - (code - codestart)) * sizeof(pcre_uchar));
9532 #endif
9533 
9534 /* Fill in any forward references that are required. There may be repeated
9535 references; optimize for them, as searching a large regex takes time. */
9536 
9537 if (cd->hwm > cd->start_workspace)
9538   {
9539   int prev_recno = -1;
9540   const pcre_uchar *groupptr = NULL;
9541   while (errorcode == 0 && cd->hwm > cd->start_workspace)
9542     {
9543     int offset, recno;
9544     cd->hwm -= LINK_SIZE;
9545     offset = GET(cd->hwm, 0);
9546 
9547     /* Check that the hwm handling hasn't gone wrong. This whole area is
9548     rewritten in PCRE2 because there are some obscure cases. */
9549 
9550     if (offset == 0 || codestart[offset-1] != OP_RECURSE)
9551       {
9552       errorcode = ERR10;
9553       break;
9554       }
9555 
9556     recno = GET(codestart, offset);
9557     if (recno != prev_recno)
9558       {
9559       groupptr = PRIV(find_bracket)(codestart, utf, recno);
9560       prev_recno = recno;
9561       }
9562     if (groupptr == NULL) errorcode = ERR53;
9563       else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));
9564     }
9565   }
9566 
9567 /* If the workspace had to be expanded, free the new memory. Set the pointer to
9568 NULL to indicate that forward references have been filled in. */
9569 
9570 if (cd->workspace_size > COMPILE_WORK_SIZE)
9571   (PUBL(free))((void *)cd->start_workspace);
9572 cd->start_workspace = NULL;
9573 
9574 /* Give an error if there's back reference to a non-existent capturing
9575 subpattern. */
9576 
9577 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
9578 
9579 /* Unless disabled, check whether any single character iterators can be
9580 auto-possessified. The function overwrites the appropriate opcode values, so
9581 the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
9582 used in this code because at least one compiler gives a warning about loss of
9583 "const" attribute if the cast (pcre_uchar *)codestart is used directly in the
9584 function call. */
9585 
9586 if (errorcode == 0 && (options & PCRE_NO_AUTO_POSSESS) == 0)
9587   {
9588   pcre_uchar *temp = (pcre_uchar *)codestart;
9589   auto_possessify(temp, utf, cd);
9590   }
9591 
9592 /* If there were any lookbehind assertions that contained OP_RECURSE
9593 (recursions or subroutine calls), a flag is set for them to be checked here,
9594 because they may contain forward references. Actual recursions cannot be fixed
9595 length, but subroutine calls can. It is done like this so that those without
9596 OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
9597 exceptional ones forgo this. We scan the pattern to check that they are fixed
9598 length, and set their lengths. */
9599 
9600 if (errorcode == 0 && cd->check_lookbehind)
9601   {
9602   pcre_uchar *cc = (pcre_uchar *)codestart;
9603 
9604   /* Loop, searching for OP_REVERSE items, and process those that do not have
9605   their length set. (Actually, it will also re-process any that have a length
9606   of zero, but that is a pathological case, and it does no harm.) When we find
9607   one, we temporarily terminate the branch it is in while we scan it. */
9608 
9609   for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf, -1);
9610        cc != NULL;
9611        cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf, -1))
9612     {
9613     if (GET(cc, 1) == 0)
9614       {
9615       int fixed_length;
9616       pcre_uchar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
9617       int end_op = *be;
9618       *be = OP_END;
9619       fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != 0, TRUE,
9620         cd, NULL);
9621       *be = end_op;
9622       DPRINTF(("fixed length = %d\n", fixed_length));
9623       if (fixed_length < 0)
9624         {
9625         errorcode = (fixed_length == -2)? ERR36 :
9626                     (fixed_length == -4)? ERR70 : ERR25;
9627         break;
9628         }
9629       if (fixed_length > cd->max_lookbehind) cd->max_lookbehind = fixed_length;
9630       PUT(cc, 1, fixed_length);
9631       }
9632     cc += 1 + LINK_SIZE;
9633     }
9634   }
9635 
9636 /* Failed to compile, or error while post-processing */
9637 
9638 if (errorcode != 0)
9639   {
9640   (PUBL(free))(re);
9641   PCRE_EARLY_ERROR_RETURN:
9642   *erroroffset = (int)(ptr - (const pcre_uchar *)pattern);
9643   PCRE_EARLY_ERROR_RETURN2:
9644   *errorptr = find_error_text(errorcode);
9645   if (errorcodeptr != NULL) *errorcodeptr = errorcode;
9646   return NULL;
9647   }
9648 
9649 /* If the anchored option was not passed, set the flag if we can determine that
9650 the pattern is anchored by virtue of ^ characters or \A or anything else, such
9651 as starting with non-atomic .* when DOTALL is set and there are no occurrences
9652 of *PRUNE or *SKIP.
9653 
9654 Otherwise, if we know what the first byte has to be, save it, because that
9655 speeds up unanchored matches no end. If not, see if we can set the
9656 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
9657 start with ^. and also when all branches start with non-atomic .* for
9658 non-DOTALL matches when *PRUNE and SKIP are not present. */
9659 
9660 if ((re->options & PCRE_ANCHORED) == 0)
9661   {
9662   if (is_anchored(codestart, 0, cd, 0)) re->options |= PCRE_ANCHORED;
9663   else
9664     {
9665     if (firstcharflags < 0)
9666       firstchar = find_firstassertedchar(codestart, &firstcharflags, FALSE);
9667     if (firstcharflags >= 0)   /* Remove caseless flag for non-caseable chars */
9668       {
9669 #if defined COMPILE_PCRE8
9670       re->first_char = firstchar & 0xff;
9671 #elif defined COMPILE_PCRE16
9672       re->first_char = firstchar & 0xffff;
9673 #elif defined COMPILE_PCRE32
9674       re->first_char = firstchar;
9675 #endif
9676       if ((firstcharflags & REQ_CASELESS) != 0)
9677         {
9678 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
9679         /* We ignore non-ASCII first chars in 8 bit mode. */
9680         if (utf)
9681           {
9682           if (re->first_char < 128)
9683             {
9684             if (cd->fcc[re->first_char] != re->first_char)
9685               re->flags |= PCRE_FCH_CASELESS;
9686             }
9687           else if (UCD_OTHERCASE(re->first_char) != re->first_char)
9688             re->flags |= PCRE_FCH_CASELESS;
9689           }
9690         else
9691 #endif
9692         if (MAX_255(re->first_char)
9693             && cd->fcc[re->first_char] != re->first_char)
9694           re->flags |= PCRE_FCH_CASELESS;
9695         }
9696 
9697       re->flags |= PCRE_FIRSTSET;
9698       }
9699 
9700     else if (is_startline(codestart, 0, cd, 0, FALSE)) re->flags |= PCRE_STARTLINE;
9701     }
9702   }
9703 
9704 /* For an anchored pattern, we use the "required byte" only if it follows a
9705 variable length item in the regex. Remove the caseless flag for non-caseable
9706 bytes. */
9707 
9708 if (reqcharflags >= 0 &&
9709      ((re->options & PCRE_ANCHORED) == 0 || (reqcharflags & REQ_VARY) != 0))
9710   {
9711 #if defined COMPILE_PCRE8
9712   re->req_char = reqchar & 0xff;
9713 #elif defined COMPILE_PCRE16
9714   re->req_char = reqchar & 0xffff;
9715 #elif defined COMPILE_PCRE32
9716   re->req_char = reqchar;
9717 #endif
9718   if ((reqcharflags & REQ_CASELESS) != 0)
9719     {
9720 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
9721     /* We ignore non-ASCII first chars in 8 bit mode. */
9722     if (utf)
9723       {
9724       if (re->req_char < 128)
9725         {
9726         if (cd->fcc[re->req_char] != re->req_char)
9727           re->flags |= PCRE_RCH_CASELESS;
9728         }
9729       else if (UCD_OTHERCASE(re->req_char) != re->req_char)
9730         re->flags |= PCRE_RCH_CASELESS;
9731       }
9732     else
9733 #endif
9734     if (MAX_255(re->req_char) && cd->fcc[re->req_char] != re->req_char)
9735       re->flags |= PCRE_RCH_CASELESS;
9736     }
9737 
9738   re->flags |= PCRE_REQCHSET;
9739   }
9740 
9741 /* Print out the compiled data if debugging is enabled. This is never the
9742 case when building a production library. */
9743 
9744 #ifdef PCRE_DEBUG
9745 printf("Length = %d top_bracket = %d top_backref = %d\n",
9746   length, re->top_bracket, re->top_backref);
9747 
9748 printf("Options=%08x\n", re->options);
9749 
9750 if ((re->flags & PCRE_FIRSTSET) != 0)
9751   {
9752   pcre_uchar ch = re->first_char;
9753   const char *caseless =
9754     ((re->flags & PCRE_FCH_CASELESS) == 0)? "" : " (caseless)";
9755   if (PRINTABLE(ch)) printf("First char = %c%s\n", ch, caseless);
9756     else printf("First char = \\x%02x%s\n", ch, caseless);
9757   }
9758 
9759 if ((re->flags & PCRE_REQCHSET) != 0)
9760   {
9761   pcre_uchar ch = re->req_char;
9762   const char *caseless =
9763     ((re->flags & PCRE_RCH_CASELESS) == 0)? "" : " (caseless)";
9764   if (PRINTABLE(ch)) printf("Req char = %c%s\n", ch, caseless);
9765     else printf("Req char = \\x%02x%s\n", ch, caseless);
9766   }
9767 
9768 #if defined COMPILE_PCRE8
9769 pcre_printint((pcre *)re, stdout, TRUE);
9770 #elif defined COMPILE_PCRE16
9771 pcre16_printint((pcre *)re, stdout, TRUE);
9772 #elif defined COMPILE_PCRE32
9773 pcre32_printint((pcre *)re, stdout, TRUE);
9774 #endif
9775 
9776 /* This check is done here in the debugging case so that the code that
9777 was compiled can be seen. */
9778 
9779 if (code - codestart > length)
9780   {
9781   (PUBL(free))(re);
9782   *errorptr = find_error_text(ERR23);
9783   *erroroffset = ptr - (pcre_uchar *)pattern;
9784   if (errorcodeptr != NULL) *errorcodeptr = ERR23;
9785   return NULL;
9786   }
9787 #endif   /* PCRE_DEBUG */
9788 
9789 /* Check for a pattern than can match an empty string, so that this information
9790 can be provided to applications. */
9791 
9792 do
9793   {
9794   if (could_be_empty_branch(codestart, code, utf, cd, NULL))
9795     {
9796     re->flags |= PCRE_MATCH_EMPTY;
9797     break;
9798     }
9799   codestart += GET(codestart, 1);
9800   }
9801 while (*codestart == OP_ALT);
9802 
9803 #if defined COMPILE_PCRE8
9804 return (pcre *)re;
9805 #elif defined COMPILE_PCRE16
9806 return (pcre16 *)re;
9807 #elif defined COMPILE_PCRE32
9808 return (pcre32 *)re;
9809 #endif
9810 }
9811 
9812 /* End of pcre_compile.c */
9813