1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9            Copyright (c) 1997-2018 University of Cambridge
10 
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14 
15     * Redistributions of source code must retain the above copyright notice,
16       this list of conditions and the following disclaimer.
17 
18     * Redistributions in binary form must reproduce the above copyright
19       notice, this list of conditions and the following disclaimer in the
20       documentation and/or other materials provided with the distribution.
21 
22     * Neither the name of the University of Cambridge nor the names of its
23       contributors may be used to endorse or promote products derived from
24       this software without specific prior written permission.
25 
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39 
40 
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43 
44 /* %ExternalCopyright% */
45 
46 #ifdef HAVE_CONFIG_H
47 #include "config.h"
48 #endif
49 
50 #define NLBLOCK cd             /* Block containing newline information */
51 #define PSSTART start_pattern  /* Field containing pattern start */
52 #define PSEND   end_pattern    /* Field containing pattern end */
53 
54 #include "pcre_internal.h"
55 
56 
57 /* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
58 is also used by pcretest. PCRE_DEBUG is not defined when building a production
59 library. We do not need to select pcre16_printint.c specially, because the
60 COMPILE_PCREx macro will already be appropriately set. */
61 
62 #ifdef PCRE_DEBUG
63 /* pcre_printint.c should not include any headers */
64 #define PCRE_INCLUDED
65 #include "pcre_printint.c"
66 #undef PCRE_INCLUDED
67 #endif
68 
69 
70 /* Macro for setting individual bits in class bitmaps. */
71 
72 #define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
73 
74 /* Maximum length value to check against when making sure that the integer that
75 holds the compiled pattern length does not overflow. We make it a bit less than
76 INT_MAX to allow for adding in group terminating bytes, so that we don't have
77 to check them every time. */
78 
79 #define OFLOW_MAX (INT_MAX - 20)
80 
81 /* Definitions to allow mutual recursion */
82 
83 static int
84   add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
85     const pcre_uint32 *, unsigned int);
86 
87 static BOOL
88   compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
89     pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
90     compile_data *, int *);
91 
92 
93 
94 /*************************************************
95 *      Code parameters and static tables         *
96 *************************************************/
97 
98 /* This value specifies the size of stack workspace that is used during the
99 first pre-compile phase that determines how much memory is required. The regex
100 is partly compiled into this space, but the compiled parts are discarded as
101 soon as they can be, so that hopefully there will never be an overrun. The code
102 does, however, check for an overrun. The largest amount I've seen used is 218,
103 so this number is very generous.
104 
105 The same workspace is used during the second, actual compile phase for
106 remembering forward references to groups so that they can be filled in at the
107 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
108 is 4 there is plenty of room for most patterns. However, the memory can get
109 filled up by repetitions of forward references, for example patterns like
110 /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
111 that the workspace is expanded using malloc() in this situation. The value
112 below is therefore a minimum, and we put a maximum on it for safety. The
113 minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
114 kicks in at the same number of forward references in all cases. */
115 
116 #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
117 #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
118 
119 /* This value determines the size of the initial vector that is used for
120 remembering named groups during the pre-compile. It is allocated on the stack,
121 but if it is too small, it is expanded using malloc(), in a similar way to the
122 workspace. The value is the number of slots in the list. */
123 
124 #define NAMED_GROUP_LIST_SIZE  20
125 
126 /* The overrun tests check for a slightly smaller size so that they detect the
127 overrun before it actually does run off the end of the data block. */
128 
129 #define WORK_SIZE_SAFETY_MARGIN (100)
130 
131 /* Private flags added to firstchar and reqchar. */
132 
133 #define REQ_CASELESS    (1 << 0)        /* Indicates caselessness */
134 #define REQ_VARY        (1 << 1)        /* Reqchar followed non-literal item */
135 /* Negative values for the firstchar and reqchar flags */
136 #define REQ_UNSET       (-2)
137 #define REQ_NONE        (-1)
138 
139 /* Repeated character flags. */
140 
141 #define UTF_LENGTH     0x10000000l      /* The char contains its length. */
142 
143 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
144 are simple data values; negative values are for special things like \d and so
145 on. Zero means further processing is needed (for things like \x), or the escape
146 is invalid. */
147 
148 #ifndef EBCDIC
149 
150 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
151 in UTF-8 mode. */
152 
153 static const short int escapes[] = {
154      0,                       0,
155      0,                       0,
156      0,                       0,
157      0,                       0,
158      0,                       0,
159      CHAR_COLON,              CHAR_SEMICOLON,
160      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
161      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
162      CHAR_COMMERCIAL_AT,      -ESC_A,
163      -ESC_B,                  -ESC_C,
164      -ESC_D,                  -ESC_E,
165      0,                       -ESC_G,
166      -ESC_H,                  0,
167      0,                       -ESC_K,
168      0,                       0,
169      -ESC_N,                  0,
170      -ESC_P,                  -ESC_Q,
171      -ESC_R,                  -ESC_S,
172      0,                       0,
173      -ESC_V,                  -ESC_W,
174      -ESC_X,                  0,
175      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
176      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
177      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
178      CHAR_GRAVE_ACCENT,       ESC_a,
179      -ESC_b,                  0,
180      -ESC_d,                  ESC_e,
181      ESC_f,                   0,
182      -ESC_h,                  0,
183      0,                       -ESC_k,
184      0,                       0,
185      ESC_n,                   0,
186      -ESC_p,                  0,
187      ESC_r,                   -ESC_s,
188      ESC_tee,                 0,
189      -ESC_v,                  -ESC_w,
190      0,                       0,
191      -ESC_z
192 };
193 
194 #else
195 
196 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
197 
198 static const short int escapes[] = {
199 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
200 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
201 /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
202 /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
203 /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
204 /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
205 /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
206 /*  80 */     0, ESC_a, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
207 /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
208 /*  90 */     0,     0, -ESC_k,       0,      0, ESC_n,      0, -ESC_p,
209 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
210 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
211 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
212 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
213 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
214 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
215 /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
216 /*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
217 /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
218 /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
219 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
220 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
221 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
222 };
223 
224 /* We also need a table of characters that may follow \c in an EBCDIC
225 environment for characters 0-31. */
226 
227 static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
228 
229 #endif
230 
231 
232 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
233 searched linearly. Put all the names into a single string, in order to reduce
234 the number of relocations when a shared library is dynamically linked. The
235 string is built from string macros so that it works in UTF-8 mode on EBCDIC
236 platforms. */
237 
238 typedef struct verbitem {
239   int   len;                 /* Length of verb name */
240   int   op;                  /* Op when no arg, or -1 if arg mandatory */
241   int   op_arg;              /* Op when arg present, or -1 if not allowed */
242 } verbitem;
243 
244 static const char verbnames[] =
245   "\0"                       /* Empty name is a shorthand for MARK */
246   STRING_MARK0
247   STRING_ACCEPT0
248   STRING_COMMIT0
249   STRING_F0
250   STRING_FAIL0
251   STRING_PRUNE0
252   STRING_SKIP0
253   STRING_THEN;
254 
255 static const verbitem verbs[] = {
256   { 0, -1,        OP_MARK },
257   { 4, -1,        OP_MARK },
258   { 6, OP_ACCEPT, -1 },
259   { 6, OP_COMMIT, -1 },
260   { 1, OP_FAIL,   -1 },
261   { 4, OP_FAIL,   -1 },
262   { 5, OP_PRUNE,  OP_PRUNE_ARG },
263   { 4, OP_SKIP,   OP_SKIP_ARG  },
264   { 4, OP_THEN,   OP_THEN_ARG  }
265 };
266 
267 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
268 
269 
270 /* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in
271 another regex library. */
272 
273 static const pcre_uchar sub_start_of_word[] = {
274   CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
275   CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' };
276 
277 static const pcre_uchar sub_end_of_word[] = {
278   CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
279   CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
280   CHAR_RIGHT_PARENTHESIS, '\0' };
281 
282 
283 /* Tables of names of POSIX character classes and their lengths. The names are
284 now all in a single string, to reduce the number of relocations when a shared
285 library is dynamically loaded. The list of lengths is terminated by a zero
286 length entry. The first three must be alpha, lower, upper, as this is assumed
287 for handling case independence. The indices for graph, print, and punct are
288 needed, so identify them. */
289 
290 static const char posix_names[] =
291   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
292   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
293   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
294   STRING_word0  STRING_xdigit;
295 
296 static const pcre_uint8 posix_name_lengths[] = {
297   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
298 
299 #define PC_GRAPH  8
300 #define PC_PRINT  9
301 #define PC_PUNCT 10
302 
303 
304 /* Table of class bit maps for each POSIX class. Each class is formed from a
305 base map, with an optional addition or removal of another map. Then, for some
306 classes, there is some additional tweaking: for [:blank:] the vertical space
307 characters are removed, and for [:alpha:] and [:alnum:] the underscore
308 character is removed. The triples in the table consist of the base map offset,
309 second map offset or -1 if no second map, and a non-negative value for map
310 addition or a negative value for map subtraction (if there are two maps). The
311 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
312 remove vertical space characters, 2 => remove underscore. */
313 
314 static const int posix_class_maps[] = {
315   cbit_word,  cbit_digit, -2,             /* alpha */
316   cbit_lower, -1,          0,             /* lower */
317   cbit_upper, -1,          0,             /* upper */
318   cbit_word,  -1,          2,             /* alnum - word without underscore */
319   cbit_print, cbit_cntrl,  0,             /* ascii */
320   cbit_space, -1,          1,             /* blank - a GNU extension */
321   cbit_cntrl, -1,          0,             /* cntrl */
322   cbit_digit, -1,          0,             /* digit */
323   cbit_graph, -1,          0,             /* graph */
324   cbit_print, -1,          0,             /* print */
325   cbit_punct, -1,          0,             /* punct */
326   cbit_space, -1,          0,             /* space */
327   cbit_word,  -1,          0,             /* word - a Perl extension */
328   cbit_xdigit,-1,          0              /* xdigit */
329 };
330 
331 /* Table of substitutes for \d etc when PCRE_UCP is set. They are replaced by
332 Unicode property escapes. */
333 
334 #ifdef SUPPORT_UCP
335 static const pcre_uchar string_PNd[]  = {
336   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
337   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
338 static const pcre_uchar string_pNd[]  = {
339   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
340   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
341 static const pcre_uchar string_PXsp[] = {
342   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
343   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
344 static const pcre_uchar string_pXsp[] = {
345   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
346   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
347 static const pcre_uchar string_PXwd[] = {
348   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
349   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
350 static const pcre_uchar string_pXwd[] = {
351   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
352   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
353 
354 static const pcre_uchar *substitutes[] = {
355   string_PNd,           /* \D */
356   string_pNd,           /* \d */
357   string_PXsp,          /* \S */   /* Xsp is Perl space, but from 8.34, Perl */
358   string_pXsp,          /* \s */   /* space and POSIX space are the same. */
359   string_PXwd,          /* \W */
360   string_pXwd           /* \w */
361 };
362 
363 /* The POSIX class substitutes must be in the order of the POSIX class names,
364 defined above, and there are both positive and negative cases. NULL means no
365 general substitute of a Unicode property escape (\p or \P). However, for some
366 POSIX classes (e.g. graph, print, punct) a special property code is compiled
367 directly. */
368 
369 static const pcre_uchar string_pL[] =   {
370   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
371   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
372 static const pcre_uchar string_pLl[] =  {
373   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
374   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
375 static const pcre_uchar string_pLu[] =  {
376   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
377   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
378 static const pcre_uchar string_pXan[] = {
379   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
380   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
381 static const pcre_uchar string_h[] =    {
382   CHAR_BACKSLASH, CHAR_h, '\0' };
383 static const pcre_uchar string_pXps[] = {
384   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
385   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
386 static const pcre_uchar string_PL[] =   {
387   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
388   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
389 static const pcre_uchar string_PLl[] =  {
390   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
391   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
392 static const pcre_uchar string_PLu[] =  {
393   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
394   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
395 static const pcre_uchar string_PXan[] = {
396   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
397   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
398 static const pcre_uchar string_H[] =    {
399   CHAR_BACKSLASH, CHAR_H, '\0' };
400 static const pcre_uchar string_PXps[] = {
401   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
402   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
403 
404 static const pcre_uchar *posix_substitutes[] = {
405   string_pL,            /* alpha */
406   string_pLl,           /* lower */
407   string_pLu,           /* upper */
408   string_pXan,          /* alnum */
409   NULL,                 /* ascii */
410   string_h,             /* blank */
411   NULL,                 /* cntrl */
412   string_pNd,           /* digit */
413   NULL,                 /* graph */
414   NULL,                 /* print */
415   NULL,                 /* punct */
416   string_pXps,          /* space */   /* Xps is POSIX space, but from 8.34 */
417   string_pXwd,          /* word  */   /* Perl and POSIX space are the same */
418   NULL,                 /* xdigit */
419   /* Negated cases */
420   string_PL,            /* ^alpha */
421   string_PLl,           /* ^lower */
422   string_PLu,           /* ^upper */
423   string_PXan,          /* ^alnum */
424   NULL,                 /* ^ascii */
425   string_H,             /* ^blank */
426   NULL,                 /* ^cntrl */
427   string_PNd,           /* ^digit */
428   NULL,                 /* ^graph */
429   NULL,                 /* ^print */
430   NULL,                 /* ^punct */
431   string_PXps,          /* ^space */  /* Xps is POSIX space, but from 8.34 */
432   string_PXwd,          /* ^word */   /* Perl and POSIX space are the same */
433   NULL                  /* ^xdigit */
434 };
435 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
436 #endif
437 
438 #define STRING(a)  # a
439 #define XSTRING(s) STRING(s)
440 
441 /* The texts of compile-time error messages. These are "char *" because they
442 are passed to the outside world. Do not ever re-use any error number, because
443 they are documented. Always add a new error instead. Messages marked DEAD below
444 are no longer used. This used to be a table of strings, but in order to reduce
445 the number of relocations needed when a shared library is loaded dynamically,
446 it is now one long string. We cannot use a table of offsets, because the
447 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
448 simply count through to the one we want - this isn't a performance issue
449 because these strings are used only when there is a compilation error.
450 
451 Each substring ends with \0 to insert a null character. This includes the final
452 substring, so that the whole string ends with \0\0, which can be detected when
453 counting through. */
454 
455 static const char error_texts[] =
456   "no error\0"
457   "\\ at end of pattern\0"
458   "\\c at end of pattern\0"
459   "unrecognized character follows \\\0"
460   "numbers out of order in {} quantifier\0"
461   /* 5 */
462   "number too big in {} quantifier\0"
463   "missing terminating ] for character class\0"
464   "invalid escape sequence in character class\0"
465   "range out of order in character class\0"
466   "nothing to repeat\0"
467   /* 10 */
468   "internal error: invalid forward reference offset\0"
469   "internal error: unexpected repeat\0"
470   "unrecognized character after (? or (?-\0"
471   "POSIX named classes are supported only within a class\0"
472   "missing )\0"
473   /* 15 */
474   "reference to non-existent subpattern\0"
475   "erroffset passed as NULL\0"
476   "unknown option bit(s) set\0"
477   "missing ) after comment\0"
478   "parentheses nested too deeply\0"  /** DEAD **/
479   /* 20 */
480   "regular expression is too large\0"
481   "failed to get memory\0"
482   "unmatched parentheses\0"
483   "internal error: code overflow\0"
484   "unrecognized character after (?<\0"
485   /* 25 */
486   "lookbehind assertion is not fixed length\0"
487   "malformed number or name after (?(\0"
488   "conditional group contains more than two branches\0"
489   "assertion expected after (?( or (?(?C)\0"
490   "(?R or (?[+-]digits must be followed by )\0"
491   /* 30 */
492   "unknown POSIX class name\0"
493   "POSIX collating elements are not supported\0"
494   "this version of PCRE is compiled without UTF support\0"
495   "spare error\0"  /** DEAD **/
496   "character value in \\x{} or \\o{} is too large\0"
497   /* 35 */
498   "invalid condition (?(0)\0"
499   "\\C not allowed in lookbehind assertion\0"
500   "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
501   "number after (?C is > 255\0"
502   "closing ) for (?C expected\0"
503   /* 40 */
504   "recursive call could loop indefinitely\0"
505   "unrecognized character after (?P\0"
506   "syntax error in subpattern name (missing terminator)\0"
507   "two named subpatterns have the same name\0"
508   "invalid UTF-8 string\0"
509   /* 45 */
510   "support for \\P, \\p, and \\X has not been compiled\0"
511   "malformed \\P or \\p sequence\0"
512   "unknown property name after \\P or \\p\0"
513   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
514   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
515   /* 50 */
516   "repeated subpattern is too long\0"    /** DEAD **/
517   "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
518   "internal error: overran compiling workspace\0"
519   "internal error: previously-checked referenced subpattern not found\0"
520   "DEFINE group contains more than one branch\0"
521   /* 55 */
522   "repeating a DEFINE group is not allowed\0"  /** DEAD **/
523   "inconsistent NEWLINE options\0"
524   "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
525   "a numbered reference must not be zero\0"
526   "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
527   /* 60 */
528   "(*VERB) not recognized or malformed\0"
529   "number is too big\0"
530   "subpattern name expected\0"
531   "digit expected after (?+\0"
532   "] is an invalid data character in JavaScript compatibility mode\0"
533   /* 65 */
534   "different names for subpatterns of the same number are not allowed\0"
535   "(*MARK) must have an argument\0"
536   "this version of PCRE is not compiled with Unicode property support\0"
537 #ifndef EBCDIC
538   "\\c must be followed by an ASCII character\0"
539 #else
540   "\\c must be followed by a letter or one of [\\]^_?\0"
541 #endif
542   "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
543   /* 70 */
544   "internal error: unknown opcode in find_fixedlength()\0"
545   "\\N is not supported in a class\0"
546   "too many forward references\0"
547   "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
548   "invalid UTF-16 string\0"
549   /* 75 */
550   "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
551   "character value in \\u.... sequence is too large\0"
552   "invalid UTF-32 string\0"
553   "setting UTF is disabled by the application\0"
554   "non-hex character in \\x{} (closing brace missing?)\0"
555   /* 80 */
556   "non-octal character in \\o{} (closing brace missing?)\0"
557   "missing opening brace after \\o\0"
558   "parentheses are too deeply nested\0"
559   "invalid range in character class\0"
560   "group name must start with a non-digit\0"
561   /* 85 */
562   "parentheses are too deeply nested (stack check)\0"
563   "digits missing in \\x{} or \\o{}\0"
564   "regular expression is too complicated\0"
565   ;
566 
567 /* Table to identify digits and hex digits. This is used when compiling
568 patterns. Note that the tables in chartables are dependent on the locale, and
569 may mark arbitrary characters as digits - but the PCRE compiling code expects
570 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
571 a private table here. It costs 256 bytes, but it is a lot faster than doing
572 character value tests (at least in some simple cases I timed), and in some
573 applications one wants PCRE to compile efficiently as well as match
574 efficiently.
575 
576 For convenience, we use the same bit definitions as in chartables:
577 
578   0x04   decimal digit
579   0x08   hexadecimal digit
580 
581 Then we can use ctype_digit and ctype_xdigit in the code. */
582 
583 /* Using a simple comparison for decimal numbers rather than a memory read
584 is much faster, and the resulting code is simpler (the compiler turns it
585 into a subtraction and unsigned comparison). */
586 
587 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
588 
589 #ifndef EBCDIC
590 
591 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
592 UTF-8 mode. */
593 
594 static const pcre_uint8 digitab[] =
595   {
596   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
597   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
598   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
599   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
600   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
601   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
602   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
603   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
604   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
605   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
606   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
607   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
608   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
609   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
610   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
611   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
612   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
613   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
614   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
615   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
616   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
617   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
618   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
619   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
620   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
621   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
622   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
623   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
624   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
625   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
626   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
627   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
628 
629 #else
630 
631 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
632 
633 static const pcre_uint8 digitab[] =
634   {
635   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
636   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
637   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
638   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
639   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
640   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
641   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
642   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
643   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
644   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
645   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
646   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
647   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
648   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
649   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
650   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
651   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
652   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
653   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
654   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
655   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
656   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
657   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
658   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
659   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
660   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
661   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
662   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
663   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
664   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
665   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
666   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
667 
668 static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
669   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
670   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
671   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
672   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
673   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
674   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
675   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
676   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
677   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
678   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
679   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
680   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
681   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
682   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
683   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
684   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
685   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
686   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
687   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
688   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
689   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
690   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
691   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
692   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
693   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
694   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
695   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
696   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
697   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
698   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
699   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
700   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
701 #endif
702 
703 
704 /* This table is used to check whether auto-possessification is possible
705 between adjacent character-type opcodes. The left-hand (repeated) opcode is
706 used to select the row, and the right-hand opcode is use to select the column.
707 A value of 1 means that auto-possessification is OK. For example, the second
708 value in the first row means that \D+\d can be turned into \D++\d.
709 
710 The Unicode property types (\P and \p) have to be present to fill out the table
711 because of what their opcode values are, but the table values should always be
712 zero because property types are handled separately in the code. The last four
713 columns apply to items that cannot be repeated, so there is no need to have
714 rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
715 *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
716 
717 #define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
718 #define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
719 
720 static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
721 /* \D \d \S \s \W \w  . .+ \C \P \p \R \H \h \V \v \X \Z \z  $ $M */
722   { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \D */
723   { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \d */
724   { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \S */
725   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \s */
726   { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \W */
727   { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \w */
728   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .  */
729   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .+ */
730   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \C */
731   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \P */
732   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \p */
733   { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \R */
734   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \H */
735   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \h */
736   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \V */
737   { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 },  /* \v */
738   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }   /* \X */
739 };
740 
741 
742 /* This table is used to check whether auto-possessification is possible
743 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
744 left-hand (repeated) opcode is used to select the row, and the right-hand
745 opcode is used to select the column. The values are as follows:
746 
747   0   Always return FALSE (never auto-possessify)
748   1   Character groups are distinct (possessify if both are OP_PROP)
749   2   Check character categories in the same group (general or particular)
750   3   TRUE if the two opcodes are not the same (PROP vs NOTPROP)
751 
752   4   Check left general category vs right particular category
753   5   Check right general category vs left particular category
754 
755   6   Left alphanum vs right general category
756   7   Left space vs right general category
757   8   Left word vs right general category
758 
759   9   Right alphanum vs left general category
760  10   Right space vs left general category
761  11   Right word vs left general category
762 
763  12   Left alphanum vs right particular category
764  13   Left space vs right particular category
765  14   Left word vs right particular category
766 
767  15   Right alphanum vs left particular category
768  16   Right space vs left particular category
769  17   Right word vs left particular category
770 */
771 
772 static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
773 /* ANY LAMP GC  PC  SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
774   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_ANY */
775   { 0,  3,  0,  0,  0,    3,    1,      1,   0,    0,   0 },  /* PT_LAMP */
776   { 0,  0,  2,  4,  0,    9,   10,     10,  11,    0,   0 },  /* PT_GC */
777   { 0,  0,  5,  2,  0,   15,   16,     16,  17,    0,   0 },  /* PT_PC */
778   { 0,  0,  0,  0,  2,    0,    0,      0,   0,    0,   0 },  /* PT_SC */
779   { 0,  3,  6, 12,  0,    3,    1,      1,   0,    0,   0 },  /* PT_ALNUM */
780   { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_SPACE */
781   { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_PXSPACE */
782   { 0,  0,  8, 14,  0,    0,    1,      1,   3,    0,   0 },  /* PT_WORD */
783   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_CLIST */
784   { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   3 }   /* PT_UCNC */
785 };
786 
787 /* This table is used to check whether auto-possessification is possible
788 between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
789 specifies a general category and the other specifies a particular category. The
790 row is selected by the general category and the column by the particular
791 category. The value is 1 if the particular category is not part of the general
792 category. */
793 
794 static const pcre_uint8 catposstab[7][30] = {
795 /* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
796   { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* C */
797   { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* L */
798   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* M */
799   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* N */
800   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },  /* P */
801   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 },  /* S */
802   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 }   /* Z */
803 };
804 
805 /* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
806 a general or particular category. The properties in each row are those
807 that apply to the character set in question. Duplication means that a little
808 unnecessary work is done when checking, but this keeps things much simpler
809 because they can all use the same code. For more details see the comment where
810 this table is used.
811 
812 Note: SPACE and PXSPACE used to be different because Perl excluded VT from
813 "space", but from Perl 5.18 it's included, so both categories are treated the
814 same here. */
815 
816 static const pcre_uint8 posspropstab[3][4] = {
817   { ucp_L, ucp_N, ucp_N, ucp_Nl },  /* ALNUM, 3rd and 4th values redundant */
818   { ucp_Z, ucp_Z, ucp_C, ucp_Cc },  /* SPACE and PXSPACE, 2nd value redundant */
819   { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
820 };
821 
822 /* This table is used when converting repeating opcodes into possessified
823 versions as a result of an explicit possessive quantifier such as ++. A zero
824 value means there is no possessified version - in those cases the item in
825 question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
826 because all relevant opcodes are less than that. */
827 
828 static const pcre_uint8 opcode_possessify[] = {
829   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
830   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
831 
832   0,                       /* NOTI */
833   OP_POSSTAR, 0,           /* STAR, MINSTAR */
834   OP_POSPLUS, 0,           /* PLUS, MINPLUS */
835   OP_POSQUERY, 0,          /* QUERY, MINQUERY */
836   OP_POSUPTO, 0,           /* UPTO, MINUPTO */
837   0,                       /* EXACT */
838   0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
839 
840   OP_POSSTARI, 0,          /* STARI, MINSTARI */
841   OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
842   OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
843   OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
844   0,                       /* EXACTI */
845   0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
846 
847   OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
848   OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
849   OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
850   OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
851   0,                       /* NOTEXACT */
852   0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
853 
854   OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
855   OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
856   OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
857   OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
858   0,                       /* NOTEXACTI */
859   0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
860 
861   OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
862   OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
863   OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
864   OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
865   0,                       /* TYPEEXACT */
866   0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
867 
868   OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
869   OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
870   OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
871   OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
872   0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
873 
874   0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
875   0, 0,                    /* REF, REFI */
876   0, 0,                    /* DNREF, DNREFI */
877   0, 0                     /* RECURSE, CALLOUT */
878 };
879 
880 
881 
882 /*************************************************
883 *            Find an error text                  *
884 *************************************************/
885 
886 /* The error texts are now all in one long string, to save on relocations. As
887 some of the text is of unknown length, we can't use a table of offsets.
888 Instead, just count through the strings. This is not a performance issue
889 because it happens only when there has been a compilation error.
890 
891 Argument:   the error number
892 Returns:    pointer to the error string
893 */
894 
895 static const char *
find_error_text(int n)896 find_error_text(int n)
897 {
898 const char *s = error_texts;
899 for (; n > 0; n--)
900   {
901   while (*s++ != CHAR_NULL) {};
902   if (*s == CHAR_NULL) return "Error text not found (please report)";
903   }
904 return s;
905 }
906 
907 
908 
909 /*************************************************
910 *           Expand the workspace                 *
911 *************************************************/
912 
913 /* This function is called during the second compiling phase, if the number of
914 forward references fills the existing workspace, which is originally a block on
915 the stack. A larger block is obtained from malloc() unless the ultimate limit
916 has been reached or the increase will be rather small.
917 
918 Argument: pointer to the compile data block
919 Returns:  0 if all went well, else an error number
920 */
921 
922 static int
expand_workspace(compile_data * cd)923 expand_workspace(compile_data *cd)
924 {
925 pcre_uchar *newspace;
926 int newsize = cd->workspace_size * 2;
927 
928 if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
929 if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
930     newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
931  return ERR72;
932 
933 newspace = (PUBL(malloc))(IN_UCHARS(newsize));
934 if (newspace == NULL) return ERR21;
935 memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
936 cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
937 if (cd->workspace_size > COMPILE_WORK_SIZE)
938   (PUBL(free))((void *)cd->start_workspace);
939 cd->start_workspace = newspace;
940 cd->workspace_size = newsize;
941 return 0;
942 }
943 
944 
945 
946 /*************************************************
947 *            Check for counted repeat            *
948 *************************************************/
949 
950 /* This function is called when a '{' is encountered in a place where it might
951 start a quantifier. It looks ahead to see if it really is a quantifier or not.
952 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
953 where the ddds are digits.
954 
955 Arguments:
956   p         pointer to the first char after '{'
957 
958 Returns:    TRUE or FALSE
959 */
960 
961 static BOOL
is_counted_repeat(const pcre_uchar * p)962 is_counted_repeat(const pcre_uchar *p)
963 {
964 if (!IS_DIGIT(*p)) return FALSE;
965 p++;
966 while (IS_DIGIT(*p)) p++;
967 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
968 
969 if (*p++ != CHAR_COMMA) return FALSE;
970 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
971 
972 if (!IS_DIGIT(*p)) return FALSE;
973 p++;
974 while (IS_DIGIT(*p)) p++;
975 
976 return (*p == CHAR_RIGHT_CURLY_BRACKET);
977 }
978 
979 
980 
981 /*************************************************
982 *            Handle escapes                      *
983 *************************************************/
984 
985 /* This function is called when a \ has been encountered. It either returns a
986 positive value for a simple escape such as \n, or 0 for a data character which
987 will be placed in chptr. A backreference to group n is returned as negative n.
988 When UTF-8 is enabled, a positive value greater than 255 may be returned in
989 chptr. On entry, ptr is pointing at the \. On exit, it is on the final
990 character of the escape sequence.
991 
992 Arguments:
993   ptrptr         points to the pattern position pointer
994   chptr          points to a returned data character
995   errorcodeptr   points to the errorcode variable
996   bracount       number of previous extracting brackets
997   options        the options bits
998   isclass        TRUE if inside a character class
999 
1000 Returns:         zero => a data character
1001                  positive => a special escape sequence
1002                  negative => a back reference
1003                  on error, errorcodeptr is set
1004 */
1005 
1006 static int
check_escape(const pcre_uchar ** ptrptr,pcre_uint32 * chptr,int * errorcodeptr,int bracount,int options,BOOL isclass)1007 check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
1008   int bracount, int options, BOOL isclass)
1009 {
1010 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
1011 BOOL utf = (options & PCRE_UTF8) != 0;
1012 const pcre_uchar *ptr = *ptrptr + 1;
1013 pcre_uint32 c;
1014 int escape = 0;
1015 int i;
1016 
1017 GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
1018 ptr--;                            /* Set pointer back to the last byte */
1019 
1020 /* If backslash is at the end of the pattern, it's an error. */
1021 
1022 if (c == CHAR_NULL) *errorcodeptr = ERR1;
1023 
1024 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
1025 in a table. A non-zero result is something that can be returned immediately.
1026 Otherwise further processing may be required. */
1027 
1028 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1029 /* Not alphanumeric */
1030 else if (c < CHAR_0 || c > CHAR_z) {}
1031 else if ((i = escapes[c - CHAR_0]) != 0)
1032   { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1033 
1034 #else           /* EBCDIC coding */
1035 /* Not alphanumeric */
1036 else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
1037 else if ((i = escapes[c - 0x48]) != 0)  { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1038 #endif
1039 
1040 /* Escapes that need further processing, or are illegal. */
1041 
1042 else
1043   {
1044   const pcre_uchar *oldptr;
1045   BOOL braced, negated, overflow;
1046   int s;
1047 
1048   switch (c)
1049     {
1050     /* A number of Perl escapes are not handled by PCRE. We give an explicit
1051     error. */
1052 
1053     case CHAR_l:
1054     case CHAR_L:
1055     *errorcodeptr = ERR37;
1056     break;
1057 
1058     case CHAR_u:
1059     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1060       {
1061       /* In JavaScript, \u must be followed by four hexadecimal numbers.
1062       Otherwise it is a lowercase u letter. */
1063       if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1064         && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
1065         && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
1066         && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
1067         {
1068         c = 0;
1069         for (i = 0; i < 4; ++i)
1070           {
1071           register pcre_uint32 cc = *(++ptr);
1072 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1073           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1074           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1075 #else           /* EBCDIC coding */
1076           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1077           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1078 #endif
1079           }
1080 
1081 #if defined COMPILE_PCRE8
1082         if (c > (utf ? 0x10ffffU : 0xffU))
1083 #elif defined COMPILE_PCRE16
1084         if (c > (utf ? 0x10ffffU : 0xffffU))
1085 #elif defined COMPILE_PCRE32
1086         if (utf && c > 0x10ffffU)
1087 #endif
1088           {
1089           *errorcodeptr = ERR76;
1090           }
1091         else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1092         }
1093       }
1094     else
1095       *errorcodeptr = ERR37;
1096     break;
1097 
1098     case CHAR_U:
1099     /* In JavaScript, \U is an uppercase U letter. */
1100     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
1101     break;
1102 
1103     /* In a character class, \g is just a literal "g". Outside a character
1104     class, \g must be followed by one of a number of specific things:
1105 
1106     (1) A number, either plain or braced. If positive, it is an absolute
1107     backreference. If negative, it is a relative backreference. This is a Perl
1108     5.10 feature.
1109 
1110     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1111     is part of Perl's movement towards a unified syntax for back references. As
1112     this is synonymous with \k{name}, we fudge it up by pretending it really
1113     was \k.
1114 
1115     (3) For Oniguruma compatibility we also support \g followed by a name or a
1116     number either in angle brackets or in single quotes. However, these are
1117     (possibly recursive) subroutine calls, _not_ backreferences. Just return
1118     the ESC_g code (cf \k). */
1119 
1120     case CHAR_g:
1121     if (isclass) break;
1122     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
1123       {
1124       escape = ESC_g;
1125       break;
1126       }
1127 
1128     /* Handle the Perl-compatible cases */
1129 
1130     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1131       {
1132       const pcre_uchar *p;
1133       for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
1134         if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
1135       if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
1136         {
1137         escape = ESC_k;
1138         break;
1139         }
1140       braced = TRUE;
1141       ptr++;
1142       }
1143     else braced = FALSE;
1144 
1145     if (ptr[1] == CHAR_MINUS)
1146       {
1147       negated = TRUE;
1148       ptr++;
1149       }
1150     else negated = FALSE;
1151 
1152     /* The integer range is limited by the machine's int representation. */
1153     s = 0;
1154     overflow = FALSE;
1155     while (IS_DIGIT(ptr[1]))
1156       {
1157       if (s > INT_MAX / 10 - 1) /* Integer overflow */
1158         {
1159         overflow = TRUE;
1160         break;
1161         }
1162       s = s * 10 + (int)(*(++ptr) - CHAR_0);
1163       }
1164     if (overflow) /* Integer overflow */
1165       {
1166       while (IS_DIGIT(ptr[1]))
1167         ptr++;
1168       *errorcodeptr = ERR61;
1169       break;
1170       }
1171 
1172     if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
1173       {
1174       *errorcodeptr = ERR57;
1175       break;
1176       }
1177 
1178     if (s == 0)
1179       {
1180       *errorcodeptr = ERR58;
1181       break;
1182       }
1183 
1184     if (negated)
1185       {
1186       if (s > bracount)
1187         {
1188         *errorcodeptr = ERR15;
1189         break;
1190         }
1191       s = bracount - (s - 1);
1192       }
1193 
1194     escape = -s;
1195     break;
1196 
1197     /* The handling of escape sequences consisting of a string of digits
1198     starting with one that is not zero is not straightforward. Perl has changed
1199     over the years. Nowadays \g{} for backreferences and \o{} for octal are
1200     recommended to avoid the ambiguities in the old syntax.
1201 
1202     Outside a character class, the digits are read as a decimal number. If the
1203     number is less than 8 (used to be 10), or if there are that many previous
1204     extracting left brackets, then it is a back reference. Otherwise, up to
1205     three octal digits are read to form an escaped byte. Thus \123 is likely to
1206     be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1207     the octal value is greater than 377, the least significant 8 bits are
1208     taken. \8 and \9 are treated as the literal characters 8 and 9.
1209 
1210     Inside a character class, \ followed by a digit is always either a literal
1211     8 or 9 or an octal number. */
1212 
1213     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1214     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1215 
1216     if (!isclass)
1217       {
1218       oldptr = ptr;
1219       /* The integer range is limited by the machine's int representation. */
1220       s = (int)(c -CHAR_0);
1221       overflow = FALSE;
1222       while (IS_DIGIT(ptr[1]))
1223         {
1224         if (s > INT_MAX / 10 - 1) /* Integer overflow */
1225           {
1226           overflow = TRUE;
1227           break;
1228           }
1229         s = s * 10 + (int)(*(++ptr) - CHAR_0);
1230         }
1231       if (overflow) /* Integer overflow */
1232         {
1233         while (IS_DIGIT(ptr[1]))
1234           ptr++;
1235         *errorcodeptr = ERR61;
1236         break;
1237         }
1238       if (s < 8 || s <= bracount)  /* Check for back reference */
1239         {
1240         escape = -s;
1241         break;
1242         }
1243       ptr = oldptr;      /* Put the pointer back and fall through */
1244       }
1245 
1246     /* Handle a digit following \ when the number is not a back reference. If
1247     the first digit is 8 or 9, Perl used to generate a binary zero byte and
1248     then treat the digit as a following literal. At least by Perl 5.18 this
1249     changed so as not to insert the binary zero. */
1250 
1251     if ((c = *ptr) >= CHAR_8) break;
1252 
1253     /* Fall through with a digit less than 8 */
1254 
1255     /* \0 always starts an octal number, but we may drop through to here with a
1256     larger first octal digit. The original code used just to take the least
1257     significant 8 bits of octal numbers (I think this is what early Perls used
1258     to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1259     but no more than 3 octal digits. */
1260 
1261     case CHAR_0:
1262     c -= CHAR_0;
1263     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1264         c = c * 8 + *(++ptr) - CHAR_0;
1265 #ifdef COMPILE_PCRE8
1266     if (!utf && c > 0xff) *errorcodeptr = ERR51;
1267 #endif
1268     break;
1269 
1270     /* \o is a relatively new Perl feature, supporting a more general way of
1271     specifying character codes in octal. The only supported form is \o{ddd}. */
1272 
1273     case CHAR_o:
1274     if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1275     if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR86; else
1276       {
1277       ptr += 2;
1278       c = 0;
1279       overflow = FALSE;
1280       while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1281         {
1282         register pcre_uint32 cc = *ptr++;
1283         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1284 #ifdef COMPILE_PCRE32
1285         if (c >= 0x20000000l) { overflow = TRUE; break; }
1286 #endif
1287         c = (c << 3) + cc - CHAR_0 ;
1288 #if defined COMPILE_PCRE8
1289         if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1290 #elif defined COMPILE_PCRE16
1291         if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1292 #elif defined COMPILE_PCRE32
1293         if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1294 #endif
1295         }
1296       if (overflow)
1297         {
1298         while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1299         *errorcodeptr = ERR34;
1300         }
1301       else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1302         {
1303         if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1304         }
1305       else *errorcodeptr = ERR80;
1306       }
1307     break;
1308 
1309     /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1310     numbers. Otherwise it is a lowercase x letter. */
1311 
1312     case CHAR_x:
1313     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1314       {
1315       if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1316         && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1317         {
1318         c = 0;
1319         for (i = 0; i < 2; ++i)
1320           {
1321           register pcre_uint32 cc = *(++ptr);
1322 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1323           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1324           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1325 #else           /* EBCDIC coding */
1326           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1327           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1328 #endif
1329           }
1330         }
1331       }    /* End JavaScript handling */
1332 
1333     /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1334     greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1335     digits. If not, { used to be treated as a data character. However, Perl
1336     seems to read hex digits up to the first non-such, and ignore the rest, so
1337     that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1338     now gives an error. */
1339 
1340     else
1341       {
1342       if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1343         {
1344         ptr += 2;
1345         if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1346           {
1347           *errorcodeptr = ERR86;
1348           break;
1349           }
1350         c = 0;
1351         overflow = FALSE;
1352         while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1353           {
1354           register pcre_uint32 cc = *ptr++;
1355           if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1356 
1357 #ifdef COMPILE_PCRE32
1358           if (c >= 0x10000000l) { overflow = TRUE; break; }
1359 #endif
1360 
1361 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1362           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1363           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1364 #else           /* EBCDIC coding */
1365           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1366           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1367 #endif
1368 
1369 #if defined COMPILE_PCRE8
1370           if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1371 #elif defined COMPILE_PCRE16
1372           if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1373 #elif defined COMPILE_PCRE32
1374           if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1375 #endif
1376           }
1377 
1378         if (overflow)
1379           {
1380           while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1381           *errorcodeptr = ERR34;
1382           }
1383 
1384         else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1385           {
1386           if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1387           }
1388 
1389         /* If the sequence of hex digits does not end with '}', give an error.
1390         We used just to recognize this construct and fall through to the normal
1391         \x handling, but nowadays Perl gives an error, which seems much more
1392         sensible, so we do too. */
1393 
1394         else *errorcodeptr = ERR79;
1395         }   /* End of \x{} processing */
1396 
1397       /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1398 
1399       else
1400         {
1401         c = 0;
1402         while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1403           {
1404           pcre_uint32 cc;                          /* Some compilers don't like */
1405           cc = *(++ptr);                           /* ++ in initializers */
1406 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1407           if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
1408           c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1409 #else           /* EBCDIC coding */
1410           if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
1411           c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1412 #endif
1413           }
1414         }     /* End of \xdd handling */
1415       }       /* End of Perl-style \x handling */
1416     break;
1417 
1418     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1419     An error is given if the byte following \c is not an ASCII character. This
1420     coding is ASCII-specific, but then the whole concept of \cx is
1421     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1422 
1423     case CHAR_c:
1424     c = *(++ptr);
1425     if (c == CHAR_NULL)
1426       {
1427       *errorcodeptr = ERR2;
1428       break;
1429       }
1430 #ifndef EBCDIC    /* ASCII/UTF-8 coding */
1431     if (c > 127)  /* Excludes all non-ASCII in either mode */
1432       {
1433       *errorcodeptr = ERR68;
1434       break;
1435       }
1436     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1437     c ^= 0x40;
1438 #else             /* EBCDIC coding */
1439     if (c >= CHAR_a && c <= CHAR_z) c += 64;
1440     if (c == CHAR_QUESTION_MARK)
1441       c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
1442     else
1443       {
1444       for (i = 0; i < 32; i++)
1445         {
1446         if (c == ebcdic_escape_c[i]) break;
1447         }
1448       if (i < 32) c = i; else *errorcodeptr = ERR68;
1449       }
1450 #endif
1451     break;
1452 
1453     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1454     other alphanumeric following \ is an error if PCRE_EXTRA was set;
1455     otherwise, for Perl compatibility, it is a literal. This code looks a bit
1456     odd, but there used to be some cases other than the default, and there may
1457     be again in future, so I haven't "optimized" it. */
1458 
1459     default:
1460     if ((options & PCRE_EXTRA) != 0) switch(c)
1461       {
1462       default:
1463       *errorcodeptr = ERR3;
1464       break;
1465       }
1466     break;
1467     }
1468   }
1469 
1470 /* Perl supports \N{name} for character names, as well as plain \N for "not
1471 newline". PCRE does not support \N{name}. However, it does support
1472 quantification such as \N{2,3}. */
1473 
1474 if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1475      !is_counted_repeat(ptr+2))
1476   *errorcodeptr = ERR37;
1477 
1478 /* If PCRE_UCP is set, we change the values for \d etc. */
1479 
1480 if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
1481   escape += (ESC_DU - ESC_D);
1482 
1483 /* Set the pointer to the final character before returning. */
1484 
1485 *ptrptr = ptr;
1486 *chptr = c;
1487 return escape;
1488 }
1489 
1490 
1491 
1492 #ifdef SUPPORT_UCP
1493 /*************************************************
1494 *               Handle \P and \p                 *
1495 *************************************************/
1496 
1497 /* This function is called after \P or \p has been encountered, provided that
1498 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1499 pointing at the P or p. On exit, it is pointing at the final character of the
1500 escape sequence.
1501 
1502 Argument:
1503   ptrptr         points to the pattern position pointer
1504   negptr         points to a boolean that is set TRUE for negation else FALSE
1505   ptypeptr       points to an unsigned int that is set to the type value
1506   pdataptr       points to an unsigned int that is set to the detailed property value
1507   errorcodeptr   points to the error code variable
1508 
1509 Returns:         TRUE if the type value was found, or FALSE for an invalid type
1510 */
1511 
1512 static BOOL
get_ucp(const pcre_uchar ** ptrptr,BOOL * negptr,unsigned int * ptypeptr,unsigned int * pdataptr,int * errorcodeptr)1513 get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
1514   unsigned int *pdataptr, int *errorcodeptr)
1515 {
1516 pcre_uchar c;
1517 int i, bot, top;
1518 const pcre_uchar *ptr = *ptrptr;
1519 pcre_uchar name[32];
1520 
1521 c = *(++ptr);
1522 if (c == CHAR_NULL) goto ERROR_RETURN;
1523 
1524 *negptr = FALSE;
1525 
1526 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1527 negation. */
1528 
1529 if (c == CHAR_LEFT_CURLY_BRACKET)
1530   {
1531   if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1532     {
1533     *negptr = TRUE;
1534     ptr++;
1535     }
1536   for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1537     {
1538     c = *(++ptr);
1539     if (c == CHAR_NULL) goto ERROR_RETURN;
1540     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1541     name[i] = c;
1542     }
1543   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1544   name[i] = 0;
1545   }
1546 
1547 /* Otherwise there is just one following character */
1548 
1549 else
1550   {
1551   name[0] = c;
1552   name[1] = 0;
1553   }
1554 
1555 *ptrptr = ptr;
1556 
1557 /* Search for a recognized property name using binary chop */
1558 
1559 bot = 0;
1560 top = PRIV(utt_size);
1561 
1562 while (bot < top)
1563   {
1564   int r;
1565   i = (bot + top) >> 1;
1566   r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1567   if (r == 0)
1568     {
1569     *ptypeptr = PRIV(utt)[i].type;
1570     *pdataptr = PRIV(utt)[i].value;
1571     return TRUE;
1572     }
1573   if (r > 0) bot = i + 1; else top = i;
1574   }
1575 
1576 *errorcodeptr = ERR47;
1577 *ptrptr = ptr;
1578 return FALSE;
1579 
1580 ERROR_RETURN:
1581 *errorcodeptr = ERR46;
1582 *ptrptr = ptr;
1583 return FALSE;
1584 }
1585 #endif
1586 
1587 
1588 
1589 /*************************************************
1590 *         Read repeat counts                     *
1591 *************************************************/
1592 
1593 /* Read an item of the form {n,m} and return the values. This is called only
1594 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1595 so the syntax is guaranteed to be correct, but we need to check the values.
1596 
1597 Arguments:
1598   p              pointer to first char after '{'
1599   minp           pointer to int for min
1600   maxp           pointer to int for max
1601                  returned as -1 if no max
1602   errorcodeptr   points to error code variable
1603 
1604 Returns:         pointer to '}' on success;
1605                  current ptr on error, with errorcodeptr set non-zero
1606 */
1607 
1608 static const pcre_uchar *
read_repeat_counts(const pcre_uchar * p,int * minp,int * maxp,int * errorcodeptr)1609 read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1610 {
1611 int min = 0;
1612 int max = -1;
1613 
1614 while (IS_DIGIT(*p))
1615   {
1616   min = min * 10 + (int)(*p++ - CHAR_0);
1617   if (min > 65535)
1618     {
1619     *errorcodeptr = ERR5;
1620     return p;
1621     }
1622   }
1623 
1624 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1625   {
1626   if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1627     {
1628     max = 0;
1629     while(IS_DIGIT(*p))
1630       {
1631       max = max * 10 + (int)(*p++ - CHAR_0);
1632       if (max > 65535)
1633         {
1634         *errorcodeptr = ERR5;
1635         return p;
1636         }
1637       }
1638     if (max < min)
1639       {
1640       *errorcodeptr = ERR4;
1641       return p;
1642       }
1643     }
1644   }
1645 
1646 *minp = min;
1647 *maxp = max;
1648 return p;
1649 }
1650 
1651 
1652 
1653 /*************************************************
1654 *      Find first significant op code            *
1655 *************************************************/
1656 
1657 /* This is called by several functions that scan a compiled expression looking
1658 for a fixed first character, or an anchoring op code etc. It skips over things
1659 that do not influence this. For some calls, it makes sense to skip negative
1660 forward and all backward assertions, and also the \b assertion; for others it
1661 does not.
1662 
1663 Arguments:
1664   code         pointer to the start of the group
1665   skipassert   TRUE if certain assertions are to be skipped
1666 
1667 Returns:       pointer to the first significant opcode
1668 */
1669 
1670 static const pcre_uchar*
first_significant_code(const pcre_uchar * code,BOOL skipassert)1671 first_significant_code(const pcre_uchar *code, BOOL skipassert)
1672 {
1673 for (;;)
1674   {
1675   switch ((int)*code)
1676     {
1677     case OP_ASSERT_NOT:
1678     case OP_ASSERTBACK:
1679     case OP_ASSERTBACK_NOT:
1680     if (!skipassert) return code;
1681     do code += GET(code, 1); while (*code == OP_ALT);
1682     code += PRIV(OP_lengths)[*code];
1683     break;
1684 
1685     case OP_WORD_BOUNDARY:
1686     case OP_NOT_WORD_BOUNDARY:
1687     if (!skipassert) return code;
1688     /* Fall through */
1689 
1690     case OP_CALLOUT:
1691     case OP_CREF:
1692     case OP_DNCREF:
1693     case OP_RREF:
1694     case OP_DNRREF:
1695     case OP_DEF:
1696     code += PRIV(OP_lengths)[*code];
1697     break;
1698 
1699     default:
1700     return code;
1701     }
1702   }
1703 /* Control never reaches here */
1704 }
1705 
1706 
1707 
1708 /*************************************************
1709 *        Find the fixed length of a branch       *
1710 *************************************************/
1711 
1712 /* Scan a branch and compute the fixed length of subject that will match it,
1713 if the length is fixed. This is needed for dealing with backward assertions.
1714 In UTF8 mode, the result is in characters rather than bytes. The branch is
1715 temporarily terminated with OP_END when this function is called.
1716 
1717 This function is called when a backward assertion is encountered, so that if it
1718 fails, the error message can point to the correct place in the pattern.
1719 However, we cannot do this when the assertion contains subroutine calls,
1720 because they can be forward references. We solve this by remembering this case
1721 and doing the check at the end; a flag specifies which mode we are running in.
1722 
1723 Arguments:
1724   code     points to the start of the pattern (the bracket)
1725   utf      TRUE in UTF-8 / UTF-16 / UTF-32 mode
1726   atend    TRUE if called when the pattern is complete
1727   cd       the "compile data" structure
1728   recurses    chain of recurse_check to catch mutual recursion
1729 
1730 Returns:   the fixed length,
1731              or -1 if there is no fixed length,
1732              or -2 if \C was encountered (in UTF-8 mode only)
1733              or -3 if an OP_RECURSE item was encountered and atend is FALSE
1734              or -4 if an unknown opcode was encountered (internal error)
1735 */
1736 
1737 static int
find_fixedlength(pcre_uchar * code,BOOL utf,BOOL atend,compile_data * cd,recurse_check * recurses)1738 find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd,
1739   recurse_check *recurses)
1740 {
1741 int length = -1;
1742 recurse_check this_recurse;
1743 register int branchlength = 0;
1744 register pcre_uchar *cc = code + 1 + LINK_SIZE;
1745 
1746 /* Scan along the opcodes for this branch. If we get to the end of the
1747 branch, check the length against that of the other branches. */
1748 
1749 for (;;)
1750   {
1751   int d;
1752   pcre_uchar *ce, *cs;
1753   register pcre_uchar op = *cc;
1754 
1755   switch (op)
1756     {
1757     /* We only need to continue for OP_CBRA (normal capturing bracket) and
1758     OP_BRA (normal non-capturing bracket) because the other variants of these
1759     opcodes are all concerned with unlimited repeated groups, which of course
1760     are not of fixed length. */
1761 
1762     case OP_CBRA:
1763     case OP_BRA:
1764     case OP_ONCE:
1765     case OP_ONCE_NC:
1766     case OP_COND:
1767     d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd,
1768       recurses);
1769     if (d < 0) return d;
1770     branchlength += d;
1771     do cc += GET(cc, 1); while (*cc == OP_ALT);
1772     cc += 1 + LINK_SIZE;
1773     break;
1774 
1775     /* Reached end of a branch; if it's a ket it is the end of a nested call.
1776     If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1777     an ALT. If it is END it's the end of the outer call. All can be handled by
1778     the same code. Note that we must not include the OP_KETRxxx opcodes here,
1779     because they all imply an unlimited repeat. */
1780 
1781     case OP_ALT:
1782     case OP_KET:
1783     case OP_END:
1784     case OP_ACCEPT:
1785     case OP_ASSERT_ACCEPT:
1786     if (length < 0) length = branchlength;
1787       else if (length != branchlength) return -1;
1788     if (*cc != OP_ALT) return length;
1789     cc += 1 + LINK_SIZE;
1790     branchlength = 0;
1791     break;
1792 
1793     /* A true recursion implies not fixed length, but a subroutine call may
1794     be OK. If the subroutine is a forward reference, we can't deal with
1795     it until the end of the pattern, so return -3. */
1796 
1797     case OP_RECURSE:
1798     if (!atend) return -3;
1799     cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1800     do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
1801     if (cc > cs && cc < ce) return -1;                    /* Recursion */
1802     else   /* Check for mutual recursion */
1803       {
1804       recurse_check *r = recurses;
1805       for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
1806       if (r != NULL) return -1;   /* Mutual recursion */
1807       }
1808     this_recurse.prev = recurses;
1809     this_recurse.group = cs;
1810     d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd, &this_recurse);
1811     if (d < 0) return d;
1812     branchlength += d;
1813     cc += 1 + LINK_SIZE;
1814     break;
1815 
1816     /* Skip over assertive subpatterns */
1817 
1818     case OP_ASSERT:
1819     case OP_ASSERT_NOT:
1820     case OP_ASSERTBACK:
1821     case OP_ASSERTBACK_NOT:
1822     do cc += GET(cc, 1); while (*cc == OP_ALT);
1823     cc += 1 + LINK_SIZE;
1824     break;
1825 
1826     /* Skip over things that don't match chars */
1827 
1828     case OP_MARK:
1829     case OP_PRUNE_ARG:
1830     case OP_SKIP_ARG:
1831     case OP_THEN_ARG:
1832     cc += cc[1] + PRIV(OP_lengths)[*cc];
1833     break;
1834 
1835     case OP_CALLOUT:
1836     case OP_CIRC:
1837     case OP_CIRCM:
1838     case OP_CLOSE:
1839     case OP_COMMIT:
1840     case OP_CREF:
1841     case OP_DEF:
1842     case OP_DNCREF:
1843     case OP_DNRREF:
1844     case OP_DOLL:
1845     case OP_DOLLM:
1846     case OP_EOD:
1847     case OP_EODN:
1848     case OP_FAIL:
1849     case OP_NOT_WORD_BOUNDARY:
1850     case OP_PRUNE:
1851     case OP_REVERSE:
1852     case OP_RREF:
1853     case OP_SET_SOM:
1854     case OP_SKIP:
1855     case OP_SOD:
1856     case OP_SOM:
1857     case OP_THEN:
1858     case OP_WORD_BOUNDARY:
1859     cc += PRIV(OP_lengths)[*cc];
1860     break;
1861 
1862     /* Handle literal characters */
1863 
1864     case OP_CHAR:
1865     case OP_CHARI:
1866     case OP_NOT:
1867     case OP_NOTI:
1868     branchlength++;
1869     cc += 2;
1870 #ifdef SUPPORT_UTF
1871     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1872 #endif
1873     break;
1874 
1875     /* Handle exact repetitions. The count is already in characters, but we
1876     need to skip over a multibyte character in UTF8 mode.  */
1877 
1878     case OP_EXACT:
1879     case OP_EXACTI:
1880     case OP_NOTEXACT:
1881     case OP_NOTEXACTI:
1882     branchlength += (int)GET2(cc,1);
1883     cc += 2 + IMM2_SIZE;
1884 #ifdef SUPPORT_UTF
1885     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1886 #endif
1887     break;
1888 
1889     case OP_TYPEEXACT:
1890     branchlength += GET2(cc,1);
1891     if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1892       cc += 2;
1893     cc += 1 + IMM2_SIZE + 1;
1894     break;
1895 
1896     /* Handle single-char matchers */
1897 
1898     case OP_PROP:
1899     case OP_NOTPROP:
1900     cc += 2;
1901     /* Fall through */
1902 
1903     case OP_HSPACE:
1904     case OP_VSPACE:
1905     case OP_NOT_HSPACE:
1906     case OP_NOT_VSPACE:
1907     case OP_NOT_DIGIT:
1908     case OP_DIGIT:
1909     case OP_NOT_WHITESPACE:
1910     case OP_WHITESPACE:
1911     case OP_NOT_WORDCHAR:
1912     case OP_WORDCHAR:
1913     case OP_ANY:
1914     case OP_ALLANY:
1915     branchlength++;
1916     cc++;
1917     break;
1918 
1919     /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1920     otherwise \C is coded as OP_ALLANY. */
1921 
1922     case OP_ANYBYTE:
1923     return -2;
1924 
1925     /* Check a class for variable quantification */
1926 
1927     case OP_CLASS:
1928     case OP_NCLASS:
1929 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1930     case OP_XCLASS:
1931     /* The original code caused an unsigned overflow in 64 bit systems,
1932     so now we use a conditional statement. */
1933     if (op == OP_XCLASS)
1934       cc += GET(cc, 1);
1935     else
1936       cc += PRIV(OP_lengths)[OP_CLASS];
1937 #else
1938     cc += PRIV(OP_lengths)[OP_CLASS];
1939 #endif
1940 
1941     switch (*cc)
1942       {
1943       case OP_CRSTAR:
1944       case OP_CRMINSTAR:
1945       case OP_CRPLUS:
1946       case OP_CRMINPLUS:
1947       case OP_CRQUERY:
1948       case OP_CRMINQUERY:
1949       case OP_CRPOSSTAR:
1950       case OP_CRPOSPLUS:
1951       case OP_CRPOSQUERY:
1952       return -1;
1953 
1954       case OP_CRRANGE:
1955       case OP_CRMINRANGE:
1956       case OP_CRPOSRANGE:
1957       if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1958       branchlength += (int)GET2(cc,1);
1959       cc += 1 + 2 * IMM2_SIZE;
1960       break;
1961 
1962       default:
1963       branchlength++;
1964       }
1965     break;
1966 
1967     /* Anything else is variable length */
1968 
1969     case OP_ANYNL:
1970     case OP_BRAMINZERO:
1971     case OP_BRAPOS:
1972     case OP_BRAPOSZERO:
1973     case OP_BRAZERO:
1974     case OP_CBRAPOS:
1975     case OP_EXTUNI:
1976     case OP_KETRMAX:
1977     case OP_KETRMIN:
1978     case OP_KETRPOS:
1979     case OP_MINPLUS:
1980     case OP_MINPLUSI:
1981     case OP_MINQUERY:
1982     case OP_MINQUERYI:
1983     case OP_MINSTAR:
1984     case OP_MINSTARI:
1985     case OP_MINUPTO:
1986     case OP_MINUPTOI:
1987     case OP_NOTMINPLUS:
1988     case OP_NOTMINPLUSI:
1989     case OP_NOTMINQUERY:
1990     case OP_NOTMINQUERYI:
1991     case OP_NOTMINSTAR:
1992     case OP_NOTMINSTARI:
1993     case OP_NOTMINUPTO:
1994     case OP_NOTMINUPTOI:
1995     case OP_NOTPLUS:
1996     case OP_NOTPLUSI:
1997     case OP_NOTPOSPLUS:
1998     case OP_NOTPOSPLUSI:
1999     case OP_NOTPOSQUERY:
2000     case OP_NOTPOSQUERYI:
2001     case OP_NOTPOSSTAR:
2002     case OP_NOTPOSSTARI:
2003     case OP_NOTPOSUPTO:
2004     case OP_NOTPOSUPTOI:
2005     case OP_NOTQUERY:
2006     case OP_NOTQUERYI:
2007     case OP_NOTSTAR:
2008     case OP_NOTSTARI:
2009     case OP_NOTUPTO:
2010     case OP_NOTUPTOI:
2011     case OP_PLUS:
2012     case OP_PLUSI:
2013     case OP_POSPLUS:
2014     case OP_POSPLUSI:
2015     case OP_POSQUERY:
2016     case OP_POSQUERYI:
2017     case OP_POSSTAR:
2018     case OP_POSSTARI:
2019     case OP_POSUPTO:
2020     case OP_POSUPTOI:
2021     case OP_QUERY:
2022     case OP_QUERYI:
2023     case OP_REF:
2024     case OP_REFI:
2025     case OP_DNREF:
2026     case OP_DNREFI:
2027     case OP_SBRA:
2028     case OP_SBRAPOS:
2029     case OP_SCBRA:
2030     case OP_SCBRAPOS:
2031     case OP_SCOND:
2032     case OP_SKIPZERO:
2033     case OP_STAR:
2034     case OP_STARI:
2035     case OP_TYPEMINPLUS:
2036     case OP_TYPEMINQUERY:
2037     case OP_TYPEMINSTAR:
2038     case OP_TYPEMINUPTO:
2039     case OP_TYPEPLUS:
2040     case OP_TYPEPOSPLUS:
2041     case OP_TYPEPOSQUERY:
2042     case OP_TYPEPOSSTAR:
2043     case OP_TYPEPOSUPTO:
2044     case OP_TYPEQUERY:
2045     case OP_TYPESTAR:
2046     case OP_TYPEUPTO:
2047     case OP_UPTO:
2048     case OP_UPTOI:
2049     return -1;
2050 
2051     /* Catch unrecognized opcodes so that when new ones are added they
2052     are not forgotten, as has happened in the past. */
2053 
2054     default:
2055     return -4;
2056     }
2057   }
2058 /* Control never gets here */
2059 }
2060 
2061 
2062 
2063 /*************************************************
2064 *    Scan compiled regex for specific bracket    *
2065 *************************************************/
2066 
2067 /* This little function scans through a compiled pattern until it finds a
2068 capturing bracket with the given number, or, if the number is negative, an
2069 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
2070 so that it can be called from pcre_study() when finding the minimum matching
2071 length.
2072 
2073 Arguments:
2074   code        points to start of expression
2075   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
2076   number      the required bracket number or negative to find a lookbehind
2077 
2078 Returns:      pointer to the opcode for the bracket, or NULL if not found
2079 */
2080 
2081 const pcre_uchar *
PRIV(find_bracket)2082 PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2083 {
2084 for (;;)
2085   {
2086   register pcre_uchar c = *code;
2087 
2088   if (c == OP_END) return NULL;
2089 
2090   /* XCLASS is used for classes that cannot be represented just by a bit
2091   map. This includes negated single high-valued characters. The length in
2092   the table is zero; the actual length is stored in the compiled code. */
2093 
2094   if (c == OP_XCLASS) code += GET(code, 1);
2095 
2096   /* Handle recursion */
2097 
2098   else if (c == OP_REVERSE)
2099     {
2100     if (number < 0) return (pcre_uchar *)code;
2101     code += PRIV(OP_lengths)[c];
2102     }
2103 
2104   /* Handle capturing bracket */
2105 
2106   else if (c == OP_CBRA || c == OP_SCBRA ||
2107            c == OP_CBRAPOS || c == OP_SCBRAPOS)
2108     {
2109     int n = (int)GET2(code, 1+LINK_SIZE);
2110     if (n == number) return (pcre_uchar *)code;
2111     code += PRIV(OP_lengths)[c];
2112     }
2113 
2114   /* Otherwise, we can get the item's length from the table, except that for
2115   repeated character types, we have to test for \p and \P, which have an extra
2116   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2117   must add in its length. */
2118 
2119   else
2120     {
2121     switch(c)
2122       {
2123       case OP_TYPESTAR:
2124       case OP_TYPEMINSTAR:
2125       case OP_TYPEPLUS:
2126       case OP_TYPEMINPLUS:
2127       case OP_TYPEQUERY:
2128       case OP_TYPEMINQUERY:
2129       case OP_TYPEPOSSTAR:
2130       case OP_TYPEPOSPLUS:
2131       case OP_TYPEPOSQUERY:
2132       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2133       break;
2134 
2135       case OP_TYPEUPTO:
2136       case OP_TYPEMINUPTO:
2137       case OP_TYPEEXACT:
2138       case OP_TYPEPOSUPTO:
2139       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2140         code += 2;
2141       break;
2142 
2143       case OP_MARK:
2144       case OP_PRUNE_ARG:
2145       case OP_SKIP_ARG:
2146       case OP_THEN_ARG:
2147       code += code[1];
2148       break;
2149       }
2150 
2151     /* Add in the fixed length from the table */
2152 
2153     code += PRIV(OP_lengths)[c];
2154 
2155   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2156   a multi-byte character. The length in the table is a minimum, so we have to
2157   arrange to skip the extra bytes. */
2158 
2159 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2160     if (utf) switch(c)
2161       {
2162       case OP_CHAR:
2163       case OP_CHARI:
2164       case OP_NOT:
2165       case OP_NOTI:
2166       case OP_EXACT:
2167       case OP_EXACTI:
2168       case OP_NOTEXACT:
2169       case OP_NOTEXACTI:
2170       case OP_UPTO:
2171       case OP_UPTOI:
2172       case OP_NOTUPTO:
2173       case OP_NOTUPTOI:
2174       case OP_MINUPTO:
2175       case OP_MINUPTOI:
2176       case OP_NOTMINUPTO:
2177       case OP_NOTMINUPTOI:
2178       case OP_POSUPTO:
2179       case OP_POSUPTOI:
2180       case OP_NOTPOSUPTO:
2181       case OP_NOTPOSUPTOI:
2182       case OP_STAR:
2183       case OP_STARI:
2184       case OP_NOTSTAR:
2185       case OP_NOTSTARI:
2186       case OP_MINSTAR:
2187       case OP_MINSTARI:
2188       case OP_NOTMINSTAR:
2189       case OP_NOTMINSTARI:
2190       case OP_POSSTAR:
2191       case OP_POSSTARI:
2192       case OP_NOTPOSSTAR:
2193       case OP_NOTPOSSTARI:
2194       case OP_PLUS:
2195       case OP_PLUSI:
2196       case OP_NOTPLUS:
2197       case OP_NOTPLUSI:
2198       case OP_MINPLUS:
2199       case OP_MINPLUSI:
2200       case OP_NOTMINPLUS:
2201       case OP_NOTMINPLUSI:
2202       case OP_POSPLUS:
2203       case OP_POSPLUSI:
2204       case OP_NOTPOSPLUS:
2205       case OP_NOTPOSPLUSI:
2206       case OP_QUERY:
2207       case OP_QUERYI:
2208       case OP_NOTQUERY:
2209       case OP_NOTQUERYI:
2210       case OP_MINQUERY:
2211       case OP_MINQUERYI:
2212       case OP_NOTMINQUERY:
2213       case OP_NOTMINQUERYI:
2214       case OP_POSQUERY:
2215       case OP_POSQUERYI:
2216       case OP_NOTPOSQUERY:
2217       case OP_NOTPOSQUERYI:
2218       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2219       break;
2220       }
2221 #else
2222     (void)(utf);  /* Keep compiler happy by referencing function argument */
2223 #endif
2224     }
2225   }
2226 }
2227 
2228 
2229 
2230 /*************************************************
2231 *   Scan compiled regex for recursion reference  *
2232 *************************************************/
2233 
2234 /* This little function scans through a compiled pattern until it finds an
2235 instance of OP_RECURSE.
2236 
2237 Arguments:
2238   code        points to start of expression
2239   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
2240 
2241 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
2242 */
2243 
2244 static const pcre_uchar *
find_recurse(const pcre_uchar * code,BOOL utf)2245 find_recurse(const pcre_uchar *code, BOOL utf)
2246 {
2247 for (;;)
2248   {
2249   register pcre_uchar c = *code;
2250   if (c == OP_END) return NULL;
2251   if (c == OP_RECURSE) return code;
2252 
2253   /* XCLASS is used for classes that cannot be represented just by a bit
2254   map. This includes negated single high-valued characters. The length in
2255   the table is zero; the actual length is stored in the compiled code. */
2256 
2257   if (c == OP_XCLASS) code += GET(code, 1);
2258 
2259   /* Otherwise, we can get the item's length from the table, except that for
2260   repeated character types, we have to test for \p and \P, which have an extra
2261   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2262   must add in its length. */
2263 
2264   else
2265     {
2266     switch(c)
2267       {
2268       case OP_TYPESTAR:
2269       case OP_TYPEMINSTAR:
2270       case OP_TYPEPLUS:
2271       case OP_TYPEMINPLUS:
2272       case OP_TYPEQUERY:
2273       case OP_TYPEMINQUERY:
2274       case OP_TYPEPOSSTAR:
2275       case OP_TYPEPOSPLUS:
2276       case OP_TYPEPOSQUERY:
2277       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2278       break;
2279 
2280       case OP_TYPEPOSUPTO:
2281       case OP_TYPEUPTO:
2282       case OP_TYPEMINUPTO:
2283       case OP_TYPEEXACT:
2284       if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2285         code += 2;
2286       break;
2287 
2288       case OP_MARK:
2289       case OP_PRUNE_ARG:
2290       case OP_SKIP_ARG:
2291       case OP_THEN_ARG:
2292       code += code[1];
2293       break;
2294       }
2295 
2296     /* Add in the fixed length from the table */
2297 
2298     code += PRIV(OP_lengths)[c];
2299 
2300     /* In UTF-8 mode, opcodes that are followed by a character may be followed
2301     by a multi-byte character. The length in the table is a minimum, so we have
2302     to arrange to skip the extra bytes. */
2303 
2304 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2305     if (utf) switch(c)
2306       {
2307       case OP_CHAR:
2308       case OP_CHARI:
2309       case OP_NOT:
2310       case OP_NOTI:
2311       case OP_EXACT:
2312       case OP_EXACTI:
2313       case OP_NOTEXACT:
2314       case OP_NOTEXACTI:
2315       case OP_UPTO:
2316       case OP_UPTOI:
2317       case OP_NOTUPTO:
2318       case OP_NOTUPTOI:
2319       case OP_MINUPTO:
2320       case OP_MINUPTOI:
2321       case OP_NOTMINUPTO:
2322       case OP_NOTMINUPTOI:
2323       case OP_POSUPTO:
2324       case OP_POSUPTOI:
2325       case OP_NOTPOSUPTO:
2326       case OP_NOTPOSUPTOI:
2327       case OP_STAR:
2328       case OP_STARI:
2329       case OP_NOTSTAR:
2330       case OP_NOTSTARI:
2331       case OP_MINSTAR:
2332       case OP_MINSTARI:
2333       case OP_NOTMINSTAR:
2334       case OP_NOTMINSTARI:
2335       case OP_POSSTAR:
2336       case OP_POSSTARI:
2337       case OP_NOTPOSSTAR:
2338       case OP_NOTPOSSTARI:
2339       case OP_PLUS:
2340       case OP_PLUSI:
2341       case OP_NOTPLUS:
2342       case OP_NOTPLUSI:
2343       case OP_MINPLUS:
2344       case OP_MINPLUSI:
2345       case OP_NOTMINPLUS:
2346       case OP_NOTMINPLUSI:
2347       case OP_POSPLUS:
2348       case OP_POSPLUSI:
2349       case OP_NOTPOSPLUS:
2350       case OP_NOTPOSPLUSI:
2351       case OP_QUERY:
2352       case OP_QUERYI:
2353       case OP_NOTQUERY:
2354       case OP_NOTQUERYI:
2355       case OP_MINQUERY:
2356       case OP_MINQUERYI:
2357       case OP_NOTMINQUERY:
2358       case OP_NOTMINQUERYI:
2359       case OP_POSQUERY:
2360       case OP_POSQUERYI:
2361       case OP_NOTPOSQUERY:
2362       case OP_NOTPOSQUERYI:
2363       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2364       break;
2365       }
2366 #else
2367     (void)(utf);  /* Keep compiler happy by referencing function argument */
2368 #endif
2369     }
2370   }
2371 }
2372 
2373 
2374 
2375 /*************************************************
2376 *    Scan compiled branch for non-emptiness      *
2377 *************************************************/
2378 
2379 /* This function scans through a branch of a compiled pattern to see whether it
2380 can match the empty string or not. It is called from could_be_empty()
2381 below and from compile_branch() when checking for an unlimited repeat of a
2382 group that can match nothing. Note that first_significant_code() skips over
2383 backward and negative forward assertions when its final argument is TRUE. If we
2384 hit an unclosed bracket, we return "empty" - this means we've struck an inner
2385 bracket whose current branch will already have been scanned.
2386 
2387 Arguments:
2388   code        points to start of search
2389   endcode     points to where to stop
2390   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2391   cd          contains pointers to tables etc.
2392   recurses    chain of recurse_check to catch mutual recursion
2393 
2394 Returns:      TRUE if what is matched could be empty
2395 */
2396 
2397 static BOOL
could_be_empty_branch(const pcre_uchar * code,const pcre_uchar * endcode,BOOL utf,compile_data * cd,recurse_check * recurses)2398 could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2399   BOOL utf, compile_data *cd, recurse_check *recurses)
2400 {
2401 register pcre_uchar c;
2402 recurse_check this_recurse;
2403 
2404 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2405      code < endcode;
2406      code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2407   {
2408   const pcre_uchar *ccode;
2409 
2410   c = *code;
2411 
2412   /* Skip over forward assertions; the other assertions are skipped by
2413   first_significant_code() with a TRUE final argument. */
2414 
2415   if (c == OP_ASSERT)
2416     {
2417     do code += GET(code, 1); while (*code == OP_ALT);
2418     c = *code;
2419     continue;
2420     }
2421 
2422   /* For a recursion/subroutine call, if its end has been reached, which
2423   implies a backward reference subroutine call, we can scan it. If it's a
2424   forward reference subroutine call, we can't. To detect forward reference
2425   we have to scan up the list that is kept in the workspace. This function is
2426   called only when doing the real compile, not during the pre-compile that
2427   measures the size of the compiled pattern. */
2428 
2429   if (c == OP_RECURSE)
2430     {
2431     const pcre_uchar *scode = cd->start_code + GET(code, 1);
2432     const pcre_uchar *endgroup = scode;
2433     BOOL empty_branch;
2434 
2435     /* Test for forward reference or uncompleted reference. This is disabled
2436     when called to scan a completed pattern by setting cd->start_workspace to
2437     NULL. */
2438 
2439     if (cd->start_workspace != NULL)
2440       {
2441       const pcre_uchar *tcode;
2442       for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2443         if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2444       if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2445       }
2446 
2447     /* If the reference is to a completed group, we need to detect whether this
2448     is a recursive call, as otherwise there will be an infinite loop. If it is
2449     a recursion, just skip over it. Simple recursions are easily detected. For
2450     mutual recursions we keep a chain on the stack. */
2451 
2452     do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2453     if (code >= scode && code <= endgroup) continue;  /* Simple recursion */
2454     else
2455       {
2456       recurse_check *r = recurses;
2457       for (r = recurses; r != NULL; r = r->prev)
2458         if (r->group == scode) break;
2459       if (r != NULL) continue;   /* Mutual recursion */
2460       }
2461 
2462     /* Completed reference; scan the referenced group, remembering it on the
2463     stack chain to detect mutual recursions. */
2464 
2465     empty_branch = FALSE;
2466     this_recurse.prev = recurses;
2467     this_recurse.group = scode;
2468 
2469     do
2470       {
2471       if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
2472         {
2473         empty_branch = TRUE;
2474         break;
2475         }
2476       scode += GET(scode, 1);
2477       }
2478     while (*scode == OP_ALT);
2479 
2480     if (!empty_branch) return FALSE;  /* All branches are non-empty */
2481     continue;
2482     }
2483 
2484   /* Groups with zero repeats can of course be empty; skip them. */
2485 
2486   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2487       c == OP_BRAPOSZERO)
2488     {
2489     code += PRIV(OP_lengths)[c];
2490     do code += GET(code, 1); while (*code == OP_ALT);
2491     c = *code;
2492     continue;
2493     }
2494 
2495   /* A nested group that is already marked as "could be empty" can just be
2496   skipped. */
2497 
2498   if (c == OP_SBRA  || c == OP_SBRAPOS ||
2499       c == OP_SCBRA || c == OP_SCBRAPOS)
2500     {
2501     do code += GET(code, 1); while (*code == OP_ALT);
2502     c = *code;
2503     continue;
2504     }
2505 
2506   /* For other groups, scan the branches. */
2507 
2508   if (c == OP_BRA  || c == OP_BRAPOS ||
2509       c == OP_CBRA || c == OP_CBRAPOS ||
2510       c == OP_ONCE || c == OP_ONCE_NC ||
2511       c == OP_COND || c == OP_SCOND)
2512     {
2513     BOOL empty_branch;
2514     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
2515 
2516     /* If a conditional group has only one branch, there is a second, implied,
2517     empty branch, so just skip over the conditional, because it could be empty.
2518     Otherwise, scan the individual branches of the group. */
2519 
2520     if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2521       code += GET(code, 1);
2522     else
2523       {
2524       empty_branch = FALSE;
2525       do
2526         {
2527         if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd,
2528           recurses)) empty_branch = TRUE;
2529         code += GET(code, 1);
2530         }
2531       while (*code == OP_ALT);
2532       if (!empty_branch) return FALSE;   /* All branches are non-empty */
2533       }
2534 
2535     c = *code;
2536     continue;
2537     }
2538 
2539   /* Handle the other opcodes */
2540 
2541   switch (c)
2542     {
2543     /* Check for quantifiers after a class. XCLASS is used for classes that
2544     cannot be represented just by a bit map. This includes negated single
2545     high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2546     actual length is stored in the compiled code, so we must update "code"
2547     here. */
2548 
2549 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2550     case OP_XCLASS:
2551     ccode = code += GET(code, 1);
2552     goto CHECK_CLASS_REPEAT;
2553 #endif
2554 
2555     case OP_CLASS:
2556     case OP_NCLASS:
2557     ccode = code + PRIV(OP_lengths)[OP_CLASS];
2558 
2559 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2560     CHECK_CLASS_REPEAT:
2561 #endif
2562 
2563     switch (*ccode)
2564       {
2565       case OP_CRSTAR:            /* These could be empty; continue */
2566       case OP_CRMINSTAR:
2567       case OP_CRQUERY:
2568       case OP_CRMINQUERY:
2569       case OP_CRPOSSTAR:
2570       case OP_CRPOSQUERY:
2571       break;
2572 
2573       default:                   /* Non-repeat => class must match */
2574       case OP_CRPLUS:            /* These repeats aren't empty */
2575       case OP_CRMINPLUS:
2576       case OP_CRPOSPLUS:
2577       return FALSE;
2578 
2579       case OP_CRRANGE:
2580       case OP_CRMINRANGE:
2581       case OP_CRPOSRANGE:
2582       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
2583       break;
2584       }
2585     break;
2586 
2587     /* Opcodes that must match a character */
2588 
2589     case OP_ANY:
2590     case OP_ALLANY:
2591     case OP_ANYBYTE:
2592 
2593     case OP_PROP:
2594     case OP_NOTPROP:
2595     case OP_ANYNL:
2596 
2597     case OP_NOT_HSPACE:
2598     case OP_HSPACE:
2599     case OP_NOT_VSPACE:
2600     case OP_VSPACE:
2601     case OP_EXTUNI:
2602 
2603     case OP_NOT_DIGIT:
2604     case OP_DIGIT:
2605     case OP_NOT_WHITESPACE:
2606     case OP_WHITESPACE:
2607     case OP_NOT_WORDCHAR:
2608     case OP_WORDCHAR:
2609 
2610     case OP_CHAR:
2611     case OP_CHARI:
2612     case OP_NOT:
2613     case OP_NOTI:
2614 
2615     case OP_PLUS:
2616     case OP_PLUSI:
2617     case OP_MINPLUS:
2618     case OP_MINPLUSI:
2619 
2620     case OP_NOTPLUS:
2621     case OP_NOTPLUSI:
2622     case OP_NOTMINPLUS:
2623     case OP_NOTMINPLUSI:
2624 
2625     case OP_POSPLUS:
2626     case OP_POSPLUSI:
2627     case OP_NOTPOSPLUS:
2628     case OP_NOTPOSPLUSI:
2629 
2630     case OP_EXACT:
2631     case OP_EXACTI:
2632     case OP_NOTEXACT:
2633     case OP_NOTEXACTI:
2634 
2635     case OP_TYPEPLUS:
2636     case OP_TYPEMINPLUS:
2637     case OP_TYPEPOSPLUS:
2638     case OP_TYPEEXACT:
2639 
2640     return FALSE;
2641 
2642     /* These are going to continue, as they may be empty, but we have to
2643     fudge the length for the \p and \P cases. */
2644 
2645     case OP_TYPESTAR:
2646     case OP_TYPEMINSTAR:
2647     case OP_TYPEPOSSTAR:
2648     case OP_TYPEQUERY:
2649     case OP_TYPEMINQUERY:
2650     case OP_TYPEPOSQUERY:
2651     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2652     break;
2653 
2654     /* Same for these */
2655 
2656     case OP_TYPEUPTO:
2657     case OP_TYPEMINUPTO:
2658     case OP_TYPEPOSUPTO:
2659     if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2660       code += 2;
2661     break;
2662 
2663     /* End of branch */
2664 
2665     case OP_KET:
2666     case OP_KETRMAX:
2667     case OP_KETRMIN:
2668     case OP_KETRPOS:
2669     case OP_ALT:
2670     return TRUE;
2671 
2672     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2673     MINUPTO, and POSUPTO and their caseless and negative versions may be
2674     followed by a multibyte character. */
2675 
2676 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2677     case OP_STAR:
2678     case OP_STARI:
2679     case OP_NOTSTAR:
2680     case OP_NOTSTARI:
2681 
2682     case OP_MINSTAR:
2683     case OP_MINSTARI:
2684     case OP_NOTMINSTAR:
2685     case OP_NOTMINSTARI:
2686 
2687     case OP_POSSTAR:
2688     case OP_POSSTARI:
2689     case OP_NOTPOSSTAR:
2690     case OP_NOTPOSSTARI:
2691 
2692     case OP_QUERY:
2693     case OP_QUERYI:
2694     case OP_NOTQUERY:
2695     case OP_NOTQUERYI:
2696 
2697     case OP_MINQUERY:
2698     case OP_MINQUERYI:
2699     case OP_NOTMINQUERY:
2700     case OP_NOTMINQUERYI:
2701 
2702     case OP_POSQUERY:
2703     case OP_POSQUERYI:
2704     case OP_NOTPOSQUERY:
2705     case OP_NOTPOSQUERYI:
2706 
2707     if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2708     break;
2709 
2710     case OP_UPTO:
2711     case OP_UPTOI:
2712     case OP_NOTUPTO:
2713     case OP_NOTUPTOI:
2714 
2715     case OP_MINUPTO:
2716     case OP_MINUPTOI:
2717     case OP_NOTMINUPTO:
2718     case OP_NOTMINUPTOI:
2719 
2720     case OP_POSUPTO:
2721     case OP_POSUPTOI:
2722     case OP_NOTPOSUPTO:
2723     case OP_NOTPOSUPTOI:
2724 
2725     if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2726     break;
2727 #endif
2728 
2729     /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2730     string. */
2731 
2732     case OP_MARK:
2733     case OP_PRUNE_ARG:
2734     case OP_SKIP_ARG:
2735     case OP_THEN_ARG:
2736     code += code[1];
2737     break;
2738 
2739     /* None of the remaining opcodes are required to match a character. */
2740 
2741     default:
2742     break;
2743     }
2744   }
2745 
2746 return TRUE;
2747 }
2748 
2749 
2750 
2751 /*************************************************
2752 *    Scan compiled regex for non-emptiness       *
2753 *************************************************/
2754 
2755 /* This function is called to check for left recursive calls. We want to check
2756 the current branch of the current pattern to see if it could match the empty
2757 string. If it could, we must look outwards for branches at other levels,
2758 stopping when we pass beyond the bracket which is the subject of the recursion.
2759 This function is called only during the real compile, not during the
2760 pre-compile.
2761 
2762 Arguments:
2763   code        points to start of the recursion
2764   endcode     points to where to stop (current RECURSE item)
2765   bcptr       points to the chain of current (unclosed) branch starts
2766   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2767   cd          pointers to tables etc
2768 
2769 Returns:      TRUE if what is matched could be empty
2770 */
2771 
2772 static BOOL
could_be_empty(const pcre_uchar * code,const pcre_uchar * endcode,branch_chain * bcptr,BOOL utf,compile_data * cd)2773 could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2774   branch_chain *bcptr, BOOL utf, compile_data *cd)
2775 {
2776 while (bcptr != NULL && bcptr->current_branch >= code)
2777   {
2778   if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
2779     return FALSE;
2780   bcptr = bcptr->outer;
2781   }
2782 return TRUE;
2783 }
2784 
2785 
2786 
2787 /*************************************************
2788 *        Base opcode of repeated opcodes         *
2789 *************************************************/
2790 
2791 /* Returns the base opcode for repeated single character type opcodes. If the
2792 opcode is not a repeated character type, it returns with the original value.
2793 
2794 Arguments:  c opcode
2795 Returns:    base opcode for the type
2796 */
2797 
2798 static pcre_uchar
get_repeat_base(pcre_uchar c)2799 get_repeat_base(pcre_uchar c)
2800 {
2801 return (c > OP_TYPEPOSUPTO)? c :
2802        (c >= OP_TYPESTAR)?   OP_TYPESTAR :
2803        (c >= OP_NOTSTARI)?   OP_NOTSTARI :
2804        (c >= OP_NOTSTAR)?    OP_NOTSTAR :
2805        (c >= OP_STARI)?      OP_STARI :
2806                              OP_STAR;
2807 }
2808 
2809 
2810 
2811 #ifdef SUPPORT_UCP
2812 /*************************************************
2813 *        Check a character and a property        *
2814 *************************************************/
2815 
2816 /* This function is called by check_auto_possessive() when a property item
2817 is adjacent to a fixed character.
2818 
2819 Arguments:
2820   c            the character
2821   ptype        the property type
2822   pdata        the data for the type
2823   negated      TRUE if it's a negated property (\P or \p{^)
2824 
2825 Returns:       TRUE if auto-possessifying is OK
2826 */
2827 
2828 static BOOL
check_char_prop(pcre_uint32 c,unsigned int ptype,unsigned int pdata,BOOL negated)2829 check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
2830   BOOL negated)
2831 {
2832 const pcre_uint32 *p;
2833 const ucd_record *prop = GET_UCD(c);
2834 
2835 switch(ptype)
2836   {
2837   case PT_LAMP:
2838   return (prop->chartype == ucp_Lu ||
2839           prop->chartype == ucp_Ll ||
2840           prop->chartype == ucp_Lt) == negated;
2841 
2842   case PT_GC:
2843   return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2844 
2845   case PT_PC:
2846   return (pdata == prop->chartype) == negated;
2847 
2848   case PT_SC:
2849   return (pdata == prop->script) == negated;
2850 
2851   /* These are specials */
2852 
2853   case PT_ALNUM:
2854   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2855           PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2856 
2857   /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2858   means that Perl space and POSIX space are now identical. PCRE was changed
2859   at release 8.34. */
2860 
2861   case PT_SPACE:    /* Perl space */
2862   case PT_PXSPACE:  /* POSIX space */
2863   switch(c)
2864     {
2865     HSPACE_CASES:
2866     VSPACE_CASES:
2867     return negated;
2868 
2869     default:
2870     return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
2871     }
2872   break;  /* Control never reaches here */
2873 
2874   case PT_WORD:
2875   return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2876           PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2877           c == CHAR_UNDERSCORE) == negated;
2878 
2879   case PT_CLIST:
2880   p = PRIV(ucd_caseless_sets) + prop->caseset;
2881   for (;;)
2882     {
2883     if (c < *p) return !negated;
2884     if (c == *p++) return negated;
2885     }
2886   break;  /* Control never reaches here */
2887   }
2888 
2889 return FALSE;
2890 }
2891 #endif  /* SUPPORT_UCP */
2892 
2893 
2894 
2895 /*************************************************
2896 *        Fill the character property list        *
2897 *************************************************/
2898 
2899 /* Checks whether the code points to an opcode that can take part in auto-
2900 possessification, and if so, fills a list with its properties.
2901 
2902 Arguments:
2903   code        points to start of expression
2904   utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2905   fcc         points to case-flipping table
2906   list        points to output list
2907               list[0] will be filled with the opcode
2908               list[1] will be non-zero if this opcode
2909                 can match an empty character string
2910               list[2..7] depends on the opcode
2911 
2912 Returns:      points to the start of the next opcode if *code is accepted
2913               NULL if *code is not accepted
2914 */
2915 
2916 static const pcre_uchar *
get_chr_property_list(const pcre_uchar * code,BOOL utf,const pcre_uint8 * fcc,pcre_uint32 * list)2917 get_chr_property_list(const pcre_uchar *code, BOOL utf,
2918   const pcre_uint8 *fcc, pcre_uint32 *list)
2919 {
2920 pcre_uchar c = *code;
2921 pcre_uchar base;
2922 const pcre_uchar *end;
2923 pcre_uint32 chr;
2924 
2925 #ifdef SUPPORT_UCP
2926 pcre_uint32 *clist_dest;
2927 const pcre_uint32 *clist_src;
2928 #else
2929 utf = utf;  /* Suppress "unused parameter" compiler warning */
2930 #endif
2931 
2932 list[0] = c;
2933 list[1] = FALSE;
2934 code++;
2935 
2936 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
2937   {
2938   base = get_repeat_base(c);
2939   c -= (base - OP_STAR);
2940 
2941   if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
2942     code += IMM2_SIZE;
2943 
2944   list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
2945 
2946   switch(base)
2947     {
2948     case OP_STAR:
2949     list[0] = OP_CHAR;
2950     break;
2951 
2952     case OP_STARI:
2953     list[0] = OP_CHARI;
2954     break;
2955 
2956     case OP_NOTSTAR:
2957     list[0] = OP_NOT;
2958     break;
2959 
2960     case OP_NOTSTARI:
2961     list[0] = OP_NOTI;
2962     break;
2963 
2964     case OP_TYPESTAR:
2965     list[0] = *code;
2966     code++;
2967     break;
2968     }
2969   c = list[0];
2970   }
2971 
2972 switch(c)
2973   {
2974   case OP_NOT_DIGIT:
2975   case OP_DIGIT:
2976   case OP_NOT_WHITESPACE:
2977   case OP_WHITESPACE:
2978   case OP_NOT_WORDCHAR:
2979   case OP_WORDCHAR:
2980   case OP_ANY:
2981   case OP_ALLANY:
2982   case OP_ANYNL:
2983   case OP_NOT_HSPACE:
2984   case OP_HSPACE:
2985   case OP_NOT_VSPACE:
2986   case OP_VSPACE:
2987   case OP_EXTUNI:
2988   case OP_EODN:
2989   case OP_EOD:
2990   case OP_DOLL:
2991   case OP_DOLLM:
2992   return code;
2993 
2994   case OP_CHAR:
2995   case OP_NOT:
2996   GETCHARINCTEST(chr, code);
2997   list[2] = chr;
2998   list[3] = NOTACHAR;
2999   return code;
3000 
3001   case OP_CHARI:
3002   case OP_NOTI:
3003   list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
3004   GETCHARINCTEST(chr, code);
3005   list[2] = chr;
3006 
3007 #ifdef SUPPORT_UCP
3008   if (chr < 128 || (chr < 256 && !utf))
3009     list[3] = fcc[chr];
3010   else
3011     list[3] = UCD_OTHERCASE(chr);
3012 #elif defined SUPPORT_UTF || !defined COMPILE_PCRE8
3013   list[3] = (chr < 256) ? fcc[chr] : chr;
3014 #else
3015   list[3] = fcc[chr];
3016 #endif
3017 
3018   /* The othercase might be the same value. */
3019 
3020   if (chr == list[3])
3021     list[3] = NOTACHAR;
3022   else
3023     list[4] = NOTACHAR;
3024   return code;
3025 
3026 #ifdef SUPPORT_UCP
3027   case OP_PROP:
3028   case OP_NOTPROP:
3029   if (code[0] != PT_CLIST)
3030     {
3031     list[2] = code[0];
3032     list[3] = code[1];
3033     return code + 2;
3034     }
3035 
3036   /* Convert only if we have enough space. */
3037 
3038   clist_src = PRIV(ucd_caseless_sets) + code[1];
3039   clist_dest = list + 2;
3040   code += 2;
3041 
3042   do {
3043      if (clist_dest >= list + 8)
3044        {
3045        /* Early return if there is not enough space. This should never
3046        happen, since all clists are shorter than 5 character now. */
3047        list[2] = code[0];
3048        list[3] = code[1];
3049        return code;
3050        }
3051      *clist_dest++ = *clist_src;
3052      }
3053   while(*clist_src++ != NOTACHAR);
3054 
3055   /* All characters are stored. The terminating NOTACHAR
3056   is copied form the clist itself. */
3057 
3058   list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
3059   return code;
3060 #endif
3061 
3062   case OP_NCLASS:
3063   case OP_CLASS:
3064 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3065   case OP_XCLASS:
3066   if (c == OP_XCLASS)
3067     end = code + GET(code, 0) - 1;
3068   else
3069 #endif
3070     end = code + 32 / sizeof(pcre_uchar);
3071 
3072   switch(*end)
3073     {
3074     case OP_CRSTAR:
3075     case OP_CRMINSTAR:
3076     case OP_CRQUERY:
3077     case OP_CRMINQUERY:
3078     case OP_CRPOSSTAR:
3079     case OP_CRPOSQUERY:
3080     list[1] = TRUE;
3081     end++;
3082     break;
3083 
3084     case OP_CRPLUS:
3085     case OP_CRMINPLUS:
3086     case OP_CRPOSPLUS:
3087     end++;
3088     break;
3089 
3090     case OP_CRRANGE:
3091     case OP_CRMINRANGE:
3092     case OP_CRPOSRANGE:
3093     list[1] = (GET2(end, 1) == 0);
3094     end += 1 + 2 * IMM2_SIZE;
3095     break;
3096     }
3097   list[2] = (pcre_uint32)(end - code);
3098   return end;
3099   }
3100 return NULL;    /* Opcode not accepted */
3101 }
3102 
3103 
3104 
3105 /*************************************************
3106 *    Scan further character sets for match       *
3107 *************************************************/
3108 
3109 /* Checks whether the base and the current opcode have a common character, in
3110 which case the base cannot be possessified.
3111 
3112 Arguments:
3113   code        points to the byte code
3114   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
3115   cd          static compile data
3116   base_list   the data list of the base opcode
3117 
3118 Returns:      TRUE if the auto-possessification is possible
3119 */
3120 
3121 static BOOL
compare_opcodes(const pcre_uchar * code,BOOL utf,const compile_data * cd,const pcre_uint32 * base_list,const pcre_uchar * base_end,int * rec_limit)3122 compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
3123   const pcre_uint32 *base_list, const pcre_uchar *base_end, int *rec_limit)
3124 {
3125 pcre_uchar c;
3126 pcre_uint32 list[8];
3127 const pcre_uint32 *chr_ptr;
3128 const pcre_uint32 *ochr_ptr;
3129 const pcre_uint32 *list_ptr;
3130 const pcre_uchar *next_code;
3131 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3132 const pcre_uchar *xclass_flags;
3133 #endif
3134 const pcre_uint8 *class_bitset;
3135 const pcre_uint8 *set1, *set2, *set_end;
3136 pcre_uint32 chr;
3137 BOOL accepted, invert_bits;
3138 BOOL entered_a_group = FALSE;
3139 
3140 if (*rec_limit == 0) return FALSE;
3141 --(*rec_limit);
3142 
3143 /* Note: the base_list[1] contains whether the current opcode has greedy
3144 (represented by a non-zero value) quantifier. This is a different from
3145 other character type lists, which stores here that the character iterator
3146 matches to an empty string (also represented by a non-zero value). */
3147 
3148 for(;;)
3149   {
3150   /* All operations move the code pointer forward.
3151   Therefore infinite recursions are not possible. */
3152 
3153   c = *code;
3154 
3155   /* Skip over callouts */
3156 
3157   if (c == OP_CALLOUT)
3158     {
3159     code += PRIV(OP_lengths)[c];
3160     continue;
3161     }
3162 
3163   if (c == OP_ALT)
3164     {
3165     do code += GET(code, 1); while (*code == OP_ALT);
3166     c = *code;
3167     }
3168 
3169   switch(c)
3170     {
3171     case OP_END:
3172     case OP_KETRPOS:
3173     /* TRUE only in greedy case. The non-greedy case could be replaced by
3174     an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
3175     uses more memory, which we cannot get at this stage.) */
3176 
3177     return base_list[1] != 0;
3178 
3179     case OP_KET:
3180     /* If the bracket is capturing, and referenced by an OP_RECURSE, or
3181     it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
3182     cannot be converted to a possessive form. */
3183 
3184     if (base_list[1] == 0) return FALSE;
3185 
3186     switch(*(code - GET(code, 1)))
3187       {
3188       case OP_ASSERT:
3189       case OP_ASSERT_NOT:
3190       case OP_ASSERTBACK:
3191       case OP_ASSERTBACK_NOT:
3192       case OP_ONCE:
3193       case OP_ONCE_NC:
3194       /* Atomic sub-patterns and assertions can always auto-possessify their
3195       last iterator. However, if the group was entered as a result of checking
3196       a previous iterator, this is not possible. */
3197 
3198       return !entered_a_group;
3199       }
3200 
3201     code += PRIV(OP_lengths)[c];
3202     continue;
3203 
3204     case OP_ONCE:
3205     case OP_ONCE_NC:
3206     case OP_BRA:
3207     case OP_CBRA:
3208     next_code = code + GET(code, 1);
3209     code += PRIV(OP_lengths)[c];
3210 
3211     while (*next_code == OP_ALT)
3212       {
3213       if (!compare_opcodes(code, utf, cd, base_list, base_end, rec_limit))
3214         return FALSE;
3215       code = next_code + 1 + LINK_SIZE;
3216       next_code += GET(next_code, 1);
3217       }
3218 
3219     entered_a_group = TRUE;
3220     continue;
3221 
3222     case OP_BRAZERO:
3223     case OP_BRAMINZERO:
3224 
3225     next_code = code + 1;
3226     if (*next_code != OP_BRA && *next_code != OP_CBRA
3227         && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
3228 
3229     do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3230 
3231     /* The bracket content will be checked by the
3232     OP_BRA/OP_CBRA case above. */
3233     next_code += 1 + LINK_SIZE;
3234     if (!compare_opcodes(next_code, utf, cd, base_list, base_end, rec_limit))
3235       return FALSE;
3236 
3237     code += PRIV(OP_lengths)[c];
3238     continue;
3239 
3240     default:
3241     break;
3242     }
3243 
3244   /* Check for a supported opcode, and load its properties. */
3245 
3246   code = get_chr_property_list(code, utf, cd->fcc, list);
3247   if (code == NULL) return FALSE;    /* Unsupported */
3248 
3249   /* If either opcode is a small character list, set pointers for comparing
3250   characters from that list with another list, or with a property. */
3251 
3252   if (base_list[0] == OP_CHAR)
3253     {
3254     chr_ptr = base_list + 2;
3255     list_ptr = list;
3256     }
3257   else if (list[0] == OP_CHAR)
3258     {
3259     chr_ptr = list + 2;
3260     list_ptr = base_list;
3261     }
3262 
3263   /* Character bitsets can also be compared to certain opcodes. */
3264 
3265   else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
3266 #ifdef COMPILE_PCRE8
3267       /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
3268       || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
3269 #endif
3270       )
3271     {
3272 #ifdef COMPILE_PCRE8
3273     if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
3274 #else
3275     if (base_list[0] == OP_CLASS)
3276 #endif
3277       {
3278       set1 = (pcre_uint8 *)(base_end - base_list[2]);
3279       list_ptr = list;
3280       }
3281     else
3282       {
3283       set1 = (pcre_uint8 *)(code - list[2]);
3284       list_ptr = base_list;
3285       }
3286 
3287     invert_bits = FALSE;
3288     switch(list_ptr[0])
3289       {
3290       case OP_CLASS:
3291       case OP_NCLASS:
3292       set2 = (pcre_uint8 *)
3293         ((list_ptr == list ? code : base_end) - list_ptr[2]);
3294       break;
3295 
3296 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3297       case OP_XCLASS:
3298       xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE;
3299       if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;
3300       if ((*xclass_flags & XCL_MAP) == 0)
3301         {
3302         /* No bits are set for characters < 256. */
3303         if (list[1] == 0) return (*xclass_flags & XCL_NOT) == 0;
3304         /* Might be an empty repeat. */
3305         continue;
3306         }
3307       set2 = (pcre_uint8 *)(xclass_flags + 1);
3308       break;
3309 #endif
3310 
3311       case OP_NOT_DIGIT:
3312       invert_bits = TRUE;
3313       /* Fall through */
3314       case OP_DIGIT:
3315       set2 = (pcre_uint8 *)(cd->cbits + cbit_digit);
3316       break;
3317 
3318       case OP_NOT_WHITESPACE:
3319       invert_bits = TRUE;
3320       /* Fall through */
3321       case OP_WHITESPACE:
3322       set2 = (pcre_uint8 *)(cd->cbits + cbit_space);
3323       break;
3324 
3325       case OP_NOT_WORDCHAR:
3326       invert_bits = TRUE;
3327       /* Fall through */
3328       case OP_WORDCHAR:
3329       set2 = (pcre_uint8 *)(cd->cbits + cbit_word);
3330       break;
3331 
3332       default:
3333       return FALSE;
3334       }
3335 
3336     /* Because the sets are unaligned, we need
3337     to perform byte comparison here. */
3338     set_end = set1 + 32;
3339     if (invert_bits)
3340       {
3341       do
3342         {
3343         if ((*set1++ & ~(*set2++)) != 0) return FALSE;
3344         }
3345       while (set1 < set_end);
3346       }
3347     else
3348       {
3349       do
3350         {
3351         if ((*set1++ & *set2++) != 0) return FALSE;
3352         }
3353       while (set1 < set_end);
3354       }
3355 
3356     if (list[1] == 0) return TRUE;
3357     /* Might be an empty repeat. */
3358     continue;
3359     }
3360 
3361   /* Some property combinations also acceptable. Unicode property opcodes are
3362   processed specially; the rest can be handled with a lookup table. */
3363 
3364   else
3365     {
3366     pcre_uint32 leftop, rightop;
3367 
3368     leftop = base_list[0];
3369     rightop = list[0];
3370 
3371 #ifdef SUPPORT_UCP
3372     accepted = FALSE; /* Always set in non-unicode case. */
3373     if (leftop == OP_PROP || leftop == OP_NOTPROP)
3374       {
3375       if (rightop == OP_EOD)
3376         accepted = TRUE;
3377       else if (rightop == OP_PROP || rightop == OP_NOTPROP)
3378         {
3379         int n;
3380         const pcre_uint8 *p;
3381         BOOL same = leftop == rightop;
3382         BOOL lisprop = leftop == OP_PROP;
3383         BOOL risprop = rightop == OP_PROP;
3384         BOOL bothprop = lisprop && risprop;
3385 
3386         /* There's a table that specifies how each combination is to be
3387         processed:
3388           0   Always return FALSE (never auto-possessify)
3389           1   Character groups are distinct (possessify if both are OP_PROP)
3390           2   Check character categories in the same group (general or particular)
3391           3   Return TRUE if the two opcodes are not the same
3392           ... see comments below
3393         */
3394 
3395         n = propposstab[base_list[2]][list[2]];
3396         switch(n)
3397           {
3398           case 0: break;
3399           case 1: accepted = bothprop; break;
3400           case 2: accepted = (base_list[3] == list[3]) != same; break;
3401           case 3: accepted = !same; break;
3402 
3403           case 4:  /* Left general category, right particular category */
3404           accepted = risprop && catposstab[base_list[3]][list[3]] == same;
3405           break;
3406 
3407           case 5:  /* Right general category, left particular category */
3408           accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
3409           break;
3410 
3411           /* This code is logically tricky. Think hard before fiddling with it.
3412           The posspropstab table has four entries per row. Each row relates to
3413           one of PCRE's special properties such as ALNUM or SPACE or WORD.
3414           Only WORD actually needs all four entries, but using repeats for the
3415           others means they can all use the same code below.
3416 
3417           The first two entries in each row are Unicode general categories, and
3418           apply always, because all the characters they include are part of the
3419           PCRE character set. The third and fourth entries are a general and a
3420           particular category, respectively, that include one or more relevant
3421           characters. One or the other is used, depending on whether the check
3422           is for a general or a particular category. However, in both cases the
3423           category contains more characters than the specials that are defined
3424           for the property being tested against. Therefore, it cannot be used
3425           in a NOTPROP case.
3426 
3427           Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3428           Underscore is covered by ucp_P or ucp_Po. */
3429 
3430           case 6:  /* Left alphanum vs right general category */
3431           case 7:  /* Left space vs right general category */
3432           case 8:  /* Left word vs right general category */
3433           p = posspropstab[n-6];
3434           accepted = risprop && lisprop ==
3435             (list[3] != p[0] &&
3436              list[3] != p[1] &&
3437             (list[3] != p[2] || !lisprop));
3438           break;
3439 
3440           case 9:   /* Right alphanum vs left general category */
3441           case 10:  /* Right space vs left general category */
3442           case 11:  /* Right word vs left general category */
3443           p = posspropstab[n-9];
3444           accepted = lisprop && risprop ==
3445             (base_list[3] != p[0] &&
3446              base_list[3] != p[1] &&
3447             (base_list[3] != p[2] || !risprop));
3448           break;
3449 
3450           case 12:  /* Left alphanum vs right particular category */
3451           case 13:  /* Left space vs right particular category */
3452           case 14:  /* Left word vs right particular category */
3453           p = posspropstab[n-12];
3454           accepted = risprop && lisprop ==
3455             (catposstab[p[0]][list[3]] &&
3456              catposstab[p[1]][list[3]] &&
3457             (list[3] != p[3] || !lisprop));
3458           break;
3459 
3460           case 15:  /* Right alphanum vs left particular category */
3461           case 16:  /* Right space vs left particular category */
3462           case 17:  /* Right word vs left particular category */
3463           p = posspropstab[n-15];
3464           accepted = lisprop && risprop ==
3465             (catposstab[p[0]][base_list[3]] &&
3466              catposstab[p[1]][base_list[3]] &&
3467             (base_list[3] != p[3] || !risprop));
3468           break;
3469           }
3470         }
3471       }
3472 
3473     else
3474 #endif  /* SUPPORT_UCP */
3475 
3476     accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3477            rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3478            autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3479 
3480     if (!accepted) return FALSE;
3481 
3482     if (list[1] == 0) return TRUE;
3483     /* Might be an empty repeat. */
3484     continue;
3485     }
3486 
3487   /* Control reaches here only if one of the items is a small character list.
3488   All characters are checked against the other side. */
3489 
3490   do
3491     {
3492     chr = *chr_ptr;
3493 
3494     switch(list_ptr[0])
3495       {
3496       case OP_CHAR:
3497       ochr_ptr = list_ptr + 2;
3498       do
3499         {
3500         if (chr == *ochr_ptr) return FALSE;
3501         ochr_ptr++;
3502         }
3503       while(*ochr_ptr != NOTACHAR);
3504       break;
3505 
3506       case OP_NOT:
3507       ochr_ptr = list_ptr + 2;
3508       do
3509         {
3510         if (chr == *ochr_ptr)
3511           break;
3512         ochr_ptr++;
3513         }
3514       while(*ochr_ptr != NOTACHAR);
3515       if (*ochr_ptr == NOTACHAR) return FALSE;   /* Not found */
3516       break;
3517 
3518       /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not*
3519       set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3520 
3521       case OP_DIGIT:
3522       if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE;
3523       break;
3524 
3525       case OP_NOT_DIGIT:
3526       if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE;
3527       break;
3528 
3529       case OP_WHITESPACE:
3530       if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE;
3531       break;
3532 
3533       case OP_NOT_WHITESPACE:
3534       if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE;
3535       break;
3536 
3537       case OP_WORDCHAR:
3538       if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE;
3539       break;
3540 
3541       case OP_NOT_WORDCHAR:
3542       if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE;
3543       break;
3544 
3545       case OP_HSPACE:
3546       switch(chr)
3547         {
3548         HSPACE_CASES: return FALSE;
3549         default: break;
3550         }
3551       break;
3552 
3553       case OP_NOT_HSPACE:
3554       switch(chr)
3555         {
3556         HSPACE_CASES: break;
3557         default: return FALSE;
3558         }
3559       break;
3560 
3561       case OP_ANYNL:
3562       case OP_VSPACE:
3563       switch(chr)
3564         {
3565         VSPACE_CASES: return FALSE;
3566         default: break;
3567         }
3568       break;
3569 
3570       case OP_NOT_VSPACE:
3571       switch(chr)
3572         {
3573         VSPACE_CASES: break;
3574         default: return FALSE;
3575         }
3576       break;
3577 
3578       case OP_DOLL:
3579       case OP_EODN:
3580       switch (chr)
3581         {
3582         case CHAR_CR:
3583         case CHAR_LF:
3584         case CHAR_VT:
3585         case CHAR_FF:
3586         case CHAR_NEL:
3587 #ifndef EBCDIC
3588         case 0x2028:
3589         case 0x2029:
3590 #endif  /* Not EBCDIC */
3591         return FALSE;
3592         }
3593       break;
3594 
3595       case OP_EOD:    /* Can always possessify before \z */
3596       break;
3597 
3598 #ifdef SUPPORT_UCP
3599       case OP_PROP:
3600       case OP_NOTPROP:
3601       if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3602             list_ptr[0] == OP_NOTPROP))
3603         return FALSE;
3604       break;
3605 #endif
3606 
3607       case OP_NCLASS:
3608       if (chr > 255) return FALSE;
3609       /* Fall through */
3610 
3611       case OP_CLASS:
3612       if (chr > 255) break;
3613       class_bitset = (pcre_uint8 *)
3614         ((list_ptr == list ? code : base_end) - list_ptr[2]);
3615       if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE;
3616       break;
3617 
3618 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3619       case OP_XCLASS:
3620       if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
3621           list_ptr[2] + LINK_SIZE, utf)) return FALSE;
3622       break;
3623 #endif
3624 
3625       default:
3626       return FALSE;
3627       }
3628 
3629     chr_ptr++;
3630     }
3631   while(*chr_ptr != NOTACHAR);
3632 
3633   /* At least one character must be matched from this opcode. */
3634 
3635   if (list[1] == 0) return TRUE;
3636   }
3637 
3638 /* Control never reaches here. There used to be a fail-save return FALSE; here,
3639 but some compilers complain about an unreachable statement. */
3640 
3641 }
3642 
3643 
3644 
3645 /*************************************************
3646 *    Scan compiled regex for auto-possession     *
3647 *************************************************/
3648 
3649 /* Replaces single character iterations with their possessive alternatives
3650 if appropriate. This function modifies the compiled opcode!
3651 
3652 Arguments:
3653   code        points to start of the byte code
3654   utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
3655   cd          static compile data
3656 
3657 Returns:      nothing
3658 */
3659 
3660 static void
auto_possessify(pcre_uchar * code,BOOL utf,const compile_data * cd)3661 auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)
3662 {
3663 register pcre_uchar c;
3664 const pcre_uchar *end;
3665 pcre_uchar *repeat_opcode;
3666 pcre_uint32 list[8];
3667 int rec_limit;
3668 
3669 for (;;)
3670   {
3671   c = *code;
3672 
3673   /* When a pattern with bad UTF-8 encoding is compiled with NO_UTF_CHECK,
3674   it may compile without complaining, but may get into a loop here if the code
3675   pointer points to a bad value. This is, of course a documentated possibility,
3676   when NO_UTF_CHECK is set, so it isn't a bug, but we can detect this case and
3677   just give up on this optimization. */
3678 
3679   if (c >= OP_TABLE_LENGTH) return;
3680 
3681   if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
3682     {
3683     c -= get_repeat_base(c) - OP_STAR;
3684     end = (c <= OP_MINUPTO) ?
3685       get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3686     list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3687 
3688     rec_limit = 1000;
3689     if (end != NULL && compare_opcodes(end, utf, cd, list, end, &rec_limit))
3690       {
3691       switch(c)
3692         {
3693         case OP_STAR:
3694         *code += OP_POSSTAR - OP_STAR;
3695         break;
3696 
3697         case OP_MINSTAR:
3698         *code += OP_POSSTAR - OP_MINSTAR;
3699         break;
3700 
3701         case OP_PLUS:
3702         *code += OP_POSPLUS - OP_PLUS;
3703         break;
3704 
3705         case OP_MINPLUS:
3706         *code += OP_POSPLUS - OP_MINPLUS;
3707         break;
3708 
3709         case OP_QUERY:
3710         *code += OP_POSQUERY - OP_QUERY;
3711         break;
3712 
3713         case OP_MINQUERY:
3714         *code += OP_POSQUERY - OP_MINQUERY;
3715         break;
3716 
3717         case OP_UPTO:
3718         *code += OP_POSUPTO - OP_UPTO;
3719         break;
3720 
3721         case OP_MINUPTO:
3722         *code += OP_POSUPTO - OP_MINUPTO;
3723         break;
3724         }
3725       }
3726     c = *code;
3727     }
3728   else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
3729     {
3730 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3731     if (c == OP_XCLASS)
3732       repeat_opcode = code + GET(code, 1);
3733     else
3734 #endif
3735       repeat_opcode = code + 1 + (32 / sizeof(pcre_uchar));
3736 
3737     c = *repeat_opcode;
3738     if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
3739       {
3740       /* end must not be NULL. */
3741       end = get_chr_property_list(code, utf, cd->fcc, list);
3742 
3743       list[1] = (c & 1) == 0;
3744 
3745       rec_limit = 1000;
3746       if (compare_opcodes(end, utf, cd, list, end, &rec_limit))
3747         {
3748         switch (c)
3749           {
3750           case OP_CRSTAR:
3751           case OP_CRMINSTAR:
3752           *repeat_opcode = OP_CRPOSSTAR;
3753           break;
3754 
3755           case OP_CRPLUS:
3756           case OP_CRMINPLUS:
3757           *repeat_opcode = OP_CRPOSPLUS;
3758           break;
3759 
3760           case OP_CRQUERY:
3761           case OP_CRMINQUERY:
3762           *repeat_opcode = OP_CRPOSQUERY;
3763           break;
3764 
3765           case OP_CRRANGE:
3766           case OP_CRMINRANGE:
3767           *repeat_opcode = OP_CRPOSRANGE;
3768           break;
3769           }
3770         }
3771       }
3772     c = *code;
3773     }
3774 
3775   switch(c)
3776     {
3777     case OP_END:
3778     return;
3779 
3780     case OP_TYPESTAR:
3781     case OP_TYPEMINSTAR:
3782     case OP_TYPEPLUS:
3783     case OP_TYPEMINPLUS:
3784     case OP_TYPEQUERY:
3785     case OP_TYPEMINQUERY:
3786     case OP_TYPEPOSSTAR:
3787     case OP_TYPEPOSPLUS:
3788     case OP_TYPEPOSQUERY:
3789     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
3790     break;
3791 
3792     case OP_TYPEUPTO:
3793     case OP_TYPEMINUPTO:
3794     case OP_TYPEEXACT:
3795     case OP_TYPEPOSUPTO:
3796     if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
3797       code += 2;
3798     break;
3799 
3800 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3801     case OP_XCLASS:
3802     code += GET(code, 1);
3803     break;
3804 #endif
3805 
3806     case OP_MARK:
3807     case OP_PRUNE_ARG:
3808     case OP_SKIP_ARG:
3809     case OP_THEN_ARG:
3810     code += code[1];
3811     break;
3812     }
3813 
3814   /* Add in the fixed length from the table */
3815 
3816   code += PRIV(OP_lengths)[c];
3817 
3818   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
3819   a multi-byte character. The length in the table is a minimum, so we have to
3820   arrange to skip the extra bytes. */
3821 
3822 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
3823   if (utf) switch(c)
3824     {
3825     case OP_CHAR:
3826     case OP_CHARI:
3827     case OP_NOT:
3828     case OP_NOTI:
3829     case OP_STAR:
3830     case OP_MINSTAR:
3831     case OP_PLUS:
3832     case OP_MINPLUS:
3833     case OP_QUERY:
3834     case OP_MINQUERY:
3835     case OP_UPTO:
3836     case OP_MINUPTO:
3837     case OP_EXACT:
3838     case OP_POSSTAR:
3839     case OP_POSPLUS:
3840     case OP_POSQUERY:
3841     case OP_POSUPTO:
3842     case OP_STARI:
3843     case OP_MINSTARI:
3844     case OP_PLUSI:
3845     case OP_MINPLUSI:
3846     case OP_QUERYI:
3847     case OP_MINQUERYI:
3848     case OP_UPTOI:
3849     case OP_MINUPTOI:
3850     case OP_EXACTI:
3851     case OP_POSSTARI:
3852     case OP_POSPLUSI:
3853     case OP_POSQUERYI:
3854     case OP_POSUPTOI:
3855     case OP_NOTSTAR:
3856     case OP_NOTMINSTAR:
3857     case OP_NOTPLUS:
3858     case OP_NOTMINPLUS:
3859     case OP_NOTQUERY:
3860     case OP_NOTMINQUERY:
3861     case OP_NOTUPTO:
3862     case OP_NOTMINUPTO:
3863     case OP_NOTEXACT:
3864     case OP_NOTPOSSTAR:
3865     case OP_NOTPOSPLUS:
3866     case OP_NOTPOSQUERY:
3867     case OP_NOTPOSUPTO:
3868     case OP_NOTSTARI:
3869     case OP_NOTMINSTARI:
3870     case OP_NOTPLUSI:
3871     case OP_NOTMINPLUSI:
3872     case OP_NOTQUERYI:
3873     case OP_NOTMINQUERYI:
3874     case OP_NOTUPTOI:
3875     case OP_NOTMINUPTOI:
3876     case OP_NOTEXACTI:
3877     case OP_NOTPOSSTARI:
3878     case OP_NOTPOSPLUSI:
3879     case OP_NOTPOSQUERYI:
3880     case OP_NOTPOSUPTOI:
3881     if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
3882     break;
3883     }
3884 #else
3885   (void)(utf);  /* Keep compiler happy by referencing function argument */
3886 #endif
3887   }
3888 }
3889 
3890 
3891 
3892 /*************************************************
3893 *           Check for POSIX class syntax         *
3894 *************************************************/
3895 
3896 /* This function is called when the sequence "[:" or "[." or "[=" is
3897 encountered in a character class. It checks whether this is followed by a
3898 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
3899 reach an unescaped ']' without the special preceding character, return FALSE.
3900 
3901 Originally, this function only recognized a sequence of letters between the
3902 terminators, but it seems that Perl recognizes any sequence of characters,
3903 though of course unknown POSIX names are subsequently rejected. Perl gives an
3904 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
3905 didn't consider this to be a POSIX class. Likewise for [:1234:].
3906 
3907 The problem in trying to be exactly like Perl is in the handling of escapes. We
3908 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
3909 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
3910 below handles the special cases \\ and \], but does not try to do any other
3911 escape processing. This makes it different from Perl for cases such as
3912 [:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
3913 not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
3914 when Perl does, I think.
3915 
3916 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
3917 It seems that the appearance of a nested POSIX class supersedes an apparent
3918 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
3919 a digit.
3920 
3921 In Perl, unescaped square brackets may also appear as part of class names. For
3922 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
3923 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
3924 seem right at all. PCRE does not allow closing square brackets in POSIX class
3925 names.
3926 
3927 Arguments:
3928   ptr      pointer to the initial [
3929   endptr   where to return the end pointer
3930 
3931 Returns:   TRUE or FALSE
3932 */
3933 
3934 static BOOL
check_posix_syntax(const pcre_uchar * ptr,const pcre_uchar ** endptr)3935 check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
3936 {
3937 pcre_uchar terminator;          /* Don't combine these lines; the Solaris cc */
3938 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
3939 for (++ptr; *ptr != CHAR_NULL; ptr++)
3940   {
3941   if (*ptr == CHAR_BACKSLASH &&
3942       (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET ||
3943        ptr[1] == CHAR_BACKSLASH))
3944     ptr++;
3945   else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
3946             *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
3947   else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3948     {
3949     *endptr = ptr;
3950     return TRUE;
3951     }
3952   }
3953 return FALSE;
3954 }
3955 
3956 
3957 
3958 
3959 /*************************************************
3960 *          Check POSIX class name                *
3961 *************************************************/
3962 
3963 /* This function is called to check the name given in a POSIX-style class entry
3964 such as [:alnum:].
3965 
3966 Arguments:
3967   ptr        points to the first letter
3968   len        the length of the name
3969 
3970 Returns:     a value representing the name, or -1 if unknown
3971 */
3972 
3973 static int
check_posix_name(const pcre_uchar * ptr,int len)3974 check_posix_name(const pcre_uchar *ptr, int len)
3975 {
3976 const char *pn = posix_names;
3977 register int yield = 0;
3978 while (posix_name_lengths[yield] != 0)
3979   {
3980   if (len == posix_name_lengths[yield] &&
3981     STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
3982   pn += posix_name_lengths[yield] + 1;
3983   yield++;
3984   }
3985 return -1;
3986 }
3987 
3988 
3989 /*************************************************
3990 *    Adjust OP_RECURSE items in repeated group   *
3991 *************************************************/
3992 
3993 /* OP_RECURSE items contain an offset from the start of the regex to the group
3994 that is referenced. This means that groups can be replicated for fixed
3995 repetition simply by copying (because the recursion is allowed to refer to
3996 earlier groups that are outside the current group). However, when a group is
3997 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
3998 inserted before it, after it has been compiled. This means that any OP_RECURSE
3999 items within it that refer to the group itself or any contained groups have to
4000 have their offsets adjusted. That one of the jobs of this function. Before it
4001 is called, the partially compiled regex must be temporarily terminated with
4002 OP_END.
4003 
4004 This function has been extended to cope with forward references for recursions
4005 and subroutine calls. It must check the list of such references for the
4006 group we are dealing with. If it finds that one of the recursions in the
4007 current group is on this list, it does not adjust the value in the reference
4008 (which is a group number). After the group has been scanned, all the offsets in
4009 the forward reference list for the group are adjusted.
4010 
4011 Arguments:
4012   group      points to the start of the group
4013   adjust     the amount by which the group is to be moved
4014   utf        TRUE in UTF-8 / UTF-16 / UTF-32 mode
4015   cd         contains pointers to tables etc.
4016   save_hwm_offset   the hwm forward reference offset at the start of the group
4017 
4018 Returns:     nothing
4019 */
4020 
4021 static void
adjust_recurse(pcre_uchar * group,int adjust,BOOL utf,compile_data * cd,size_t save_hwm_offset)4022 adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
4023   size_t save_hwm_offset)
4024 {
4025 int offset;
4026 pcre_uchar *hc;
4027 pcre_uchar *ptr = group;
4028 
4029 while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
4030   {
4031   for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
4032        hc += LINK_SIZE)
4033     {
4034     offset = (int)GET(hc, 0);
4035     if (cd->start_code + offset == ptr + 1) break;
4036     }
4037 
4038   /* If we have not found this recursion on the forward reference list, adjust
4039   the recursion's offset if it's after the start of this group. */
4040 
4041   if (hc >= cd->hwm)
4042     {
4043     offset = (int)GET(ptr, 1);
4044     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
4045     }
4046 
4047   ptr += 1 + LINK_SIZE;
4048   }
4049 
4050 /* Now adjust all forward reference offsets for the group. */
4051 
4052 for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
4053      hc += LINK_SIZE)
4054   {
4055   offset = (int)GET(hc, 0);
4056   PUT(hc, 0, offset + adjust);
4057   }
4058 }
4059 
4060 
4061 
4062 /*************************************************
4063 *        Insert an automatic callout point       *
4064 *************************************************/
4065 
4066 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
4067 callout points before each pattern item.
4068 
4069 Arguments:
4070   code           current code pointer
4071   ptr            current pattern pointer
4072   cd             pointers to tables etc
4073 
4074 Returns:         new code pointer
4075 */
4076 
4077 static pcre_uchar *
auto_callout(pcre_uchar * code,const pcre_uchar * ptr,compile_data * cd)4078 auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
4079 {
4080 *code++ = OP_CALLOUT;
4081 *code++ = 255;
4082 PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
4083 PUT(code, LINK_SIZE, 0);                       /* Default length */
4084 return code + 2 * LINK_SIZE;
4085 }
4086 
4087 
4088 
4089 /*************************************************
4090 *         Complete a callout item                *
4091 *************************************************/
4092 
4093 /* A callout item contains the length of the next item in the pattern, which
4094 we can't fill in till after we have reached the relevant point. This is used
4095 for both automatic and manual callouts.
4096 
4097 Arguments:
4098   previous_callout   points to previous callout item
4099   ptr                current pattern pointer
4100   cd                 pointers to tables etc
4101 
4102 Returns:             nothing
4103 */
4104 
4105 static void
complete_callout(pcre_uchar * previous_callout,const pcre_uchar * ptr,compile_data * cd)4106 complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
4107 {
4108 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
4109 PUT(previous_callout, 2 + LINK_SIZE, length);
4110 }
4111 
4112 
4113 
4114 #ifdef SUPPORT_UCP
4115 /*************************************************
4116 *           Get othercase range                  *
4117 *************************************************/
4118 
4119 /* This function is passed the start and end of a class range, in UTF-8 mode
4120 with UCP support. It searches up the characters, looking for ranges of
4121 characters in the "other" case. Each call returns the next one, updating the
4122 start address. A character with multiple other cases is returned on its own
4123 with a special return value.
4124 
4125 Arguments:
4126   cptr        points to starting character value; updated
4127   d           end value
4128   ocptr       where to put start of othercase range
4129   odptr       where to put end of othercase range
4130 
4131 Yield:        -1 when no more
4132                0 when a range is returned
4133               >0 the CASESET offset for char with multiple other cases
4134                 in this case, ocptr contains the original
4135 */
4136 
4137 static int
get_othercase_range(pcre_uint32 * cptr,pcre_uint32 d,pcre_uint32 * ocptr,pcre_uint32 * odptr)4138 get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
4139   pcre_uint32 *odptr)
4140 {
4141 pcre_uint32 c, othercase, next;
4142 unsigned int co;
4143 
4144 /* Find the first character that has an other case. If it has multiple other
4145 cases, return its case offset value. */
4146 
4147 for (c = *cptr; c <= d; c++)
4148   {
4149   if ((co = UCD_CASESET(c)) != 0)
4150     {
4151     *ocptr = c++;   /* Character that has the set */
4152     *cptr = c;      /* Rest of input range */
4153     return (int)co;
4154     }
4155   if ((othercase = UCD_OTHERCASE(c)) != c) break;
4156   }
4157 
4158 if (c > d) return -1;  /* Reached end of range */
4159 
4160 /* Found a character that has a single other case. Search for the end of the
4161 range, which is either the end of the input range, or a character that has zero
4162 or more than one other cases. */
4163 
4164 *ocptr = othercase;
4165 next = othercase + 1;
4166 
4167 for (++c; c <= d; c++)
4168   {
4169   if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
4170   next++;
4171   }
4172 
4173 *odptr = next - 1;     /* End of othercase range */
4174 *cptr = c;             /* Rest of input range */
4175 return 0;
4176 }
4177 #endif  /* SUPPORT_UCP */
4178 
4179 
4180 
4181 /*************************************************
4182 *        Add a character or range to a class     *
4183 *************************************************/
4184 
4185 /* This function packages up the logic of adding a character or range of
4186 characters to a class. The character values in the arguments will be within the
4187 valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
4188 mutually recursive with the function immediately below.
4189 
4190 Arguments:
4191   classbits     the bit map for characters < 256
4192   uchardptr     points to the pointer for extra data
4193   options       the options word
4194   cd            contains pointers to tables etc.
4195   start         start of range character
4196   end           end of range character
4197 
4198 Returns:        the number of < 256 characters added
4199                 the pointer to extra data is updated
4200 */
4201 
4202 static int
add_to_class(pcre_uint8 * classbits,pcre_uchar ** uchardptr,int options,compile_data * cd,pcre_uint32 start,pcre_uint32 end)4203 add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4204   compile_data *cd, pcre_uint32 start, pcre_uint32 end)
4205 {
4206 pcre_uint32 c;
4207 pcre_uint32 classbits_end = (end <= 0xff ? end : 0xff);
4208 int n8 = 0;
4209 
4210 /* If caseless matching is required, scan the range and process alternate
4211 cases. In Unicode, there are 8-bit characters that have alternate cases that
4212 are greater than 255 and vice-versa. Sometimes we can just extend the original
4213 range. */
4214 
4215 if ((options & PCRE_CASELESS) != 0)
4216   {
4217 #ifdef SUPPORT_UCP
4218   if ((options & PCRE_UTF8) != 0)
4219     {
4220     int rc;
4221     pcre_uint32 oc, od;
4222 
4223     options &= ~PCRE_CASELESS;   /* Remove for recursive calls */
4224     c = start;
4225 
4226     while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
4227       {
4228       /* Handle a single character that has more than one other case. */
4229 
4230       if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
4231         PRIV(ucd_caseless_sets) + rc, oc);
4232 
4233       /* Do nothing if the other case range is within the original range. */
4234 
4235       else if (oc >= start && od <= end) continue;
4236 
4237       /* Extend the original range if there is overlap, noting that if oc < c, we
4238       can't have od > end because a subrange is always shorter than the basic
4239       range. Otherwise, use a recursive call to add the additional range. */
4240 
4241       else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
4242       else if (od > end && oc <= end + 1)
4243         {
4244         end = od;       /* Extend upwards */
4245         if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
4246         }
4247       else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
4248       }
4249     }
4250   else
4251 #endif  /* SUPPORT_UCP */
4252 
4253   /* Not UTF-mode, or no UCP */
4254 
4255   for (c = start; c <= classbits_end; c++)
4256     {
4257     SETBIT(classbits, cd->fcc[c]);
4258     n8++;
4259     }
4260   }
4261 
4262 /* Now handle the original range. Adjust the final value according to the bit
4263 length - this means that the same lists of (e.g.) horizontal spaces can be used
4264 in all cases. */
4265 
4266 #if defined COMPILE_PCRE8
4267 #ifdef SUPPORT_UTF
4268   if ((options & PCRE_UTF8) == 0)
4269 #endif
4270   if (end > 0xff) end = 0xff;
4271 
4272 #elif defined COMPILE_PCRE16
4273 #ifdef SUPPORT_UTF
4274   if ((options & PCRE_UTF16) == 0)
4275 #endif
4276   if (end > 0xffff) end = 0xffff;
4277 
4278 #endif /* COMPILE_PCRE[8|16] */
4279 
4280 /* Use the bitmap for characters < 256. Otherwise use extra data.*/
4281 
4282 for (c = start; c <= classbits_end; c++)
4283   {
4284   /* Regardless of start, c will always be <= 255. */
4285   SETBIT(classbits, c);
4286   n8++;
4287   }
4288 
4289 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4290 if (start <= 0xff) start = 0xff + 1;
4291 
4292 if (end >= start)
4293   {
4294   pcre_uchar *uchardata = *uchardptr;
4295 #ifdef SUPPORT_UTF
4296   if ((options & PCRE_UTF8) != 0)  /* All UTFs use the same flag bit */
4297     {
4298     if (start < end)
4299       {
4300       *uchardata++ = XCL_RANGE;
4301       uchardata += PRIV(ord2utf)(start, uchardata);
4302       uchardata += PRIV(ord2utf)(end, uchardata);
4303       }
4304     else if (start == end)
4305       {
4306       *uchardata++ = XCL_SINGLE;
4307       uchardata += PRIV(ord2utf)(start, uchardata);
4308       }
4309     }
4310   else
4311 #endif  /* SUPPORT_UTF */
4312 
4313   /* Without UTF support, character values are constrained by the bit length,
4314   and can only be > 256 for 16-bit and 32-bit libraries. */
4315 
4316 #ifdef COMPILE_PCRE8
4317     {}
4318 #else
4319   if (start < end)
4320     {
4321     *uchardata++ = XCL_RANGE;
4322     *uchardata++ = start;
4323     *uchardata++ = end;
4324     }
4325   else if (start == end)
4326     {
4327     *uchardata++ = XCL_SINGLE;
4328     *uchardata++ = start;
4329     }
4330 #endif
4331 
4332   *uchardptr = uchardata;   /* Updata extra data pointer */
4333   }
4334 #endif /* SUPPORT_UTF || !COMPILE_PCRE8 */
4335 
4336 return n8;    /* Number of 8-bit characters */
4337 }
4338 
4339 
4340 
4341 
4342 /*************************************************
4343 *        Add a list of characters to a class     *
4344 *************************************************/
4345 
4346 /* This function is used for adding a list of case-equivalent characters to a
4347 class, and also for adding a list of horizontal or vertical whitespace. If the
4348 list is in order (which it should be), ranges of characters are detected and
4349 handled appropriately. This function is mutually recursive with the function
4350 above.
4351 
4352 Arguments:
4353   classbits     the bit map for characters < 256
4354   uchardptr     points to the pointer for extra data
4355   options       the options word
4356   cd            contains pointers to tables etc.
4357   p             points to row of 32-bit values, terminated by NOTACHAR
4358   except        character to omit; this is used when adding lists of
4359                   case-equivalent characters to avoid including the one we
4360                   already know about
4361 
4362 Returns:        the number of < 256 characters added
4363                 the pointer to extra data is updated
4364 */
4365 
4366 static int
add_list_to_class(pcre_uint8 * classbits,pcre_uchar ** uchardptr,int options,compile_data * cd,const pcre_uint32 * p,unsigned int except)4367 add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4368   compile_data *cd, const pcre_uint32 *p, unsigned int except)
4369 {
4370 int n8 = 0;
4371 while (p[0] < NOTACHAR)
4372   {
4373   int n = 0;
4374   if (p[0] != except)
4375     {
4376     while(p[n+1] == p[0] + n + 1) n++;
4377     n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
4378     }
4379   p += n + 1;
4380   }
4381 return n8;
4382 }
4383 
4384 
4385 
4386 /*************************************************
4387 *    Add characters not in a list to a class     *
4388 *************************************************/
4389 
4390 /* This function is used for adding the complement of a list of horizontal or
4391 vertical whitespace to a class. The list must be in order.
4392 
4393 Arguments:
4394   classbits     the bit map for characters < 256
4395   uchardptr     points to the pointer for extra data
4396   options       the options word
4397   cd            contains pointers to tables etc.
4398   p             points to row of 32-bit values, terminated by NOTACHAR
4399 
4400 Returns:        the number of < 256 characters added
4401                 the pointer to extra data is updated
4402 */
4403 
4404 static int
add_not_list_to_class(pcre_uint8 * classbits,pcre_uchar ** uchardptr,int options,compile_data * cd,const pcre_uint32 * p)4405 add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
4406   int options, compile_data *cd, const pcre_uint32 *p)
4407 {
4408 BOOL utf = (options & PCRE_UTF8) != 0;
4409 int n8 = 0;
4410 if (p[0] > 0)
4411   n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
4412 while (p[0] < NOTACHAR)
4413   {
4414   while (p[1] == p[0] + 1) p++;
4415   n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
4416     (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
4417   p++;
4418   }
4419 return n8;
4420 }
4421 
4422 
4423 
4424 /*************************************************
4425 *           Compile one branch                   *
4426 *************************************************/
4427 
4428 /* Scan the pattern, compiling it into the a vector. If the options are
4429 changed during the branch, the pointer is used to change the external options
4430 bits. This function is used during the pre-compile phase when we are trying
4431 to find out the amount of memory needed, as well as during the real compile
4432 phase. The value of lengthptr distinguishes the two phases.
4433 
4434 Arguments:
4435   optionsptr        pointer to the option bits
4436   codeptr           points to the pointer to the current code point
4437   ptrptr            points to the current pattern pointer
4438   errorcodeptr      points to error code variable
4439   firstcharptr      place to put the first required character
4440   firstcharflagsptr place to put the first character flags, or a negative number
4441   reqcharptr        place to put the last required character
4442   reqcharflagsptr   place to put the last required character flags, or a negative number
4443   bcptr             points to current branch chain
4444   cond_depth        conditional nesting depth
4445   cd                contains pointers to tables etc.
4446   lengthptr         NULL during the real compile phase
4447                     points to length accumulator during pre-compile phase
4448 
4449 Returns:            TRUE on success
4450                     FALSE, with *errorcodeptr set non-zero on error
4451 */
4452 
4453 static BOOL
compile_branch(int * optionsptr,pcre_uchar ** codeptr,const pcre_uchar ** ptrptr,int * errorcodeptr,pcre_uint32 * firstcharptr,pcre_int32 * firstcharflagsptr,pcre_uint32 * reqcharptr,pcre_int32 * reqcharflagsptr,branch_chain * bcptr,int cond_depth,compile_data * cd,int * lengthptr)4454 compile_branch(int *optionsptr, pcre_uchar **codeptr,
4455   const pcre_uchar **ptrptr, int *errorcodeptr,
4456   pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
4457   pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
4458   branch_chain *bcptr, int cond_depth,
4459   compile_data *cd, int *lengthptr)
4460 {
4461 int repeat_type, op_type;
4462 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
4463 int bravalue = 0;
4464 int greedy_default, greedy_non_default;
4465 pcre_uint32 firstchar, reqchar;
4466 pcre_int32 firstcharflags, reqcharflags;
4467 pcre_uint32 zeroreqchar, zerofirstchar;
4468 pcre_int32 zeroreqcharflags, zerofirstcharflags;
4469 pcre_int32 req_caseopt, reqvary, tempreqvary;
4470 int options = *optionsptr;               /* May change dynamically */
4471 int after_manual_callout = 0;
4472 int length_prevgroup = 0;
4473 register pcre_uint32 c;
4474 int escape;
4475 register pcre_uchar *code = *codeptr;
4476 pcre_uchar *last_code = code;
4477 pcre_uchar *orig_code = code;
4478 pcre_uchar *tempcode;
4479 BOOL inescq = FALSE;
4480 BOOL groupsetfirstchar = FALSE;
4481 const pcre_uchar *ptr = *ptrptr;
4482 const pcre_uchar *tempptr;
4483 const pcre_uchar *nestptr = NULL;
4484 pcre_uchar *previous = NULL;
4485 pcre_uchar *previous_callout = NULL;
4486 size_t item_hwm_offset = 0;
4487 pcre_uint8 classbits[32];
4488 
4489 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
4490 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
4491 dynamically as we process the pattern. */
4492 
4493 #ifdef SUPPORT_UTF
4494 /* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
4495 BOOL utf = (options & PCRE_UTF8) != 0;
4496 #ifndef COMPILE_PCRE32
4497 pcre_uchar utf_chars[6];
4498 #endif
4499 #else
4500 BOOL utf = FALSE;
4501 #endif
4502 
4503 /* Helper variables for OP_XCLASS opcode (for characters > 255). We define
4504 class_uchardata always so that it can be passed to add_to_class() always,
4505 though it will not be used in non-UTF 8-bit cases. This avoids having to supply
4506 alternative calls for the different cases. */
4507 
4508 pcre_uchar *class_uchardata;
4509 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4510 BOOL xclass;
4511 pcre_uchar *class_uchardata_base;
4512 #endif
4513 
4514 #ifdef PCRE_DEBUG
4515 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
4516 #endif
4517 
4518 /* Set up the default and non-default settings for greediness */
4519 
4520 greedy_default = ((options & PCRE_UNGREEDY) != 0);
4521 greedy_non_default = greedy_default ^ 1;
4522 
4523 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
4524 matching encountered yet". It gets changed to REQ_NONE if we hit something that
4525 matches a non-fixed char first char; reqchar just remains unset if we never
4526 find one.
4527 
4528 When we hit a repeat whose minimum is zero, we may have to adjust these values
4529 to take the zero repeat into account. This is implemented by setting them to
4530 zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
4531 item types that can be repeated set these backoff variables appropriately. */
4532 
4533 firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
4534 firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
4535 
4536 /* The variable req_caseopt contains either the REQ_CASELESS value
4537 or zero, according to the current setting of the caseless flag. The
4538 REQ_CASELESS leaves the lower 28 bit empty. It is added into the
4539 firstchar or reqchar variables to record the case status of the
4540 value. This is used only for ASCII characters. */
4541 
4542 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
4543 
4544 /* Switch on next character until the end of the branch */
4545 
4546 for (;; ptr++)
4547   {
4548   BOOL negate_class;
4549   BOOL should_flip_negation;
4550   BOOL possessive_quantifier;
4551   BOOL is_quantifier;
4552   BOOL is_recurse;
4553   BOOL reset_bracount;
4554   int class_has_8bitchar;
4555   int class_one_char;
4556 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4557   BOOL xclass_has_prop;
4558 #endif
4559   int newoptions;
4560   int recno;
4561   int refsign;
4562   int skipbytes;
4563   pcre_uint32 subreqchar, subfirstchar;
4564   pcre_int32 subreqcharflags, subfirstcharflags;
4565   int terminator;
4566   unsigned int mclength;
4567   unsigned int tempbracount;
4568   pcre_uint32 ec;
4569   pcre_uchar mcbuffer[8];
4570 
4571   /* Come here to restart the loop without advancing the pointer. */
4572 
4573   REDO_LOOP:
4574 
4575   /* Get next character in the pattern */
4576 
4577   c = *ptr;
4578 
4579   /* If we are at the end of a nested substitution, revert to the outer level
4580   string. Nesting only happens one level deep. */
4581 
4582   if (c == CHAR_NULL && nestptr != NULL)
4583     {
4584     ptr = nestptr;
4585     nestptr = NULL;
4586     c = *ptr;
4587     }
4588 
4589   /* If we are in the pre-compile phase, accumulate the length used for the
4590   previous cycle of this loop. */
4591 
4592   if (lengthptr != NULL)
4593     {
4594 #ifdef PCRE_DEBUG
4595     if (code > cd->hwm) cd->hwm = code;                 /* High water info */
4596 #endif
4597     if (code > cd->start_workspace + cd->workspace_size -
4598         WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
4599       {
4600       *errorcodeptr = (code >= cd->start_workspace + cd->workspace_size)?
4601         ERR52 : ERR87;
4602       goto FAILED;
4603       }
4604 
4605     /* There is at least one situation where code goes backwards: this is the
4606     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
4607     the class is simply eliminated. However, it is created first, so we have to
4608     allow memory for it. Therefore, don't ever reduce the length at this point.
4609     */
4610 
4611     if (code < last_code) code = last_code;
4612 
4613     /* Paranoid check for integer overflow */
4614 
4615     if (OFLOW_MAX - *lengthptr < code - last_code)
4616       {
4617       *errorcodeptr = ERR20;
4618       goto FAILED;
4619       }
4620 
4621     *lengthptr += (int)(code - last_code);
4622     DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
4623       (int)(code - last_code), c, c));
4624 
4625     /* If "previous" is set and it is not at the start of the work space, move
4626     it back to there, in order to avoid filling up the work space. Otherwise,
4627     if "previous" is NULL, reset the current code pointer to the start. */
4628 
4629     if (previous != NULL)
4630       {
4631       if (previous > orig_code)
4632         {
4633         memmove(orig_code, previous, IN_UCHARS(code - previous));
4634         code -= previous - orig_code;
4635         previous = orig_code;
4636         }
4637       }
4638     else code = orig_code;
4639 
4640     /* Remember where this code item starts so we can pick up the length
4641     next time round. */
4642 
4643     last_code = code;
4644     }
4645 
4646   /* In the real compile phase, just check the workspace used by the forward
4647   reference list. */
4648 
4649   else if (cd->hwm > cd->start_workspace + cd->workspace_size)
4650     {
4651     *errorcodeptr = ERR52;
4652     goto FAILED;
4653     }
4654 
4655   /* If in \Q...\E, check for the end; if not, we have a literal. Otherwise an
4656   isolated \E is ignored. */
4657 
4658   if (c != CHAR_NULL)
4659     {
4660     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4661       {
4662       inescq = FALSE;
4663       ptr++;
4664       continue;
4665       }
4666     else if (inescq)
4667       {
4668       if (previous_callout != NULL)
4669         {
4670         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
4671           complete_callout(previous_callout, ptr, cd);
4672         previous_callout = NULL;
4673         }
4674       if ((options & PCRE_AUTO_CALLOUT) != 0)
4675         {
4676         previous_callout = code;
4677         code = auto_callout(code, ptr, cd);
4678         }
4679       goto NORMAL_CHAR;
4680       }
4681 
4682     /* Check for the start of a \Q...\E sequence. We must do this here rather
4683     than later in case it is immediately followed by \E, which turns it into a
4684     "do nothing" sequence. */
4685 
4686     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
4687       {
4688       inescq = TRUE;
4689       ptr++;
4690       continue;
4691       }
4692     }
4693 
4694   /* In extended mode, skip white space and comments. */
4695 
4696   if ((options & PCRE_EXTENDED) != 0)
4697     {
4698     const pcre_uchar *wscptr = ptr;
4699     while (MAX_255(c) && (cd->ctypes[c] & ctype_space) != 0) c = *(++ptr);
4700     if (c == CHAR_NUMBER_SIGN)
4701       {
4702       ptr++;
4703       while (*ptr != CHAR_NULL)
4704         {
4705         if (IS_NEWLINE(ptr))         /* For non-fixed-length newline cases, */
4706           {                          /* IS_NEWLINE sets cd->nllen. */
4707           ptr += cd->nllen;
4708           break;
4709           }
4710         ptr++;
4711 #ifdef SUPPORT_UTF
4712         if (utf) FORWARDCHAR(ptr);
4713 #endif
4714         }
4715       }
4716 
4717     /* If we skipped any characters, restart the loop. Otherwise, we didn't see
4718     a comment. */
4719 
4720     if (ptr > wscptr) goto REDO_LOOP;
4721     }
4722 
4723   /* Skip over (?# comments. We need to do this here because we want to know if
4724   the next thing is a quantifier, and these comments may come between an item
4725   and its quantifier. */
4726 
4727   if (c == CHAR_LEFT_PARENTHESIS && ptr[1] == CHAR_QUESTION_MARK &&
4728       ptr[2] == CHAR_NUMBER_SIGN)
4729     {
4730     ptr += 3;
4731     while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
4732     if (*ptr == CHAR_NULL)
4733       {
4734       *errorcodeptr = ERR18;
4735       goto FAILED;
4736       }
4737     continue;
4738     }
4739 
4740   /* See if the next thing is a quantifier. */
4741 
4742   is_quantifier =
4743     c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4744     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4745 
4746   /* Fill in length of a previous callout, except when the next thing is a
4747   quantifier or when processing a property substitution string in UCP mode. */
4748 
4749   if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
4750        after_manual_callout-- <= 0)
4751     {
4752     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
4753       complete_callout(previous_callout, ptr, cd);
4754     previous_callout = NULL;
4755     }
4756 
4757   /* Create auto callout, except for quantifiers, or while processing property
4758   strings that are substituted for \w etc in UCP mode. */
4759 
4760   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
4761     {
4762     previous_callout = code;
4763     code = auto_callout(code, ptr, cd);
4764     }
4765 
4766   /* Process the next pattern item. */
4767 
4768   switch(c)
4769     {
4770     /* ===================================================================*/
4771     case CHAR_NULL:                /* The branch terminates at string end */
4772     case CHAR_VERTICAL_LINE:       /* or | or ) */
4773     case CHAR_RIGHT_PARENTHESIS:
4774     *firstcharptr = firstchar;
4775     *firstcharflagsptr = firstcharflags;
4776     *reqcharptr = reqchar;
4777     *reqcharflagsptr = reqcharflags;
4778     *codeptr = code;
4779     *ptrptr = ptr;
4780     if (lengthptr != NULL)
4781       {
4782       if (OFLOW_MAX - *lengthptr < code - last_code)
4783         {
4784         *errorcodeptr = ERR20;
4785         goto FAILED;
4786         }
4787       *lengthptr += (int)(code - last_code);   /* To include callout length */
4788       DPRINTF((">> end branch\n"));
4789       }
4790     return TRUE;
4791 
4792 
4793     /* ===================================================================*/
4794     /* Handle single-character metacharacters. In multiline mode, ^ disables
4795     the setting of any following char as a first character. */
4796 
4797     case CHAR_CIRCUMFLEX_ACCENT:
4798     previous = NULL;
4799     if ((options & PCRE_MULTILINE) != 0)
4800       {
4801       if (firstcharflags == REQ_UNSET)
4802         zerofirstcharflags = firstcharflags = REQ_NONE;
4803       *code++ = OP_CIRCM;
4804       }
4805     else *code++ = OP_CIRC;
4806     break;
4807 
4808     case CHAR_DOLLAR_SIGN:
4809     previous = NULL;
4810     *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
4811     break;
4812 
4813     /* There can never be a first char if '.' is first, whatever happens about
4814     repeats. The value of reqchar doesn't change either. */
4815 
4816     case CHAR_DOT:
4817     if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4818     zerofirstchar = firstchar;
4819     zerofirstcharflags = firstcharflags;
4820     zeroreqchar = reqchar;
4821     zeroreqcharflags = reqcharflags;
4822     previous = code;
4823     item_hwm_offset = cd->hwm - cd->start_workspace;
4824     *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
4825     break;
4826 
4827 
4828     /* ===================================================================*/
4829     /* Character classes. If the included characters are all < 256, we build a
4830     32-byte bitmap of the permitted characters, except in the special case
4831     where there is only one such character. For negated classes, we build the
4832     map as usual, then invert it at the end. However, we use a different opcode
4833     so that data characters > 255 can be handled correctly.
4834 
4835     If the class contains characters outside the 0-255 range, a different
4836     opcode is compiled. It may optionally have a bit map for characters < 256,
4837     but those above are are explicitly listed afterwards. A flag byte tells
4838     whether the bitmap is present, and whether this is a negated class or not.
4839 
4840     In JavaScript compatibility mode, an isolated ']' causes an error. In
4841     default (Perl) mode, it is treated as a data character. */
4842 
4843     case CHAR_RIGHT_SQUARE_BRACKET:
4844     if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4845       {
4846       *errorcodeptr = ERR64;
4847       goto FAILED;
4848       }
4849     goto NORMAL_CHAR;
4850 
4851     /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
4852     used for "start of word" and "end of word". As these are otherwise illegal
4853     sequences, we don't break anything by recognizing them. They are replaced
4854     by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
4855     erroneous and are handled by the normal code below. */
4856 
4857     case CHAR_LEFT_SQUARE_BRACKET:
4858     if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
4859       {
4860       nestptr = ptr + 7;
4861       ptr = sub_start_of_word;
4862       goto REDO_LOOP;
4863       }
4864 
4865     if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
4866       {
4867       nestptr = ptr + 7;
4868       ptr = sub_end_of_word;
4869       goto REDO_LOOP;
4870       }
4871 
4872     /* Handle a real character class. */
4873 
4874     previous = code;
4875     item_hwm_offset = cd->hwm - cd->start_workspace;
4876 
4877     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
4878     they are encountered at the top level, so we'll do that too. */
4879 
4880     if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4881          ptr[1] == CHAR_EQUALS_SIGN) &&
4882         check_posix_syntax(ptr, &tempptr))
4883       {
4884       *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
4885       goto FAILED;
4886       }
4887 
4888     /* If the first character is '^', set the negation flag and skip it. Also,
4889     if the first few characters (either before or after ^) are \Q\E or \E we
4890     skip them too. This makes for compatibility with Perl. */
4891 
4892     negate_class = FALSE;
4893     for (;;)
4894       {
4895       c = *(++ptr);
4896       if (c == CHAR_BACKSLASH)
4897         {
4898         if (ptr[1] == CHAR_E)
4899           ptr++;
4900         else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
4901           ptr += 3;
4902         else
4903           break;
4904         }
4905       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4906         negate_class = TRUE;
4907       else break;
4908       }
4909 
4910     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
4911     an initial ']' is taken as a data character -- the code below handles
4912     that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
4913     [^] must match any character, so generate OP_ALLANY. */
4914 
4915     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4916         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4917       {
4918       *code++ = negate_class? OP_ALLANY : OP_FAIL;
4919       if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4920       zerofirstchar = firstchar;
4921       zerofirstcharflags = firstcharflags;
4922       break;
4923       }
4924 
4925     /* If a class contains a negative special such as \S, we need to flip the
4926     negation flag at the end, so that support for characters > 255 works
4927     correctly (they are all included in the class). */
4928 
4929     should_flip_negation = FALSE;
4930 
4931     /* Extended class (xclass) will be used when characters > 255
4932     might match. */
4933 
4934 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4935     xclass = FALSE;
4936     class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
4937     class_uchardata_base = class_uchardata;   /* Save the start */
4938 #endif
4939 
4940     /* For optimization purposes, we track some properties of the class:
4941     class_has_8bitchar will be non-zero if the class contains at least one <
4942     256 character; class_one_char will be 1 if the class contains just one
4943     character; xclass_has_prop will be TRUE if unicode property checks
4944     are present in the class. */
4945 
4946     class_has_8bitchar = 0;
4947     class_one_char = 0;
4948 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4949     xclass_has_prop = FALSE;
4950 #endif
4951 
4952     /* Initialize the 32-char bit map to all zeros. We build the map in a
4953     temporary bit of memory, in case the class contains fewer than two
4954     8-bit characters because in that case the compiled code doesn't use the bit
4955     map. */
4956 
4957     memset(classbits, 0, 32 * sizeof(pcre_uint8));
4958 
4959     /* Process characters until ] is reached. By writing this as a "do" it
4960     means that an initial ] is taken as a data character. At the start of the
4961     loop, c contains the first byte of the character. */
4962 
4963     if (c != CHAR_NULL) do
4964       {
4965       const pcre_uchar *oldptr;
4966 
4967 #ifdef SUPPORT_UTF
4968       if (utf && HAS_EXTRALEN(c))
4969         {                           /* Braces are required because the */
4970         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
4971         }
4972 #endif
4973 
4974 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4975       /* In the pre-compile phase, accumulate the length of any extra
4976       data and reset the pointer. This is so that very large classes that
4977       contain a zillion > 255 characters no longer overwrite the work space
4978       (which is on the stack). We have to remember that there was XCLASS data,
4979       however. */
4980 
4981       if (class_uchardata > class_uchardata_base) xclass = TRUE;
4982 
4983       if (lengthptr != NULL && class_uchardata > class_uchardata_base)
4984         {
4985         *lengthptr += (int)(class_uchardata - class_uchardata_base);
4986         class_uchardata = class_uchardata_base;
4987         }
4988 #endif
4989 
4990       /* Inside \Q...\E everything is literal except \E */
4991 
4992       if (inescq)
4993         {
4994         if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
4995           {
4996           inescq = FALSE;                   /* Reset literal state */
4997           ptr++;                            /* Skip the 'E' */
4998           continue;                         /* Carry on with next */
4999           }
5000         goto CHECK_RANGE;                   /* Could be range if \E follows */
5001         }
5002 
5003       /* Handle POSIX class names. Perl allows a negation extension of the
5004       form [:^name:]. A square bracket that doesn't match the syntax is
5005       treated as a literal. We also recognize the POSIX constructions
5006       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
5007       5.6 and 5.8 do. */
5008 
5009       if (c == CHAR_LEFT_SQUARE_BRACKET &&
5010           (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
5011            ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
5012         {
5013         BOOL local_negate = FALSE;
5014         int posix_class, taboffset, tabopt;
5015         register const pcre_uint8 *cbits = cd->cbits;
5016         pcre_uint8 pbits[32];
5017 
5018         if (ptr[1] != CHAR_COLON)
5019           {
5020           *errorcodeptr = ERR31;
5021           goto FAILED;
5022           }
5023 
5024         ptr += 2;
5025         if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
5026           {
5027           local_negate = TRUE;
5028           should_flip_negation = TRUE;  /* Note negative special */
5029           ptr++;
5030           }
5031 
5032         posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
5033         if (posix_class < 0)
5034           {
5035           *errorcodeptr = ERR30;
5036           goto FAILED;
5037           }
5038 
5039         /* If matching is caseless, upper and lower are converted to
5040         alpha. This relies on the fact that the class table starts with
5041         alpha, lower, upper as the first 3 entries. */
5042 
5043         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
5044           posix_class = 0;
5045 
5046         /* When PCRE_UCP is set, some of the POSIX classes are converted to
5047         different escape sequences that use Unicode properties \p or \P. Others
5048         that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
5049         directly. */
5050 
5051 #ifdef SUPPORT_UCP
5052         if ((options & PCRE_UCP) != 0)
5053           {
5054           unsigned int ptype = 0;
5055           int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
5056 
5057           /* The posix_substitutes table specifies which POSIX classes can be
5058           converted to \p or \P items. */
5059 
5060           if (posix_substitutes[pc] != NULL)
5061             {
5062             nestptr = tempptr + 1;
5063             ptr = posix_substitutes[pc] - 1;
5064             continue;
5065             }
5066 
5067           /* There are three other classes that generate special property calls
5068           that are recognized only in an XCLASS. */
5069 
5070           else switch(posix_class)
5071             {
5072             case PC_GRAPH:
5073             ptype = PT_PXGRAPH;
5074             /* Fall through */
5075             case PC_PRINT:
5076             if (ptype == 0) ptype = PT_PXPRINT;
5077             /* Fall through */
5078             case PC_PUNCT:
5079             if (ptype == 0) ptype = PT_PXPUNCT;
5080             *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
5081             *class_uchardata++ = ptype;
5082             *class_uchardata++ = 0;
5083             xclass_has_prop = TRUE;
5084             ptr = tempptr + 1;
5085             continue;
5086 
5087             /* For the other POSIX classes (ascii, cntrl, xdigit) we are going
5088             to fall through to the non-UCP case and build a bit map for
5089             characters with code points less than 256. If we are in a negated
5090             POSIX class, characters with code points greater than 255 must
5091             either all match or all not match. In the special case where we
5092             have not yet generated any xclass data, and this is the final item
5093             in the overall class, we need do nothing: later on, the opcode
5094             OP_NCLASS will be used to indicate that characters greater than 255
5095             are acceptable. If we have already seen an xclass item or one may
5096             follow (we have to assume that it might if this is not the end of
5097             the class), explicitly list all wide codepoints, which will then
5098             either not match or match, depending on whether the class is or is
5099             not negated. */
5100 
5101             default:
5102             if (local_negate &&
5103                 (xclass || tempptr[2] != CHAR_RIGHT_SQUARE_BRACKET))
5104               {
5105               *class_uchardata++ = XCL_RANGE;
5106               class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
5107               class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
5108               }
5109             break;
5110             }
5111           }
5112 #endif
5113         /* In the non-UCP case, or when UCP makes no difference, we build the
5114         bit map for the POSIX class in a chunk of local store because we may be
5115         adding and subtracting from it, and we don't want to subtract bits that
5116         may be in the main map already. At the end we or the result into the
5117         bit map that is being built. */
5118 
5119         posix_class *= 3;
5120 
5121         /* Copy in the first table (always present) */
5122 
5123         memcpy(pbits, cbits + posix_class_maps[posix_class],
5124           32 * sizeof(pcre_uint8));
5125 
5126         /* If there is a second table, add or remove it as required. */
5127 
5128         taboffset = posix_class_maps[posix_class + 1];
5129         tabopt = posix_class_maps[posix_class + 2];
5130 
5131         if (taboffset >= 0)
5132           {
5133           if (tabopt >= 0)
5134             for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
5135           else
5136             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
5137           }
5138 
5139         /* Now see if we need to remove any special characters. An option
5140         value of 1 removes vertical space and 2 removes underscore. */
5141 
5142         if (tabopt < 0) tabopt = -tabopt;
5143         if (tabopt == 1) pbits[1] &= ~0x3c;
5144           else if (tabopt == 2) pbits[11] &= 0x7f;
5145 
5146         /* Add the POSIX table or its complement into the main table that is
5147         being built and we are done. */
5148 
5149         if (local_negate)
5150           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
5151         else
5152           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
5153 
5154         ptr = tempptr + 1;
5155         /* Every class contains at least one < 256 character. */
5156         class_has_8bitchar = 1;
5157         /* Every class contains at least two characters. */
5158         class_one_char = 2;
5159         continue;    /* End of POSIX syntax handling */
5160         }
5161 
5162       /* Backslash may introduce a single character, or it may introduce one
5163       of the specials, which just set a flag. The sequence \b is a special
5164       case. Inside a class (and only there) it is treated as backspace. We
5165       assume that other escapes have more than one character in them, so
5166       speculatively set both class_has_8bitchar and class_one_char bigger
5167       than one. Unrecognized escapes fall through and are either treated
5168       as literal characters (by default), or are faulted if
5169       PCRE_EXTRA is set. */
5170 
5171       if (c == CHAR_BACKSLASH)
5172         {
5173         escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options,
5174           TRUE);
5175         if (*errorcodeptr != 0) goto FAILED;
5176         if (escape == 0) c = ec;
5177         else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
5178         else if (escape == ESC_N)          /* \N is not supported in a class */
5179           {
5180           *errorcodeptr = ERR71;
5181           goto FAILED;
5182           }
5183         else if (escape == ESC_Q)            /* Handle start of quoted string */
5184           {
5185           if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5186             {
5187             ptr += 2; /* avoid empty string */
5188             }
5189           else inescq = TRUE;
5190           continue;
5191           }
5192         else if (escape == ESC_E) continue;  /* Ignore orphan \E */
5193 
5194         else
5195           {
5196           register const pcre_uint8 *cbits = cd->cbits;
5197           /* Every class contains at least two < 256 characters. */
5198           class_has_8bitchar++;
5199           /* Every class contains at least two characters. */
5200           class_one_char += 2;
5201 
5202           switch (escape)
5203             {
5204 #ifdef SUPPORT_UCP
5205             case ESC_du:     /* These are the values given for \d etc */
5206             case ESC_DU:     /* when PCRE_UCP is set. We replace the */
5207             case ESC_wu:     /* escape sequence with an appropriate \p */
5208             case ESC_WU:     /* or \P to test Unicode properties instead */
5209             case ESC_su:     /* of the default ASCII testing. */
5210             case ESC_SU:
5211             nestptr = ptr;
5212             ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
5213             class_has_8bitchar--;                /* Undo! */
5214             continue;
5215 #endif
5216             case ESC_d:
5217             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
5218             continue;
5219 
5220             case ESC_D:
5221             should_flip_negation = TRUE;
5222             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
5223             continue;
5224 
5225             case ESC_w:
5226             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
5227             continue;
5228 
5229             case ESC_W:
5230             should_flip_negation = TRUE;
5231             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
5232             continue;
5233 
5234             /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5235             5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5236             previously set by something earlier in the character class.
5237             Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5238             we could just adjust the appropriate bit. From PCRE 8.34 we no
5239             longer treat \s and \S specially. */
5240 
5241             case ESC_s:
5242             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
5243             continue;
5244 
5245             case ESC_S:
5246             should_flip_negation = TRUE;
5247             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
5248             continue;
5249 
5250             /* The rest apply in both UCP and non-UCP cases. */
5251 
5252             case ESC_h:
5253             (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5254               PRIV(hspace_list), NOTACHAR);
5255             continue;
5256 
5257             case ESC_H:
5258             (void)add_not_list_to_class(classbits, &class_uchardata, options,
5259               cd, PRIV(hspace_list));
5260             continue;
5261 
5262             case ESC_v:
5263             (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5264               PRIV(vspace_list), NOTACHAR);
5265             continue;
5266 
5267             case ESC_V:
5268             (void)add_not_list_to_class(classbits, &class_uchardata, options,
5269               cd, PRIV(vspace_list));
5270             continue;
5271 
5272             case ESC_p:
5273             case ESC_P:
5274 #ifdef SUPPORT_UCP
5275               {
5276               BOOL negated;
5277               unsigned int ptype = 0, pdata = 0;
5278               if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
5279                 goto FAILED;
5280               *class_uchardata++ = ((escape == ESC_p) != negated)?
5281                 XCL_PROP : XCL_NOTPROP;
5282               *class_uchardata++ = ptype;
5283               *class_uchardata++ = pdata;
5284               xclass_has_prop = TRUE;
5285               class_has_8bitchar--;                /* Undo! */
5286               continue;
5287               }
5288 #else
5289             *errorcodeptr = ERR45;
5290             goto FAILED;
5291 #endif
5292             /* Unrecognized escapes are faulted if PCRE is running in its
5293             strict mode. By default, for compatibility with Perl, they are
5294             treated as literals. */
5295 
5296             default:
5297             if ((options & PCRE_EXTRA) != 0)
5298               {
5299               *errorcodeptr = ERR7;
5300               goto FAILED;
5301               }
5302             class_has_8bitchar--;    /* Undo the speculative increase. */
5303             class_one_char -= 2;     /* Undo the speculative increase. */
5304             c = *ptr;                /* Get the final character and fall through */
5305             break;
5306             }
5307           }
5308 
5309         /* Fall through if the escape just defined a single character (c >= 0).
5310         This may be greater than 256. */
5311 
5312         escape = 0;
5313 
5314         }   /* End of backslash handling */
5315 
5316       /* A character may be followed by '-' to form a range. However, Perl does
5317       not permit ']' to be the end of the range. A '-' character at the end is
5318       treated as a literal. Perl ignores orphaned \E sequences entirely. The
5319       code for handling \Q and \E is messy. */
5320 
5321       CHECK_RANGE:
5322       while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5323         {
5324         inescq = FALSE;
5325         ptr += 2;
5326         }
5327       oldptr = ptr;
5328 
5329       /* Remember if \r or \n were explicitly used */
5330 
5331       if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5332 
5333       /* Check for range */
5334 
5335       if (!inescq && ptr[1] == CHAR_MINUS)
5336         {
5337         pcre_uint32 d;
5338         ptr += 2;
5339         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
5340 
5341         /* If we hit \Q (not followed by \E) at this point, go into escaped
5342         mode. */
5343 
5344         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
5345           {
5346           ptr += 2;
5347           if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
5348             { ptr += 2; continue; }
5349           inescq = TRUE;
5350           break;
5351           }
5352 
5353         /* Minus (hyphen) at the end of a class is treated as a literal, so put
5354         back the pointer and jump to handle the character that preceded it. */
5355 
5356         if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
5357           {
5358           ptr = oldptr;
5359           goto CLASS_SINGLE_CHARACTER;
5360           }
5361 
5362         /* Otherwise, we have a potential range; pick up the next character */
5363 
5364 #ifdef SUPPORT_UTF
5365         if (utf)
5366           {                           /* Braces are required because the */
5367           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
5368           }
5369         else
5370 #endif
5371         d = *ptr;  /* Not UTF-8 mode */
5372 
5373         /* The second part of a range can be a single-character escape
5374         sequence, but not any of the other escapes. Perl treats a hyphen as a
5375         literal in such circumstances. However, in Perl's warning mode, a
5376         warning is given, so PCRE now faults it as it is almost certainly a
5377         mistake on the user's part. */
5378 
5379         if (!inescq)
5380           {
5381           if (d == CHAR_BACKSLASH)
5382             {
5383             int descape;
5384             descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
5385             if (*errorcodeptr != 0) goto FAILED;
5386 
5387             /* 0 means a character was put into d; \b is backspace; any other
5388             special causes an error. */
5389 
5390             if (descape != 0)
5391               {
5392               if (descape == ESC_b) d = CHAR_BS; else
5393                 {
5394                 *errorcodeptr = ERR83;
5395                 goto FAILED;
5396                 }
5397               }
5398             }
5399 
5400           /* A hyphen followed by a POSIX class is treated in the same way. */
5401 
5402           else if (d == CHAR_LEFT_SQUARE_BRACKET &&
5403                    (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
5404                     ptr[1] == CHAR_EQUALS_SIGN) &&
5405                    check_posix_syntax(ptr, &tempptr))
5406             {
5407             *errorcodeptr = ERR83;
5408             goto FAILED;
5409             }
5410           }
5411 
5412         /* Check that the two values are in the correct order. Optimize
5413         one-character ranges. */
5414 
5415         if (d < c)
5416           {
5417           *errorcodeptr = ERR8;
5418           goto FAILED;
5419           }
5420         if (d == c) goto CLASS_SINGLE_CHARACTER;  /* A few lines below */
5421 
5422         /* We have found a character range, so single character optimizations
5423         cannot be done anymore. Any value greater than 1 indicates that there
5424         is more than one character. */
5425 
5426         class_one_char = 2;
5427 
5428         /* Remember an explicit \r or \n, and add the range to the class. */
5429 
5430         if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5431 
5432         class_has_8bitchar +=
5433           add_to_class(classbits, &class_uchardata, options, cd, c, d);
5434 
5435         continue;   /* Go get the next char in the class */
5436         }
5437 
5438       /* Handle a single character - we can get here for a normal non-escape
5439       char, or after \ that introduces a single character or for an apparent
5440       range that isn't. Only the value 1 matters for class_one_char, so don't
5441       increase it if it is already 2 or more ... just in case there's a class
5442       with a zillion characters in it. */
5443 
5444       CLASS_SINGLE_CHARACTER:
5445       if (class_one_char < 2) class_one_char++;
5446 
5447       /* If xclass_has_prop is false and class_one_char is 1, we have the first
5448       single character in the class, and there have been no prior ranges, or
5449       XCLASS items generated by escapes. If this is the final character in the
5450       class, we can optimize by turning the item into a 1-character OP_CHAR[I]
5451       if it's positive, or OP_NOT[I] if it's negative. In the positive case, it
5452       can cause firstchar to be set. Otherwise, there can be no first char if
5453       this item is first, whatever repeat count may follow. In the case of
5454       reqchar, save the previous value for reinstating. */
5455 
5456       if (!inescq &&
5457 #ifdef SUPPORT_UCP
5458           !xclass_has_prop &&
5459 #endif
5460           class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
5461         {
5462         ptr++;
5463         zeroreqchar = reqchar;
5464         zeroreqcharflags = reqcharflags;
5465 
5466         if (negate_class)
5467           {
5468 #ifdef SUPPORT_UCP
5469           int d;
5470 #endif
5471           if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5472           zerofirstchar = firstchar;
5473           zerofirstcharflags = firstcharflags;
5474 
5475           /* For caseless UTF-8 mode when UCP support is available, check
5476           whether this character has more than one other case. If so, generate
5477           a special OP_NOTPROP item instead of OP_NOTI. */
5478 
5479 #ifdef SUPPORT_UCP
5480           if (utf && (options & PCRE_CASELESS) != 0 &&
5481               (d = UCD_CASESET(c)) != 0)
5482             {
5483             *code++ = OP_NOTPROP;
5484             *code++ = PT_CLIST;
5485             *code++ = d;
5486             }
5487           else
5488 #endif
5489           /* Char has only one other case, or UCP not available */
5490 
5491             {
5492             *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
5493 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5494             if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5495               code += PRIV(ord2utf)(c, code);
5496             else
5497 #endif
5498               *code++ = c;
5499             }
5500 
5501           /* We are finished with this character class */
5502 
5503           goto END_CLASS;
5504           }
5505 
5506         /* For a single, positive character, get the value into mcbuffer, and
5507         then we can handle this with the normal one-character code. */
5508 
5509 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5510         if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5511           mclength = PRIV(ord2utf)(c, mcbuffer);
5512         else
5513 #endif
5514           {
5515           mcbuffer[0] = c;
5516           mclength = 1;
5517           }
5518         goto ONE_CHAR;
5519         }       /* End of 1-char optimization */
5520 
5521       /* There is more than one character in the class, or an XCLASS item
5522       has been generated. Add this character to the class. */
5523 
5524       class_has_8bitchar +=
5525         add_to_class(classbits, &class_uchardata, options, cd, c, c);
5526       }
5527 
5528     /* Loop until ']' reached. This "while" is the end of the "do" far above.
5529     If we are at the end of an internal nested string, revert to the outer
5530     string. */
5531 
5532     while (((c = *(++ptr)) != CHAR_NULL ||
5533            (nestptr != NULL &&
5534              (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != CHAR_NULL)) &&
5535            (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
5536 
5537     /* Check for missing terminating ']' */
5538 
5539     if (c == CHAR_NULL)
5540       {
5541       *errorcodeptr = ERR6;
5542       goto FAILED;
5543       }
5544 
5545     /* We will need an XCLASS if data has been placed in class_uchardata. In
5546     the second phase this is a sufficient test. However, in the pre-compile
5547     phase, class_uchardata gets emptied to prevent workspace overflow, so it
5548     only if the very last character in the class needs XCLASS will it contain
5549     anything at this point. For this reason, xclass gets set TRUE above when
5550     uchar_classdata is emptied, and that's why this code is the way it is here
5551     instead of just doing a test on class_uchardata below. */
5552 
5553 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5554     if (class_uchardata > class_uchardata_base) xclass = TRUE;
5555 #endif
5556 
5557     /* If this is the first thing in the branch, there can be no first char
5558     setting, whatever the repeat count. Any reqchar setting must remain
5559     unchanged after any kind of repeat. */
5560 
5561     if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5562     zerofirstchar = firstchar;
5563     zerofirstcharflags = firstcharflags;
5564     zeroreqchar = reqchar;
5565     zeroreqcharflags = reqcharflags;
5566 
5567     /* If there are characters with values > 255, we have to compile an
5568     extended class, with its own opcode, unless there was a negated special
5569     such as \S in the class, and PCRE_UCP is not set, because in that case all
5570     characters > 255 are in the class, so any that were explicitly given as
5571     well can be ignored. If (when there are explicit characters > 255 that must
5572     be listed) there are no characters < 256, we can omit the bitmap in the
5573     actual compiled code. */
5574 
5575 #ifdef SUPPORT_UTF
5576     if (xclass && (xclass_has_prop || !should_flip_negation ||
5577         (options & PCRE_UCP) != 0))
5578 #elif !defined COMPILE_PCRE8
5579     if (xclass && (xclass_has_prop || !should_flip_negation))
5580 #endif
5581 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5582       {
5583       /* For non-UCP wide characters, in a non-negative class containing \S or
5584       similar (should_flip_negation is set), all characters greater than 255
5585       must be in the class. */
5586 
5587       if (
5588 #if defined COMPILE_PCRE8
5589            utf &&
5590 #endif
5591            should_flip_negation && !negate_class && (options & PCRE_UCP) == 0)
5592         {
5593         *class_uchardata++ = XCL_RANGE;
5594         if (utf)   /* Will always be utf in the 8-bit library */
5595           {
5596           class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
5597           class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
5598           }
5599         else       /* Can only happen for the 16-bit & 32-bit libraries */
5600           {
5601 #if defined COMPILE_PCRE16
5602           *class_uchardata++ = 0x100;
5603           *class_uchardata++ = 0xffffu;
5604 #elif defined COMPILE_PCRE32
5605           *class_uchardata++ = 0x100;
5606           *class_uchardata++ = 0xffffffffu;
5607 #endif
5608           }
5609         }
5610 
5611       *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
5612       *code++ = OP_XCLASS;
5613       code += LINK_SIZE;
5614       *code = negate_class? XCL_NOT:0;
5615       if (xclass_has_prop) *code |= XCL_HASPROP;
5616 
5617       /* If the map is required, move up the extra data to make room for it;
5618       otherwise just move the code pointer to the end of the extra data. */
5619 
5620       if (class_has_8bitchar > 0)
5621         {
5622         *code++ |= XCL_MAP;
5623         memmove(code + (32 / sizeof(pcre_uchar)), code,
5624           IN_UCHARS(class_uchardata - code));
5625         if (negate_class && !xclass_has_prop)
5626           for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5627         memcpy(code, classbits, 32);
5628         code = class_uchardata + (32 / sizeof(pcre_uchar));
5629         }
5630       else code = class_uchardata;
5631 
5632       /* Now fill in the complete length of the item */
5633 
5634       PUT(previous, 1, (int)(code - previous));
5635       break;   /* End of class handling */
5636       }
5637 
5638     /* Even though any XCLASS list is now discarded, we must allow for
5639     its memory. */
5640 
5641     if (lengthptr != NULL)
5642       *lengthptr += (int)(class_uchardata - class_uchardata_base);
5643 #endif
5644 
5645     /* If there are no characters > 255, or they are all to be included or
5646     excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
5647     whole class was negated and whether there were negative specials such as \S
5648     (non-UCP) in the class. Then copy the 32-byte map into the code vector,
5649     negating it if necessary. */
5650 
5651     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
5652     if (lengthptr == NULL)    /* Save time in the pre-compile phase */
5653       {
5654       if (negate_class)
5655         for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5656       memcpy(code, classbits, 32);
5657       }
5658     code += 32 / sizeof(pcre_uchar);
5659 
5660     END_CLASS:
5661     break;
5662 
5663 
5664     /* ===================================================================*/
5665     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
5666     has been tested above. */
5667 
5668     case CHAR_LEFT_CURLY_BRACKET:
5669     if (!is_quantifier) goto NORMAL_CHAR;
5670     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
5671     if (*errorcodeptr != 0) goto FAILED;
5672     goto REPEAT;
5673 
5674     case CHAR_ASTERISK:
5675     repeat_min = 0;
5676     repeat_max = -1;
5677     goto REPEAT;
5678 
5679     case CHAR_PLUS:
5680     repeat_min = 1;
5681     repeat_max = -1;
5682     goto REPEAT;
5683 
5684     case CHAR_QUESTION_MARK:
5685     repeat_min = 0;
5686     repeat_max = 1;
5687 
5688     REPEAT:
5689     if (previous == NULL)
5690       {
5691       *errorcodeptr = ERR9;
5692       goto FAILED;
5693       }
5694 
5695     if (repeat_min == 0)
5696       {
5697       firstchar = zerofirstchar;    /* Adjust for zero repeat */
5698       firstcharflags = zerofirstcharflags;
5699       reqchar = zeroreqchar;        /* Ditto */
5700       reqcharflags = zeroreqcharflags;
5701       }
5702 
5703     /* Remember whether this is a variable length repeat */
5704 
5705     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
5706 
5707     op_type = 0;                    /* Default single-char op codes */
5708     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
5709 
5710     /* Save start of previous item, in case we have to move it up in order to
5711     insert something before it. */
5712 
5713     tempcode = previous;
5714 
5715     /* Before checking for a possessive quantifier, we must skip over
5716     whitespace and comments in extended mode because Perl allows white space at
5717     this point. */
5718 
5719     if ((options & PCRE_EXTENDED) != 0)
5720       {
5721       const pcre_uchar *p = ptr + 1;
5722       for (;;)
5723         {
5724         while (MAX_255(*p) && (cd->ctypes[*p] & ctype_space) != 0) p++;
5725         if (*p != CHAR_NUMBER_SIGN) break;
5726         p++;
5727         while (*p != CHAR_NULL)
5728           {
5729           if (IS_NEWLINE(p))         /* For non-fixed-length newline cases, */
5730             {                        /* IS_NEWLINE sets cd->nllen. */
5731             p += cd->nllen;
5732             break;
5733             }
5734           p++;
5735 #ifdef SUPPORT_UTF
5736           if (utf) FORWARDCHAR(p);
5737 #endif
5738           }           /* Loop for comment characters */
5739         }             /* Loop for multiple comments */
5740       ptr = p - 1;    /* Character before the next significant one. */
5741       }
5742 
5743     /* We also need to skip over (?# comments, which are not dependent on
5744     extended mode. */
5745 
5746     if (ptr[1] == CHAR_LEFT_PARENTHESIS && ptr[2] == CHAR_QUESTION_MARK &&
5747         ptr[3] == CHAR_NUMBER_SIGN)
5748       {
5749       ptr += 4;
5750       while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
5751       if (*ptr == CHAR_NULL)
5752         {
5753         *errorcodeptr = ERR18;
5754         goto FAILED;
5755         }
5756       }
5757 
5758     /* If the next character is '+', we have a possessive quantifier. This
5759     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
5760     If the next character is '?' this is a minimizing repeat, by default,
5761     but if PCRE_UNGREEDY is set, it works the other way round. We change the
5762     repeat type to the non-default. */
5763 
5764     if (ptr[1] == CHAR_PLUS)
5765       {
5766       repeat_type = 0;                  /* Force greedy */
5767       possessive_quantifier = TRUE;
5768       ptr++;
5769       }
5770     else if (ptr[1] == CHAR_QUESTION_MARK)
5771       {
5772       repeat_type = greedy_non_default;
5773       ptr++;
5774       }
5775     else repeat_type = greedy_default;
5776 
5777     /* If previous was a recursion call, wrap it in atomic brackets so that
5778     previous becomes the atomic group. All recursions were so wrapped in the
5779     past, but it no longer happens for non-repeated recursions. In fact, the
5780     repeated ones could be re-implemented independently so as not to need this,
5781     but for the moment we rely on the code for repeating groups. */
5782 
5783     if (*previous == OP_RECURSE)
5784       {
5785       memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
5786       *previous = OP_ONCE;
5787       PUT(previous, 1, 2 + 2*LINK_SIZE);
5788       previous[2 + 2*LINK_SIZE] = OP_KET;
5789       PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
5790       code += 2 + 2 * LINK_SIZE;
5791       length_prevgroup = 3 + 3*LINK_SIZE;
5792 
5793       /* When actually compiling, we need to check whether this was a forward
5794       reference, and if so, adjust the offset. */
5795 
5796       if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
5797         {
5798         int offset = GET(cd->hwm, -LINK_SIZE);
5799         if (offset == previous + 1 - cd->start_code)
5800           PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
5801         }
5802       }
5803 
5804     /* Now handle repetition for the different types of item. */
5805 
5806     /* If previous was a character or negated character match, abolish the item
5807     and generate a repeat item instead. If a char item has a minimum of more
5808     than one, ensure that it is set in reqchar - it might not be if a sequence
5809     such as x{3} is the first thing in a branch because the x will have gone
5810     into firstchar instead.  */
5811 
5812     if (*previous == OP_CHAR || *previous == OP_CHARI
5813         || *previous == OP_NOT || *previous == OP_NOTI)
5814       {
5815       switch (*previous)
5816         {
5817         default: /* Make compiler happy. */
5818         case OP_CHAR:  op_type = OP_STAR - OP_STAR; break;
5819         case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
5820         case OP_NOT:   op_type = OP_NOTSTAR - OP_STAR; break;
5821         case OP_NOTI:  op_type = OP_NOTSTARI - OP_STAR; break;
5822         }
5823 
5824       /* Deal with UTF characters that take up more than one character. It's
5825       easier to write this out separately than try to macrify it. Use c to
5826       hold the length of the character in bytes, plus UTF_LENGTH to flag that
5827       it's a length rather than a small character. */
5828 
5829 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5830       if (utf && NOT_FIRSTCHAR(code[-1]))
5831         {
5832         pcre_uchar *lastchar = code - 1;
5833         BACKCHAR(lastchar);
5834         c = (int)(code - lastchar);     /* Length of UTF-8 character */
5835         memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
5836         c |= UTF_LENGTH;                /* Flag c as a length */
5837         }
5838       else
5839 #endif /* SUPPORT_UTF */
5840 
5841       /* Handle the case of a single charater - either with no UTF support, or
5842       with UTF disabled, or for a single character UTF character. */
5843         {
5844         c = code[-1];
5845         if (*previous <= OP_CHARI && repeat_min > 1)
5846           {
5847           reqchar = c;
5848           reqcharflags = req_caseopt | cd->req_varyopt;
5849           }
5850         }
5851 
5852       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
5853       }
5854 
5855     /* If previous was a character type match (\d or similar), abolish it and
5856     create a suitable repeat item. The code is shared with single-character
5857     repeats by setting op_type to add a suitable offset into repeat_type. Note
5858     the the Unicode property types will be present only when SUPPORT_UCP is
5859     defined, but we don't wrap the little bits of code here because it just
5860     makes it horribly messy. */
5861 
5862     else if (*previous < OP_EODN)
5863       {
5864       pcre_uchar *oldcode;
5865       int prop_type, prop_value;
5866       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
5867       c = *previous;
5868 
5869       OUTPUT_SINGLE_REPEAT:
5870       if (*previous == OP_PROP || *previous == OP_NOTPROP)
5871         {
5872         prop_type = previous[1];
5873         prop_value = previous[2];
5874         }
5875       else prop_type = prop_value = -1;
5876 
5877       oldcode = code;
5878       code = previous;                  /* Usually overwrite previous item */
5879 
5880       /* If the maximum is zero then the minimum must also be zero; Perl allows
5881       this case, so we do too - by simply omitting the item altogether. */
5882 
5883       if (repeat_max == 0) goto END_REPEAT;
5884 
5885       /* Combine the op_type with the repeat_type */
5886 
5887       repeat_type += op_type;
5888 
5889       /* A minimum of zero is handled either as the special case * or ?, or as
5890       an UPTO, with the maximum given. */
5891 
5892       if (repeat_min == 0)
5893         {
5894         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
5895           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
5896         else
5897           {
5898           *code++ = OP_UPTO + repeat_type;
5899           PUT2INC(code, 0, repeat_max);
5900           }
5901         }
5902 
5903       /* A repeat minimum of 1 is optimized into some special cases. If the
5904       maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
5905       left in place and, if the maximum is greater than 1, we use OP_UPTO with
5906       one less than the maximum. */
5907 
5908       else if (repeat_min == 1)
5909         {
5910         if (repeat_max == -1)
5911           *code++ = OP_PLUS + repeat_type;
5912         else
5913           {
5914           code = oldcode;                 /* leave previous item in place */
5915           if (repeat_max == 1) goto END_REPEAT;
5916           *code++ = OP_UPTO + repeat_type;
5917           PUT2INC(code, 0, repeat_max - 1);
5918           }
5919         }
5920 
5921       /* The case {n,n} is just an EXACT, while the general case {n,m} is
5922       handled as an EXACT followed by an UPTO. */
5923 
5924       else
5925         {
5926         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
5927         PUT2INC(code, 0, repeat_min);
5928 
5929         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
5930         we have to insert the character for the previous code. For a repeated
5931         Unicode property match, there are two extra bytes that define the
5932         required property. In UTF-8 mode, long characters have their length in
5933         c, with the UTF_LENGTH bit as a flag. */
5934 
5935         if (repeat_max < 0)
5936           {
5937 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5938           if (utf && (c & UTF_LENGTH) != 0)
5939             {
5940             memcpy(code, utf_chars, IN_UCHARS(c & 7));
5941             code += c & 7;
5942             }
5943           else
5944 #endif
5945             {
5946             *code++ = c;
5947             if (prop_type >= 0)
5948               {
5949               *code++ = prop_type;
5950               *code++ = prop_value;
5951               }
5952             }
5953           *code++ = OP_STAR + repeat_type;
5954           }
5955 
5956         /* Else insert an UPTO if the max is greater than the min, again
5957         preceded by the character, for the previously inserted code. If the
5958         UPTO is just for 1 instance, we can use QUERY instead. */
5959 
5960         else if (repeat_max != repeat_min)
5961           {
5962 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5963           if (utf && (c & UTF_LENGTH) != 0)
5964             {
5965             memcpy(code, utf_chars, IN_UCHARS(c & 7));
5966             code += c & 7;
5967             }
5968           else
5969 #endif
5970           *code++ = c;
5971           if (prop_type >= 0)
5972             {
5973             *code++ = prop_type;
5974             *code++ = prop_value;
5975             }
5976           repeat_max -= repeat_min;
5977 
5978           if (repeat_max == 1)
5979             {
5980             *code++ = OP_QUERY + repeat_type;
5981             }
5982           else
5983             {
5984             *code++ = OP_UPTO + repeat_type;
5985             PUT2INC(code, 0, repeat_max);
5986             }
5987           }
5988         }
5989 
5990       /* The character or character type itself comes last in all cases. */
5991 
5992 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5993       if (utf && (c & UTF_LENGTH) != 0)
5994         {
5995         memcpy(code, utf_chars, IN_UCHARS(c & 7));
5996         code += c & 7;
5997         }
5998       else
5999 #endif
6000       *code++ = c;
6001 
6002       /* For a repeated Unicode property match, there are two extra bytes that
6003       define the required property. */
6004 
6005 #ifdef SUPPORT_UCP
6006       if (prop_type >= 0)
6007         {
6008         *code++ = prop_type;
6009         *code++ = prop_value;
6010         }
6011 #endif
6012       }
6013 
6014     /* If previous was a character class or a back reference, we put the repeat
6015     stuff after it, but just skip the item if the repeat was {0,0}. */
6016 
6017     else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
6018 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6019              *previous == OP_XCLASS ||
6020 #endif
6021              *previous == OP_REF   || *previous == OP_REFI ||
6022              *previous == OP_DNREF || *previous == OP_DNREFI)
6023       {
6024       if (repeat_max == 0)
6025         {
6026         code = previous;
6027         goto END_REPEAT;
6028         }
6029 
6030       if (repeat_min == 0 && repeat_max == -1)
6031         *code++ = OP_CRSTAR + repeat_type;
6032       else if (repeat_min == 1 && repeat_max == -1)
6033         *code++ = OP_CRPLUS + repeat_type;
6034       else if (repeat_min == 0 && repeat_max == 1)
6035         *code++ = OP_CRQUERY + repeat_type;
6036       else
6037         {
6038         *code++ = OP_CRRANGE + repeat_type;
6039         PUT2INC(code, 0, repeat_min);
6040         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
6041         PUT2INC(code, 0, repeat_max);
6042         }
6043       }
6044 
6045     /* If previous was a bracket group, we may have to replicate it in certain
6046     cases. Note that at this point we can encounter only the "basic" bracket
6047     opcodes such as BRA and CBRA, as this is the place where they get converted
6048     into the more special varieties such as BRAPOS and SBRA. A test for >=
6049     OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
6050     ASSERTBACK_NOT, ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND.
6051     Originally, PCRE did not allow repetition of assertions, but now it does,
6052     for Perl compatibility. */
6053 
6054     else if (*previous >= OP_ASSERT && *previous <= OP_COND)
6055       {
6056       register int i;
6057       int len = (int)(code - previous);
6058       size_t base_hwm_offset = item_hwm_offset;
6059       pcre_uchar *bralink = NULL;
6060       pcre_uchar *brazeroptr = NULL;
6061 
6062       /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
6063       we just ignore the repeat. */
6064 
6065       if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
6066         goto END_REPEAT;
6067 
6068       /* There is no sense in actually repeating assertions. The only potential
6069       use of repetition is in cases when the assertion is optional. Therefore,
6070       if the minimum is greater than zero, just ignore the repeat. If the
6071       maximum is not zero or one, set it to 1. */
6072 
6073       if (*previous < OP_ONCE)    /* Assertion */
6074         {
6075         if (repeat_min > 0) goto END_REPEAT;
6076         if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
6077         }
6078 
6079       /* The case of a zero minimum is special because of the need to stick
6080       OP_BRAZERO in front of it, and because the group appears once in the
6081       data, whereas in other cases it appears the minimum number of times. For
6082       this reason, it is simplest to treat this case separately, as otherwise
6083       the code gets far too messy. There are several special subcases when the
6084       minimum is zero. */
6085 
6086       if (repeat_min == 0)
6087         {
6088         /* If the maximum is also zero, we used to just omit the group from the
6089         output altogether, like this:
6090 
6091         ** if (repeat_max == 0)
6092         **   {
6093         **   code = previous;
6094         **   goto END_REPEAT;
6095         **   }
6096 
6097         However, that fails when a group or a subgroup within it is referenced
6098         as a subroutine from elsewhere in the pattern, so now we stick in
6099         OP_SKIPZERO in front of it so that it is skipped on execution. As we
6100         don't have a list of which groups are referenced, we cannot do this
6101         selectively.
6102 
6103         If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
6104         and do no more at this point. However, we do need to adjust any
6105         OP_RECURSE calls inside the group that refer to the group itself or any
6106         internal or forward referenced group, because the offset is from the
6107         start of the whole regex. Temporarily terminate the pattern while doing
6108         this. */
6109 
6110         if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
6111           {
6112           *code = OP_END;
6113           adjust_recurse(previous, 1, utf, cd, item_hwm_offset);
6114           memmove(previous + 1, previous, IN_UCHARS(len));
6115           code++;
6116           if (repeat_max == 0)
6117             {
6118             *previous++ = OP_SKIPZERO;
6119             goto END_REPEAT;
6120             }
6121           brazeroptr = previous;    /* Save for possessive optimizing */
6122           *previous++ = OP_BRAZERO + repeat_type;
6123           }
6124 
6125         /* If the maximum is greater than 1 and limited, we have to replicate
6126         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
6127         The first one has to be handled carefully because it's the original
6128         copy, which has to be moved up. The remainder can be handled by code
6129         that is common with the non-zero minimum case below. We have to
6130         adjust the value or repeat_max, since one less copy is required. Once
6131         again, we may have to adjust any OP_RECURSE calls inside the group. */
6132 
6133         else
6134           {
6135           int offset;
6136           *code = OP_END;
6137           adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, item_hwm_offset);
6138           memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
6139           code += 2 + LINK_SIZE;
6140           *previous++ = OP_BRAZERO + repeat_type;
6141           *previous++ = OP_BRA;
6142 
6143           /* We chain together the bracket offset fields that have to be
6144           filled in later when the ends of the brackets are reached. */
6145 
6146           offset = (bralink == NULL)? 0 : (int)(previous - bralink);
6147           bralink = previous;
6148           PUTINC(previous, 0, offset);
6149           }
6150 
6151         repeat_max--;
6152         }
6153 
6154       /* If the minimum is greater than zero, replicate the group as many
6155       times as necessary, and adjust the maximum to the number of subsequent
6156       copies that we need. If we set a first char from the group, and didn't
6157       set a required char, copy the latter from the former. If there are any
6158       forward reference subroutine calls in the group, there will be entries on
6159       the workspace list; replicate these with an appropriate increment. */
6160 
6161       else
6162         {
6163         if (repeat_min > 1)
6164           {
6165           /* In the pre-compile phase, we don't actually do the replication. We
6166           just adjust the length as if we had. Do some paranoid checks for
6167           potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
6168           integer type when available, otherwise double. */
6169 
6170           if (lengthptr != NULL)
6171             {
6172             int delta = (repeat_min - 1)*length_prevgroup;
6173             if ((INT64_OR_DOUBLE)(repeat_min - 1)*
6174                   (INT64_OR_DOUBLE)length_prevgroup >
6175                     (INT64_OR_DOUBLE)INT_MAX ||
6176                 OFLOW_MAX - *lengthptr < delta)
6177               {
6178               *errorcodeptr = ERR20;
6179               goto FAILED;
6180               }
6181             *lengthptr += delta;
6182             }
6183 
6184           /* This is compiling for real. If there is a set first byte for
6185           the group, and we have not yet set a "required byte", set it. Make
6186           sure there is enough workspace for copying forward references before
6187           doing the copy. */
6188 
6189           else
6190             {
6191             if (groupsetfirstchar && reqcharflags < 0)
6192               {
6193               reqchar = firstchar;
6194               reqcharflags = firstcharflags;
6195               }
6196 
6197             for (i = 1; i < repeat_min; i++)
6198               {
6199               pcre_uchar *hc;
6200               size_t this_hwm_offset = cd->hwm - cd->start_workspace;
6201               memcpy(code, previous, IN_UCHARS(len));
6202 
6203               while (cd->hwm > cd->start_workspace + cd->workspace_size -
6204                      WORK_SIZE_SAFETY_MARGIN -
6205                      (this_hwm_offset - base_hwm_offset))
6206                 {
6207                 *errorcodeptr = expand_workspace(cd);
6208                 if (*errorcodeptr != 0) goto FAILED;
6209                 }
6210 
6211               for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
6212                    hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
6213                    hc += LINK_SIZE)
6214                 {
6215                 PUT(cd->hwm, 0, GET(hc, 0) + len);
6216                 cd->hwm += LINK_SIZE;
6217                 }
6218               base_hwm_offset = this_hwm_offset;
6219               code += len;
6220               }
6221             }
6222           }
6223 
6224         if (repeat_max > 0) repeat_max -= repeat_min;
6225         }
6226 
6227       /* This code is common to both the zero and non-zero minimum cases. If
6228       the maximum is limited, it replicates the group in a nested fashion,
6229       remembering the bracket starts on a stack. In the case of a zero minimum,
6230       the first one was set up above. In all cases the repeat_max now specifies
6231       the number of additional copies needed. Again, we must remember to
6232       replicate entries on the forward reference list. */
6233 
6234       if (repeat_max >= 0)
6235         {
6236         /* In the pre-compile phase, we don't actually do the replication. We
6237         just adjust the length as if we had. For each repetition we must add 1
6238         to the length for BRAZERO and for all but the last repetition we must
6239         add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
6240         paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
6241         a 64-bit integer type when available, otherwise double. */
6242 
6243         if (lengthptr != NULL && repeat_max > 0)
6244           {
6245           int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
6246                       2 - 2*LINK_SIZE;   /* Last one doesn't nest */
6247           if ((INT64_OR_DOUBLE)repeat_max *
6248                 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
6249                   > (INT64_OR_DOUBLE)INT_MAX ||
6250               OFLOW_MAX - *lengthptr < delta)
6251             {
6252             *errorcodeptr = ERR20;
6253             goto FAILED;
6254             }
6255           *lengthptr += delta;
6256           }
6257 
6258         /* This is compiling for real */
6259 
6260         else for (i = repeat_max - 1; i >= 0; i--)
6261           {
6262           pcre_uchar *hc;
6263           size_t this_hwm_offset = cd->hwm - cd->start_workspace;
6264 
6265           *code++ = OP_BRAZERO + repeat_type;
6266 
6267           /* All but the final copy start a new nesting, maintaining the
6268           chain of brackets outstanding. */
6269 
6270           if (i != 0)
6271             {
6272             int offset;
6273             *code++ = OP_BRA;
6274             offset = (bralink == NULL)? 0 : (int)(code - bralink);
6275             bralink = code;
6276             PUTINC(code, 0, offset);
6277             }
6278 
6279           memcpy(code, previous, IN_UCHARS(len));
6280 
6281           /* Ensure there is enough workspace for forward references before
6282           copying them. */
6283 
6284           while (cd->hwm > cd->start_workspace + cd->workspace_size -
6285                  WORK_SIZE_SAFETY_MARGIN -
6286                  (this_hwm_offset - base_hwm_offset))
6287             {
6288             *errorcodeptr = expand_workspace(cd);
6289             if (*errorcodeptr != 0) goto FAILED;
6290             }
6291 
6292           for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
6293                hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
6294                hc += LINK_SIZE)
6295             {
6296             PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
6297             cd->hwm += LINK_SIZE;
6298             }
6299           base_hwm_offset = this_hwm_offset;
6300           code += len;
6301           }
6302 
6303         /* Now chain through the pending brackets, and fill in their length
6304         fields (which are holding the chain links pro tem). */
6305 
6306         while (bralink != NULL)
6307           {
6308           int oldlinkoffset;
6309           int offset = (int)(code - bralink + 1);
6310           pcre_uchar *bra = code - offset;
6311           oldlinkoffset = GET(bra, 1);
6312           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
6313           *code++ = OP_KET;
6314           PUTINC(code, 0, offset);
6315           PUT(bra, 1, offset);
6316           }
6317         }
6318 
6319       /* If the maximum is unlimited, set a repeater in the final copy. For
6320       ONCE brackets, that's all we need to do. However, possessively repeated
6321       ONCE brackets can be converted into non-capturing brackets, as the
6322       behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
6323       deal with possessive ONCEs specially.
6324 
6325       Otherwise, when we are doing the actual compile phase, check to see
6326       whether this group is one that could match an empty string. If so,
6327       convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
6328       that runtime checking can be done. [This check is also applied to ONCE
6329       groups at runtime, but in a different way.]
6330 
6331       Then, if the quantifier was possessive and the bracket is not a
6332       conditional, we convert the BRA code to the POS form, and the KET code to
6333       KETRPOS. (It turns out to be convenient at runtime to detect this kind of
6334       subpattern at both the start and at the end.) The use of special opcodes
6335       makes it possible to reduce greatly the stack usage in pcre_exec(). If
6336       the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
6337 
6338       Then, if the minimum number of matches is 1 or 0, cancel the possessive
6339       flag so that the default action below, of wrapping everything inside
6340       atomic brackets, does not happen. When the minimum is greater than 1,
6341       there will be earlier copies of the group, and so we still have to wrap
6342       the whole thing. */
6343 
6344       else
6345         {
6346         pcre_uchar *ketcode = code - 1 - LINK_SIZE;
6347         pcre_uchar *bracode = ketcode - GET(ketcode, 1);
6348 
6349         /* Convert possessive ONCE brackets to non-capturing */
6350 
6351         if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
6352             possessive_quantifier) *bracode = OP_BRA;
6353 
6354         /* For non-possessive ONCE brackets, all we need to do is to
6355         set the KET. */
6356 
6357         if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
6358           *ketcode = OP_KETRMAX + repeat_type;
6359 
6360         /* Handle non-ONCE brackets and possessive ONCEs (which have been
6361         converted to non-capturing above). */
6362 
6363         else
6364           {
6365           /* In the compile phase, check for empty string matching. */
6366 
6367           if (lengthptr == NULL)
6368             {
6369             pcre_uchar *scode = bracode;
6370             do
6371               {
6372               if (could_be_empty_branch(scode, ketcode, utf, cd, NULL))
6373                 {
6374                 *bracode += OP_SBRA - OP_BRA;
6375                 break;
6376                 }
6377               scode += GET(scode, 1);
6378               }
6379             while (*scode == OP_ALT);
6380             }
6381 
6382           /* A conditional group with only one branch has an implicit empty
6383           alternative branch. */
6384 
6385           if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
6386             *bracode = OP_SCOND;
6387 
6388           /* Handle possessive quantifiers. */
6389 
6390           if (possessive_quantifier)
6391             {
6392             /* For COND brackets, we wrap the whole thing in a possessively
6393             repeated non-capturing bracket, because we have not invented POS
6394             versions of the COND opcodes. Because we are moving code along, we
6395             must ensure that any pending recursive references are updated. */
6396 
6397             if (*bracode == OP_COND || *bracode == OP_SCOND)
6398               {
6399               int nlen = (int)(code - bracode);
6400               *code = OP_END;
6401               adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6402               memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
6403               code += 1 + LINK_SIZE;
6404               nlen += 1 + LINK_SIZE;
6405               *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
6406               *code++ = OP_KETRPOS;
6407               PUTINC(code, 0, nlen);
6408               PUT(bracode, 1, nlen);
6409               }
6410 
6411             /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
6412 
6413             else
6414               {
6415               *bracode += 1;              /* Switch to xxxPOS opcodes */
6416               *ketcode = OP_KETRPOS;
6417               }
6418 
6419             /* If the minimum is zero, mark it as possessive, then unset the
6420             possessive flag when the minimum is 0 or 1. */
6421 
6422             if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
6423             if (repeat_min < 2) possessive_quantifier = FALSE;
6424             }
6425 
6426           /* Non-possessive quantifier */
6427 
6428           else *ketcode = OP_KETRMAX + repeat_type;
6429           }
6430         }
6431       }
6432 
6433     /* If previous is OP_FAIL, it was generated by an empty class [] in
6434     JavaScript mode. The other ways in which OP_FAIL can be generated, that is
6435     by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
6436     error above. We can just ignore the repeat in JS case. */
6437 
6438     else if (*previous == OP_FAIL) goto END_REPEAT;
6439 
6440     /* Else there's some kind of shambles */
6441 
6442     else
6443       {
6444       *errorcodeptr = ERR11;
6445       goto FAILED;
6446       }
6447 
6448     /* If the character following a repeat is '+', possessive_quantifier is
6449     TRUE. For some opcodes, there are special alternative opcodes for this
6450     case. For anything else, we wrap the entire repeated item inside OP_ONCE
6451     brackets. Logically, the '+' notation is just syntactic sugar, taken from
6452     Sun's Java package, but the special opcodes can optimize it.
6453 
6454     Some (but not all) possessively repeated subpatterns have already been
6455     completely handled in the code just above. For them, possessive_quantifier
6456     is always FALSE at this stage. Note that the repeated item starts at
6457     tempcode, not at previous, which might be the first part of a string whose
6458     (former) last char we repeated. */
6459 
6460     if (possessive_quantifier)
6461       {
6462       int len;
6463 
6464       /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
6465       However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
6466       {5,}, or {5,10}). We skip over an EXACT item; if the length of what
6467       remains is greater than zero, there's a further opcode that can be
6468       handled. If not, do nothing, leaving the EXACT alone. */
6469 
6470       switch(*tempcode)
6471         {
6472         case OP_TYPEEXACT:
6473         tempcode += PRIV(OP_lengths)[*tempcode] +
6474           ((tempcode[1 + IMM2_SIZE] == OP_PROP
6475           || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
6476         break;
6477 
6478         /* CHAR opcodes are used for exacts whose count is 1. */
6479 
6480         case OP_CHAR:
6481         case OP_CHARI:
6482         case OP_NOT:
6483         case OP_NOTI:
6484         case OP_EXACT:
6485         case OP_EXACTI:
6486         case OP_NOTEXACT:
6487         case OP_NOTEXACTI:
6488         tempcode += PRIV(OP_lengths)[*tempcode];
6489 #ifdef SUPPORT_UTF
6490         if (utf && HAS_EXTRALEN(tempcode[-1]))
6491           tempcode += GET_EXTRALEN(tempcode[-1]);
6492 #endif
6493         break;
6494 
6495         /* For the class opcodes, the repeat operator appears at the end;
6496         adjust tempcode to point to it. */
6497 
6498         case OP_CLASS:
6499         case OP_NCLASS:
6500         tempcode += 1 + 32/sizeof(pcre_uchar);
6501         break;
6502 
6503 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6504         case OP_XCLASS:
6505         tempcode += GET(tempcode, 1);
6506         break;
6507 #endif
6508         }
6509 
6510       /* If tempcode is equal to code (which points to the end of the repeated
6511       item), it means we have skipped an EXACT item but there is no following
6512       QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
6513       all other cases, tempcode will be pointing to the repeat opcode, and will
6514       be less than code, so the value of len will be greater than 0. */
6515 
6516       len = (int)(code - tempcode);
6517       if (len > 0)
6518         {
6519         unsigned int repcode = *tempcode;
6520 
6521         /* There is a table for possessifying opcodes, all of which are less
6522         than OP_CALLOUT. A zero entry means there is no possessified version.
6523         */
6524 
6525         if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
6526           *tempcode = opcode_possessify[repcode];
6527 
6528         /* For opcode without a special possessified version, wrap the item in
6529         ONCE brackets. Because we are moving code along, we must ensure that any
6530         pending recursive references are updated. */
6531 
6532         else
6533           {
6534           *code = OP_END;
6535           adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6536           memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6537           code += 1 + LINK_SIZE;
6538           len += 1 + LINK_SIZE;
6539           tempcode[0] = OP_ONCE;
6540           *code++ = OP_KET;
6541           PUTINC(code, 0, len);
6542           PUT(tempcode, 1, len);
6543           }
6544         }
6545 
6546 #ifdef NEVER
6547       if (len > 0) switch (*tempcode)
6548         {
6549         case OP_STAR:  *tempcode = OP_POSSTAR; break;
6550         case OP_PLUS:  *tempcode = OP_POSPLUS; break;
6551         case OP_QUERY: *tempcode = OP_POSQUERY; break;
6552         case OP_UPTO:  *tempcode = OP_POSUPTO; break;
6553 
6554         case OP_STARI:  *tempcode = OP_POSSTARI; break;
6555         case OP_PLUSI:  *tempcode = OP_POSPLUSI; break;
6556         case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
6557         case OP_UPTOI:  *tempcode = OP_POSUPTOI; break;
6558 
6559         case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
6560         case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
6561         case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
6562         case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
6563 
6564         case OP_NOTSTARI:  *tempcode = OP_NOTPOSSTARI; break;
6565         case OP_NOTPLUSI:  *tempcode = OP_NOTPOSPLUSI; break;
6566         case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
6567         case OP_NOTUPTOI:  *tempcode = OP_NOTPOSUPTOI; break;
6568 
6569         case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
6570         case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
6571         case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
6572         case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
6573 
6574         case OP_CRSTAR:   *tempcode = OP_CRPOSSTAR; break;
6575         case OP_CRPLUS:   *tempcode = OP_CRPOSPLUS; break;
6576         case OP_CRQUERY:  *tempcode = OP_CRPOSQUERY; break;
6577         case OP_CRRANGE:  *tempcode = OP_CRPOSRANGE; break;
6578 
6579         /* Because we are moving code along, we must ensure that any
6580         pending recursive references are updated. */
6581 
6582         default:
6583         *code = OP_END;
6584         adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6585         memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6586         code += 1 + LINK_SIZE;
6587         len += 1 + LINK_SIZE;
6588         tempcode[0] = OP_ONCE;
6589         *code++ = OP_KET;
6590         PUTINC(code, 0, len);
6591         PUT(tempcode, 1, len);
6592         break;
6593         }
6594 #endif
6595       }
6596 
6597     /* In all case we no longer have a previous item. We also set the
6598     "follows varying string" flag for subsequently encountered reqchars if
6599     it isn't already set and we have just passed a varying length item. */
6600 
6601     END_REPEAT:
6602     previous = NULL;
6603     cd->req_varyopt |= reqvary;
6604     break;
6605 
6606 
6607     /* ===================================================================*/
6608     /* Start of nested parenthesized sub-expression, or comment or lookahead or
6609     lookbehind or option setting or condition or all the other extended
6610     parenthesis forms.  */
6611 
6612     case CHAR_LEFT_PARENTHESIS:
6613     ptr++;
6614 
6615     /* Now deal with various "verbs" that can be introduced by '*'. */
6616 
6617     if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
6618          || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
6619       {
6620       int i, namelen;
6621       int arglen = 0;
6622       const char *vn = verbnames;
6623       const pcre_uchar *name = ptr + 1;
6624       const pcre_uchar *arg = NULL;
6625       previous = NULL;
6626       ptr++;
6627       while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
6628       namelen = (int)(ptr - name);
6629 
6630       /* It appears that Perl allows any characters whatsoever, other than
6631       a closing parenthesis, to appear in arguments, so we no longer insist on
6632       letters, digits, and underscores. */
6633 
6634       if (*ptr == CHAR_COLON)
6635         {
6636         arg = ++ptr;
6637         while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6638         arglen = (int)(ptr - arg);
6639         if ((unsigned int)arglen > MAX_MARK)
6640           {
6641           *errorcodeptr = ERR75;
6642           goto FAILED;
6643           }
6644         }
6645 
6646       if (*ptr != CHAR_RIGHT_PARENTHESIS)
6647         {
6648         *errorcodeptr = ERR60;
6649         goto FAILED;
6650         }
6651 
6652       /* Scan the table of verb names */
6653 
6654       for (i = 0; i < verbcount; i++)
6655         {
6656         if (namelen == verbs[i].len &&
6657             STRNCMP_UC_C8(name, vn, namelen) == 0)
6658           {
6659           int setverb;
6660 
6661           /* Check for open captures before ACCEPT and convert it to
6662           ASSERT_ACCEPT if in an assertion. */
6663 
6664           if (verbs[i].op == OP_ACCEPT)
6665             {
6666             open_capitem *oc;
6667             if (arglen != 0)
6668               {
6669               *errorcodeptr = ERR59;
6670               goto FAILED;
6671               }
6672             cd->had_accept = TRUE;
6673             for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6674               {
6675               if (lengthptr != NULL)
6676                 {
6677 #ifdef COMPILE_PCRE8
6678                 *lengthptr += 1 + IMM2_SIZE;
6679 #elif defined COMPILE_PCRE16
6680                 *lengthptr += 2 + IMM2_SIZE;
6681 #elif defined COMPILE_PCRE32
6682                 *lengthptr += 4 + IMM2_SIZE;
6683 #endif
6684                 }
6685               else
6686                 {
6687                 *code++ = OP_CLOSE;
6688                 PUT2INC(code, 0, oc->number);
6689                 }
6690               }
6691             setverb = *code++ =
6692               (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6693 
6694             /* Do not set firstchar after *ACCEPT */
6695             if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
6696             }
6697 
6698           /* Handle other cases with/without an argument */
6699 
6700           else if (arglen == 0)
6701             {
6702             if (verbs[i].op < 0)   /* Argument is mandatory */
6703               {
6704               *errorcodeptr = ERR66;
6705               goto FAILED;
6706               }
6707             setverb = *code++ = verbs[i].op;
6708             }
6709 
6710           else
6711             {
6712             if (verbs[i].op_arg < 0)   /* Argument is forbidden */
6713               {
6714               *errorcodeptr = ERR59;
6715               goto FAILED;
6716               }
6717             setverb = *code++ = verbs[i].op_arg;
6718             if (lengthptr != NULL)    /* In pass 1 just add in the length */
6719               {                       /* to avoid potential workspace */
6720               *lengthptr += arglen;   /* overflow. */
6721               *code++ = 0;
6722               }
6723             else
6724               {
6725               *code++ = arglen;
6726               memcpy(code, arg, IN_UCHARS(arglen));
6727               code += arglen;
6728               }
6729             *code++ = 0;
6730             }
6731 
6732           switch (setverb)
6733             {
6734             case OP_THEN:
6735             case OP_THEN_ARG:
6736             cd->external_flags |= PCRE_HASTHEN;
6737             break;
6738 
6739             case OP_PRUNE:
6740             case OP_PRUNE_ARG:
6741             case OP_SKIP:
6742             case OP_SKIP_ARG:
6743             cd->had_pruneorskip = TRUE;
6744             break;
6745             }
6746 
6747           break;  /* Found verb, exit loop */
6748           }
6749 
6750         vn += verbs[i].len + 1;
6751         }
6752 
6753       if (i < verbcount) continue;    /* Successfully handled a verb */
6754       *errorcodeptr = ERR60;          /* Verb not recognized */
6755       goto FAILED;
6756       }
6757 
6758     /* Initialize for "real" parentheses */
6759 
6760     newoptions = options;
6761     skipbytes = 0;
6762     bravalue = OP_CBRA;
6763     item_hwm_offset = cd->hwm - cd->start_workspace;
6764     reset_bracount = FALSE;
6765 
6766     /* Deal with the extended parentheses; all are introduced by '?', and the
6767     appearance of any of them means that this is not a capturing group. */
6768 
6769     if (*ptr == CHAR_QUESTION_MARK)
6770       {
6771       int i, set, unset, namelen;
6772       int *optset;
6773       const pcre_uchar *name;
6774       pcre_uchar *slot;
6775 
6776       switch (*(++ptr))
6777         {
6778         /* ------------------------------------------------------------ */
6779         case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
6780         reset_bracount = TRUE;
6781         cd->dupgroups = TRUE;     /* Record (?| encountered */
6782         /* Fall through */
6783 
6784         /* ------------------------------------------------------------ */
6785         case CHAR_COLON:          /* Non-capturing bracket */
6786         bravalue = OP_BRA;
6787         ptr++;
6788         break;
6789 
6790 
6791         /* ------------------------------------------------------------ */
6792         case CHAR_LEFT_PARENTHESIS:
6793         bravalue = OP_COND;       /* Conditional group */
6794         tempptr = ptr;
6795 
6796         /* A condition can be an assertion, a number (referring to a numbered
6797         group's having been set), a name (referring to a named group), or 'R',
6798         referring to recursion. R<digits> and R&name are also permitted for
6799         recursion tests.
6800 
6801         There are ways of testing a named group: (?(name)) is used by Python;
6802         Perl 5.10 onwards uses (?(<name>) or (?('name')).
6803 
6804         There is one unfortunate ambiguity, caused by history. 'R' can be the
6805         recursive thing or the name 'R' (and similarly for 'R' followed by
6806         digits). We look for a name first; if not found, we try the other case.
6807 
6808         For compatibility with auto-callouts, we allow a callout to be
6809         specified before a condition that is an assertion. First, check for the
6810         syntax of a callout; if found, adjust the temporary pointer that is
6811         used to check for an assertion condition. That's all that is needed! */
6812 
6813         if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C)
6814           {
6815           for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break;
6816           if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
6817             tempptr += i + 1;
6818 
6819           /* tempptr should now be pointing to the opening parenthesis of the
6820           assertion condition. */
6821 
6822           if (*tempptr != CHAR_LEFT_PARENTHESIS)
6823             {
6824             *errorcodeptr = ERR28;
6825             goto FAILED;
6826             }
6827           }
6828 
6829         /* For conditions that are assertions, check the syntax, and then exit
6830         the switch. This will take control down to where bracketed groups,
6831         including assertions, are processed. */
6832 
6833         if (tempptr[1] == CHAR_QUESTION_MARK &&
6834               (tempptr[2] == CHAR_EQUALS_SIGN ||
6835                tempptr[2] == CHAR_EXCLAMATION_MARK ||
6836                  (tempptr[2] == CHAR_LESS_THAN_SIGN &&
6837                    (tempptr[3] == CHAR_EQUALS_SIGN ||
6838                     tempptr[3] == CHAR_EXCLAMATION_MARK))))
6839           {
6840           cd->iscondassert = TRUE;
6841           break;
6842           }
6843 
6844         /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6845         need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6846 
6847         code[1+LINK_SIZE] = OP_CREF;
6848         skipbytes = 1+IMM2_SIZE;
6849         refsign = -1;     /* => not a number */
6850         namelen = -1;     /* => not a name; must set to avoid warning */
6851         name = NULL;      /* Always set to avoid warning */
6852         recno = 0;        /* Always set to avoid warning */
6853 
6854         /* Check for a test for recursion in a named group. */
6855 
6856         ptr++;
6857         if (*ptr == CHAR_R && ptr[1] == CHAR_AMPERSAND)
6858           {
6859           terminator = -1;
6860           ptr += 2;
6861           code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
6862           }
6863 
6864         /* Check for a test for a named group's having been set, using the Perl
6865         syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6866         syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). */
6867 
6868         else if (*ptr == CHAR_LESS_THAN_SIGN)
6869           {
6870           terminator = CHAR_GREATER_THAN_SIGN;
6871           ptr++;
6872           }
6873         else if (*ptr == CHAR_APOSTROPHE)
6874           {
6875           terminator = CHAR_APOSTROPHE;
6876           ptr++;
6877           }
6878         else
6879           {
6880           terminator = CHAR_NULL;
6881           if (*ptr == CHAR_MINUS || *ptr == CHAR_PLUS) refsign = *ptr++;
6882             else if (IS_DIGIT(*ptr)) refsign = 0;
6883           }
6884 
6885         /* Handle a number */
6886 
6887         if (refsign >= 0)
6888           {
6889           while (IS_DIGIT(*ptr))
6890             {
6891             if (recno > INT_MAX / 10 - 1)  /* Integer overflow */
6892               {
6893               while (IS_DIGIT(*ptr)) ptr++;
6894               *errorcodeptr = ERR61;
6895               goto FAILED;
6896               }
6897             recno = recno * 10 + (int)(*ptr - CHAR_0);
6898             ptr++;
6899             }
6900           }
6901 
6902         /* Otherwise we expect to read a name; anything else is an error. When
6903         a name is one of a number of duplicates, a different opcode is used and
6904         it needs more memory. Unfortunately we cannot tell whether a name is a
6905         duplicate in the first pass, so we have to allow for more memory. */
6906 
6907         else
6908           {
6909           if (IS_DIGIT(*ptr))
6910             {
6911             *errorcodeptr = ERR84;
6912             goto FAILED;
6913             }
6914           if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_word) == 0)
6915             {
6916             *errorcodeptr = ERR28;   /* Assertion expected */
6917             goto FAILED;
6918             }
6919           name = ptr++;
6920           while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
6921             {
6922             ptr++;
6923             }
6924           namelen = (int)(ptr - name);
6925           if (lengthptr != NULL) skipbytes += IMM2_SIZE;
6926           }
6927 
6928         /* Check the terminator */
6929 
6930         if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
6931             *ptr++ != CHAR_RIGHT_PARENTHESIS)
6932           {
6933           ptr--;                  /* Error offset */
6934           *errorcodeptr = ERR26;  /* Malformed number or name */
6935           goto FAILED;
6936           }
6937 
6938         /* Do no further checking in the pre-compile phase. */
6939 
6940         if (lengthptr != NULL) break;
6941 
6942         /* In the real compile we do the work of looking for the actual
6943         reference. If refsign is not negative, it means we have a number in
6944         recno. */
6945 
6946         if (refsign >= 0)
6947           {
6948           if (recno <= 0)
6949             {
6950             *errorcodeptr = ERR35;
6951             goto FAILED;
6952             }
6953           if (refsign != 0) recno = (refsign == CHAR_MINUS)?
6954             cd->bracount - recno + 1 : recno + cd->bracount;
6955           if (recno <= 0 || recno > cd->final_bracount)
6956             {
6957             *errorcodeptr = ERR15;
6958             goto FAILED;
6959             }
6960           PUT2(code, 2+LINK_SIZE, recno);
6961           if (recno > cd->top_backref) cd->top_backref = recno;
6962           break;
6963           }
6964 
6965         /* Otherwise look for the name. */
6966 
6967         slot = cd->name_table;
6968         for (i = 0; i < cd->names_found; i++)
6969           {
6970           if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
6971             slot[IMM2_SIZE+namelen] == 0) break;
6972           slot += cd->name_entry_size;
6973           }
6974 
6975         /* Found the named subpattern. If the name is duplicated, add one to
6976         the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6977         appropriate data values. Otherwise, just insert the unique subpattern
6978         number. */
6979 
6980         if (i < cd->names_found)
6981           {
6982           int offset = i++;
6983           int count = 1;
6984           recno = GET2(slot, 0);   /* Number from first found */
6985           if (recno > cd->top_backref) cd->top_backref = recno;
6986           for (; i < cd->names_found; i++)
6987             {
6988             slot += cd->name_entry_size;
6989             if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0 ||
6990               (slot+IMM2_SIZE)[namelen] != 0) break;
6991             count++;
6992             }
6993 
6994           if (count > 1)
6995             {
6996             PUT2(code, 2+LINK_SIZE, offset);
6997             PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6998             skipbytes += IMM2_SIZE;
6999             code[1+LINK_SIZE]++;
7000             }
7001           else  /* Not a duplicated name */
7002             {
7003             PUT2(code, 2+LINK_SIZE, recno);
7004             }
7005           }
7006 
7007         /* If terminator == CHAR_NULL it means that the name followed directly
7008         after the opening parenthesis [e.g. (?(abc)...] and in this case there
7009         are some further alternatives to try. For the cases where terminator !=
7010         CHAR_NULL [things like (?(<name>... or (?('name')... or (?(R&name)... ]
7011         we have now checked all the possibilities, so give an error. */
7012 
7013         else if (terminator != CHAR_NULL)
7014           {
7015           *errorcodeptr = ERR15;
7016           goto FAILED;
7017           }
7018 
7019         /* Check for (?(R) for recursion. Allow digits after R to specify a
7020         specific group number. */
7021 
7022         else if (*name == CHAR_R)
7023           {
7024           recno = 0;
7025           for (i = 1; i < namelen; i++)
7026             {
7027             if (!IS_DIGIT(name[i]))
7028               {
7029               *errorcodeptr = ERR15;
7030               goto FAILED;
7031               }
7032             if (recno > INT_MAX / 10 - 1)   /* Integer overflow */
7033               {
7034               *errorcodeptr = ERR61;
7035               goto FAILED;
7036               }
7037             recno = recno * 10 + name[i] - CHAR_0;
7038             }
7039           if (recno == 0) recno = RREF_ANY;
7040           code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
7041           PUT2(code, 2+LINK_SIZE, recno);
7042           }
7043 
7044         /* Similarly, check for the (?(DEFINE) "condition", which is always
7045         false. */
7046 
7047         else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
7048           {
7049           code[1+LINK_SIZE] = OP_DEF;
7050           skipbytes = 1;
7051           }
7052 
7053         /* Reference to an unidentified subpattern. */
7054 
7055         else
7056           {
7057           *errorcodeptr = ERR15;
7058           goto FAILED;
7059           }
7060         break;
7061 
7062 
7063         /* ------------------------------------------------------------ */
7064         case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
7065         bravalue = OP_ASSERT;
7066         cd->assert_depth += 1;
7067         ptr++;
7068         break;
7069 
7070         /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
7071         thing to do, but Perl allows all assertions to be quantified, and when
7072         they contain capturing parentheses there may be a potential use for
7073         this feature. Not that that applies to a quantified (?!) but we allow
7074         it for uniformity. */
7075 
7076         /* ------------------------------------------------------------ */
7077         case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
7078         ptr++;
7079         if (*ptr == CHAR_RIGHT_PARENTHESIS && ptr[1] != CHAR_ASTERISK &&
7080              ptr[1] != CHAR_PLUS && ptr[1] != CHAR_QUESTION_MARK &&
7081             (ptr[1] != CHAR_LEFT_CURLY_BRACKET || !is_counted_repeat(ptr+2)))
7082           {
7083           *code++ = OP_FAIL;
7084           previous = NULL;
7085           continue;
7086           }
7087         bravalue = OP_ASSERT_NOT;
7088         cd->assert_depth += 1;
7089         break;
7090 
7091 
7092         /* ------------------------------------------------------------ */
7093         case CHAR_LESS_THAN_SIGN:              /* Lookbehind or named define */
7094         switch (ptr[1])
7095           {
7096           case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
7097           bravalue = OP_ASSERTBACK;
7098           cd->assert_depth += 1;
7099           ptr += 2;
7100           break;
7101 
7102           case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
7103           bravalue = OP_ASSERTBACK_NOT;
7104           cd->assert_depth += 1;
7105           ptr += 2;
7106           break;
7107 
7108           default:                /* Could be name define, else bad */
7109           if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
7110             goto DEFINE_NAME;
7111           ptr++;                  /* Correct offset for error */
7112           *errorcodeptr = ERR24;
7113           goto FAILED;
7114           }
7115         break;
7116 
7117 
7118         /* ------------------------------------------------------------ */
7119         case CHAR_GREATER_THAN_SIGN:           /* One-time brackets */
7120         bravalue = OP_ONCE;
7121         ptr++;
7122         break;
7123 
7124 
7125         /* ------------------------------------------------------------ */
7126         case CHAR_C:                 /* Callout - may be followed by digits; */
7127         previous_callout = code;     /* Save for later completion */
7128         after_manual_callout = 1;    /* Skip one item before completing */
7129         *code++ = OP_CALLOUT;
7130           {
7131           int n = 0;
7132           ptr++;
7133           while(IS_DIGIT(*ptr))
7134             n = n * 10 + *ptr++ - CHAR_0;
7135           if (*ptr != CHAR_RIGHT_PARENTHESIS)
7136             {
7137             *errorcodeptr = ERR39;
7138             goto FAILED;
7139             }
7140           if (n > 255)
7141             {
7142             *errorcodeptr = ERR38;
7143             goto FAILED;
7144             }
7145           *code++ = n;
7146           PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
7147           PUT(code, LINK_SIZE, 0);                          /* Default length */
7148           code += 2 * LINK_SIZE;
7149           }
7150         previous = NULL;
7151         continue;
7152 
7153 
7154         /* ------------------------------------------------------------ */
7155         case CHAR_P:              /* Python-style named subpattern handling */
7156         if (*(++ptr) == CHAR_EQUALS_SIGN ||
7157             *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
7158           {
7159           is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
7160           terminator = CHAR_RIGHT_PARENTHESIS;
7161           goto NAMED_REF_OR_RECURSE;
7162           }
7163         else if (*ptr != CHAR_LESS_THAN_SIGN)  /* Test for Python-style defn */
7164           {
7165           *errorcodeptr = ERR41;
7166           goto FAILED;
7167           }
7168         /* Fall through to handle (?P< as (?< is handled */
7169 
7170 
7171         /* ------------------------------------------------------------ */
7172         DEFINE_NAME:    /* Come here from (?< handling */
7173         case CHAR_APOSTROPHE:
7174         terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
7175           CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7176         name = ++ptr;
7177         if (IS_DIGIT(*ptr))
7178           {
7179           *errorcodeptr = ERR84;   /* Group name must start with non-digit */
7180           goto FAILED;
7181           }
7182         while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
7183         namelen = (int)(ptr - name);
7184 
7185         /* In the pre-compile phase, do a syntax check, remember the longest
7186         name, and then remember the group in a vector, expanding it if
7187         necessary. Duplicates for the same number are skipped; other duplicates
7188         are checked for validity. In the actual compile, there is nothing to
7189         do. */
7190 
7191         if (lengthptr != NULL)
7192           {
7193           named_group *ng;
7194           pcre_uint32 number = cd->bracount + 1;
7195 
7196           if (*ptr != (pcre_uchar)terminator)
7197             {
7198             *errorcodeptr = ERR42;
7199             goto FAILED;
7200             }
7201 
7202           if (cd->names_found >= MAX_NAME_COUNT)
7203             {
7204             *errorcodeptr = ERR49;
7205             goto FAILED;
7206             }
7207 
7208           if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
7209             {
7210             cd->name_entry_size = namelen + IMM2_SIZE + 1;
7211             if (namelen > MAX_NAME_SIZE)
7212               {
7213               *errorcodeptr = ERR48;
7214               goto FAILED;
7215               }
7216             }
7217 
7218           /* Scan the list to check for duplicates. For duplicate names, if the
7219           number is the same, break the loop, which causes the name to be
7220           discarded; otherwise, if DUPNAMES is not set, give an error.
7221           If it is set, allow the name with a different number, but continue
7222           scanning in case this is a duplicate with the same number. For
7223           non-duplicate names, give an error if the number is duplicated. */
7224 
7225           ng = cd->named_groups;
7226           for (i = 0; i < cd->names_found; i++, ng++)
7227             {
7228             if (namelen == ng->length &&
7229                 STRNCMP_UC_UC(name, ng->name, namelen) == 0)
7230               {
7231               if (ng->number == number) break;
7232               if ((options & PCRE_DUPNAMES) == 0)
7233                 {
7234                 *errorcodeptr = ERR43;
7235                 goto FAILED;
7236                 }
7237               cd->dupnames = TRUE;  /* Duplicate names exist */
7238               }
7239             else if (ng->number == number)
7240               {
7241               *errorcodeptr = ERR65;
7242               goto FAILED;
7243               }
7244             }
7245 
7246           if (i >= cd->names_found)     /* Not a duplicate with same number */
7247             {
7248             /* Increase the list size if necessary */
7249 
7250             if (cd->names_found >= cd->named_group_list_size)
7251               {
7252               int newsize = cd->named_group_list_size * 2;
7253               named_group *newspace = (PUBL(malloc))
7254                 (newsize * sizeof(named_group));
7255 
7256               if (newspace == NULL)
7257                 {
7258                 *errorcodeptr = ERR21;
7259                 goto FAILED;
7260                 }
7261 
7262               memcpy(newspace, cd->named_groups,
7263                 cd->named_group_list_size * sizeof(named_group));
7264               if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
7265                 (PUBL(free))((void *)cd->named_groups);
7266               cd->named_groups = newspace;
7267               cd->named_group_list_size = newsize;
7268               }
7269 
7270             cd->named_groups[cd->names_found].name = name;
7271             cd->named_groups[cd->names_found].length = namelen;
7272             cd->named_groups[cd->names_found].number = number;
7273             cd->names_found++;
7274             }
7275           }
7276 
7277         ptr++;                    /* Move past > or ' in both passes. */
7278         goto NUMBERED_GROUP;
7279 
7280 
7281         /* ------------------------------------------------------------ */
7282         case CHAR_AMPERSAND:            /* Perl recursion/subroutine syntax */
7283         terminator = CHAR_RIGHT_PARENTHESIS;
7284         is_recurse = TRUE;
7285         /* Fall through */
7286 
7287         /* We come here from the Python syntax above that handles both
7288         references (?P=name) and recursion (?P>name), as well as falling
7289         through from the Perl recursion syntax (?&name). We also come here from
7290         the Perl \k<name> or \k'name' back reference syntax and the \k{name}
7291         .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
7292 
7293         NAMED_REF_OR_RECURSE:
7294         name = ++ptr;
7295         if (IS_DIGIT(*ptr))
7296           {
7297           *errorcodeptr = ERR84;   /* Group name must start with non-digit */
7298           goto FAILED;
7299           }
7300         while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
7301         namelen = (int)(ptr - name);
7302 
7303         /* In the pre-compile phase, do a syntax check. We used to just set
7304         a dummy reference number, because it was not used in the first pass.
7305         However, with the change of recursive back references to be atomic,
7306         we have to look for the number so that this state can be identified, as
7307         otherwise the incorrect length is computed. If it's not a backwards
7308         reference, the dummy number will do. */
7309 
7310         if (lengthptr != NULL)
7311           {
7312           named_group *ng;
7313           recno = 0;
7314 
7315           if (namelen == 0)
7316             {
7317             *errorcodeptr = ERR62;
7318             goto FAILED;
7319             }
7320           if (*ptr != (pcre_uchar)terminator)
7321             {
7322             *errorcodeptr = ERR42;
7323             goto FAILED;
7324             }
7325           if (namelen > MAX_NAME_SIZE)
7326             {
7327             *errorcodeptr = ERR48;
7328             goto FAILED;
7329             }
7330 
7331           /* Count named back references. */
7332 
7333           if (!is_recurse) cd->namedrefcount++;
7334 
7335           /* We have to allow for a named reference to a duplicated name (this
7336           cannot be determined until the second pass). This needs an extra
7337           16-bit data item. */
7338 
7339           *lengthptr += IMM2_SIZE;
7340 
7341           /* If this is a forward reference and we are within a (?|...) group,
7342           the reference may end up as the number of a group which we are
7343           currently inside, that is, it could be a recursive reference. In the
7344           real compile this will be picked up and the reference wrapped with
7345           OP_ONCE to make it atomic, so we must space in case this occurs. */
7346 
7347           /* In fact, this can happen for a non-forward reference because
7348           another group with the same number might be created later. This
7349           issue is fixed "properly" in PCRE2. As PCRE1 is now in maintenance
7350           only mode, we finesse the bug by allowing more memory always. */
7351 
7352           *lengthptr += 4 + 4*LINK_SIZE;
7353 
7354           /* It is even worse than that. The current reference may be to an
7355           existing named group with a different number (so apparently not
7356           recursive) but which later on is also attached to a group with the
7357           current number. This can only happen if $(| has been previous
7358           encountered. In that case, we allow yet more memory, just in case.
7359           (Again, this is fixed "properly" in PCRE2. */
7360 
7361           if (cd->dupgroups) *lengthptr += 4 + 4*LINK_SIZE;
7362 
7363           /* Otherwise, check for recursion here. The name table does not exist
7364           in the first pass; instead we must scan the list of names encountered
7365           so far in order to get the number. If the name is not found, leave
7366           the value of recno as 0 for a forward reference. */
7367 
7368           /* This patch (removing "else") fixes a problem when a reference is
7369           to multiple identically named nested groups from within the nest.
7370           Once again, it is not the "proper" fix, and it results in an
7371           over-allocation of memory. */
7372 
7373           /* else */
7374             {
7375             ng = cd->named_groups;
7376             for (i = 0; i < cd->names_found; i++, ng++)
7377               {
7378               if (namelen == ng->length &&
7379                   STRNCMP_UC_UC(name, ng->name, namelen) == 0)
7380                 {
7381                 open_capitem *oc;
7382                 recno = ng->number;
7383                 if (is_recurse) break;
7384                 for (oc = cd->open_caps; oc != NULL; oc = oc->next)
7385                   {
7386                   if (oc->number == recno)
7387                     {
7388                     oc->flag = TRUE;
7389                     break;
7390                     }
7391                   }
7392                 }
7393               }
7394             }
7395           }
7396 
7397         /* In the real compile, search the name table. We check the name
7398         first, and then check that we have reached the end of the name in the
7399         table. That way, if the name is longer than any in the table, the
7400         comparison will fail without reading beyond the table entry. */
7401 
7402         else
7403           {
7404           slot = cd->name_table;
7405           for (i = 0; i < cd->names_found; i++)
7406             {
7407             if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
7408                 slot[IMM2_SIZE+namelen] == 0)
7409               break;
7410             slot += cd->name_entry_size;
7411             }
7412 
7413           if (i < cd->names_found)
7414             {
7415             recno = GET2(slot, 0);
7416             }
7417           else
7418             {
7419             *errorcodeptr = ERR15;
7420             goto FAILED;
7421             }
7422           }
7423 
7424         /* In both phases, for recursions, we can now go to the code than
7425         handles numerical recursion. */
7426 
7427         if (is_recurse) goto HANDLE_RECURSION;
7428 
7429         /* In the second pass we must see if the name is duplicated. If so, we
7430         generate a different opcode. */
7431 
7432         if (lengthptr == NULL && cd->dupnames)
7433           {
7434           int count = 1;
7435           unsigned int index = i;
7436           pcre_uchar *cslot = slot + cd->name_entry_size;
7437 
7438           for (i++; i < cd->names_found; i++)
7439             {
7440             if (STRCMP_UC_UC(slot + IMM2_SIZE, cslot + IMM2_SIZE) != 0) break;
7441             count++;
7442             cslot += cd->name_entry_size;
7443             }
7444 
7445           if (count > 1)
7446             {
7447             if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7448             previous = code;
7449             item_hwm_offset = cd->hwm - cd->start_workspace;
7450             *code++ = ((options & PCRE_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
7451             PUT2INC(code, 0, index);
7452             PUT2INC(code, 0, count);
7453 
7454             /* Process each potentially referenced group. */
7455 
7456             for (; slot < cslot; slot += cd->name_entry_size)
7457               {
7458               open_capitem *oc;
7459               recno = GET2(slot, 0);
7460               cd->backref_map |= (recno < 32)? (1 << recno) : 1;
7461               if (recno > cd->top_backref) cd->top_backref = recno;
7462 
7463               /* Check to see if this back reference is recursive, that it, it
7464               is inside the group that it references. A flag is set so that the
7465               group can be made atomic. */
7466 
7467               for (oc = cd->open_caps; oc != NULL; oc = oc->next)
7468                 {
7469                 if (oc->number == recno)
7470                   {
7471                   oc->flag = TRUE;
7472                   break;
7473                   }
7474                 }
7475               }
7476 
7477             continue;  /* End of back ref handling */
7478             }
7479           }
7480 
7481         /* First pass, or a non-duplicated name. */
7482 
7483         goto HANDLE_REFERENCE;
7484 
7485 
7486         /* ------------------------------------------------------------ */
7487         case CHAR_R:              /* Recursion, same as (?0) */
7488         recno = 0;
7489         if (*(++ptr) != CHAR_RIGHT_PARENTHESIS)
7490           {
7491           *errorcodeptr = ERR29;
7492           goto FAILED;
7493           }
7494         goto HANDLE_RECURSION;
7495 
7496 
7497         /* ------------------------------------------------------------ */
7498         case CHAR_MINUS: case CHAR_PLUS:  /* Recursion or subroutine */
7499         case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
7500         case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
7501           {
7502           const pcre_uchar *called;
7503           terminator = CHAR_RIGHT_PARENTHESIS;
7504 
7505           /* Come here from the \g<...> and \g'...' code (Oniguruma
7506           compatibility). However, the syntax has been checked to ensure that
7507           the ... are a (signed) number, so that neither ERR63 nor ERR29 will
7508           be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
7509           ever be taken. */
7510 
7511           HANDLE_NUMERICAL_RECURSION:
7512 
7513           if ((refsign = *ptr) == CHAR_PLUS)
7514             {
7515             ptr++;
7516             if (!IS_DIGIT(*ptr))
7517               {
7518               *errorcodeptr = ERR63;
7519               goto FAILED;
7520               }
7521             }
7522           else if (refsign == CHAR_MINUS)
7523             {
7524             if (!IS_DIGIT(ptr[1]))
7525               goto OTHER_CHAR_AFTER_QUERY;
7526             ptr++;
7527             }
7528 
7529           recno = 0;
7530           while(IS_DIGIT(*ptr))
7531             {
7532             if (recno > INT_MAX / 10 - 1) /* Integer overflow */
7533               {
7534               while (IS_DIGIT(*ptr)) ptr++;
7535               *errorcodeptr = ERR61;
7536               goto FAILED;
7537               }
7538             recno = recno * 10 + *ptr++ - CHAR_0;
7539             }
7540 
7541           if (*ptr != (pcre_uchar)terminator)
7542             {
7543             *errorcodeptr = ERR29;
7544             goto FAILED;
7545             }
7546 
7547           if (refsign == CHAR_MINUS)
7548             {
7549             if (recno == 0)
7550               {
7551               *errorcodeptr = ERR58;
7552               goto FAILED;
7553               }
7554             recno = cd->bracount - recno + 1;
7555             if (recno <= 0)
7556               {
7557               *errorcodeptr = ERR15;
7558               goto FAILED;
7559               }
7560             }
7561           else if (refsign == CHAR_PLUS)
7562             {
7563             if (recno == 0)
7564               {
7565               *errorcodeptr = ERR58;
7566               goto FAILED;
7567               }
7568             recno += cd->bracount;
7569             }
7570 
7571           /* Come here from code above that handles a named recursion */
7572 
7573           HANDLE_RECURSION:
7574 
7575           previous = code;
7576           item_hwm_offset = cd->hwm - cd->start_workspace;
7577           called = cd->start_code;
7578 
7579           /* When we are actually compiling, find the bracket that is being
7580           referenced. Temporarily end the regex in case it doesn't exist before
7581           this point. If we end up with a forward reference, first check that
7582           the bracket does occur later so we can give the error (and position)
7583           now. Then remember this forward reference in the workspace so it can
7584           be filled in at the end. */
7585 
7586           if (lengthptr == NULL)
7587             {
7588             *code = OP_END;
7589             if (recno != 0)
7590               called = PRIV(find_bracket)(cd->start_code, utf, recno);
7591 
7592             /* Forward reference */
7593 
7594             if (called == NULL)
7595               {
7596               if (recno > cd->final_bracount)
7597                 {
7598                 *errorcodeptr = ERR15;
7599                 goto FAILED;
7600                 }
7601 
7602               /* Fudge the value of "called" so that when it is inserted as an
7603               offset below, what it actually inserted is the reference number
7604               of the group. Then remember the forward reference. */
7605 
7606               called = cd->start_code + recno;
7607               if (cd->hwm >= cd->start_workspace + cd->workspace_size -
7608                   WORK_SIZE_SAFETY_MARGIN)
7609                 {
7610                 *errorcodeptr = expand_workspace(cd);
7611                 if (*errorcodeptr != 0) goto FAILED;
7612                 }
7613               PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
7614               }
7615 
7616             /* If not a forward reference, and the subpattern is still open,
7617             this is a recursive call. We check to see if this is a left
7618             recursion that could loop for ever, and diagnose that case. We
7619             must not, however, do this check if we are in a conditional
7620             subpattern because the condition might be testing for recursion in
7621             a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid.
7622             Forever loops are also detected at runtime, so those that occur in
7623             conditional subpatterns will be picked up then. */
7624 
7625             else if (GET(called, 1) == 0 && cond_depth <= 0 &&
7626                      could_be_empty(called, code, bcptr, utf, cd))
7627               {
7628               *errorcodeptr = ERR40;
7629               goto FAILED;
7630               }
7631             }
7632 
7633           /* Insert the recursion/subroutine item. It does not have a set first
7634           character (relevant if it is repeated, because it will then be
7635           wrapped with ONCE brackets). */
7636 
7637           *code = OP_RECURSE;
7638           PUT(code, 1, (int)(called - cd->start_code));
7639           code += 1 + LINK_SIZE;
7640           groupsetfirstchar = FALSE;
7641           }
7642 
7643         /* Can't determine a first byte now */
7644 
7645         if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7646         zerofirstchar = firstchar;
7647         zerofirstcharflags = firstcharflags;
7648         continue;
7649 
7650 
7651         /* ------------------------------------------------------------ */
7652         default:              /* Other characters: check option setting */
7653         OTHER_CHAR_AFTER_QUERY:
7654         set = unset = 0;
7655         optset = &set;
7656 
7657         while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
7658           {
7659           switch (*ptr++)
7660             {
7661             case CHAR_MINUS: optset = &unset; break;
7662 
7663             case CHAR_J:    /* Record that it changed in the external options */
7664             *optset |= PCRE_DUPNAMES;
7665             cd->external_flags |= PCRE_JCHANGED;
7666             break;
7667 
7668             case CHAR_i: *optset |= PCRE_CASELESS; break;
7669             case CHAR_m: *optset |= PCRE_MULTILINE; break;
7670             case CHAR_s: *optset |= PCRE_DOTALL; break;
7671             case CHAR_x: *optset |= PCRE_EXTENDED; break;
7672             case CHAR_U: *optset |= PCRE_UNGREEDY; break;
7673             case CHAR_X: *optset |= PCRE_EXTRA; break;
7674 
7675             default:  *errorcodeptr = ERR12;
7676                       ptr--;    /* Correct the offset */
7677                       goto FAILED;
7678             }
7679           }
7680 
7681         /* Set up the changed option bits, but don't change anything yet. */
7682 
7683         newoptions = (options | set) & (~unset);
7684 
7685         /* If the options ended with ')' this is not the start of a nested
7686         group with option changes, so the options change at this level.
7687         If we are not at the pattern start, reset the greedy defaults and the
7688         case value for firstchar and reqchar. */
7689 
7690         if (*ptr == CHAR_RIGHT_PARENTHESIS)
7691           {
7692           greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
7693           greedy_non_default = greedy_default ^ 1;
7694           req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
7695 
7696           /* Change options at this level, and pass them back for use
7697           in subsequent branches. */
7698 
7699           *optionsptr = options = newoptions;
7700           previous = NULL;       /* This item can't be repeated */
7701           continue;              /* It is complete */
7702           }
7703 
7704         /* If the options ended with ':' we are heading into a nested group
7705         with possible change of options. Such groups are non-capturing and are
7706         not assertions of any kind. All we need to do is skip over the ':';
7707         the newoptions value is handled below. */
7708 
7709         bravalue = OP_BRA;
7710         ptr++;
7711         }     /* End of switch for character following (? */
7712       }       /* End of (? handling */
7713 
7714     /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
7715     is set, all unadorned brackets become non-capturing and behave like (?:...)
7716     brackets. */
7717 
7718     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
7719       {
7720       bravalue = OP_BRA;
7721       }
7722 
7723     /* Else we have a capturing group. */
7724 
7725     else
7726       {
7727       NUMBERED_GROUP:
7728       cd->bracount += 1;
7729       PUT2(code, 1+LINK_SIZE, cd->bracount);
7730       skipbytes = IMM2_SIZE;
7731       }
7732 
7733     /* Process nested bracketed regex. First check for parentheses nested too
7734     deeply. */
7735 
7736     if ((cd->parens_depth += 1) > PARENS_NEST_LIMIT)
7737       {
7738       *errorcodeptr = ERR82;
7739       goto FAILED;
7740       }
7741 
7742     /* All assertions used not to be repeatable, but this was changed for Perl
7743     compatibility. All kinds can now be repeated except for assertions that are
7744     conditions (Perl also forbids these to be repeated). We copy code into a
7745     non-register variable (tempcode) in order to be able to pass its address
7746     because some compilers complain otherwise. At the start of a conditional
7747     group whose condition is an assertion, cd->iscondassert is set. We unset it
7748     here so as to allow assertions later in the group to be quantified. */
7749 
7750     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT &&
7751         cd->iscondassert)
7752       {
7753       previous = NULL;
7754       cd->iscondassert = FALSE;
7755       }
7756     else
7757       {
7758       previous = code;
7759       item_hwm_offset = cd->hwm - cd->start_workspace;
7760       }
7761 
7762     *code = bravalue;
7763     tempcode = code;
7764     tempreqvary = cd->req_varyopt;        /* Save value before bracket */
7765     tempbracount = cd->bracount;          /* Save value before bracket */
7766     length_prevgroup = 0;                 /* Initialize for pre-compile phase */
7767 
7768     if (!compile_regex(
7769          newoptions,                      /* The complete new option state */
7770          &tempcode,                       /* Where to put code (updated) */
7771          &ptr,                            /* Input pointer (updated) */
7772          errorcodeptr,                    /* Where to put an error message */
7773          (bravalue == OP_ASSERTBACK ||
7774           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
7775          reset_bracount,                  /* True if (?| group */
7776          skipbytes,                       /* Skip over bracket number */
7777          cond_depth +
7778            ((bravalue == OP_COND)?1:0),   /* Depth of condition subpatterns */
7779          &subfirstchar,                   /* For possible first char */
7780          &subfirstcharflags,
7781          &subreqchar,                     /* For possible last char */
7782          &subreqcharflags,
7783          bcptr,                           /* Current branch chain */
7784          cd,                              /* Tables block */
7785          (lengthptr == NULL)? NULL :      /* Actual compile phase */
7786            &length_prevgroup              /* Pre-compile phase */
7787          ))
7788       goto FAILED;
7789 
7790     cd->parens_depth -= 1;
7791 
7792     /* If this was an atomic group and there are no capturing groups within it,
7793     generate OP_ONCE_NC instead of OP_ONCE. */
7794 
7795     if (bravalue == OP_ONCE && cd->bracount <= tempbracount)
7796       *code = OP_ONCE_NC;
7797 
7798     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
7799       cd->assert_depth -= 1;
7800 
7801     /* At the end of compiling, code is still pointing to the start of the
7802     group, while tempcode has been updated to point past the end of the group.
7803     The pattern pointer (ptr) is on the bracket.
7804 
7805     If this is a conditional bracket, check that there are no more than
7806     two branches in the group, or just one if it's a DEFINE group. We do this
7807     in the real compile phase, not in the pre-pass, where the whole group may
7808     not be available. */
7809 
7810     if (bravalue == OP_COND && lengthptr == NULL)
7811       {
7812       pcre_uchar *tc = code;
7813       int condcount = 0;
7814 
7815       do {
7816          condcount++;
7817          tc += GET(tc,1);
7818          }
7819       while (*tc != OP_KET);
7820 
7821       /* A DEFINE group is never obeyed inline (the "condition" is always
7822       false). It must have only one branch. */
7823 
7824       if (code[LINK_SIZE+1] == OP_DEF)
7825         {
7826         if (condcount > 1)
7827           {
7828           *errorcodeptr = ERR54;
7829           goto FAILED;
7830           }
7831         bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
7832         }
7833 
7834       /* A "normal" conditional group. If there is just one branch, we must not
7835       make use of its firstchar or reqchar, because this is equivalent to an
7836       empty second branch. */
7837 
7838       else
7839         {
7840         if (condcount > 2)
7841           {
7842           *errorcodeptr = ERR27;
7843           goto FAILED;
7844           }
7845         if (condcount == 1) subfirstcharflags = subreqcharflags = REQ_NONE;
7846         }
7847       }
7848 
7849     /* Error if hit end of pattern */
7850 
7851     if (*ptr != CHAR_RIGHT_PARENTHESIS)
7852       {
7853       *errorcodeptr = ERR14;
7854       goto FAILED;
7855       }
7856 
7857     /* In the pre-compile phase, update the length by the length of the group,
7858     less the brackets at either end. Then reduce the compiled code to just a
7859     set of non-capturing brackets so that it doesn't use much memory if it is
7860     duplicated by a quantifier.*/
7861 
7862     if (lengthptr != NULL)
7863       {
7864       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
7865         {
7866         *errorcodeptr = ERR20;
7867         goto FAILED;
7868         }
7869       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
7870       code++;   /* This already contains bravalue */
7871       PUTINC(code, 0, 1 + LINK_SIZE);
7872       *code++ = OP_KET;
7873       PUTINC(code, 0, 1 + LINK_SIZE);
7874       break;    /* No need to waste time with special character handling */
7875       }
7876 
7877     /* Otherwise update the main code pointer to the end of the group. */
7878 
7879     code = tempcode;
7880 
7881     /* For a DEFINE group, required and first character settings are not
7882     relevant. */
7883 
7884     if (bravalue == OP_DEF) break;
7885 
7886     /* Handle updating of the required and first characters for other types of
7887     group. Update for normal brackets of all kinds, and conditions with two
7888     branches (see code above). If the bracket is followed by a quantifier with
7889     zero repeat, we have to back off. Hence the definition of zeroreqchar and
7890     zerofirstchar outside the main loop so that they can be accessed for the
7891     back off. */
7892 
7893     zeroreqchar = reqchar;
7894     zeroreqcharflags = reqcharflags;
7895     zerofirstchar = firstchar;
7896     zerofirstcharflags = firstcharflags;
7897     groupsetfirstchar = FALSE;
7898 
7899     if (bravalue >= OP_ONCE)
7900       {
7901       /* If we have not yet set a firstchar in this branch, take it from the
7902       subpattern, remembering that it was set here so that a repeat of more
7903       than one can replicate it as reqchar if necessary. If the subpattern has
7904       no firstchar, set "none" for the whole branch. In both cases, a zero
7905       repeat forces firstchar to "none". */
7906 
7907       if (firstcharflags == REQ_UNSET)
7908         {
7909         if (subfirstcharflags >= 0)
7910           {
7911           firstchar = subfirstchar;
7912           firstcharflags = subfirstcharflags;
7913           groupsetfirstchar = TRUE;
7914           }
7915         else firstcharflags = REQ_NONE;
7916         zerofirstcharflags = REQ_NONE;
7917         }
7918 
7919       /* If firstchar was previously set, convert the subpattern's firstchar
7920       into reqchar if there wasn't one, using the vary flag that was in
7921       existence beforehand. */
7922 
7923       else if (subfirstcharflags >= 0 && subreqcharflags < 0)
7924         {
7925         subreqchar = subfirstchar;
7926         subreqcharflags = subfirstcharflags | tempreqvary;
7927         }
7928 
7929       /* If the subpattern set a required byte (or set a first byte that isn't
7930       really the first byte - see above), set it. */
7931 
7932       if (subreqcharflags >= 0)
7933         {
7934         reqchar = subreqchar;
7935         reqcharflags = subreqcharflags;
7936         }
7937       }
7938 
7939     /* For a forward assertion, we take the reqchar, if set, provided that the
7940     group has also set a first char. This can be helpful if the pattern that
7941     follows the assertion doesn't set a different char. For example, it's
7942     useful for /(?=abcde).+/. We can't set firstchar for an assertion, however
7943     because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
7944     the "real" "a" would then become a reqchar instead of a firstchar. This is
7945     overcome by a scan at the end if there's no firstchar, looking for an
7946     asserted first char. */
7947 
7948     else if (bravalue == OP_ASSERT && subreqcharflags >= 0 &&
7949              subfirstcharflags >= 0)
7950       {
7951       reqchar = subreqchar;
7952       reqcharflags = subreqcharflags;
7953       }
7954     break;     /* End of processing '(' */
7955 
7956 
7957     /* ===================================================================*/
7958     /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
7959     are arranged to be the negation of the corresponding OP_values in the
7960     default case when PCRE_UCP is not set. For the back references, the values
7961     are negative the reference number. Only back references and those types
7962     that consume a character may be repeated. We can test for values between
7963     ESC_b and ESC_Z for the latter; this may have to change if any new ones are
7964     ever created. */
7965 
7966     case CHAR_BACKSLASH:
7967     tempptr = ptr;
7968     escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options, FALSE);
7969     if (*errorcodeptr != 0) goto FAILED;
7970 
7971     if (escape == 0)                  /* The escape coded a single character */
7972       c = ec;
7973     else
7974       {
7975       /* For metasequences that actually match a character, we disable the
7976       setting of a first character if it hasn't already been set. */
7977 
7978       if (firstcharflags == REQ_UNSET && escape > ESC_b && escape < ESC_Z)
7979         firstcharflags = REQ_NONE;
7980 
7981       /* Set values to reset to if this is followed by a zero repeat. */
7982 
7983       zerofirstchar = firstchar;
7984       zerofirstcharflags = firstcharflags;
7985       zeroreqchar = reqchar;
7986       zeroreqcharflags = reqcharflags;
7987 
7988       /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
7989       is a subroutine call by number (Oniguruma syntax). In fact, the value
7990       ESC_g is returned only for these cases. So we don't need to check for <
7991       or ' if the value is ESC_g. For the Perl syntax \g{n} the value is
7992       -n, and for the Perl syntax \g{name} the result is ESC_k (as
7993       that is a synonym for a named back reference). */
7994 
7995       if (escape == ESC_g)
7996         {
7997         const pcre_uchar *p;
7998         pcre_uint32 cf;
7999 
8000         item_hwm_offset = cd->hwm - cd->start_workspace;   /* Normally this is set when '(' is read */
8001         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
8002           CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
8003 
8004         /* These two statements stop the compiler for warning about possibly
8005         unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
8006         fact, because we do the check for a number below, the paths that
8007         would actually be in error are never taken. */
8008 
8009         skipbytes = 0;
8010         reset_bracount = FALSE;
8011 
8012         /* If it's not a signed or unsigned number, treat it as a name. */
8013 
8014         cf = ptr[1];
8015         if (cf != CHAR_PLUS && cf != CHAR_MINUS && !IS_DIGIT(cf))
8016           {
8017           is_recurse = TRUE;
8018           goto NAMED_REF_OR_RECURSE;
8019           }
8020 
8021         /* Signed or unsigned number (cf = ptr[1]) is known to be plus or minus
8022         or a digit. */
8023 
8024         p = ptr + 2;
8025         while (IS_DIGIT(*p)) p++;
8026         if (*p != (pcre_uchar)terminator)
8027           {
8028           *errorcodeptr = ERR57;
8029           goto FAILED;
8030           }
8031         ptr++;
8032         goto HANDLE_NUMERICAL_RECURSION;
8033         }
8034 
8035       /* \k<name> or \k'name' is a back reference by name (Perl syntax).
8036       We also support \k{name} (.NET syntax).  */
8037 
8038       if (escape == ESC_k)
8039         {
8040         if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
8041           ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
8042           {
8043           *errorcodeptr = ERR69;
8044           goto FAILED;
8045           }
8046         is_recurse = FALSE;
8047         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
8048           CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
8049           CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
8050         goto NAMED_REF_OR_RECURSE;
8051         }
8052 
8053       /* Back references are handled specially; must disable firstchar if
8054       not set to cope with cases like (?=(\w+))\1: which would otherwise set
8055       ':' later. */
8056 
8057       if (escape < 0)
8058         {
8059         open_capitem *oc;
8060         recno = -escape;
8061 
8062         /* Come here from named backref handling when the reference is to a
8063         single group (i.e. not to a duplicated name. */
8064 
8065         HANDLE_REFERENCE:
8066         if (firstcharflags == REQ_UNSET) zerofirstcharflags = firstcharflags = REQ_NONE;
8067         previous = code;
8068         item_hwm_offset = cd->hwm - cd->start_workspace;
8069         *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
8070         PUT2INC(code, 0, recno);
8071         cd->backref_map |= (recno < 32)? (1 << recno) : 1;
8072         if (recno > cd->top_backref) cd->top_backref = recno;
8073 
8074         /* Check to see if this back reference is recursive, that it, it
8075         is inside the group that it references. A flag is set so that the
8076         group can be made atomic. */
8077 
8078         for (oc = cd->open_caps; oc != NULL; oc = oc->next)
8079           {
8080           if (oc->number == recno)
8081             {
8082             oc->flag = TRUE;
8083             break;
8084             }
8085           }
8086         }
8087 
8088       /* So are Unicode property matches, if supported. */
8089 
8090 #ifdef SUPPORT_UCP
8091       else if (escape == ESC_P || escape == ESC_p)
8092         {
8093         BOOL negated;
8094         unsigned int ptype = 0, pdata = 0;
8095         if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
8096           goto FAILED;
8097         previous = code;
8098         item_hwm_offset = cd->hwm - cd->start_workspace;
8099         *code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
8100         *code++ = ptype;
8101         *code++ = pdata;
8102         }
8103 #else
8104 
8105       /* If Unicode properties are not supported, \X, \P, and \p are not
8106       allowed. */
8107 
8108       else if (escape == ESC_X || escape == ESC_P || escape == ESC_p)
8109         {
8110         *errorcodeptr = ERR45;
8111         goto FAILED;
8112         }
8113 #endif
8114 
8115       /* For the rest (including \X when Unicode properties are supported), we
8116       can obtain the OP value by negating the escape value in the default
8117       situation when PCRE_UCP is not set. When it *is* set, we substitute
8118       Unicode property tests. Note that \b and \B do a one-character
8119       lookbehind, and \A also behaves as if it does. */
8120 
8121       else
8122         {
8123         if ((escape == ESC_b || escape == ESC_B || escape == ESC_A) &&
8124              cd->max_lookbehind == 0)
8125           cd->max_lookbehind = 1;
8126 #ifdef SUPPORT_UCP
8127         if (escape >= ESC_DU && escape <= ESC_wu)
8128           {
8129           nestptr = ptr + 1;                   /* Where to resume */
8130           ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
8131           }
8132         else
8133 #endif
8134         /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
8135         so that it works in DFA mode and in lookbehinds. */
8136 
8137           {
8138           previous = (escape > ESC_b && escape < ESC_Z)? code : NULL;
8139           item_hwm_offset = cd->hwm - cd->start_workspace;
8140           *code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape;
8141           }
8142         }
8143       continue;
8144       }
8145 
8146     /* We have a data character whose value is in c. In UTF-8 mode it may have
8147     a value > 127. We set its representation in the length/buffer, and then
8148     handle it as a data character. */
8149 
8150 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
8151     if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
8152       mclength = PRIV(ord2utf)(c, mcbuffer);
8153     else
8154 #endif
8155 
8156      {
8157      mcbuffer[0] = c;
8158      mclength = 1;
8159      }
8160     goto ONE_CHAR;
8161 
8162 
8163     /* ===================================================================*/
8164     /* Handle a literal character. It is guaranteed not to be whitespace or #
8165     when the extended flag is set. If we are in a UTF mode, it may be a
8166     multi-unit literal character. */
8167 
8168     default:
8169     NORMAL_CHAR:
8170     mclength = 1;
8171     mcbuffer[0] = c;
8172 
8173 #ifdef SUPPORT_UTF
8174     if (utf && HAS_EXTRALEN(c))
8175       ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
8176 #endif
8177 
8178     /* At this point we have the character's bytes in mcbuffer, and the length
8179     in mclength. When not in UTF-8 mode, the length is always 1. */
8180 
8181     ONE_CHAR:
8182     previous = code;
8183     item_hwm_offset = cd->hwm - cd->start_workspace;
8184 
8185     /* For caseless UTF-8 mode when UCP support is available, check whether
8186     this character has more than one other case. If so, generate a special
8187     OP_PROP item instead of OP_CHARI. */
8188 
8189 #ifdef SUPPORT_UCP
8190     if (utf && (options & PCRE_CASELESS) != 0)
8191       {
8192       GETCHAR(c, mcbuffer);
8193       if ((c = UCD_CASESET(c)) != 0)
8194         {
8195         *code++ = OP_PROP;
8196         *code++ = PT_CLIST;
8197         *code++ = c;
8198         if (firstcharflags == REQ_UNSET)
8199           firstcharflags = zerofirstcharflags = REQ_NONE;
8200         break;
8201         }
8202       }
8203 #endif
8204 
8205     /* Caseful matches, or not one of the multicase characters. */
8206 
8207     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARI : OP_CHAR;
8208     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
8209 
8210     /* Remember if \r or \n were seen */
8211 
8212     if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
8213       cd->external_flags |= PCRE_HASCRORLF;
8214 
8215     /* Set the first and required bytes appropriately. If no previous first
8216     byte, set it from this character, but revert to none on a zero repeat.
8217     Otherwise, leave the firstchar value alone, and don't change it on a zero
8218     repeat. */
8219 
8220     if (firstcharflags == REQ_UNSET)
8221       {
8222       zerofirstcharflags = REQ_NONE;
8223       zeroreqchar = reqchar;
8224       zeroreqcharflags = reqcharflags;
8225 
8226       /* If the character is more than one byte long, we can set firstchar
8227       only if it is not to be matched caselessly. */
8228 
8229       if (mclength == 1 || req_caseopt == 0)
8230         {
8231         firstchar = mcbuffer[0];
8232         firstcharflags = req_caseopt;
8233 
8234         if (mclength != 1)
8235           {
8236           reqchar = code[-1];
8237           reqcharflags = cd->req_varyopt;
8238           }
8239         }
8240       else firstcharflags = reqcharflags = REQ_NONE;
8241       }
8242 
8243     /* firstchar was previously set; we can set reqchar only if the length is
8244     1 or the matching is caseful. */
8245 
8246     else
8247       {
8248       zerofirstchar = firstchar;
8249       zerofirstcharflags = firstcharflags;
8250       zeroreqchar = reqchar;
8251       zeroreqcharflags = reqcharflags;
8252       if (mclength == 1 || req_caseopt == 0)
8253         {
8254         reqchar = code[-1];
8255         reqcharflags = req_caseopt | cd->req_varyopt;
8256         }
8257       }
8258 
8259     break;            /* End of literal character handling */
8260     }
8261   }                   /* end of big loop */
8262 
8263 
8264 /* Control never reaches here by falling through, only by a goto for all the
8265 error states. Pass back the position in the pattern so that it can be displayed
8266 to the user for diagnosing the error. */
8267 
8268 FAILED:
8269 *ptrptr = ptr;
8270 return FALSE;
8271 }
8272 
8273 
8274 
8275 /*************************************************
8276 *     Compile sequence of alternatives           *
8277 *************************************************/
8278 
8279 /* On entry, ptr is pointing past the bracket character, but on return it
8280 points to the closing bracket, or vertical bar, or end of string. The code
8281 variable is pointing at the byte into which the BRA operator has been stored.
8282 This function is used during the pre-compile phase when we are trying to find
8283 out the amount of memory needed, as well as during the real compile phase. The
8284 value of lengthptr distinguishes the two phases.
8285 
8286 Arguments:
8287   options           option bits, including any changes for this subpattern
8288   codeptr           -> the address of the current code pointer
8289   ptrptr            -> the address of the current pattern pointer
8290   errorcodeptr      -> pointer to error code variable
8291   lookbehind        TRUE if this is a lookbehind assertion
8292   reset_bracount    TRUE to reset the count for each branch
8293   skipbytes         skip this many bytes at start (for brackets and OP_COND)
8294   cond_depth        depth of nesting for conditional subpatterns
8295   firstcharptr      place to put the first required character
8296   firstcharflagsptr place to put the first character flags, or a negative number
8297   reqcharptr        place to put the last required character
8298   reqcharflagsptr   place to put the last required character flags, or a negative number
8299   bcptr             pointer to the chain of currently open branches
8300   cd                points to the data block with tables pointers etc.
8301   lengthptr         NULL during the real compile phase
8302                     points to length accumulator during pre-compile phase
8303 
8304 Returns:            TRUE on success
8305 */
8306 
8307 static BOOL
compile_regex(int options,pcre_uchar ** codeptr,const pcre_uchar ** ptrptr,int * errorcodeptr,BOOL lookbehind,BOOL reset_bracount,int skipbytes,int cond_depth,pcre_uint32 * firstcharptr,pcre_int32 * firstcharflagsptr,pcre_uint32 * reqcharptr,pcre_int32 * reqcharflagsptr,branch_chain * bcptr,compile_data * cd,int * lengthptr)8308 compile_regex(int options, pcre_uchar **codeptr, const pcre_uchar **ptrptr,
8309   int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
8310   int cond_depth,
8311   pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
8312   pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
8313   branch_chain *bcptr, compile_data *cd, int *lengthptr)
8314 {
8315 const pcre_uchar *ptr = *ptrptr;
8316 pcre_uchar *code = *codeptr;
8317 pcre_uchar *last_branch = code;
8318 pcre_uchar *start_bracket = code;
8319 pcre_uchar *reverse_count = NULL;
8320 open_capitem capitem;
8321 int capnumber = 0;
8322 pcre_uint32 firstchar, reqchar;
8323 pcre_int32 firstcharflags, reqcharflags;
8324 pcre_uint32 branchfirstchar, branchreqchar;
8325 pcre_int32 branchfirstcharflags, branchreqcharflags;
8326 int length;
8327 unsigned int orig_bracount;
8328 unsigned int max_bracount;
8329 branch_chain bc;
8330 size_t save_hwm_offset;
8331 
8332 /* If set, call the external function that checks for stack availability. */
8333 
8334 if (PUBL(stack_guard) != NULL && PUBL(stack_guard)())
8335   {
8336   *errorcodeptr= ERR85;
8337   return FALSE;
8338   }
8339 
8340 /* Miscellaneous initialization */
8341 
8342 bc.outer = bcptr;
8343 bc.current_branch = code;
8344 
8345 firstchar = reqchar = 0;
8346 firstcharflags = reqcharflags = REQ_UNSET;
8347 
8348 save_hwm_offset = cd->hwm - cd->start_workspace;
8349 
8350 /* Accumulate the length for use in the pre-compile phase. Start with the
8351 length of the BRA and KET and any extra bytes that are required at the
8352 beginning. We accumulate in a local variable to save frequent testing of
8353 lenthptr for NULL. We cannot do this by looking at the value of code at the
8354 start and end of each alternative, because compiled items are discarded during
8355 the pre-compile phase so that the work space is not exceeded. */
8356 
8357 length = 2 + 2*LINK_SIZE + skipbytes;
8358 
8359 /* WARNING: If the above line is changed for any reason, you must also change
8360 the code that abstracts option settings at the start of the pattern and makes
8361 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
8362 pre-compile phase to find out whether anything has yet been compiled or not. */
8363 
8364 /* If this is a capturing subpattern, add to the chain of open capturing items
8365 so that we can detect them if (*ACCEPT) is encountered. This is also used to
8366 detect groups that contain recursive back references to themselves. Note that
8367 only OP_CBRA need be tested here; changing this opcode to one of its variants,
8368 e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */
8369 
8370 if (*code == OP_CBRA)
8371   {
8372   capnumber = GET2(code, 1 + LINK_SIZE);
8373   capitem.number = capnumber;
8374   capitem.next = cd->open_caps;
8375   capitem.flag = FALSE;
8376   cd->open_caps = &capitem;
8377   }
8378 
8379 /* Offset is set zero to mark that this bracket is still open */
8380 
8381 PUT(code, 1, 0);
8382 code += 1 + LINK_SIZE + skipbytes;
8383 
8384 /* Loop for each alternative branch */
8385 
8386 orig_bracount = max_bracount = cd->bracount;
8387 for (;;)
8388   {
8389   /* For a (?| group, reset the capturing bracket count so that each branch
8390   uses the same numbers. */
8391 
8392   if (reset_bracount) cd->bracount = orig_bracount;
8393 
8394   /* Set up dummy OP_REVERSE if lookbehind assertion */
8395 
8396   if (lookbehind)
8397     {
8398     *code++ = OP_REVERSE;
8399     reverse_count = code;
8400     PUTINC(code, 0, 0);
8401     length += 1 + LINK_SIZE;
8402     }
8403 
8404   /* Now compile the branch; in the pre-compile phase its length gets added
8405   into the length. */
8406 
8407   if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstchar,
8408         &branchfirstcharflags, &branchreqchar, &branchreqcharflags, &bc,
8409         cond_depth, cd, (lengthptr == NULL)? NULL : &length))
8410     {
8411     *ptrptr = ptr;
8412     return FALSE;
8413     }
8414 
8415   /* Keep the highest bracket count in case (?| was used and some branch
8416   has fewer than the rest. */
8417 
8418   if (cd->bracount > max_bracount) max_bracount = cd->bracount;
8419 
8420   /* In the real compile phase, there is some post-processing to be done. */
8421 
8422   if (lengthptr == NULL)
8423     {
8424     /* If this is the first branch, the firstchar and reqchar values for the
8425     branch become the values for the regex. */
8426 
8427     if (*last_branch != OP_ALT)
8428       {
8429       firstchar = branchfirstchar;
8430       firstcharflags = branchfirstcharflags;
8431       reqchar = branchreqchar;
8432       reqcharflags = branchreqcharflags;
8433       }
8434 
8435     /* If this is not the first branch, the first char and reqchar have to
8436     match the values from all the previous branches, except that if the
8437     previous value for reqchar didn't have REQ_VARY set, it can still match,
8438     and we set REQ_VARY for the regex. */
8439 
8440     else
8441       {
8442       /* If we previously had a firstchar, but it doesn't match the new branch,
8443       we have to abandon the firstchar for the regex, but if there was
8444       previously no reqchar, it takes on the value of the old firstchar. */
8445 
8446       if (firstcharflags >= 0 &&
8447           (firstcharflags != branchfirstcharflags || firstchar != branchfirstchar))
8448         {
8449         if (reqcharflags < 0)
8450           {
8451           reqchar = firstchar;
8452           reqcharflags = firstcharflags;
8453           }
8454         firstcharflags = REQ_NONE;
8455         }
8456 
8457       /* If we (now or from before) have no firstchar, a firstchar from the
8458       branch becomes a reqchar if there isn't a branch reqchar. */
8459 
8460       if (firstcharflags < 0 && branchfirstcharflags >= 0 && branchreqcharflags < 0)
8461         {
8462         branchreqchar = branchfirstchar;
8463         branchreqcharflags = branchfirstcharflags;
8464         }
8465 
8466       /* Now ensure that the reqchars match */
8467 
8468       if (((reqcharflags & ~REQ_VARY) != (branchreqcharflags & ~REQ_VARY)) ||
8469           reqchar != branchreqchar)
8470         reqcharflags = REQ_NONE;
8471       else
8472         {
8473         reqchar = branchreqchar;
8474         reqcharflags |= branchreqcharflags; /* To "or" REQ_VARY */
8475         }
8476       }
8477 
8478     /* If lookbehind, check that this branch matches a fixed-length string, and
8479     put the length into the OP_REVERSE item. Temporarily mark the end of the
8480     branch with OP_END. If the branch contains OP_RECURSE, the result is -3
8481     because there may be forward references that we can't check here. Set a
8482     flag to cause another lookbehind check at the end. Why not do it all at the
8483     end? Because common, erroneous checks are picked up here and the offset of
8484     the problem can be shown. */
8485 
8486     if (lookbehind)
8487       {
8488       int fixed_length;
8489       *code = OP_END;
8490       fixed_length = find_fixedlength(last_branch,  (options & PCRE_UTF8) != 0,
8491         FALSE, cd, NULL);
8492       DPRINTF(("fixed length = %d\n", fixed_length));
8493       if (fixed_length == -3)
8494         {
8495         cd->check_lookbehind = TRUE;
8496         }
8497       else if (fixed_length < 0)
8498         {
8499         *errorcodeptr = (fixed_length == -2)? ERR36 :
8500                         (fixed_length == -4)? ERR70: ERR25;
8501         *ptrptr = ptr;
8502         return FALSE;
8503         }
8504       else
8505         {
8506         if (fixed_length > cd->max_lookbehind)
8507           cd->max_lookbehind = fixed_length;
8508         PUT(reverse_count, 0, fixed_length);
8509         }
8510       }
8511     }
8512 
8513   /* Reached end of expression, either ')' or end of pattern. In the real
8514   compile phase, go back through the alternative branches and reverse the chain
8515   of offsets, with the field in the BRA item now becoming an offset to the
8516   first alternative. If there are no alternatives, it points to the end of the
8517   group. The length in the terminating ket is always the length of the whole
8518   bracketed item. Return leaving the pointer at the terminating char. */
8519 
8520   if (*ptr != CHAR_VERTICAL_LINE)
8521     {
8522     if (lengthptr == NULL)
8523       {
8524       int branch_length = (int)(code - last_branch);
8525       do
8526         {
8527         int prev_length = GET(last_branch, 1);
8528         PUT(last_branch, 1, branch_length);
8529         branch_length = prev_length;
8530         last_branch -= branch_length;
8531         }
8532       while (branch_length > 0);
8533       }
8534 
8535     /* Fill in the ket */
8536 
8537     *code = OP_KET;
8538     PUT(code, 1, (int)(code - start_bracket));
8539     code += 1 + LINK_SIZE;
8540 
8541     /* If it was a capturing subpattern, check to see if it contained any
8542     recursive back references. If so, we must wrap it in atomic brackets.
8543     Because we are moving code along, we must ensure that any pending recursive
8544     references are updated. In any event, remove the block from the chain. */
8545 
8546     if (capnumber > 0)
8547       {
8548       if (cd->open_caps->flag)
8549         {
8550         *code = OP_END;
8551         adjust_recurse(start_bracket, 1 + LINK_SIZE,
8552           (options & PCRE_UTF8) != 0, cd, save_hwm_offset);
8553         memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
8554           IN_UCHARS(code - start_bracket));
8555         *start_bracket = OP_ONCE;
8556         code += 1 + LINK_SIZE;
8557         PUT(start_bracket, 1, (int)(code - start_bracket));
8558         *code = OP_KET;
8559         PUT(code, 1, (int)(code - start_bracket));
8560         code += 1 + LINK_SIZE;
8561         length += 2 + 2*LINK_SIZE;
8562         }
8563       cd->open_caps = cd->open_caps->next;
8564       }
8565 
8566     /* Retain the highest bracket number, in case resetting was used. */
8567 
8568     cd->bracount = max_bracount;
8569 
8570     /* Set values to pass back */
8571 
8572     *codeptr = code;
8573     *ptrptr = ptr;
8574     *firstcharptr = firstchar;
8575     *firstcharflagsptr = firstcharflags;
8576     *reqcharptr = reqchar;
8577     *reqcharflagsptr = reqcharflags;
8578     if (lengthptr != NULL)
8579       {
8580       if (OFLOW_MAX - *lengthptr < length)
8581         {
8582         *errorcodeptr = ERR20;
8583         return FALSE;
8584         }
8585       *lengthptr += length;
8586       }
8587     return TRUE;
8588     }
8589 
8590   /* Another branch follows. In the pre-compile phase, we can move the code
8591   pointer back to where it was for the start of the first branch. (That is,
8592   pretend that each branch is the only one.)
8593 
8594   In the real compile phase, insert an ALT node. Its length field points back
8595   to the previous branch while the bracket remains open. At the end the chain
8596   is reversed. It's done like this so that the start of the bracket has a
8597   zero offset until it is closed, making it possible to detect recursion. */
8598 
8599   if (lengthptr != NULL)
8600     {
8601     code = *codeptr + 1 + LINK_SIZE + skipbytes;
8602     length += 1 + LINK_SIZE;
8603     }
8604   else
8605     {
8606     *code = OP_ALT;
8607     PUT(code, 1, (int)(code - last_branch));
8608     bc.current_branch = last_branch = code;
8609     code += 1 + LINK_SIZE;
8610     }
8611 
8612   ptr++;
8613   }
8614 /* Control never reaches here */
8615 }
8616 
8617 
8618 
8619 
8620 /*************************************************
8621 *          Check for anchored expression         *
8622 *************************************************/
8623 
8624 /* Try to find out if this is an anchored regular expression. Consider each
8625 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8626 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8627 it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8628 be found, because ^ generates OP_CIRCM in that mode.
8629 
8630 We can also consider a regex to be anchored if OP_SOM starts all its branches.
8631 This is the code for \G, which means "match at start of match position, taking
8632 into account the match offset".
8633 
8634 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8635 because that will try the rest of the pattern at all possible matching points,
8636 so there is no point trying again.... er ....
8637 
8638 .... except when the .* appears inside capturing parentheses, and there is a
8639 subsequent back reference to those parentheses. We haven't enough information
8640 to catch that case precisely.
8641 
8642 At first, the best we could do was to detect when .* was in capturing brackets
8643 and the highest back reference was greater than or equal to that level.
8644 However, by keeping a bitmap of the first 31 back references, we can catch some
8645 of the more common cases more precisely.
8646 
8647 ... A second exception is when the .* appears inside an atomic group, because
8648 this prevents the number of characters it matches from being adjusted.
8649 
8650 Arguments:
8651   code           points to start of expression (the bracket)
8652   bracket_map    a bitmap of which brackets we are inside while testing; this
8653                   handles up to substring 31; after that we just have to take
8654                   the less precise approach
8655   cd             points to the compile data block
8656   atomcount      atomic group level
8657 
8658 Returns:     TRUE or FALSE
8659 */
8660 
8661 static BOOL
is_anchored(register const pcre_uchar * code,unsigned int bracket_map,compile_data * cd,int atomcount)8662 is_anchored(register const pcre_uchar *code, unsigned int bracket_map,
8663   compile_data *cd, int atomcount)
8664 {
8665 do {
8666    const pcre_uchar *scode = first_significant_code(
8667      code + PRIV(OP_lengths)[*code], FALSE);
8668    register int op = *scode;
8669 
8670    /* Non-capturing brackets */
8671 
8672    if (op == OP_BRA  || op == OP_BRAPOS ||
8673        op == OP_SBRA || op == OP_SBRAPOS)
8674      {
8675      if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8676      }
8677 
8678    /* Capturing brackets */
8679 
8680    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8681             op == OP_SCBRA || op == OP_SCBRAPOS)
8682      {
8683      int n = GET2(scode, 1+LINK_SIZE);
8684      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
8685      if (!is_anchored(scode, new_map, cd, atomcount)) return FALSE;
8686      }
8687 
8688    /* Positive forward assertion */
8689 
8690    else if (op == OP_ASSERT)
8691      {
8692      if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8693      }
8694 
8695    /* Condition; not anchored if no second branch */
8696 
8697    else if (op == OP_COND)
8698      {
8699      if (scode[GET(scode,1)] != OP_ALT) return FALSE;
8700      if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8701      }
8702 
8703    /* Atomic groups */
8704 
8705    else if (op == OP_ONCE || op == OP_ONCE_NC)
8706      {
8707      if (!is_anchored(scode, bracket_map, cd, atomcount + 1))
8708        return FALSE;
8709      }
8710 
8711    /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8712    it isn't in brackets that are or may be referenced or inside an atomic
8713    group. */
8714 
8715    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8716              op == OP_TYPEPOSSTAR))
8717      {
8718      if (scode[1] != OP_ALLANY || (bracket_map & cd->backref_map) != 0 ||
8719          atomcount > 0 || cd->had_pruneorskip)
8720        return FALSE;
8721      }
8722 
8723    /* Check for explicit anchoring */
8724 
8725    else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8726 
8727    code += GET(code, 1);
8728    }
8729 while (*code == OP_ALT);   /* Loop for each alternative */
8730 return TRUE;
8731 }
8732 
8733 
8734 
8735 /*************************************************
8736 *         Check for starting with ^ or .*        *
8737 *************************************************/
8738 
8739 /* This is called to find out if every branch starts with ^ or .* so that
8740 "first char" processing can be done to speed things up in multiline
8741 matching and for non-DOTALL patterns that start with .* (which must start at
8742 the beginning or after \n). As in the case of is_anchored() (see above), we
8743 have to take account of back references to capturing brackets that contain .*
8744 because in that case we can't make the assumption. Also, the appearance of .*
8745 inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
8746 or *SKIP does not count, because once again the assumption no longer holds.
8747 
8748 Arguments:
8749   code           points to start of expression (the bracket)
8750   bracket_map    a bitmap of which brackets we are inside while testing; this
8751                   handles up to substring 31; after that we just have to take
8752                   the less precise approach
8753   cd             points to the compile data
8754   atomcount      atomic group level
8755   inassert       TRUE if in an assertion
8756 
8757 Returns:         TRUE or FALSE
8758 */
8759 
8760 static BOOL
is_startline(const pcre_uchar * code,unsigned int bracket_map,compile_data * cd,int atomcount,BOOL inassert)8761 is_startline(const pcre_uchar *code, unsigned int bracket_map,
8762   compile_data *cd, int atomcount, BOOL inassert)
8763 {
8764 do {
8765    const pcre_uchar *scode = first_significant_code(
8766      code + PRIV(OP_lengths)[*code], FALSE);
8767    register int op = *scode;
8768 
8769    /* If we are at the start of a conditional assertion group, *both* the
8770    conditional assertion *and* what follows the condition must satisfy the test
8771    for start of line. Other kinds of condition fail. Note that there may be an
8772    auto-callout at the start of a condition. */
8773 
8774    if (op == OP_COND)
8775      {
8776      scode += 1 + LINK_SIZE;
8777      if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8778      switch (*scode)
8779        {
8780        case OP_CREF:
8781        case OP_DNCREF:
8782        case OP_RREF:
8783        case OP_DNRREF:
8784        case OP_DEF:
8785        case OP_FAIL:
8786        return FALSE;
8787 
8788        default:     /* Assertion */
8789        if (!is_startline(scode, bracket_map, cd, atomcount, TRUE)) return FALSE;
8790        do scode += GET(scode, 1); while (*scode == OP_ALT);
8791        scode += 1 + LINK_SIZE;
8792        break;
8793        }
8794      scode = first_significant_code(scode, FALSE);
8795      op = *scode;
8796      }
8797 
8798    /* Non-capturing brackets */
8799 
8800    if (op == OP_BRA  || op == OP_BRAPOS ||
8801        op == OP_SBRA || op == OP_SBRAPOS)
8802      {
8803      if (!is_startline(scode, bracket_map, cd, atomcount, inassert)) return FALSE;
8804      }
8805 
8806    /* Capturing brackets */
8807 
8808    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8809             op == OP_SCBRA || op == OP_SCBRAPOS)
8810      {
8811      int n = GET2(scode, 1+LINK_SIZE);
8812      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
8813      if (!is_startline(scode, new_map, cd, atomcount, inassert)) return FALSE;
8814      }
8815 
8816    /* Positive forward assertions */
8817 
8818    else if (op == OP_ASSERT)
8819      {
8820      if (!is_startline(scode, bracket_map, cd, atomcount, TRUE)) return FALSE;
8821      }
8822 
8823    /* Atomic brackets */
8824 
8825    else if (op == OP_ONCE || op == OP_ONCE_NC)
8826      {
8827      if (!is_startline(scode, bracket_map, cd, atomcount + 1, inassert)) return FALSE;
8828      }
8829 
8830    /* .* means "start at start or after \n" if it isn't in atomic brackets or
8831    brackets that may be referenced or an assertion, as long as the pattern does
8832    not contain *PRUNE or *SKIP, because these break the feature. Consider, for
8833    example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e.
8834    not at the start of a line. */
8835 
8836    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8837      {
8838      if (scode[1] != OP_ANY || (bracket_map & cd->backref_map) != 0 ||
8839          atomcount > 0 || cd->had_pruneorskip || inassert)
8840        return FALSE;
8841      }
8842 
8843    /* Check for explicit circumflex; anything else gives a FALSE result. Note
8844    in particular that this includes atomic brackets OP_ONCE and OP_ONCE_NC
8845    because the number of characters matched by .* cannot be adjusted inside
8846    them. */
8847 
8848    else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8849 
8850    /* Move on to the next alternative */
8851 
8852    code += GET(code, 1);
8853    }
8854 while (*code == OP_ALT);  /* Loop for each alternative */
8855 return TRUE;
8856 }
8857 
8858 
8859 
8860 /*************************************************
8861 *       Check for asserted fixed first char      *
8862 *************************************************/
8863 
8864 /* During compilation, the "first char" settings from forward assertions are
8865 discarded, because they can cause conflicts with actual literals that follow.
8866 However, if we end up without a first char setting for an unanchored pattern,
8867 it is worth scanning the regex to see if there is an initial asserted first
8868 char. If all branches start with the same asserted char, or with a
8869 non-conditional bracket all of whose alternatives start with the same asserted
8870 char (recurse ad lib), then we return that char, with the flags set to zero or
8871 REQ_CASELESS; otherwise return zero with REQ_NONE in the flags.
8872 
8873 Arguments:
8874   code       points to start of expression (the bracket)
8875   flags      points to the first char flags, or to REQ_NONE
8876   inassert   TRUE if in an assertion
8877 
8878 Returns:     the fixed first char, or 0 with REQ_NONE in flags
8879 */
8880 
8881 static pcre_uint32
find_firstassertedchar(const pcre_uchar * code,pcre_int32 * flags,BOOL inassert)8882 find_firstassertedchar(const pcre_uchar *code, pcre_int32 *flags,
8883   BOOL inassert)
8884 {
8885 register pcre_uint32 c = 0;
8886 int cflags = REQ_NONE;
8887 
8888 *flags = REQ_NONE;
8889 do {
8890    pcre_uint32 d;
8891    int dflags;
8892    int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
8893              *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
8894    const pcre_uchar *scode = first_significant_code(code + 1+LINK_SIZE + xl,
8895      TRUE);
8896    register pcre_uchar op = *scode;
8897 
8898    switch(op)
8899      {
8900      default:
8901      return 0;
8902 
8903      case OP_BRA:
8904      case OP_BRAPOS:
8905      case OP_CBRA:
8906      case OP_SCBRA:
8907      case OP_CBRAPOS:
8908      case OP_SCBRAPOS:
8909      case OP_ASSERT:
8910      case OP_ONCE:
8911      case OP_ONCE_NC:
8912      d = find_firstassertedchar(scode, &dflags, op == OP_ASSERT);
8913      if (dflags < 0)
8914        return 0;
8915      if (cflags < 0) { c = d; cflags = dflags; } else if (c != d || cflags != dflags) return 0;
8916      break;
8917 
8918      case OP_EXACT:
8919      scode += IMM2_SIZE;
8920      /* Fall through */
8921 
8922      case OP_CHAR:
8923      case OP_PLUS:
8924      case OP_MINPLUS:
8925      case OP_POSPLUS:
8926      if (!inassert) return 0;
8927      if (cflags < 0) { c = scode[1]; cflags = 0; }
8928        else if (c != scode[1]) return 0;
8929      break;
8930 
8931      case OP_EXACTI:
8932      scode += IMM2_SIZE;
8933      /* Fall through */
8934 
8935      case OP_CHARI:
8936      case OP_PLUSI:
8937      case OP_MINPLUSI:
8938      case OP_POSPLUSI:
8939      if (!inassert) return 0;
8940      if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
8941        else if (c != scode[1]) return 0;
8942      break;
8943      }
8944 
8945    code += GET(code, 1);
8946    }
8947 while (*code == OP_ALT);
8948 
8949 *flags = cflags;
8950 return c;
8951 }
8952 
8953 
8954 
8955 /*************************************************
8956 *     Add an entry to the name/number table      *
8957 *************************************************/
8958 
8959 /* This function is called between compiling passes to add an entry to the
8960 name/number table, maintaining alphabetical order. Checking for permitted
8961 and forbidden duplicates has already been done.
8962 
8963 Arguments:
8964   cd           the compile data block
8965   name         the name to add
8966   length       the length of the name
8967   groupno      the group number
8968 
8969 Returns:       nothing
8970 */
8971 
8972 static void
add_name(compile_data * cd,const pcre_uchar * name,int length,unsigned int groupno)8973 add_name(compile_data *cd, const pcre_uchar *name, int length,
8974   unsigned int groupno)
8975 {
8976 int i;
8977 pcre_uchar *slot = cd->name_table;
8978 
8979 for (i = 0; i < cd->names_found; i++)
8980   {
8981   int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(length));
8982   if (crc == 0 && slot[IMM2_SIZE+length] != 0)
8983     crc = -1; /* Current name is a substring */
8984 
8985   /* Make space in the table and break the loop for an earlier name. For a
8986   duplicate or later name, carry on. We do this for duplicates so that in the
8987   simple case (when ?(| is not used) they are in order of their numbers. In all
8988   cases they are in the order in which they appear in the pattern. */
8989 
8990   if (crc < 0)
8991     {
8992     memmove(slot + cd->name_entry_size, slot,
8993       IN_UCHARS((cd->names_found - i) * cd->name_entry_size));
8994     break;
8995     }
8996 
8997   /* Continue the loop for a later or duplicate name */
8998 
8999   slot += cd->name_entry_size;
9000   }
9001 
9002 PUT2(slot, 0, groupno);
9003 memcpy(slot + IMM2_SIZE, name, IN_UCHARS(length));
9004 slot[IMM2_SIZE + length] = 0;
9005 cd->names_found++;
9006 }
9007 
9008 
9009 
9010 /*************************************************
9011 *        Compile a Regular Expression            *
9012 *************************************************/
9013 
9014 /* This function takes a string and returns a pointer to a block of store
9015 holding a compiled version of the expression. The original API for this
9016 function had no error code return variable; it is retained for backwards
9017 compatibility. The new function is given a new name.
9018 
9019 Arguments:
9020   pattern       the regular expression
9021   options       various option bits
9022   errorcodeptr  pointer to error code variable (pcre_compile2() only)
9023                   can be NULL if you don't want a code value
9024   errorptr      pointer to pointer to error text
9025   erroroffset   ptr offset in pattern where error was detected
9026   tables        pointer to character tables or NULL
9027 
9028 Returns:        pointer to compiled data block, or NULL on error,
9029                 with errorptr and erroroffset set
9030 */
9031 
9032 #if defined COMPILE_PCRE8
9033 #if defined(ERLANG_INTEGRATION)
9034 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
erts_pcre_compile(const char * pattern,int options,const char ** errorptr,int * erroroffset,const unsigned char * tables)9035 erts_pcre_compile(const char *pattern, int options, const char **errorptr,
9036 		  int *erroroffset, const unsigned char *tables)
9037 #else
9038 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
9039 pcre_compile(const char *pattern, int options, const char **errorptr,
9040   int *erroroffset, const unsigned char *tables)
9041 #endif
9042 #elif defined COMPILE_PCRE16
9043 PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
9044 pcre16_compile(PCRE_SPTR16 pattern, int options, const char **errorptr,
9045   int *erroroffset, const unsigned char *tables)
9046 #elif defined COMPILE_PCRE32
9047 PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
9048 pcre32_compile(PCRE_SPTR32 pattern, int options, const char **errorptr,
9049   int *erroroffset, const unsigned char *tables)
9050 #endif
9051 {
9052 #if defined COMPILE_PCRE8
9053 #if defined(ERLANG_INTEGRATION)
9054 return erts_pcre_compile2(pattern, options, NULL, errorptr,
9055 			  erroroffset, tables);
9056 #else
9057 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
9058 #endif
9059 #elif defined COMPILE_PCRE16
9060 return pcre16_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
9061 #elif defined COMPILE_PCRE32
9062 return pcre32_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
9063 #endif
9064 }
9065 
9066 
9067 #if defined COMPILE_PCRE8
9068 #if defined(ERLANG_INTEGRATION)
9069 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
erts_pcre_compile2(const char * pattern,int options,int * errorcodeptr,const char ** errorptr,int * erroroffset,const unsigned char * tables)9070 erts_pcre_compile2(const char *pattern, int options, int *errorcodeptr,
9071   const char **errorptr, int *erroroffset, const unsigned char *tables)
9072 #else
9073 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
9074 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
9075   const char **errorptr, int *erroroffset, const unsigned char *tables)
9076 #endif
9077 #elif defined COMPILE_PCRE16
9078 PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
9079 pcre16_compile2(PCRE_SPTR16 pattern, int options, int *errorcodeptr,
9080   const char **errorptr, int *erroroffset, const unsigned char *tables)
9081 #elif defined COMPILE_PCRE32
9082 PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
9083 pcre32_compile2(PCRE_SPTR32 pattern, int options, int *errorcodeptr,
9084   const char **errorptr, int *erroroffset, const unsigned char *tables)
9085 #endif
9086 {
9087 REAL_PCRE *re;
9088 int length = 1;  /* For final END opcode */
9089 pcre_int32 firstcharflags, reqcharflags;
9090 pcre_uint32 firstchar, reqchar;
9091 pcre_uint32 limit_match = PCRE_UINT32_MAX;
9092 pcre_uint32 limit_recursion = PCRE_UINT32_MAX;
9093 int newline;
9094 int errorcode = 0;
9095 int skipatstart = 0;
9096 BOOL utf;
9097 BOOL never_utf = FALSE;
9098 size_t size;
9099 pcre_uchar *code;
9100 const pcre_uchar *codestart;
9101 const pcre_uchar *ptr;
9102 compile_data compile_block;
9103 compile_data *cd = &compile_block;
9104 
9105 /* This space is used for "compiling" into during the first phase, when we are
9106 computing the amount of memory that is needed. Compiled items are thrown away
9107 as soon as possible, so that a fairly large buffer should be sufficient for
9108 this purpose. The same space is used in the second phase for remembering where
9109 to fill in forward references to subpatterns. That may overflow, in which case
9110 new memory is obtained from malloc(). */
9111 
9112 pcre_uchar cworkspace[COMPILE_WORK_SIZE];
9113 
9114 /* This vector is used for remembering name groups during the pre-compile. In a
9115 similar way to cworkspace, it can be expanded using malloc() if necessary. */
9116 
9117 named_group named_groups[NAMED_GROUP_LIST_SIZE];
9118 
9119 /* Set this early so that early errors get offset 0. */
9120 
9121 ptr = (const pcre_uchar *)pattern;
9122 
9123 /* We can't pass back an error message if errorptr is NULL; I guess the best we
9124 can do is just return NULL, but we can set a code value if there is a code
9125 pointer. */
9126 
9127 if (errorptr == NULL)
9128   {
9129   if (errorcodeptr != NULL) *errorcodeptr = 99;
9130   return NULL;
9131   }
9132 
9133 *errorptr = NULL;
9134 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
9135 
9136 /* However, we can give a message for this error */
9137 
9138 if (erroroffset == NULL)
9139   {
9140   errorcode = ERR16;
9141   goto PCRE_EARLY_ERROR_RETURN2;
9142   }
9143 
9144 *erroroffset = 0;
9145 
9146 /* Set up pointers to the individual character tables */
9147 
9148 if (tables == NULL) tables = PRIV(default_tables);
9149 cd->lcc = tables + lcc_offset;
9150 cd->fcc = tables + fcc_offset;
9151 cd->cbits = tables + cbits_offset;
9152 cd->ctypes = tables + ctypes_offset;
9153 
9154 /* Check that all undefined public option bits are zero */
9155 
9156 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
9157   {
9158   errorcode = ERR17;
9159   goto PCRE_EARLY_ERROR_RETURN;
9160   }
9161 
9162 /* If PCRE_NEVER_UTF is set, remember it. */
9163 
9164 if ((options & PCRE_NEVER_UTF) != 0) never_utf = TRUE;
9165 
9166 /* Check for global one-time settings at the start of the pattern, and remember
9167 the offset for later. */
9168 
9169 cd->external_flags = 0;   /* Initialize here for LIMIT_MATCH/RECURSION */
9170 
9171 while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
9172        ptr[skipatstart+1] == CHAR_ASTERISK)
9173   {
9174   int newnl = 0;
9175   int newbsr = 0;
9176 
9177 /* For completeness and backward compatibility, (*UTFn) is supported in the
9178 relevant libraries, but (*UTF) is generic and always supported. Note that
9179 PCRE_UTF8 == PCRE_UTF16 == PCRE_UTF32. */
9180 
9181 #ifdef COMPILE_PCRE8
9182   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF8_RIGHTPAR, 5) == 0)
9183     { skipatstart += 7; options |= PCRE_UTF8; continue; }
9184 #endif
9185 #ifdef COMPILE_PCRE16
9186   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF16_RIGHTPAR, 6) == 0)
9187     { skipatstart += 8; options |= PCRE_UTF16; continue; }
9188 #endif
9189 #ifdef COMPILE_PCRE32
9190   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF32_RIGHTPAR, 6) == 0)
9191     { skipatstart += 8; options |= PCRE_UTF32; continue; }
9192 #endif
9193 
9194   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 4) == 0)
9195     { skipatstart += 6; options |= PCRE_UTF8; continue; }
9196   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)
9197     { skipatstart += 6; options |= PCRE_UCP; continue; }
9198   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_AUTO_POSSESS_RIGHTPAR, 16) == 0)
9199     { skipatstart += 18; options |= PCRE_NO_AUTO_POSSESS; continue; }
9200   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
9201     { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
9202 
9203   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_MATCH_EQ, 12) == 0)
9204     {
9205     pcre_uint32 c = 0;
9206     int p = skipatstart + 14;
9207     while (isdigit(ptr[p]))
9208       {
9209       if (c > PCRE_UINT32_MAX / 10 - 1) break;   /* Integer overflow */
9210       c = c*10 + ptr[p++] - CHAR_0;
9211       }
9212     if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
9213     if (c < limit_match)
9214       {
9215       limit_match = c;
9216       cd->external_flags |= PCRE_MLSET;
9217       }
9218     skipatstart = p;
9219     continue;
9220     }
9221 
9222   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_RECURSION_EQ, 16) == 0)
9223     {
9224     pcre_uint32 c = 0;
9225     int p = skipatstart + 18;
9226     while (isdigit(ptr[p]))
9227       {
9228       if (c > PCRE_UINT32_MAX / 10 - 1) break;   /* Integer overflow check */
9229       c = c*10 + ptr[p++] - CHAR_0;
9230       }
9231     if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
9232     if (c < limit_recursion)
9233       {
9234       limit_recursion = c;
9235       cd->external_flags |= PCRE_RLSET;
9236       }
9237     skipatstart = p;
9238     continue;
9239     }
9240 
9241   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CR_RIGHTPAR, 3) == 0)
9242     { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
9243   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LF_RIGHTPAR, 3)  == 0)
9244     { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
9245   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CRLF_RIGHTPAR, 5)  == 0)
9246     { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
9247   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANY_RIGHTPAR, 4) == 0)
9248     { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
9249   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANYCRLF_RIGHTPAR, 8) == 0)
9250     { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
9251 
9252   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
9253     { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
9254   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
9255     { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
9256 
9257   if (newnl != 0)
9258     options = (options & ~PCRE_NEWLINE_BITS) | newnl;
9259   else if (newbsr != 0)
9260     options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
9261   else break;
9262   }
9263 
9264 /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
9265 utf = (options & PCRE_UTF8) != 0;
9266 if (utf && never_utf)
9267   {
9268   errorcode = ERR78;
9269   goto PCRE_EARLY_ERROR_RETURN2;
9270   }
9271 
9272 /* Can't support UTF unless PCRE has been compiled to include the code. The
9273 return of an error code from PRIV(valid_utf)() is a new feature, introduced in
9274 release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
9275 not used here. */
9276 
9277 #ifdef SUPPORT_UTF
9278 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&
9279      (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
9280   {
9281 #if defined COMPILE_PCRE8
9282   errorcode = ERR44;
9283 #elif defined COMPILE_PCRE16
9284   errorcode = ERR74;
9285 #elif defined COMPILE_PCRE32
9286   errorcode = ERR77;
9287 #endif
9288   goto PCRE_EARLY_ERROR_RETURN2;
9289   }
9290 #else
9291 if (utf)
9292   {
9293   errorcode = ERR32;
9294   goto PCRE_EARLY_ERROR_RETURN;
9295   }
9296 #endif
9297 
9298 /* Can't support UCP unless PCRE has been compiled to include the code. */
9299 
9300 #ifndef SUPPORT_UCP
9301 if ((options & PCRE_UCP) != 0)
9302   {
9303   errorcode = ERR67;
9304   goto PCRE_EARLY_ERROR_RETURN;
9305   }
9306 #endif
9307 
9308 /* Check validity of \R options. */
9309 
9310 if ((options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) ==
9311      (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
9312   {
9313   errorcode = ERR56;
9314   goto PCRE_EARLY_ERROR_RETURN;
9315   }
9316 
9317 /* Handle different types of newline. The three bits give seven cases. The
9318 current code allows for fixed one- or two-byte sequences, plus "any" and
9319 "anycrlf". */
9320 
9321 switch (options & PCRE_NEWLINE_BITS)
9322   {
9323   case 0: newline = NEWLINE; break;   /* Build-time default */
9324   case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
9325   case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
9326   case PCRE_NEWLINE_CR+
9327        PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
9328   case PCRE_NEWLINE_ANY: newline = -1; break;
9329   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
9330   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
9331   }
9332 
9333 if (newline == -2)
9334   {
9335   cd->nltype = NLTYPE_ANYCRLF;
9336   }
9337 else if (newline < 0)
9338   {
9339   cd->nltype = NLTYPE_ANY;
9340   }
9341 else
9342   {
9343   cd->nltype = NLTYPE_FIXED;
9344   if (newline > 255)
9345     {
9346     cd->nllen = 2;
9347     cd->nl[0] = (newline >> 8) & 255;
9348     cd->nl[1] = newline & 255;
9349     }
9350   else
9351     {
9352     cd->nllen = 1;
9353     cd->nl[0] = newline;
9354     }
9355   }
9356 
9357 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
9358 references to help in deciding whether (.*) can be treated as anchored or not.
9359 */
9360 
9361 cd->top_backref = 0;
9362 cd->backref_map = 0;
9363 
9364 /* Reflect pattern for debugging output */
9365 
9366 DPRINTF(("------------------------------------------------------------------\n"));
9367 #ifdef PCRE_DEBUG
9368 print_puchar(stdout, (PCRE_PUCHAR)pattern);
9369 #endif
9370 DPRINTF(("\n"));
9371 
9372 /* Pretend to compile the pattern while actually just accumulating the length
9373 of memory required. This behaviour is triggered by passing a non-NULL final
9374 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
9375 to compile parts of the pattern into; the compiled code is discarded when it is
9376 no longer needed, so hopefully this workspace will never overflow, though there
9377 is a test for its doing so. */
9378 
9379 cd->bracount = cd->final_bracount = 0;
9380 cd->names_found = 0;
9381 cd->name_entry_size = 0;
9382 cd->name_table = NULL;
9383 cd->dupnames = FALSE;
9384 cd->dupgroups = FALSE;
9385 cd->namedrefcount = 0;
9386 cd->start_code = cworkspace;
9387 cd->hwm = cworkspace;
9388 cd->iscondassert = FALSE;
9389 cd->start_workspace = cworkspace;
9390 cd->workspace_size = COMPILE_WORK_SIZE;
9391 cd->named_groups = named_groups;
9392 cd->named_group_list_size = NAMED_GROUP_LIST_SIZE;
9393 cd->start_pattern = (const pcre_uchar *)pattern;
9394 cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
9395 cd->req_varyopt = 0;
9396 cd->parens_depth = 0;
9397 cd->assert_depth = 0;
9398 cd->max_lookbehind = 0;
9399 cd->external_options = options;
9400 cd->open_caps = NULL;
9401 
9402 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
9403 don't need to look at the result of the function here. The initial options have
9404 been put into the cd block so that they can be changed if an option setting is
9405 found within the regex right at the beginning. Bringing initial option settings
9406 outside can help speed up starting point checks. */
9407 
9408 ptr += skipatstart;
9409 code = cworkspace;
9410 *code = OP_BRA;
9411 
9412 (void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
9413   FALSE, 0, 0, &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL,
9414   cd, &length);
9415 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
9416 
9417 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
9418   (int)(cd->hwm - cworkspace)));
9419 
9420 if (length > MAX_PATTERN_SIZE)
9421   {
9422   errorcode = ERR20;
9423   goto PCRE_EARLY_ERROR_RETURN;
9424   }
9425 
9426 /* Compute the size of the data block for storing the compiled pattern. Integer
9427 overflow should no longer be possible because nowadays we limit the maximum
9428 value of cd->names_found and cd->name_entry_size. */
9429 
9430 size = sizeof(REAL_PCRE) +
9431   (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
9432 
9433 /* Get the memory. */
9434 
9435 re = (REAL_PCRE *)(PUBL(malloc))(size);
9436 if (re == NULL)
9437   {
9438   errorcode = ERR21;
9439   goto PCRE_EARLY_ERROR_RETURN;
9440   }
9441 
9442 /* Put in the magic number, and save the sizes, initial options, internal
9443 flags, and character table pointer. NULL is used for the default character
9444 tables. The nullpad field is at the end; it's there to help in the case when a
9445 regex compiled on a system with 4-byte pointers is run on another with 8-byte
9446 pointers. */
9447 
9448 re->magic_number = MAGIC_NUMBER;
9449 re->size = (int)size;
9450 re->options = cd->external_options;
9451 re->flags = cd->external_flags;
9452 re->limit_match = limit_match;
9453 re->limit_recursion = limit_recursion;
9454 re->first_char = 0;
9455 re->req_char = 0;
9456 re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar);
9457 re->name_entry_size = cd->name_entry_size;
9458 re->name_count = cd->names_found;
9459 re->ref_count = 0;
9460 re->tables = (tables == PRIV(default_tables))? NULL : tables;
9461 re->nullpad = NULL;
9462 #ifdef COMPILE_PCRE32
9463 re->dummy = 0;
9464 #else
9465 re->dummy1 = re->dummy2 = re->dummy3 = 0;
9466 #endif
9467 
9468 /* The starting points of the name/number translation table and of the code are
9469 passed around in the compile data block. The start/end pattern and initial
9470 options are already set from the pre-compile phase, as is the name_entry_size
9471 field. Reset the bracket count and the names_found field. Also reset the hwm
9472 field; this time it's used for remembering forward references to subpatterns.
9473 */
9474 
9475 cd->final_bracount = cd->bracount;  /* Save for checking forward references */
9476 cd->parens_depth = 0;
9477 cd->assert_depth = 0;
9478 cd->bracount = 0;
9479 cd->max_lookbehind = 0;
9480 cd->name_table = (pcre_uchar *)re + re->name_table_offset;
9481 codestart = cd->name_table + re->name_entry_size * re->name_count;
9482 cd->start_code = codestart;
9483 cd->hwm = (pcre_uchar *)(cd->start_workspace);
9484 cd->iscondassert = FALSE;
9485 cd->req_varyopt = 0;
9486 cd->had_accept = FALSE;
9487 cd->had_pruneorskip = FALSE;
9488 cd->check_lookbehind = FALSE;
9489 cd->open_caps = NULL;
9490 
9491 /* If any named groups were found, create the name/number table from the list
9492 created in the first pass. */
9493 
9494 if (cd->names_found > 0)
9495   {
9496   int i = cd->names_found;
9497   named_group *ng = cd->named_groups;
9498   cd->names_found = 0;
9499   for (; i > 0; i--, ng++)
9500     add_name(cd, ng->name, ng->length, ng->number);
9501   if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
9502     (PUBL(free))((void *)cd->named_groups);
9503   }
9504 
9505 /* Set up a starting, non-extracting bracket, then compile the expression. On
9506 error, errorcode will be set non-zero, so we don't need to look at the result
9507 of the function here. */
9508 
9509 ptr = (const pcre_uchar *)pattern + skipatstart;
9510 code = (pcre_uchar *)codestart;
9511 *code = OP_BRA;
9512 (void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, 0, 0,
9513   &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL, cd, NULL);
9514 re->top_bracket = cd->bracount;
9515 re->top_backref = cd->top_backref;
9516 re->max_lookbehind = cd->max_lookbehind;
9517 re->flags = cd->external_flags | PCRE_MODE;
9518 
9519 if (cd->had_accept)
9520   {
9521   reqchar = 0;              /* Must disable after (*ACCEPT) */
9522   reqcharflags = REQ_NONE;
9523   }
9524 
9525 /* If not reached end of pattern on success, there's an excess bracket. */
9526 
9527 if (errorcode == 0 && *ptr != CHAR_NULL) errorcode = ERR22;
9528 
9529 /* Fill in the terminating state and check for disastrous overflow, but
9530 if debugging, leave the test till after things are printed out. */
9531 
9532 *code++ = OP_END;
9533 
9534 #ifndef PCRE_DEBUG
9535 if (code - codestart > length) errorcode = ERR23;
9536 #endif
9537 
9538 #ifdef SUPPORT_VALGRIND
9539 /* If the estimated length exceeds the really used length, mark the extra
9540 allocated memory as unaddressable, so that any out-of-bound reads can be
9541 detected. */
9542 VALGRIND_MAKE_MEM_NOACCESS(code, (length - (code - codestart)) * sizeof(pcre_uchar));
9543 #endif
9544 
9545 /* Fill in any forward references that are required. There may be repeated
9546 references; optimize for them, as searching a large regex takes time. */
9547 
9548 if (cd->hwm > cd->start_workspace)
9549   {
9550   int prev_recno = -1;
9551   const pcre_uchar *groupptr = NULL;
9552   while (errorcode == 0 && cd->hwm > cd->start_workspace)
9553     {
9554     int offset, recno;
9555     cd->hwm -= LINK_SIZE;
9556     offset = GET(cd->hwm, 0);
9557 
9558     /* Check that the hwm handling hasn't gone wrong. This whole area is
9559     rewritten in PCRE2 because there are some obscure cases. */
9560 
9561     if (offset == 0 || codestart[offset-1] != OP_RECURSE)
9562       {
9563       errorcode = ERR10;
9564       break;
9565       }
9566 
9567     recno = GET(codestart, offset);
9568     if (recno != prev_recno)
9569       {
9570       groupptr = PRIV(find_bracket)(codestart, utf, recno);
9571       prev_recno = recno;
9572       }
9573     if (groupptr == NULL) errorcode = ERR53;
9574       else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));
9575     }
9576   }
9577 
9578 /* If the workspace had to be expanded, free the new memory. Set the pointer to
9579 NULL to indicate that forward references have been filled in. */
9580 
9581 if (cd->workspace_size > COMPILE_WORK_SIZE)
9582   (PUBL(free))((void *)cd->start_workspace);
9583 cd->start_workspace = NULL;
9584 
9585 /* Give an error if there's back reference to a non-existent capturing
9586 subpattern. */
9587 
9588 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
9589 
9590 /* Unless disabled, check whether any single character iterators can be
9591 auto-possessified. The function overwrites the appropriate opcode values, so
9592 the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
9593 used in this code because at least one compiler gives a warning about loss of
9594 "const" attribute if the cast (pcre_uchar *)codestart is used directly in the
9595 function call. */
9596 
9597 if (errorcode == 0 && (options & PCRE_NO_AUTO_POSSESS) == 0)
9598   {
9599   pcre_uchar *temp = (pcre_uchar *)codestart;
9600   auto_possessify(temp, utf, cd);
9601   }
9602 
9603 /* If there were any lookbehind assertions that contained OP_RECURSE
9604 (recursions or subroutine calls), a flag is set for them to be checked here,
9605 because they may contain forward references. Actual recursions cannot be fixed
9606 length, but subroutine calls can. It is done like this so that those without
9607 OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
9608 exceptional ones forgo this. We scan the pattern to check that they are fixed
9609 length, and set their lengths. */
9610 
9611 if (errorcode == 0 && cd->check_lookbehind)
9612   {
9613   pcre_uchar *cc = (pcre_uchar *)codestart;
9614 
9615   /* Loop, searching for OP_REVERSE items, and process those that do not have
9616   their length set. (Actually, it will also re-process any that have a length
9617   of zero, but that is a pathological case, and it does no harm.) When we find
9618   one, we temporarily terminate the branch it is in while we scan it. */
9619 
9620   for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf, -1);
9621        cc != NULL;
9622        cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf, -1))
9623     {
9624     if (GET(cc, 1) == 0)
9625       {
9626       int fixed_length;
9627       pcre_uchar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
9628       int end_op = *be;
9629       *be = OP_END;
9630       fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != 0, TRUE,
9631         cd, NULL);
9632       *be = end_op;
9633       DPRINTF(("fixed length = %d\n", fixed_length));
9634       if (fixed_length < 0)
9635         {
9636         errorcode = (fixed_length == -2)? ERR36 :
9637                     (fixed_length == -4)? ERR70 : ERR25;
9638         break;
9639         }
9640       if (fixed_length > cd->max_lookbehind) cd->max_lookbehind = fixed_length;
9641       PUT(cc, 1, fixed_length);
9642       }
9643     cc += 1 + LINK_SIZE;
9644     }
9645   }
9646 
9647 /* Failed to compile, or error while post-processing */
9648 
9649 if (errorcode != 0)
9650   {
9651   (PUBL(free))(re);
9652   PCRE_EARLY_ERROR_RETURN:
9653   *erroroffset = (int)(ptr - (const pcre_uchar *)pattern);
9654   PCRE_EARLY_ERROR_RETURN2:
9655   *errorptr = find_error_text(errorcode);
9656   if (errorcodeptr != NULL) *errorcodeptr = errorcode;
9657   return NULL;
9658   }
9659 
9660 /* If the anchored option was not passed, set the flag if we can determine that
9661 the pattern is anchored by virtue of ^ characters or \A or anything else, such
9662 as starting with non-atomic .* when DOTALL is set and there are no occurrences
9663 of *PRUNE or *SKIP.
9664 
9665 Otherwise, if we know what the first byte has to be, save it, because that
9666 speeds up unanchored matches no end. If not, see if we can set the
9667 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
9668 start with ^. and also when all branches start with non-atomic .* for
9669 non-DOTALL matches when *PRUNE and SKIP are not present. */
9670 
9671 if ((re->options & PCRE_ANCHORED) == 0)
9672   {
9673   if (is_anchored(codestart, 0, cd, 0)) re->options |= PCRE_ANCHORED;
9674   else
9675     {
9676     if (firstcharflags < 0)
9677       firstchar = find_firstassertedchar(codestart, &firstcharflags, FALSE);
9678     if (firstcharflags >= 0)   /* Remove caseless flag for non-caseable chars */
9679       {
9680 #if defined COMPILE_PCRE8
9681       re->first_char = firstchar & 0xff;
9682 #elif defined COMPILE_PCRE16
9683       re->first_char = firstchar & 0xffff;
9684 #elif defined COMPILE_PCRE32
9685       re->first_char = firstchar;
9686 #endif
9687       if ((firstcharflags & REQ_CASELESS) != 0)
9688         {
9689 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
9690         /* We ignore non-ASCII first chars in 8 bit mode. */
9691         if (utf)
9692           {
9693           if (re->first_char < 128)
9694             {
9695             if (cd->fcc[re->first_char] != re->first_char)
9696               re->flags |= PCRE_FCH_CASELESS;
9697             }
9698           else if (UCD_OTHERCASE(re->first_char) != re->first_char)
9699             re->flags |= PCRE_FCH_CASELESS;
9700           }
9701         else
9702 #endif
9703         if (MAX_255(re->first_char)
9704             && cd->fcc[re->first_char] != re->first_char)
9705           re->flags |= PCRE_FCH_CASELESS;
9706         }
9707 
9708       re->flags |= PCRE_FIRSTSET;
9709       }
9710 
9711     else if (is_startline(codestart, 0, cd, 0, FALSE)) re->flags |= PCRE_STARTLINE;
9712     }
9713   }
9714 
9715 /* For an anchored pattern, we use the "required byte" only if it follows a
9716 variable length item in the regex. Remove the caseless flag for non-caseable
9717 bytes. */
9718 
9719 if (reqcharflags >= 0 &&
9720      ((re->options & PCRE_ANCHORED) == 0 || (reqcharflags & REQ_VARY) != 0))
9721   {
9722 #if defined COMPILE_PCRE8
9723   re->req_char = reqchar & 0xff;
9724 #elif defined COMPILE_PCRE16
9725   re->req_char = reqchar & 0xffff;
9726 #elif defined COMPILE_PCRE32
9727   re->req_char = reqchar;
9728 #endif
9729   if ((reqcharflags & REQ_CASELESS) != 0)
9730     {
9731 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
9732     /* We ignore non-ASCII first chars in 8 bit mode. */
9733     if (utf)
9734       {
9735       if (re->req_char < 128)
9736         {
9737         if (cd->fcc[re->req_char] != re->req_char)
9738           re->flags |= PCRE_RCH_CASELESS;
9739         }
9740       else if (UCD_OTHERCASE(re->req_char) != re->req_char)
9741         re->flags |= PCRE_RCH_CASELESS;
9742       }
9743     else
9744 #endif
9745     if (MAX_255(re->req_char) && cd->fcc[re->req_char] != re->req_char)
9746       re->flags |= PCRE_RCH_CASELESS;
9747     }
9748 
9749   re->flags |= PCRE_REQCHSET;
9750   }
9751 
9752 /* Print out the compiled data if debugging is enabled. This is never the
9753 case when building a production library. */
9754 
9755 #ifdef PCRE_DEBUG
9756 printf("Length = %d top_bracket = %d top_backref = %d\n",
9757   length, re->top_bracket, re->top_backref);
9758 
9759 printf("Options=%08x\n", re->options);
9760 
9761 if ((re->flags & PCRE_FIRSTSET) != 0)
9762   {
9763   pcre_uchar ch = re->first_char;
9764   const char *caseless =
9765     ((re->flags & PCRE_FCH_CASELESS) == 0)? "" : " (caseless)";
9766   if (PRINTABLE(ch)) printf("First char = %c%s\n", ch, caseless);
9767     else printf("First char = \\x%02x%s\n", ch, caseless);
9768   }
9769 
9770 if ((re->flags & PCRE_REQCHSET) != 0)
9771   {
9772   pcre_uchar ch = re->req_char;
9773   const char *caseless =
9774     ((re->flags & PCRE_RCH_CASELESS) == 0)? "" : " (caseless)";
9775   if (PRINTABLE(ch)) printf("Req char = %c%s\n", ch, caseless);
9776     else printf("Req char = \\x%02x%s\n", ch, caseless);
9777   }
9778 
9779 #if defined COMPILE_PCRE8
9780 pcre_printint((pcre *)re, stdout, TRUE);
9781 #elif defined COMPILE_PCRE16
9782 pcre16_printint((pcre *)re, stdout, TRUE);
9783 #elif defined COMPILE_PCRE32
9784 pcre32_printint((pcre *)re, stdout, TRUE);
9785 #endif
9786 
9787 /* This check is done here in the debugging case so that the code that
9788 was compiled can be seen. */
9789 
9790 if (code - codestart > length)
9791   {
9792   (PUBL(free))(re);
9793   *errorptr = find_error_text(ERR23);
9794   *erroroffset = ptr - (pcre_uchar *)pattern;
9795   if (errorcodeptr != NULL) *errorcodeptr = ERR23;
9796   return NULL;
9797   }
9798 #endif   /* PCRE_DEBUG */
9799 
9800 /* Check for a pattern than can match an empty string, so that this information
9801 can be provided to applications. */
9802 
9803 do
9804   {
9805   if (could_be_empty_branch(codestart, code, utf, cd, NULL))
9806     {
9807     re->flags |= PCRE_MATCH_EMPTY;
9808     break;
9809     }
9810   codestart += GET(codestart, 1);
9811   }
9812 while (*codestart == OP_ALT);
9813 
9814 #if defined COMPILE_PCRE8
9815 return (pcre *)re;
9816 #elif defined COMPILE_PCRE16
9817 return (pcre16 *)re;
9818 #elif defined COMPILE_PCRE32
9819 return (pcre32 *)re;
9820 #endif
9821 }
9822 
9823 /* End of pcre_compile.c */
9824