1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9 
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11 
12            Copyright (c) 1997-2003 University of Cambridge
13 
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18 
19 1. This software is distributed in the hope that it will be useful,
20    but WITHOUT ANY WARRANTY; without even the implied warranty of
21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22 
23 2. The origin of this software must not be misrepresented, either by
24    explicit claim or by omission.
25 
26 3. Altered versions must be plainly marked as such, and must not be
27    misrepresented as being the original software.
28 
29 4. If PCRE is embedded in any software that is released under the GNU
30    General Purpose Licence (GPL), then the terms of that licence shall
31    supersede any condition above with which it is incompatible.
32 -----------------------------------------------------------------------------
33 */
34 
35 
36 /* Include the internals header, which itself includes Standard C headers plus
37 the external pcre header. */
38 
39 #include "internal.h"
40 
41 
42 
43 /*************************************************
44 *      Set a bit and maybe its alternate case    *
45 *************************************************/
46 
47 /* Given a character, set its bit in the table, and also the bit for the other
48 version of a letter if we are caseless.
49 
50 Arguments:
51   start_bits    points to the bit map
52   c             is the character
53   caseless      the caseless flag
54   cd            the block with char table pointers
55 
56 Returns:        nothing
57 */
58 
59 static void
set_bit(uschar * start_bits,int c,BOOL caseless,compile_data * cd)60 set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd)
61 {
62 start_bits[c/8] |= (1 << (c&7));
63 if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
64   start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
65 }
66 
67 
68 
69 /*************************************************
70 *          Create bitmap of starting chars       *
71 *************************************************/
72 
73 /* This function scans a compiled unanchored expression and attempts to build a
74 bitmap of the set of initial characters. If it can't, it returns FALSE. As time
75 goes by, we may be able to get more clever at doing this.
76 
77 Arguments:
78   code         points to an expression
79   start_bits   points to a 32-byte table, initialized to 0
80   caseless     the current state of the caseless flag
81   utf8         TRUE if in UTF-8 mode
82   cd           the block with char table pointers
83 
84 Returns:       TRUE if table built, FALSE otherwise
85 */
86 
87 static BOOL
set_start_bits(const uschar * code,uschar * start_bits,BOOL caseless,BOOL utf8,compile_data * cd)88 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
89   BOOL utf8, compile_data *cd)
90 {
91 register int c;
92 
93 /* This next statement and the later reference to dummy are here in order to
94 trick the optimizer of the IBM C compiler for OS/2 into generating correct
95 code. Apparently IBM isn't going to fix the problem, and we would rather not
96 disable optimization (in this module it actually makes a big difference, and
97 the pcre module can use all the optimization it can get). */
98 
99 volatile int dummy __attribute__((unused));
100 
101 do
102   {
103   const uschar *tcode = code + 1 + LINK_SIZE;
104   BOOL try_next = TRUE;
105 
106   while (try_next)
107     {
108     /* If a branch starts with a bracket or a positive lookahead assertion,
109     recurse to set bits from within them. That's all for this branch. */
110 
111     if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
112       {
113       if (!set_start_bits(tcode, start_bits, caseless, utf8, cd))
114         return FALSE;
115       try_next = FALSE;
116       }
117 
118     else switch(*tcode)
119       {
120       default:
121       return FALSE;
122 
123       /* Skip over callout */
124 
125       case OP_CALLOUT:
126       tcode += 2;
127       break;
128 
129       /* Skip over extended extraction bracket number */
130 
131       case OP_BRANUMBER:
132       tcode += 3;
133       break;
134 
135       /* Skip over lookbehind and negative lookahead assertions */
136 
137       case OP_ASSERT_NOT:
138       case OP_ASSERTBACK:
139       case OP_ASSERTBACK_NOT:
140       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
141       tcode += 1+LINK_SIZE;
142       break;
143 
144       /* Skip over an option setting, changing the caseless flag */
145 
146       case OP_OPT:
147       caseless = (tcode[1] & PCRE_CASELESS) != 0;
148       tcode += 2;
149       break;
150 
151       /* BRAZERO does the bracket, but carries on. */
152 
153       case OP_BRAZERO:
154       case OP_BRAMINZERO:
155       if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd))
156         return FALSE;
157       dummy = 1;
158       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
159       tcode += 1+LINK_SIZE;
160       break;
161 
162       /* Single-char * or ? sets the bit and tries the next item */
163 
164       case OP_STAR:
165       case OP_MINSTAR:
166       case OP_QUERY:
167       case OP_MINQUERY:
168       set_bit(start_bits, tcode[1], caseless, cd);
169       tcode += 2;
170 #ifdef SUPPORT_UTF8
171       if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
172 #endif
173       break;
174 
175       /* Single-char upto sets the bit and tries the next */
176 
177       case OP_UPTO:
178       case OP_MINUPTO:
179       set_bit(start_bits, tcode[3], caseless, cd);
180       tcode += 4;
181 #ifdef SUPPORT_UTF8
182       if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
183 #endif
184       break;
185 
186       /* At least one single char sets the bit and stops */
187 
188       case OP_EXACT:       /* Fall through */
189       tcode++;
190 
191       case OP_CHARS:       /* Fall through */
192       tcode++;
193 
194       case OP_PLUS:
195       case OP_MINPLUS:
196       set_bit(start_bits, tcode[1], caseless, cd);
197       try_next = FALSE;
198       break;
199 
200       /* Single character type sets the bits and stops */
201 
202       case OP_NOT_DIGIT:
203       for (c = 0; c < 32; c++)
204         start_bits[c] |= ~cd->cbits[c+cbit_digit];
205       try_next = FALSE;
206       break;
207 
208       case OP_DIGIT:
209       for (c = 0; c < 32; c++)
210         start_bits[c] |= cd->cbits[c+cbit_digit];
211       try_next = FALSE;
212       break;
213 
214       case OP_NOT_WHITESPACE:
215       for (c = 0; c < 32; c++)
216         start_bits[c] |= ~cd->cbits[c+cbit_space];
217       try_next = FALSE;
218       break;
219 
220       case OP_WHITESPACE:
221       for (c = 0; c < 32; c++)
222         start_bits[c] |= cd->cbits[c+cbit_space];
223       try_next = FALSE;
224       break;
225 
226       case OP_NOT_WORDCHAR:
227       for (c = 0; c < 32; c++)
228         start_bits[c] |= ~cd->cbits[c+cbit_word];
229       try_next = FALSE;
230       break;
231 
232       case OP_WORDCHAR:
233       for (c = 0; c < 32; c++)
234         start_bits[c] |= cd->cbits[c+cbit_word];
235       try_next = FALSE;
236       break;
237 
238       /* One or more character type fudges the pointer and restarts, knowing
239       it will hit a single character type and stop there. */
240 
241       case OP_TYPEPLUS:
242       case OP_TYPEMINPLUS:
243       tcode++;
244       break;
245 
246       case OP_TYPEEXACT:
247       tcode += 3;
248       break;
249 
250       /* Zero or more repeats of character types set the bits and then
251       try again. */
252 
253       case OP_TYPEUPTO:
254       case OP_TYPEMINUPTO:
255       tcode += 2;               /* Fall through */
256 
257       case OP_TYPESTAR:
258       case OP_TYPEMINSTAR:
259       case OP_TYPEQUERY:
260       case OP_TYPEMINQUERY:
261       switch(tcode[1])
262         {
263         case OP_ANY:
264         return FALSE;
265 
266         case OP_NOT_DIGIT:
267         for (c = 0; c < 32; c++)
268           start_bits[c] |= ~cd->cbits[c+cbit_digit];
269         break;
270 
271         case OP_DIGIT:
272         for (c = 0; c < 32; c++)
273           start_bits[c] |= cd->cbits[c+cbit_digit];
274         break;
275 
276         case OP_NOT_WHITESPACE:
277         for (c = 0; c < 32; c++)
278           start_bits[c] |= ~cd->cbits[c+cbit_space];
279         break;
280 
281         case OP_WHITESPACE:
282         for (c = 0; c < 32; c++)
283           start_bits[c] |= cd->cbits[c+cbit_space];
284         break;
285 
286         case OP_NOT_WORDCHAR:
287         for (c = 0; c < 32; c++)
288           start_bits[c] |= ~cd->cbits[c+cbit_word];
289         break;
290 
291         case OP_WORDCHAR:
292         for (c = 0; c < 32; c++)
293           start_bits[c] |= cd->cbits[c+cbit_word];
294         break;
295         }
296 
297       tcode += 2;
298       break;
299 
300       /* Character class where all the information is in a bit map: set the
301       bits and either carry on or not, according to the repeat count. If it was
302       a negative class, and we are operating with UTF-8 characters, any byte
303       with a value >= 0xc4 is a potentially valid starter because it starts a
304       character with a value > 255. */
305 
306       case OP_NCLASS:
307       if (utf8)
308         {
309         start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
310         memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
311         }
312       /* Fall through */
313 
314       case OP_CLASS:
315         {
316         tcode++;
317 
318         /* In UTF-8 mode, the bits in a bit map correspond to character
319         values, not to byte values. However, the bit map we are constructing is
320         for byte values. So we have to do a conversion for characters whose
321         value is > 127. In fact, there are only two possible starting bytes for
322         characters in the range 128 - 255. */
323 
324         if (utf8)
325           {
326           for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
327           for (c = 128; c < 256; c++)
328             {
329             if ((tcode[c/8] & (1 << (c&7))) != 0)
330               {
331               int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
332               start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
333               c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
334               }
335             }
336           }
337 
338         /* In non-UTF-8 mode, the two bit maps are completely compatible. */
339 
340         else
341           {
342           for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
343           }
344 
345         /* Advance past the bit map, and act on what follows */
346 
347         tcode += 32;
348         switch (*tcode)
349           {
350           case OP_CRSTAR:
351           case OP_CRMINSTAR:
352           case OP_CRQUERY:
353           case OP_CRMINQUERY:
354           tcode++;
355           break;
356 
357           case OP_CRRANGE:
358           case OP_CRMINRANGE:
359           if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
360             else try_next = FALSE;
361           break;
362 
363           default:
364           try_next = FALSE;
365           break;
366           }
367         }
368       break; /* End of bitmap class handling */
369 
370       }      /* End of switch */
371     }        /* End of try_next loop */
372 
373   code += GET(code, 1);   /* Advance to next branch */
374   }
375 while (*code == OP_ALT);
376 return TRUE;
377 }
378 
379 
380 
381 /*************************************************
382 *          Study a compiled expression           *
383 *************************************************/
384 
385 /* This function is handed a compiled expression that it must study to produce
386 information that will speed up the matching. It returns a pcre_extra block
387 which then gets handed back to pcre_exec().
388 
389 Arguments:
390   re        points to the compiled expression
391   options   contains option bits
392   errorptr  points to where to place error messages;
393             set NULL unless error
394 
395 Returns:    pointer to a pcre_extra block, with study_data filled in and the
396               appropriate flag set;
397             NULL on error or if no optimization possible
398 */
399 
400 EXPORT pcre_extra *
pcre_study(const pcre * external_re,int options,const char ** errorptr)401 pcre_study(const pcre *external_re, int options, const char **errorptr)
402 {
403 uschar start_bits[32];
404 pcre_extra *extra;
405 pcre_study_data *study;
406 const real_pcre *re = (const real_pcre *)external_re;
407 uschar *code = (uschar *)re + sizeof(real_pcre) +
408   (re->name_count * re->name_entry_size);
409 compile_data compile_block;
410 
411 *errorptr = NULL;
412 
413 if (re == NULL || re->magic_number != MAGIC_NUMBER)
414   {
415   *errorptr = "argument is not a compiled regular expression";
416   return NULL;
417   }
418 
419 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
420   {
421   *errorptr = "unknown or incorrect option bit(s) set";
422   return NULL;
423   }
424 
425 /* For an anchored pattern, or an unanchored pattern that has a first char, or
426 a multiline pattern that matches only at "line starts", no further processing
427 at present. */
428 
429 if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
430   return NULL;
431 
432 /* Set the character tables in the block which is passed around */
433 
434 compile_block.lcc = re->tables + lcc_offset;
435 compile_block.fcc = re->tables + fcc_offset;
436 compile_block.cbits = re->tables + cbits_offset;
437 compile_block.ctypes = re->tables + ctypes_offset;
438 
439 /* See if we can find a fixed set of initial characters for the pattern. */
440 
441 memset(start_bits, 0, 32 * sizeof(uschar));
442 if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
443   (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL;
444 
445 /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
446 the latter, which is pointed to by the former, which may also get additional
447 data set later by the calling program. At the moment, the size of
448 pcre_study_data is fixed. We nevertheless save it in a field for returning via
449 the pcre_fullinfo() function so that if it becomes variable in the future, we
450 don't have to change that code. */
451 
452 extra = (pcre_extra *)(pcre_malloc)
453   (sizeof(pcre_extra) + sizeof(pcre_study_data));
454 
455 if (extra == NULL)
456   {
457   *errorptr = "failed to get memory";
458   return NULL;
459   }
460 
461 study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
462 extra->flags = PCRE_EXTRA_STUDY_DATA;
463 extra->study_data = study;
464 
465 study->size = sizeof(pcre_study_data);
466 study->options = PCRE_STUDY_MAPPED;
467 memcpy(study->start_bits, start_bits, sizeof(start_bits));
468 
469 return extra;
470 }
471 
472 /* End of study.c */
473