1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9 
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11 
12            Copyright (c) 1997-2000 University of Cambridge
13 
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18 
19 1. This software is distributed in the hope that it will be useful,
20    but WITHOUT ANY WARRANTY; without even the implied warranty of
21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22 
23 2. The origin of this software must not be misrepresented, either by
24    explicit claim or by omission.
25 
26 3. Altered versions must be plainly marked as such, and must not be
27    misrepresented as being the original software.
28 
29 4. If PCRE is embedded in any software that is released under the GNU
30    General Purpose Licence (GPL), then the terms of that licence shall
31    supersede any condition above with which it is incompatible.
32 -----------------------------------------------------------------------------
33 */
34 
35 
36 /* Include the internals header, which itself includes Standard C headers plus
37 the external pcre header. */
38 
39 #include "internal.h"
40 
41 
42 
43 /*************************************************
44 *      Set a bit and maybe its alternate case    *
45 *************************************************/
46 
47 /* Given a character, set its bit in the table, and also the bit for the other
48 version of a letter if we are caseless.
49 
50 Arguments:
51   start_bits    points to the bit map
52   c             is the character
53   caseless      the caseless flag
54   cd            the block with char table pointers
55 
56 Returns:        nothing
57 */
58 
59 static void
set_bit(uschar * start_bits,int c,BOOL caseless,compile_data * cd)60 set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd)
61 {
62 start_bits[c/8] |= (1 << (c&7));
63 if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
64   start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
65 }
66 
67 
68 
69 /*************************************************
70 *          Create bitmap of starting chars       *
71 *************************************************/
72 
73 /* This function scans a compiled unanchored expression and attempts to build a
74 bitmap of the set of initial characters. If it can't, it returns FALSE. As time
75 goes by, we may be able to get more clever at doing this.
76 
77 Arguments:
78   code         points to an expression
79   start_bits   points to a 32-byte table, initialized to 0
80   caseless     the current state of the caseless flag
81   cd           the block with char table pointers
82 
83 Returns:       TRUE if table built, FALSE otherwise
84 */
85 
86 static BOOL
set_start_bits(const uschar * code,uschar * start_bits,BOOL caseless,compile_data * cd)87 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
88   compile_data *cd)
89 {
90 register int c;
91 
92 /* This next statement and the later reference to dummy are here in order to
93 trick the optimizer of the IBM C compiler for OS/2 into generating correct
94 code. Apparently IBM isn't going to fix the problem, and we would rather not
95 disable optimization (in this module it actually makes a big difference, and
96 the pcre module can use all the optimization it can get). */
97 
98 volatile int dummy;
99 
100 do
101   {
102   const uschar *tcode = code + 3;
103   BOOL try_next = TRUE;
104 
105   while (try_next)
106     {
107     try_next = FALSE;
108 
109     /* If a branch starts with a bracket or a positive lookahead assertion,
110     recurse to set bits from within them. That's all for this branch. */
111 
112     if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
113       {
114       if (!set_start_bits(tcode, start_bits, caseless, cd))
115         return FALSE;
116       }
117 
118     else switch(*tcode)
119       {
120       default:
121       return FALSE;
122 
123       /* Skip over lookbehind and negative lookahead assertions */
124 
125       case OP_ASSERT_NOT:
126       case OP_ASSERTBACK:
127       case OP_ASSERTBACK_NOT:
128       try_next = TRUE;
129       do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
130       tcode += 3;
131       break;
132 
133       /* Skip over an option setting, changing the caseless flag */
134 
135       case OP_OPT:
136       caseless = (tcode[1] & PCRE_CASELESS) != 0;
137       tcode += 2;
138       try_next = TRUE;
139       break;
140 
141       /* BRAZERO does the bracket, but carries on. */
142 
143       case OP_BRAZERO:
144       case OP_BRAMINZERO:
145       if (!set_start_bits(++tcode, start_bits, caseless, cd))
146         return FALSE;
147       dummy = 1;
148       do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
149       tcode += 3;
150       try_next = TRUE;
151       break;
152 
153       /* Single-char * or ? sets the bit and tries the next item */
154 
155       case OP_STAR:
156       case OP_MINSTAR:
157       case OP_QUERY:
158       case OP_MINQUERY:
159       set_bit(start_bits, tcode[1], caseless, cd);
160       tcode += 2;
161       try_next = TRUE;
162       break;
163 
164       /* Single-char upto sets the bit and tries the next */
165 
166       case OP_UPTO:
167       case OP_MINUPTO:
168       set_bit(start_bits, tcode[3], caseless, cd);
169       tcode += 4;
170       try_next = TRUE;
171       break;
172 
173       /* At least one single char sets the bit and stops */
174 
175       case OP_EXACT:       /* Fall through */
176       tcode++;
177 
178       case OP_CHARS:       /* Fall through */
179       tcode++;
180 
181       case OP_PLUS:
182       case OP_MINPLUS:
183       set_bit(start_bits, tcode[1], caseless, cd);
184       break;
185 
186       /* Single character type sets the bits and stops */
187 
188       case OP_NOT_DIGIT:
189       for (c = 0; c < 32; c++)
190         start_bits[c] |= ~cd->cbits[c+cbit_digit];
191       break;
192 
193       case OP_DIGIT:
194       for (c = 0; c < 32; c++)
195         start_bits[c] |= cd->cbits[c+cbit_digit];
196       break;
197 
198       case OP_NOT_WHITESPACE:
199       for (c = 0; c < 32; c++)
200         start_bits[c] |= ~cd->cbits[c+cbit_space];
201       break;
202 
203       case OP_WHITESPACE:
204       for (c = 0; c < 32; c++)
205         start_bits[c] |= cd->cbits[c+cbit_space];
206       break;
207 
208       case OP_NOT_WORDCHAR:
209       for (c = 0; c < 32; c++)
210         start_bits[c] |= ~cd->cbits[c+cbit_word];
211       break;
212 
213       case OP_WORDCHAR:
214       for (c = 0; c < 32; c++)
215         start_bits[c] |= cd->cbits[c+cbit_word];
216       break;
217 
218       /* One or more character type fudges the pointer and restarts, knowing
219       it will hit a single character type and stop there. */
220 
221       case OP_TYPEPLUS:
222       case OP_TYPEMINPLUS:
223       tcode++;
224       try_next = TRUE;
225       break;
226 
227       case OP_TYPEEXACT:
228       tcode += 3;
229       try_next = TRUE;
230       break;
231 
232       /* Zero or more repeats of character types set the bits and then
233       try again. */
234 
235       case OP_TYPEUPTO:
236       case OP_TYPEMINUPTO:
237       tcode += 2;               /* Fall through */
238 
239       case OP_TYPESTAR:
240       case OP_TYPEMINSTAR:
241       case OP_TYPEQUERY:
242       case OP_TYPEMINQUERY:
243       switch(tcode[1])
244         {
245         case OP_NOT_DIGIT:
246         for (c = 0; c < 32; c++)
247           start_bits[c] |= ~cd->cbits[c+cbit_digit];
248         break;
249 
250         case OP_DIGIT:
251         for (c = 0; c < 32; c++)
252           start_bits[c] |= cd->cbits[c+cbit_digit];
253         break;
254 
255         case OP_NOT_WHITESPACE:
256         for (c = 0; c < 32; c++)
257           start_bits[c] |= ~cd->cbits[c+cbit_space];
258         break;
259 
260         case OP_WHITESPACE:
261         for (c = 0; c < 32; c++)
262           start_bits[c] |= cd->cbits[c+cbit_space];
263         break;
264 
265         case OP_NOT_WORDCHAR:
266         for (c = 0; c < 32; c++)
267           start_bits[c] |= ~cd->cbits[c+cbit_word];
268         break;
269 
270         case OP_WORDCHAR:
271         for (c = 0; c < 32; c++)
272           start_bits[c] |= cd->cbits[c+cbit_word];
273         break;
274         }
275 
276       tcode += 2;
277       try_next = TRUE;
278       break;
279 
280       /* Character class: set the bits and either carry on or not,
281       according to the repeat count. */
282 
283       case OP_CLASS:
284         {
285         tcode++;
286         for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
287         tcode += 32;
288         switch (*tcode)
289           {
290           case OP_CRSTAR:
291           case OP_CRMINSTAR:
292           case OP_CRQUERY:
293           case OP_CRMINQUERY:
294           tcode++;
295           try_next = TRUE;
296           break;
297 
298           case OP_CRRANGE:
299           case OP_CRMINRANGE:
300           if (((tcode[1] << 8) + tcode[2]) == 0)
301             {
302             tcode += 5;
303             try_next = TRUE;
304             }
305           break;
306           }
307         }
308       break; /* End of class handling */
309 
310       }      /* End of switch */
311     }        /* End of try_next loop */
312 
313   code += (code[1] << 8) + code[2];   /* Advance to next branch */
314   }
315 while (*code == OP_ALT);
316 return TRUE;
317 }
318 
319 
320 
321 /*************************************************
322 *          Study a compiled expression           *
323 *************************************************/
324 
325 /* This function is handed a compiled expression that it must study to produce
326 information that will speed up the matching. It returns a pcre_extra block
327 which then gets handed back to pcre_exec().
328 
329 Arguments:
330   re        points to the compiled expression
331   options   contains option bits
332   errorptr  points to where to place error messages;
333             set NULL unless error
334 
335 Returns:    pointer to a pcre_extra block,
336             NULL on error or if no optimization possible
337 */
338 
339 pcre_extra *
pcre_study(const pcre * external_re,int options,const char ** errorptr)340 pcre_study(const pcre *external_re, int options, const char **errorptr)
341 {
342 uschar start_bits[32];
343 real_pcre_extra *extra;
344 const real_pcre *re = (const real_pcre *)external_re;
345 compile_data compile_block;
346 
347 *errorptr = NULL;
348 
349 if (re == NULL || re->magic_number != MAGIC_NUMBER)
350   {
351   *errorptr = "argument is not a compiled regular expression";
352   return NULL;
353   }
354 
355 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
356   {
357   *errorptr = "unknown or incorrect option bit(s) set";
358   return NULL;
359   }
360 
361 /* For an anchored pattern, or an unchored pattern that has a first char, or a
362 multiline pattern that matches only at "line starts", no further processing at
363 present. */
364 
365 if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
366   return NULL;
367 
368 /* Set the character tables in the block which is passed around */
369 
370 compile_block.lcc = re->tables + lcc_offset;
371 compile_block.fcc = re->tables + fcc_offset;
372 compile_block.cbits = re->tables + cbits_offset;
373 compile_block.ctypes = re->tables + ctypes_offset;
374 
375 /* See if we can find a fixed set of initial characters for the pattern. */
376 
377 memset(start_bits, 0, 32 * sizeof(uschar));
378 if (!set_start_bits(re->code, start_bits, (re->options & PCRE_CASELESS) != 0,
379   &compile_block)) return NULL;
380 
381 /* Get an "extra" block and put the information therein. */
382 
383 extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
384 
385 if (extra == NULL)
386   {
387   *errorptr = "failed to get memory";
388   return NULL;
389   }
390 
391 extra->options = PCRE_STUDY_MAPPED;
392 memcpy(extra->start_bits, start_bits, sizeof(start_bits));
393 
394 return (pcre_extra *)extra;
395 }
396 
397 /* End of study.c */
398