1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9 
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11 
12            Copyright (c) 1997-2001 University of Cambridge
13 
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18 
19 1. This software is distributed in the hope that it will be useful,
20    but WITHOUT ANY WARRANTY; without even the implied warranty of
21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22 
23 2. The origin of this software must not be misrepresented, either by
24    explicit claim or by omission.
25 
26 3. Altered versions must be plainly marked as such, and must not be
27    misrepresented as being the original software.
28 
29 4. If PCRE is embedded in any software that is released under the GNU
30    General Purpose Licence (GPL), then the terms of that licence shall
31    supersede any condition above with which it is incompatible.
32 -----------------------------------------------------------------------------
33 */
34 
35 
36 /* Include the internals header, which itself includes Standard C headers plus
37 the external pcre header. */
38 
39 #include "pcre_internal.h"
40 
41 
42 
43 /*************************************************
44 *      Set a bit and maybe its alternate case    *
45 *************************************************/
46 
47 /* Given a character, set its bit in the table, and also the bit for the other
48 version of a letter if we are caseless.
49 
50 Arguments:
51   start_bits    points to the bit map
52   c             is the character
53   caseless      the caseless flag
54   cd            the block with char table pointers
55 
56 Returns:        nothing
57 */
58 
59 static void
set_bit(uschar * start_bits,int c,BOOL caseless,compile_data * cd)60 set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd)
61 {
62 start_bits[c/8] |= (1 << (c&7));
63 if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
64   start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
65 }
66 
67 
68 
69 /*************************************************
70 *          Create bitmap of starting chars       *
71 *************************************************/
72 
73 /* This function scans a compiled unanchored expression and attempts to build a
74 bitmap of the set of initial characters. If it can't, it returns FALSE. As time
75 goes by, we may be able to get more clever at doing this.
76 
77 Arguments:
78   code         points to an expression
79   start_bits   points to a 32-byte table, initialized to 0
80   caseless     the current state of the caseless flag
81   cd           the block with char table pointers
82 
83 Returns:       TRUE if table built, FALSE otherwise
84 */
85 
86 static BOOL
set_start_bits(const uschar * code,uschar * start_bits,BOOL caseless,compile_data * cd)87 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
88   compile_data *cd)
89 {
90 register int c;
91 
92 /* This next statement and the later reference to dummy are here in order to
93 trick the optimizer of the IBM C compiler for OS/2 into generating correct
94 code. Apparently IBM isn't going to fix the problem, and we would rather not
95 disable optimization (in this module it actually makes a big difference, and
96 the pcre module can use all the optimization it can get). */
97 
98 volatile int dummy;
99 
100 do
101   {
102   const uschar *tcode = code + 3;
103   BOOL try_next = TRUE;
104 
105   while (try_next)
106     {
107     /* If a branch starts with a bracket or a positive lookahead assertion,
108     recurse to set bits from within them. That's all for this branch. */
109 
110     if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
111       {
112       if (!set_start_bits(tcode, start_bits, caseless, cd))
113         return FALSE;
114       try_next = FALSE;
115       }
116 
117     else switch(*tcode)
118       {
119       default:
120       return FALSE;
121 
122       /* Skip over extended extraction bracket number */
123 
124       case OP_BRANUMBER:
125       tcode += 3;
126       break;
127 
128       /* Skip over lookbehind and negative lookahead assertions */
129 
130       case OP_ASSERT_NOT:
131       case OP_ASSERTBACK:
132       case OP_ASSERTBACK_NOT:
133       do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
134       tcode += 3;
135       break;
136 
137       /* Skip over an option setting, changing the caseless flag */
138 
139       case OP_OPT:
140       caseless = (tcode[1] & PCRE_CASELESS) != 0;
141       tcode += 2;
142       break;
143 
144       /* BRAZERO does the bracket, but carries on. */
145 
146       case OP_BRAZERO:
147       case OP_BRAMINZERO:
148       if (!set_start_bits(++tcode, start_bits, caseless, cd))
149         return FALSE;
150       dummy = 1;
151       do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
152       tcode += 3;
153       break;
154 
155       /* Single-char * or ? sets the bit and tries the next item */
156 
157       case OP_STAR:
158       case OP_MINSTAR:
159       case OP_QUERY:
160       case OP_MINQUERY:
161       set_bit(start_bits, tcode[1], caseless, cd);
162       tcode += 2;
163       break;
164 
165       /* Single-char upto sets the bit and tries the next */
166 
167       case OP_UPTO:
168       case OP_MINUPTO:
169       set_bit(start_bits, tcode[3], caseless, cd);
170       tcode += 4;
171       break;
172 
173       /* At least one single char sets the bit and stops */
174 
175       case OP_EXACT:       /* Fall through */
176       tcode++;
177 
178       case OP_CHARS:       /* Fall through */
179       tcode++;
180 
181       case OP_PLUS:
182       case OP_MINPLUS:
183       set_bit(start_bits, tcode[1], caseless, cd);
184       try_next = FALSE;
185       break;
186 
187       /* Single character type sets the bits and stops */
188 
189       case OP_NOT_DIGIT:
190       for (c = 0; c < 32; c++)
191         start_bits[c] |= ~cd->cbits[c+cbit_digit];
192       try_next = FALSE;
193       break;
194 
195       case OP_DIGIT:
196       for (c = 0; c < 32; c++)
197         start_bits[c] |= cd->cbits[c+cbit_digit];
198       try_next = FALSE;
199       break;
200 
201       case OP_NOT_WHITESPACE:
202       for (c = 0; c < 32; c++)
203         start_bits[c] |= ~cd->cbits[c+cbit_space];
204       try_next = FALSE;
205       break;
206 
207       case OP_WHITESPACE:
208       for (c = 0; c < 32; c++)
209         start_bits[c] |= cd->cbits[c+cbit_space];
210       try_next = FALSE;
211       break;
212 
213       case OP_NOT_WORDCHAR:
214       for (c = 0; c < 32; c++)
215         start_bits[c] |= ~cd->cbits[c+cbit_word];
216       try_next = FALSE;
217       break;
218 
219       case OP_WORDCHAR:
220       for (c = 0; c < 32; c++)
221         start_bits[c] |= cd->cbits[c+cbit_word];
222       try_next = FALSE;
223       break;
224 
225       /* One or more character type fudges the pointer and restarts, knowing
226       it will hit a single character type and stop there. */
227 
228       case OP_TYPEPLUS:
229       case OP_TYPEMINPLUS:
230       tcode++;
231       break;
232 
233       case OP_TYPEEXACT:
234       tcode += 3;
235       break;
236 
237       /* Zero or more repeats of character types set the bits and then
238       try again. */
239 
240       case OP_TYPEUPTO:
241       case OP_TYPEMINUPTO:
242       tcode += 2;               /* Fall through */
243 
244       case OP_TYPESTAR:
245       case OP_TYPEMINSTAR:
246       case OP_TYPEQUERY:
247       case OP_TYPEMINQUERY:
248       switch(tcode[1])
249         {
250         case OP_NOT_DIGIT:
251         for (c = 0; c < 32; c++)
252           start_bits[c] |= ~cd->cbits[c+cbit_digit];
253         break;
254 
255         case OP_DIGIT:
256         for (c = 0; c < 32; c++)
257           start_bits[c] |= cd->cbits[c+cbit_digit];
258         break;
259 
260         case OP_NOT_WHITESPACE:
261         for (c = 0; c < 32; c++)
262           start_bits[c] |= ~cd->cbits[c+cbit_space];
263         break;
264 
265         case OP_WHITESPACE:
266         for (c = 0; c < 32; c++)
267           start_bits[c] |= cd->cbits[c+cbit_space];
268         break;
269 
270         case OP_NOT_WORDCHAR:
271         for (c = 0; c < 32; c++)
272           start_bits[c] |= ~cd->cbits[c+cbit_word];
273         break;
274 
275         case OP_WORDCHAR:
276         for (c = 0; c < 32; c++)
277           start_bits[c] |= cd->cbits[c+cbit_word];
278         break;
279         }
280 
281       tcode += 2;
282       break;
283 
284       /* Character class: set the bits and either carry on or not,
285       according to the repeat count. */
286 
287       case OP_CLASS:
288         {
289         tcode++;
290         for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
291         tcode += 32;
292         switch (*tcode)
293           {
294           case OP_CRSTAR:
295           case OP_CRMINSTAR:
296           case OP_CRQUERY:
297           case OP_CRMINQUERY:
298           tcode++;
299           break;
300 
301           case OP_CRRANGE:
302           case OP_CRMINRANGE:
303           if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
304             else try_next = FALSE;
305           break;
306 
307           default:
308           try_next = FALSE;
309           break;
310           }
311         }
312       break; /* End of class handling */
313 
314       }      /* End of switch */
315     }        /* End of try_next loop */
316 
317   code += (code[1] << 8) + code[2];   /* Advance to next branch */
318   }
319 while (*code == OP_ALT);
320 return TRUE;
321 }
322 
323 
324 
325 /*************************************************
326 *          Study a compiled expression           *
327 *************************************************/
328 
329 /* This function is handed a compiled expression that it must study to produce
330 information that will speed up the matching. It returns a pcre_extra block
331 which then gets handed back to pcre_exec().
332 
333 Arguments:
334   re        points to the compiled expression
335   options   contains option bits
336   errorptr  points to where to place error messages;
337             set NULL unless error
338 
339 Returns:    pointer to a pcre_extra block,
340             NULL on error or if no optimization possible
341 */
342 
343 pcre_extra *
pcre_study(const pcre * external_re,int options,const char ** errorptr)344 pcre_study(const pcre *external_re, int options, const char **errorptr)
345 {
346 uschar start_bits[32];
347 real_pcre_extra *extra;
348 const real_pcre *re = (const real_pcre *)external_re;
349 compile_data compile_block;
350 
351 *errorptr = NULL;
352 
353 if (re == NULL || re->magic_number != MAGIC_NUMBER)
354   {
355   *errorptr = "argument is not a compiled regular expression";
356   return NULL;
357   }
358 
359 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
360   {
361   *errorptr = "unknown or incorrect option bit(s) set";
362   return NULL;
363   }
364 
365 /* For an anchored pattern, or an unchored pattern that has a first char, or a
366 multiline pattern that matches only at "line starts", no further processing at
367 present. */
368 
369 if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
370   return NULL;
371 
372 /* Set the character tables in the block which is passed around */
373 
374 compile_block.lcc = re->tables + lcc_offset;
375 compile_block.fcc = re->tables + fcc_offset;
376 compile_block.cbits = re->tables + cbits_offset;
377 compile_block.ctypes = re->tables + ctypes_offset;
378 
379 /* See if we can find a fixed set of initial characters for the pattern. */
380 
381 memset(start_bits, 0, 32 * sizeof(uschar));
382 if (!set_start_bits(re->code, start_bits, (re->options & PCRE_CASELESS) != 0,
383   &compile_block)) return NULL;
384 
385 /* Get an "extra" block and put the information therein. */
386 
387 extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
388 
389 if (extra == NULL)
390   {
391   *errorptr = "failed to get memory";
392   return NULL;
393   }
394 
395 extra->options = PCRE_STUDY_MAPPED;
396 memcpy(extra->start_bits, start_bits, sizeof(start_bits));
397 
398 return (pcre_extra *)extra;
399 }
400 
401 /* End of study.c */
402