1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1997-2001 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
25
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
28
29 4. If PCRE is embedded in any software that is released under the GNU
30 General Purpose Licence (GPL), then the terms of that licence shall
31 supersede any condition above with which it is incompatible.
32 -----------------------------------------------------------------------------
33 */
34
35
36 /* Include the internals header, which itself includes Standard C headers plus
37 the external pcre header. */
38
39 #include "pcre_internal.h"
40
41
42
43 /*************************************************
44 * Set a bit and maybe its alternate case *
45 *************************************************/
46
47 /* Given a character, set its bit in the table, and also the bit for the other
48 version of a letter if we are caseless.
49
50 Arguments:
51 start_bits points to the bit map
52 c is the character
53 caseless the caseless flag
54 cd the block with char table pointers
55
56 Returns: nothing
57 */
58
59 static void
set_bit(uschar * start_bits,int c,BOOL caseless,compile_data * cd)60 set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd)
61 {
62 start_bits[c/8] |= (1 << (c&7));
63 if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
64 start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
65 }
66
67
68
69 /*************************************************
70 * Create bitmap of starting chars *
71 *************************************************/
72
73 /* This function scans a compiled unanchored expression and attempts to build a
74 bitmap of the set of initial characters. If it can't, it returns FALSE. As time
75 goes by, we may be able to get more clever at doing this.
76
77 Arguments:
78 code points to an expression
79 start_bits points to a 32-byte table, initialized to 0
80 caseless the current state of the caseless flag
81 cd the block with char table pointers
82
83 Returns: TRUE if table built, FALSE otherwise
84 */
85
86 static BOOL
set_start_bits(const uschar * code,uschar * start_bits,BOOL caseless,compile_data * cd)87 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
88 compile_data *cd)
89 {
90 register int c;
91
92 /* This next statement and the later reference to dummy are here in order to
93 trick the optimizer of the IBM C compiler for OS/2 into generating correct
94 code. Apparently IBM isn't going to fix the problem, and we would rather not
95 disable optimization (in this module it actually makes a big difference, and
96 the pcre module can use all the optimization it can get). */
97
98 volatile int dummy;
99
100 do
101 {
102 const uschar *tcode = code + 3;
103 BOOL try_next = TRUE;
104
105 while (try_next)
106 {
107 /* If a branch starts with a bracket or a positive lookahead assertion,
108 recurse to set bits from within them. That's all for this branch. */
109
110 if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
111 {
112 if (!set_start_bits(tcode, start_bits, caseless, cd))
113 return FALSE;
114 try_next = FALSE;
115 }
116
117 else switch(*tcode)
118 {
119 default:
120 return FALSE;
121
122 /* Skip over extended extraction bracket number */
123
124 case OP_BRANUMBER:
125 tcode += 3;
126 break;
127
128 /* Skip over lookbehind and negative lookahead assertions */
129
130 case OP_ASSERT_NOT:
131 case OP_ASSERTBACK:
132 case OP_ASSERTBACK_NOT:
133 do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
134 tcode += 3;
135 break;
136
137 /* Skip over an option setting, changing the caseless flag */
138
139 case OP_OPT:
140 caseless = (tcode[1] & PCRE_CASELESS) != 0;
141 tcode += 2;
142 break;
143
144 /* BRAZERO does the bracket, but carries on. */
145
146 case OP_BRAZERO:
147 case OP_BRAMINZERO:
148 if (!set_start_bits(++tcode, start_bits, caseless, cd))
149 return FALSE;
150 dummy = 1;
151 do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
152 tcode += 3;
153 break;
154
155 /* Single-char * or ? sets the bit and tries the next item */
156
157 case OP_STAR:
158 case OP_MINSTAR:
159 case OP_QUERY:
160 case OP_MINQUERY:
161 set_bit(start_bits, tcode[1], caseless, cd);
162 tcode += 2;
163 break;
164
165 /* Single-char upto sets the bit and tries the next */
166
167 case OP_UPTO:
168 case OP_MINUPTO:
169 set_bit(start_bits, tcode[3], caseless, cd);
170 tcode += 4;
171 break;
172
173 /* At least one single char sets the bit and stops */
174
175 case OP_EXACT: /* Fall through */
176 tcode++;
177
178 case OP_CHARS: /* Fall through */
179 tcode++;
180
181 case OP_PLUS:
182 case OP_MINPLUS:
183 set_bit(start_bits, tcode[1], caseless, cd);
184 try_next = FALSE;
185 break;
186
187 /* Single character type sets the bits and stops */
188
189 case OP_NOT_DIGIT:
190 for (c = 0; c < 32; c++)
191 start_bits[c] |= ~cd->cbits[c+cbit_digit];
192 try_next = FALSE;
193 break;
194
195 case OP_DIGIT:
196 for (c = 0; c < 32; c++)
197 start_bits[c] |= cd->cbits[c+cbit_digit];
198 try_next = FALSE;
199 break;
200
201 case OP_NOT_WHITESPACE:
202 for (c = 0; c < 32; c++)
203 start_bits[c] |= ~cd->cbits[c+cbit_space];
204 try_next = FALSE;
205 break;
206
207 case OP_WHITESPACE:
208 for (c = 0; c < 32; c++)
209 start_bits[c] |= cd->cbits[c+cbit_space];
210 try_next = FALSE;
211 break;
212
213 case OP_NOT_WORDCHAR:
214 for (c = 0; c < 32; c++)
215 start_bits[c] |= ~cd->cbits[c+cbit_word];
216 try_next = FALSE;
217 break;
218
219 case OP_WORDCHAR:
220 for (c = 0; c < 32; c++)
221 start_bits[c] |= cd->cbits[c+cbit_word];
222 try_next = FALSE;
223 break;
224
225 /* One or more character type fudges the pointer and restarts, knowing
226 it will hit a single character type and stop there. */
227
228 case OP_TYPEPLUS:
229 case OP_TYPEMINPLUS:
230 tcode++;
231 break;
232
233 case OP_TYPEEXACT:
234 tcode += 3;
235 break;
236
237 /* Zero or more repeats of character types set the bits and then
238 try again. */
239
240 case OP_TYPEUPTO:
241 case OP_TYPEMINUPTO:
242 tcode += 2; /* Fall through */
243
244 case OP_TYPESTAR:
245 case OP_TYPEMINSTAR:
246 case OP_TYPEQUERY:
247 case OP_TYPEMINQUERY:
248 switch(tcode[1])
249 {
250 case OP_NOT_DIGIT:
251 for (c = 0; c < 32; c++)
252 start_bits[c] |= ~cd->cbits[c+cbit_digit];
253 break;
254
255 case OP_DIGIT:
256 for (c = 0; c < 32; c++)
257 start_bits[c] |= cd->cbits[c+cbit_digit];
258 break;
259
260 case OP_NOT_WHITESPACE:
261 for (c = 0; c < 32; c++)
262 start_bits[c] |= ~cd->cbits[c+cbit_space];
263 break;
264
265 case OP_WHITESPACE:
266 for (c = 0; c < 32; c++)
267 start_bits[c] |= cd->cbits[c+cbit_space];
268 break;
269
270 case OP_NOT_WORDCHAR:
271 for (c = 0; c < 32; c++)
272 start_bits[c] |= ~cd->cbits[c+cbit_word];
273 break;
274
275 case OP_WORDCHAR:
276 for (c = 0; c < 32; c++)
277 start_bits[c] |= cd->cbits[c+cbit_word];
278 break;
279 }
280
281 tcode += 2;
282 break;
283
284 /* Character class: set the bits and either carry on or not,
285 according to the repeat count. */
286
287 case OP_CLASS:
288 {
289 tcode++;
290 for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
291 tcode += 32;
292 switch (*tcode)
293 {
294 case OP_CRSTAR:
295 case OP_CRMINSTAR:
296 case OP_CRQUERY:
297 case OP_CRMINQUERY:
298 tcode++;
299 break;
300
301 case OP_CRRANGE:
302 case OP_CRMINRANGE:
303 if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
304 else try_next = FALSE;
305 break;
306
307 default:
308 try_next = FALSE;
309 break;
310 }
311 }
312 break; /* End of class handling */
313
314 } /* End of switch */
315 } /* End of try_next loop */
316
317 code += (code[1] << 8) + code[2]; /* Advance to next branch */
318 }
319 while (*code == OP_ALT);
320 return TRUE;
321 }
322
323
324
325 /*************************************************
326 * Study a compiled expression *
327 *************************************************/
328
329 /* This function is handed a compiled expression that it must study to produce
330 information that will speed up the matching. It returns a pcre_extra block
331 which then gets handed back to pcre_exec().
332
333 Arguments:
334 re points to the compiled expression
335 options contains option bits
336 errorptr points to where to place error messages;
337 set NULL unless error
338
339 Returns: pointer to a pcre_extra block,
340 NULL on error or if no optimization possible
341 */
342
343 pcre_extra *
pcre_study(const pcre * external_re,int options,const char ** errorptr)344 pcre_study(const pcre *external_re, int options, const char **errorptr)
345 {
346 uschar start_bits[32];
347 real_pcre_extra *extra;
348 const real_pcre *re = (const real_pcre *)external_re;
349 compile_data compile_block;
350
351 *errorptr = NULL;
352
353 if (re == NULL || re->magic_number != MAGIC_NUMBER)
354 {
355 *errorptr = "argument is not a compiled regular expression";
356 return NULL;
357 }
358
359 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
360 {
361 *errorptr = "unknown or incorrect option bit(s) set";
362 return NULL;
363 }
364
365 /* For an anchored pattern, or an unchored pattern that has a first char, or a
366 multiline pattern that matches only at "line starts", no further processing at
367 present. */
368
369 if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
370 return NULL;
371
372 /* Set the character tables in the block which is passed around */
373
374 compile_block.lcc = re->tables + lcc_offset;
375 compile_block.fcc = re->tables + fcc_offset;
376 compile_block.cbits = re->tables + cbits_offset;
377 compile_block.ctypes = re->tables + ctypes_offset;
378
379 /* See if we can find a fixed set of initial characters for the pattern. */
380
381 memset(start_bits, 0, 32 * sizeof(uschar));
382 if (!set_start_bits(re->code, start_bits, (re->options & PCRE_CASELESS) != 0,
383 &compile_block)) return NULL;
384
385 /* Get an "extra" block and put the information therein. */
386
387 extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
388
389 if (extra == NULL)
390 {
391 *errorptr = "failed to get memory";
392 return NULL;
393 }
394
395 extra->options = PCRE_STUDY_MAPPED;
396 memcpy(extra->start_bits, start_bits, sizeof(start_bits));
397
398 return (pcre_extra *)extra;
399 }
400
401 /* End of study.c */
402