1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1997-2000 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
25
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
28
29 4. If PCRE is embedded in any software that is released under the GNU
30 General Purpose Licence (GPL), then the terms of that licence shall
31 supersede any condition above with which it is incompatible.
32 -----------------------------------------------------------------------------
33 */
34
35
36 /* Include the internals header, which itself includes Standard C headers plus
37 the external pcre header. */
38
39 #include "internal.h"
40
41
42
43 /*************************************************
44 * Set a bit and maybe its alternate case *
45 *************************************************/
46
47 /* Given a character, set its bit in the table, and also the bit for the other
48 version of a letter if we are caseless.
49
50 Arguments:
51 start_bits points to the bit map
52 c is the character
53 caseless the caseless flag
54 cd the block with char table pointers
55
56 Returns: nothing
57 */
58
59 static void
set_bit(uschar * start_bits,int c,BOOL caseless,compile_data * cd)60 set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd)
61 {
62 start_bits[c/8] |= (1 << (c&7));
63 if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
64 start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
65 }
66
67
68
69 /*************************************************
70 * Create bitmap of starting chars *
71 *************************************************/
72
73 /* This function scans a compiled unanchored expression and attempts to build a
74 bitmap of the set of initial characters. If it can't, it returns FALSE. As time
75 goes by, we may be able to get more clever at doing this.
76
77 Arguments:
78 code points to an expression
79 start_bits points to a 32-byte table, initialized to 0
80 caseless the current state of the caseless flag
81 cd the block with char table pointers
82
83 Returns: TRUE if table built, FALSE otherwise
84 */
85
86 static BOOL
set_start_bits(const uschar * code,uschar * start_bits,BOOL caseless,compile_data * cd)87 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
88 compile_data *cd)
89 {
90 register int c;
91
92 /* This next statement and the later reference to dummy are here in order to
93 trick the optimizer of the IBM C compiler for OS/2 into generating correct
94 code. Apparently IBM isn't going to fix the problem, and we would rather not
95 disable optimization (in this module it actually makes a big difference, and
96 the pcre module can use all the optimization it can get). */
97
98 volatile int dummy;
99
100 do
101 {
102 const uschar *tcode = code + 3;
103 BOOL try_next = TRUE;
104
105 while (try_next)
106 {
107 try_next = FALSE;
108
109 /* If a branch starts with a bracket or a positive lookahead assertion,
110 recurse to set bits from within them. That's all for this branch. */
111
112 if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
113 {
114 if (!set_start_bits(tcode, start_bits, caseless, cd))
115 return FALSE;
116 }
117
118 else switch(*tcode)
119 {
120 default:
121 return FALSE;
122
123 /* Skip over lookbehind and negative lookahead assertions */
124
125 case OP_ASSERT_NOT:
126 case OP_ASSERTBACK:
127 case OP_ASSERTBACK_NOT:
128 try_next = TRUE;
129 do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
130 tcode += 3;
131 break;
132
133 /* Skip over an option setting, changing the caseless flag */
134
135 case OP_OPT:
136 caseless = (tcode[1] & PCRE_CASELESS) != 0;
137 tcode += 2;
138 try_next = TRUE;
139 break;
140
141 /* BRAZERO does the bracket, but carries on. */
142
143 case OP_BRAZERO:
144 case OP_BRAMINZERO:
145 if (!set_start_bits(++tcode, start_bits, caseless, cd))
146 return FALSE;
147 dummy = 1;
148 do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
149 tcode += 3;
150 try_next = TRUE;
151 break;
152
153 /* Single-char * or ? sets the bit and tries the next item */
154
155 case OP_STAR:
156 case OP_MINSTAR:
157 case OP_QUERY:
158 case OP_MINQUERY:
159 set_bit(start_bits, tcode[1], caseless, cd);
160 tcode += 2;
161 try_next = TRUE;
162 break;
163
164 /* Single-char upto sets the bit and tries the next */
165
166 case OP_UPTO:
167 case OP_MINUPTO:
168 set_bit(start_bits, tcode[3], caseless, cd);
169 tcode += 4;
170 try_next = TRUE;
171 break;
172
173 /* At least one single char sets the bit and stops */
174
175 case OP_EXACT: /* Fall through */
176 tcode++;
177
178 case OP_CHARS: /* Fall through */
179 tcode++;
180
181 case OP_PLUS:
182 case OP_MINPLUS:
183 set_bit(start_bits, tcode[1], caseless, cd);
184 break;
185
186 /* Single character type sets the bits and stops */
187
188 case OP_NOT_DIGIT:
189 for (c = 0; c < 32; c++)
190 start_bits[c] |= ~cd->cbits[c+cbit_digit];
191 break;
192
193 case OP_DIGIT:
194 for (c = 0; c < 32; c++)
195 start_bits[c] |= cd->cbits[c+cbit_digit];
196 break;
197
198 case OP_NOT_WHITESPACE:
199 for (c = 0; c < 32; c++)
200 start_bits[c] |= ~cd->cbits[c+cbit_space];
201 break;
202
203 case OP_WHITESPACE:
204 for (c = 0; c < 32; c++)
205 start_bits[c] |= cd->cbits[c+cbit_space];
206 break;
207
208 case OP_NOT_WORDCHAR:
209 for (c = 0; c < 32; c++)
210 start_bits[c] |= ~cd->cbits[c+cbit_word];
211 break;
212
213 case OP_WORDCHAR:
214 for (c = 0; c < 32; c++)
215 start_bits[c] |= cd->cbits[c+cbit_word];
216 break;
217
218 /* One or more character type fudges the pointer and restarts, knowing
219 it will hit a single character type and stop there. */
220
221 case OP_TYPEPLUS:
222 case OP_TYPEMINPLUS:
223 tcode++;
224 try_next = TRUE;
225 break;
226
227 case OP_TYPEEXACT:
228 tcode += 3;
229 try_next = TRUE;
230 break;
231
232 /* Zero or more repeats of character types set the bits and then
233 try again. */
234
235 case OP_TYPEUPTO:
236 case OP_TYPEMINUPTO:
237 tcode += 2; /* Fall through */
238
239 case OP_TYPESTAR:
240 case OP_TYPEMINSTAR:
241 case OP_TYPEQUERY:
242 case OP_TYPEMINQUERY:
243 switch(tcode[1])
244 {
245 case OP_NOT_DIGIT:
246 for (c = 0; c < 32; c++)
247 start_bits[c] |= ~cd->cbits[c+cbit_digit];
248 break;
249
250 case OP_DIGIT:
251 for (c = 0; c < 32; c++)
252 start_bits[c] |= cd->cbits[c+cbit_digit];
253 break;
254
255 case OP_NOT_WHITESPACE:
256 for (c = 0; c < 32; c++)
257 start_bits[c] |= ~cd->cbits[c+cbit_space];
258 break;
259
260 case OP_WHITESPACE:
261 for (c = 0; c < 32; c++)
262 start_bits[c] |= cd->cbits[c+cbit_space];
263 break;
264
265 case OP_NOT_WORDCHAR:
266 for (c = 0; c < 32; c++)
267 start_bits[c] |= ~cd->cbits[c+cbit_word];
268 break;
269
270 case OP_WORDCHAR:
271 for (c = 0; c < 32; c++)
272 start_bits[c] |= cd->cbits[c+cbit_word];
273 break;
274 }
275
276 tcode += 2;
277 try_next = TRUE;
278 break;
279
280 /* Character class: set the bits and either carry on or not,
281 according to the repeat count. */
282
283 case OP_CLASS:
284 {
285 tcode++;
286 for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
287 tcode += 32;
288 switch (*tcode)
289 {
290 case OP_CRSTAR:
291 case OP_CRMINSTAR:
292 case OP_CRQUERY:
293 case OP_CRMINQUERY:
294 tcode++;
295 try_next = TRUE;
296 break;
297
298 case OP_CRRANGE:
299 case OP_CRMINRANGE:
300 if (((tcode[1] << 8) + tcode[2]) == 0)
301 {
302 tcode += 5;
303 try_next = TRUE;
304 }
305 break;
306 }
307 }
308 break; /* End of class handling */
309
310 } /* End of switch */
311 } /* End of try_next loop */
312
313 code += (code[1] << 8) + code[2]; /* Advance to next branch */
314 }
315 while (*code == OP_ALT);
316 return TRUE;
317 }
318
319
320
321 /*************************************************
322 * Study a compiled expression *
323 *************************************************/
324
325 /* This function is handed a compiled expression that it must study to produce
326 information that will speed up the matching. It returns a pcre_extra block
327 which then gets handed back to pcre_exec().
328
329 Arguments:
330 re points to the compiled expression
331 options contains option bits
332 errorptr points to where to place error messages;
333 set NULL unless error
334
335 Returns: pointer to a pcre_extra block,
336 NULL on error or if no optimization possible
337 */
338
339 pcre_extra *
pcre_study(const pcre * external_re,int options,const char ** errorptr)340 pcre_study(const pcre *external_re, int options, const char **errorptr)
341 {
342 uschar start_bits[32];
343 real_pcre_extra *extra;
344 const real_pcre *re = (const real_pcre *)external_re;
345 compile_data compile_block;
346
347 *errorptr = NULL;
348
349 if (re == NULL || re->magic_number != MAGIC_NUMBER)
350 {
351 *errorptr = "argument is not a compiled regular expression";
352 return NULL;
353 }
354
355 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
356 {
357 *errorptr = "unknown or incorrect option bit(s) set";
358 return NULL;
359 }
360
361 /* For an anchored pattern, or an unchored pattern that has a first char, or a
362 multiline pattern that matches only at "line starts", no further processing at
363 present. */
364
365 if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
366 return NULL;
367
368 /* Set the character tables in the block which is passed around */
369
370 compile_block.lcc = re->tables + lcc_offset;
371 compile_block.fcc = re->tables + fcc_offset;
372 compile_block.cbits = re->tables + cbits_offset;
373 compile_block.ctypes = re->tables + ctypes_offset;
374
375 /* See if we can find a fixed set of initial characters for the pattern. */
376
377 memset(start_bits, 0, 32 * sizeof(uschar));
378 if (!set_start_bits(re->code, start_bits, (re->options & PCRE_CASELESS) != 0,
379 &compile_block)) return NULL;
380
381 /* Get an "extra" block and put the information therein. */
382
383 extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
384
385 if (extra == NULL)
386 {
387 *errorptr = "failed to get memory";
388 return NULL;
389 }
390
391 extra->options = PCRE_STUDY_MAPPED;
392 memcpy(extra->start_bits, start_bits, sizeof(start_bits));
393
394 return (pcre_extra *)extra;
395 }
396
397 /* End of study.c */
398