1 /*
2 tre-match-parallel.c - TRE parallel regex matching engine
3
4 Copyright (c) 2001-2006 Ville Laurikari <vl@iki.fi>.
5
6 This library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 This library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with this library; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
20 */
21
22 /*
23 This algorithm searches for matches basically by reading characters
24 in the searched string one by one, starting at the beginning. All
25 matching paths in the TNFA are traversed in parallel. When two or
26 more paths reach the same state, exactly one is chosen according to
27 tag ordering rules; if returning submatches is not required it does
28 not matter which path is chosen.
29
30 The worst case time required for finding the leftmost and longest
31 match, or determining that there is no match, is always linearly
32 dependent on the length of the text being searched.
33
34 This algorithm cannot handle TNFAs with back referencing nodes.
35 See `tre-match-backtrack.c'.
36 */
37
38
39 #ifdef HAVE_CONFIG_H
40 #include <config.h>
41 #endif /* HAVE_CONFIG_H */
42
43 #ifdef TRE_USE_ALLOCA
44 /* AIX requires this to be the first thing in the file. */
45 #ifndef __GNUC__
46 # if HAVE_ALLOCA_H
47 # include <alloca.h>
48 # else
49 # ifdef _AIX
50 #pragma alloca
51 # else
52 # ifndef alloca /* predefined by HP cc +Olibcalls */
53 char *alloca ();
54 # endif
55 # endif
56 # endif
57 #endif
58 #endif /* TRE_USE_ALLOCA */
59
60 #include <assert.h>
61 #include <stdlib.h>
62 #include <string.h>
63 #ifdef HAVE_WCHAR_H
64 #include <wchar.h>
65 #endif /* HAVE_WCHAR_H */
66 #ifdef HAVE_WCTYPE_H
67 #include <wctype.h>
68 #endif /* HAVE_WCTYPE_H */
69 #ifndef TRE_WCHAR
70 #include <ctype.h>
71 #endif /* !TRE_WCHAR */
72 #ifdef HAVE_MALLOC_H
73 #include <malloc.h>
74 #endif /* HAVE_MALLOC_H */
75
76 #include "tre-internal.h"
77 #include "tre-match-utils.h"
78 #include "regex.h"
79 #include "xmalloc.h"
80
81
82
83 typedef struct {
84 tre_tnfa_transition_t *state;
85 int *tags;
86 } tre_tnfa_reach_t;
87
88 typedef struct {
89 int pos;
90 int **tags;
91 } tre_reach_pos_t;
92
93
94 #ifdef TRE_DEBUG
95 static void
tre_print_reach(const tre_tnfa_t * tnfa,tre_tnfa_reach_t * reach,int num_tags)96 tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags)
97 {
98 int i;
99
100 while (reach->state != NULL)
101 {
102 DPRINT((" %p", (void *)reach->state));
103 if (num_tags > 0)
104 {
105 DPRINT(("/"));
106 for (i = 0; i < num_tags; i++)
107 {
108 DPRINT(("%d:%d", i, reach->tags[i]));
109 if (i < (num_tags-1))
110 DPRINT((","));
111 }
112 }
113 reach++;
114 }
115 DPRINT(("\n"));
116
117 }
118 #endif /* TRE_DEBUG */
119
120 reg_errcode_t
tre_tnfa_run_parallel(const tre_tnfa_t * tnfa,const void * string,int len,tre_str_type_t type,int * match_tags,int eflags,int * match_end_ofs)121 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
122 tre_str_type_t type, int *match_tags, int eflags,
123 int *match_end_ofs)
124 {
125 /* State variables required by GET_NEXT_WCHAR. */
126 tre_char_t prev_c = 0, next_c = 0;
127 const char *str_byte = string;
128 int pos = -1;
129 unsigned int pos_add_next = 1;
130 #ifdef TRE_WCHAR
131 const wchar_t *str_wide = string;
132 #ifdef TRE_MBSTATE
133 mbstate_t mbstate;
134 #endif /* TRE_MBSTATE */
135 #endif /* TRE_WCHAR */
136 int reg_notbol = eflags & REG_NOTBOL;
137 int reg_noteol = eflags & REG_NOTEOL;
138 int reg_newline = tnfa->cflags & REG_NEWLINE;
139 int str_user_end = 0;
140
141 char *buf;
142 tre_tnfa_transition_t *trans_i;
143 tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i;
144 tre_reach_pos_t *reach_pos;
145 int *tag_i;
146 int num_tags, i;
147
148 int match_eo = -1; /* end offset of match (-1 if no match found yet) */
149 int new_match = 0;
150 int *tmp_tags = NULL;
151 int *tmp_iptr;
152
153 #ifdef TRE_MBSTATE
154 memset(&mbstate, '\0', sizeof(mbstate));
155 #endif /* TRE_MBSTATE */
156
157 DPRINT(("tre_tnfa_run_parallel, input type %d\n", type));
158
159 if (!match_tags)
160 num_tags = 0;
161 else
162 num_tags = tnfa->num_tags;
163
164 /* Allocate memory for temporary data required for matching. This needs to
165 be done for every matching operation to be thread safe. This allocates
166 everything in a single large block from the stack frame using alloca()
167 or with malloc() if alloca is unavailable. */
168 {
169 int tbytes, rbytes, pbytes, xbytes, total_bytes;
170 char *tmp_buf;
171 /* Compute the length of the block we need. */
172 tbytes = sizeof(*tmp_tags) * num_tags;
173 rbytes = sizeof(*reach_next) * (tnfa->num_states + 1);
174 pbytes = sizeof(*reach_pos) * tnfa->num_states;
175 xbytes = sizeof(int) * num_tags;
176 total_bytes =
177 (sizeof(long) - 1) * 4 /* for alignment paddings */
178 + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes;
179
180 /* Allocate the memory. */
181 #ifdef TRE_USE_ALLOCA
182 buf = alloca(total_bytes);
183 #else /* !TRE_USE_ALLOCA */
184 buf = xmalloc(total_bytes);
185 #endif /* !TRE_USE_ALLOCA */
186 if (buf == NULL)
187 return REG_ESPACE;
188 memset(buf, 0, total_bytes);
189
190 /* Get the various pointers within tmp_buf (properly aligned). */
191 tmp_tags = (void *)buf;
192 tmp_buf = buf + tbytes;
193 tmp_buf += ALIGN(tmp_buf, long);
194 reach_next = (void *)tmp_buf;
195 tmp_buf += rbytes;
196 tmp_buf += ALIGN(tmp_buf, long);
197 reach = (void *)tmp_buf;
198 tmp_buf += rbytes;
199 tmp_buf += ALIGN(tmp_buf, long);
200 reach_pos = (void *)tmp_buf;
201 tmp_buf += pbytes;
202 tmp_buf += ALIGN(tmp_buf, long);
203 for (i = 0; i < tnfa->num_states; i++)
204 {
205 reach[i].tags = (void *)tmp_buf;
206 tmp_buf += xbytes;
207 reach_next[i].tags = (void *)tmp_buf;
208 tmp_buf += xbytes;
209 }
210 }
211
212 for (i = 0; i < tnfa->num_states; i++)
213 reach_pos[i].pos = -1;
214
215 /* If only one character can start a match, find it first. */
216 if (tnfa->first_char >= 0 && type == STR_BYTE && str_byte)
217 {
218 const char *orig_str = str_byte;
219 int first = tnfa->first_char;
220
221 if (len >= 0)
222 str_byte = memchr(orig_str, first, len);
223 else
224 str_byte = strchr(orig_str, first);
225 if (str_byte == NULL)
226 {
227 #ifndef TRE_USE_ALLOCA
228 if (buf)
229 xfree(buf);
230 #endif /* !TRE_USE_ALLOCA */
231 return REG_NOMATCH;
232 }
233 DPRINT(("skipped %lu chars\n", (unsigned long)(str_byte - orig_str)));
234 if (str_byte >= orig_str + 1)
235 prev_c = (unsigned char)*(str_byte - 1);
236 next_c = (unsigned char)*str_byte;
237 pos = str_byte - orig_str;
238 if (len < 0 || pos < len)
239 str_byte++;
240 }
241 else
242 {
243 GET_NEXT_WCHAR();
244 pos = 0;
245 }
246
247 #if 0
248 /* Skip over characters that cannot possibly be the first character
249 of a match. */
250 if (tnfa->firstpos_chars != NULL)
251 {
252 char *chars = tnfa->firstpos_chars;
253
254 if (len < 0)
255 {
256 const char *orig_str = str_byte;
257 /* XXX - use strpbrk() and wcspbrk() because they might be
258 optimized for the target architecture. Try also strcspn()
259 and wcscspn() and compare the speeds. */
260 while (next_c != L'\0' && !chars[next_c])
261 {
262 next_c = *str_byte++;
263 }
264 prev_c = *(str_byte - 2);
265 pos += str_byte - orig_str;
266 DPRINT(("skipped %d chars\n", str_byte - orig_str));
267 }
268 else
269 {
270 while (pos <= len && !chars[next_c])
271 {
272 prev_c = next_c;
273 next_c = (unsigned char)(*str_byte++);
274 pos++;
275 }
276 }
277 }
278 #endif
279
280 DPRINT(("length: %d\n", len));
281 DPRINT(("pos:chr/code | states and tags\n"));
282 DPRINT(("-------------+------------------------------------------------\n"));
283
284 reach_next_i = reach_next;
285 while (1)
286 {
287 /* If no match found yet, add the initial states to `reach_next'. */
288 if (match_eo < 0)
289 {
290 DPRINT((" init >"));
291 trans_i = tnfa->initial;
292 while (trans_i->state != NULL)
293 {
294 if (reach_pos[trans_i->state_id].pos < pos)
295 {
296 if (trans_i->assertions
297 && CHECK_ASSERTIONS(trans_i->assertions))
298 {
299 DPRINT(("assertion failed\n"));
300 trans_i++;
301 continue;
302 }
303
304 DPRINT((" %p", (void *)trans_i->state));
305 reach_next_i->state = trans_i->state;
306 for (i = 0; i < num_tags; i++)
307 reach_next_i->tags[i] = -1;
308 tag_i = trans_i->tags;
309 if (tag_i)
310 while (*tag_i >= 0)
311 {
312 if (*tag_i < num_tags)
313 reach_next_i->tags[*tag_i] = pos;
314 tag_i++;
315 }
316 if (reach_next_i->state == tnfa->final)
317 {
318 DPRINT((" found empty match\n"));
319 match_eo = pos;
320 new_match = 1;
321 for (i = 0; i < num_tags; i++)
322 match_tags[i] = reach_next_i->tags[i];
323 }
324 reach_pos[trans_i->state_id].pos = pos;
325 reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
326 reach_next_i++;
327 }
328 trans_i++;
329 }
330 DPRINT(("\n"));
331 reach_next_i->state = NULL;
332 }
333 else
334 {
335 if (num_tags == 0 || reach_next_i == reach_next)
336 /*�We have found a match. */
337 break;
338 }
339
340 /* Check for end of string. */
341 if (len < 0)
342 {
343 if (type == STR_USER)
344 {
345 if (str_user_end)
346 break;
347 }
348 else if (next_c == L'\0')
349 break;
350 }
351 else
352 {
353 if (pos >= len)
354 break;
355 }
356
357 GET_NEXT_WCHAR();
358
359 #ifdef TRE_DEBUG
360 DPRINT(("%3d:%2lc/%05d |", pos - 1, (tre_cint_t)prev_c, (int)prev_c));
361 tre_print_reach(tnfa, reach_next, num_tags);
362 DPRINT(("%3d:%2lc/%05d |", pos, (tre_cint_t)next_c, (int)next_c));
363 tre_print_reach(tnfa, reach_next, num_tags);
364 #endif /* TRE_DEBUG */
365
366 /* Swap `reach' and `reach_next'. */
367 reach_i = reach;
368 reach = reach_next;
369 reach_next = reach_i;
370
371 /* For each state in `reach', weed out states that don't fulfill the
372 minimal matching conditions. */
373 if (tnfa->num_minimals && new_match)
374 {
375 new_match = 0;
376 reach_next_i = reach_next;
377 for (reach_i = reach; reach_i->state; reach_i++)
378 {
379 int i;
380 int skip = 0;
381 for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
382 {
383 int end = tnfa->minimal_tags[i];
384 int start = tnfa->minimal_tags[i + 1];
385 DPRINT((" Minimal start %d, end %d\n", start, end));
386 if (end >= num_tags)
387 {
388 DPRINT((" Throwing %p out.\n", reach_i->state));
389 skip = 1;
390 break;
391 }
392 else if (reach_i->tags[start] == match_tags[start]
393 && reach_i->tags[end] < match_tags[end])
394 {
395 DPRINT((" Throwing %p out because t%d < %d\n",
396 reach_i->state, end, match_tags[end]));
397 skip = 1;
398 break;
399 }
400 }
401 if (!skip)
402 {
403 int *tmp_iptr;
404 reach_next_i->state = reach_i->state;
405 tmp_iptr = reach_next_i->tags;
406 reach_next_i->tags = reach_i->tags;
407 reach_i->tags = tmp_iptr;
408 reach_next_i++;
409 }
410 }
411 reach_next_i->state = NULL;
412
413 /* Swap `reach' and `reach_next'. */
414 reach_i = reach;
415 reach = reach_next;
416 reach_next = reach_i;
417 }
418
419 /* For each state in `reach' see if there is a transition leaving with
420 the current input symbol to a state not yet in `reach_next', and
421 add the destination states to `reach_next'. */
422 reach_next_i = reach_next;
423 for (reach_i = reach; reach_i->state; reach_i++)
424 {
425 for (trans_i = reach_i->state; trans_i->state; trans_i++)
426 {
427 /* Does this transition match the input symbol? */
428 if (trans_i->code_min <= prev_c &&
429 trans_i->code_max >= prev_c)
430 {
431 if (trans_i->assertions
432 && (CHECK_ASSERTIONS(trans_i->assertions)
433 /* Handle character class transitions. */
434 || ((trans_i->assertions & ASSERT_CHAR_CLASS)
435 && !(tnfa->cflags & REG_ICASE)
436 && !tre_isctype((tre_cint_t)prev_c,
437 trans_i->u.class))
438 || ((trans_i->assertions & ASSERT_CHAR_CLASS)
439 && (tnfa->cflags & REG_ICASE)
440 && (!tre_isctype(tre_tolower((tre_cint_t)prev_c),
441 trans_i->u.class)
442 && !tre_isctype(tre_toupper((tre_cint_t)prev_c),
443 trans_i->u.class)))
444 || ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG)
445 && tre_neg_char_classes_match(trans_i->neg_classes,
446 (tre_cint_t)prev_c,
447 tnfa->cflags & REG_ICASE))))
448 {
449 DPRINT(("assertion failed\n"));
450 continue;
451 }
452
453 /* Compute the tags after this transition. */
454 for (i = 0; i < num_tags; i++)
455 tmp_tags[i] = reach_i->tags[i];
456 tag_i = trans_i->tags;
457 if (tag_i != NULL)
458 while (*tag_i >= 0)
459 {
460 if (*tag_i < num_tags)
461 tmp_tags[*tag_i] = pos;
462 tag_i++;
463 }
464
465 if (reach_pos[trans_i->state_id].pos < pos)
466 {
467 /* Found an unvisited node. */
468 reach_next_i->state = trans_i->state;
469 tmp_iptr = reach_next_i->tags;
470 reach_next_i->tags = tmp_tags;
471 tmp_tags = tmp_iptr;
472 reach_pos[trans_i->state_id].pos = pos;
473 reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
474
475 if (reach_next_i->state == tnfa->final
476 && (match_eo == -1
477 || (num_tags > 0
478 && reach_next_i->tags[0] <= match_tags[0])))
479 {
480 DPRINT((" found match %p\n", trans_i->state));
481 match_eo = pos;
482 new_match = 1;
483 for (i = 0; i < num_tags; i++)
484 match_tags[i] = reach_next_i->tags[i];
485 }
486 reach_next_i++;
487
488 }
489 else
490 {
491 assert(reach_pos[trans_i->state_id].pos == pos);
492 /* Another path has also reached this state. We choose
493 the winner by examining the tag values for both
494 paths. */
495 if (tre_tag_order(num_tags, tnfa->tag_directions,
496 tmp_tags,
497 *reach_pos[trans_i->state_id].tags))
498 {
499 /* The new path wins. */
500 tmp_iptr = *reach_pos[trans_i->state_id].tags;
501 *reach_pos[trans_i->state_id].tags = tmp_tags;
502 if (trans_i->state == tnfa->final)
503 {
504 DPRINT((" found better match\n"));
505 match_eo = pos;
506 new_match = 1;
507 for (i = 0; i < num_tags; i++)
508 match_tags[i] = tmp_tags[i];
509 }
510 tmp_tags = tmp_iptr;
511 }
512 }
513 }
514 }
515 }
516 reach_next_i->state = NULL;
517 }
518
519 DPRINT(("match end offset = %d\n", match_eo));
520
521 #ifndef TRE_USE_ALLOCA
522 if (buf)
523 xfree(buf);
524 #endif /* !TRE_USE_ALLOCA */
525
526 *match_end_ofs = match_eo;
527 return match_eo >= 0 ? REG_OK : REG_NOMATCH;
528 }
529
530 /* EOF */
531