1 /*
2   tre-match-parallel.c - TRE parallel regex matching engine
3 
4   Copyright (c) 2001-2006 Ville Laurikari <vl@iki.fi>.
5 
6   This library is free software; you can redistribute it and/or
7   modify it under the terms of the GNU Lesser General Public
8   License as published by the Free Software Foundation; either
9   version 2.1 of the License, or (at your option) any later version.
10 
11   This library is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14   Lesser General Public License for more details.
15 
16   You should have received a copy of the GNU Lesser General Public
17   License along with this library; if not, write to the Free Software
18   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
19 
20 */
21 
22 /*
23   This algorithm searches for matches basically by reading characters
24   in the searched string one by one, starting at the beginning.	 All
25   matching paths in the TNFA are traversed in parallel.	 When two or
26   more paths reach the same state, exactly one is chosen according to
27   tag ordering rules; if returning submatches is not required it does
28   not matter which path is chosen.
29 
30   The worst case time required for finding the leftmost and longest
31   match, or determining that there is no match, is always linearly
32   dependent on the length of the text being searched.
33 
34   This algorithm cannot handle TNFAs with back referencing nodes.
35   See `tre-match-backtrack.c'.
36 */
37 
38 
39 #ifdef HAVE_CONFIG_H
40 #include <config.h>
41 #endif /* HAVE_CONFIG_H */
42 
43 #ifdef TRE_USE_ALLOCA
44 /* AIX requires this to be the first thing in the file.	 */
45 #ifndef __GNUC__
46 # if HAVE_ALLOCA_H
47 #  include <alloca.h>
48 # else
49 #  ifdef _AIX
50  #pragma alloca
51 #  else
52 #   ifndef alloca /* predefined by HP cc +Olibcalls */
53 char *alloca ();
54 #   endif
55 #  endif
56 # endif
57 #endif
58 #endif /* TRE_USE_ALLOCA */
59 
60 #include <assert.h>
61 #include <stdlib.h>
62 #include <string.h>
63 #ifdef HAVE_WCHAR_H
64 #include <wchar.h>
65 #endif /* HAVE_WCHAR_H */
66 #ifdef HAVE_WCTYPE_H
67 #include <wctype.h>
68 #endif /* HAVE_WCTYPE_H */
69 #ifndef TRE_WCHAR
70 #include <ctype.h>
71 #endif /* !TRE_WCHAR */
72 #ifdef HAVE_MALLOC_H
73 #include <malloc.h>
74 #endif /* HAVE_MALLOC_H */
75 
76 #include "tre-internal.h"
77 #include "tre-match-utils.h"
78 #include "regex.h"
79 #include "xmalloc.h"
80 
81 
82 
83 typedef struct {
84   tre_tnfa_transition_t *state;
85   int *tags;
86 } tre_tnfa_reach_t;
87 
88 typedef struct {
89   int pos;
90   int **tags;
91 } tre_reach_pos_t;
92 
93 
94 #ifdef TRE_DEBUG
95 static void
tre_print_reach(const tre_tnfa_t * tnfa,tre_tnfa_reach_t * reach,int num_tags)96 tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags)
97 {
98   int i;
99 
100   while (reach->state != NULL)
101     {
102       DPRINT((" %p", (void *)reach->state));
103       if (num_tags > 0)
104 	{
105 	  DPRINT(("/"));
106 	  for (i = 0; i < num_tags; i++)
107 	    {
108 	      DPRINT(("%d:%d", i, reach->tags[i]));
109 	      if (i < (num_tags-1))
110 		DPRINT((","));
111 	    }
112 	}
113       reach++;
114     }
115   DPRINT(("\n"));
116 
117 }
118 #endif /* TRE_DEBUG */
119 
120 reg_errcode_t
tre_tnfa_run_parallel(const tre_tnfa_t * tnfa,const void * string,int len,tre_str_type_t type,int * match_tags,int eflags,int * match_end_ofs)121 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
122 		      tre_str_type_t type, int *match_tags, int eflags,
123 		      int *match_end_ofs)
124 {
125   /* State variables required by GET_NEXT_WCHAR. */
126   tre_char_t prev_c = 0, next_c = 0;
127   const char *str_byte = string;
128   int pos = -1;
129   unsigned int pos_add_next = 1;
130 #ifdef TRE_WCHAR
131   const wchar_t *str_wide = string;
132 #ifdef TRE_MBSTATE
133   mbstate_t mbstate;
134 #endif /* TRE_MBSTATE */
135 #endif /* TRE_WCHAR */
136   int reg_notbol = eflags & REG_NOTBOL;
137   int reg_noteol = eflags & REG_NOTEOL;
138   int reg_newline = tnfa->cflags & REG_NEWLINE;
139   int str_user_end = 0;
140 
141   char *buf;
142   tre_tnfa_transition_t *trans_i;
143   tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i;
144   tre_reach_pos_t *reach_pos;
145   int *tag_i;
146   int num_tags, i;
147 
148   int match_eo = -1;	   /* end offset of match (-1 if no match found yet) */
149   int new_match = 0;
150   int *tmp_tags = NULL;
151   int *tmp_iptr;
152 
153 #ifdef TRE_MBSTATE
154   memset(&mbstate, '\0', sizeof(mbstate));
155 #endif /* TRE_MBSTATE */
156 
157   DPRINT(("tre_tnfa_run_parallel, input type %d\n", type));
158 
159   if (!match_tags)
160     num_tags = 0;
161   else
162     num_tags = tnfa->num_tags;
163 
164   /* Allocate memory for temporary data required for matching.	This needs to
165      be done for every matching operation to be thread safe.  This allocates
166      everything in a single large block from the stack frame using alloca()
167      or with malloc() if alloca is unavailable. */
168   {
169     int tbytes, rbytes, pbytes, xbytes, total_bytes;
170     char *tmp_buf;
171     /* Compute the length of the block we need. */
172     tbytes = sizeof(*tmp_tags) * num_tags;
173     rbytes = sizeof(*reach_next) * (tnfa->num_states + 1);
174     pbytes = sizeof(*reach_pos) * tnfa->num_states;
175     xbytes = sizeof(int) * num_tags;
176     total_bytes =
177       (sizeof(long) - 1) * 4 /* for alignment paddings */
178       + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes;
179 
180     /* Allocate the memory. */
181 #ifdef TRE_USE_ALLOCA
182     buf = alloca(total_bytes);
183 #else /* !TRE_USE_ALLOCA */
184     buf = xmalloc(total_bytes);
185 #endif /* !TRE_USE_ALLOCA */
186     if (buf == NULL)
187       return REG_ESPACE;
188     memset(buf, 0, total_bytes);
189 
190     /* Get the various pointers within tmp_buf (properly aligned). */
191     tmp_tags = (void *)buf;
192     tmp_buf = buf + tbytes;
193     tmp_buf += ALIGN(tmp_buf, long);
194     reach_next = (void *)tmp_buf;
195     tmp_buf += rbytes;
196     tmp_buf += ALIGN(tmp_buf, long);
197     reach = (void *)tmp_buf;
198     tmp_buf += rbytes;
199     tmp_buf += ALIGN(tmp_buf, long);
200     reach_pos = (void *)tmp_buf;
201     tmp_buf += pbytes;
202     tmp_buf += ALIGN(tmp_buf, long);
203     for (i = 0; i < tnfa->num_states; i++)
204       {
205 	reach[i].tags = (void *)tmp_buf;
206 	tmp_buf += xbytes;
207 	reach_next[i].tags = (void *)tmp_buf;
208 	tmp_buf += xbytes;
209       }
210   }
211 
212   for (i = 0; i < tnfa->num_states; i++)
213     reach_pos[i].pos = -1;
214 
215   /* If only one character can start a match, find it first. */
216   if (tnfa->first_char >= 0 && type == STR_BYTE && str_byte)
217     {
218       const char *orig_str = str_byte;
219       int first = tnfa->first_char;
220 
221       if (len >= 0)
222 	str_byte = memchr(orig_str, first, len);
223       else
224 	str_byte = strchr(orig_str, first);
225       if (str_byte == NULL)
226 	{
227 #ifndef TRE_USE_ALLOCA
228 	  if (buf)
229 	    xfree(buf);
230 #endif /* !TRE_USE_ALLOCA */
231 	  return REG_NOMATCH;
232 	}
233       DPRINT(("skipped %lu chars\n", (unsigned long)(str_byte - orig_str)));
234       if (str_byte >= orig_str + 1)
235 	prev_c = (unsigned char)*(str_byte - 1);
236       next_c = (unsigned char)*str_byte;
237       pos = str_byte - orig_str;
238       if (len < 0 || pos < len)
239 	str_byte++;
240     }
241   else
242     {
243       GET_NEXT_WCHAR();
244       pos = 0;
245     }
246 
247 #if 0
248   /* Skip over characters that cannot possibly be the first character
249      of a match. */
250   if (tnfa->firstpos_chars != NULL)
251     {
252       char *chars = tnfa->firstpos_chars;
253 
254       if (len < 0)
255 	{
256 	  const char *orig_str = str_byte;
257 	  /* XXX - use strpbrk() and wcspbrk() because they might be
258 	     optimized for the target architecture.  Try also strcspn()
259 	     and wcscspn() and compare the speeds. */
260 	  while (next_c != L'\0' && !chars[next_c])
261 	    {
262 	      next_c = *str_byte++;
263 	    }
264 	  prev_c = *(str_byte - 2);
265 	  pos += str_byte - orig_str;
266 	  DPRINT(("skipped %d chars\n", str_byte - orig_str));
267 	}
268       else
269 	{
270 	  while (pos <= len && !chars[next_c])
271 	    {
272 	      prev_c = next_c;
273 	      next_c = (unsigned char)(*str_byte++);
274 	      pos++;
275 	    }
276 	}
277     }
278 #endif
279 
280   DPRINT(("length: %d\n", len));
281   DPRINT(("pos:chr/code | states and tags\n"));
282   DPRINT(("-------------+------------------------------------------------\n"));
283 
284   reach_next_i = reach_next;
285   while (1)
286     {
287       /* If no match found yet, add the initial states to `reach_next'. */
288       if (match_eo < 0)
289 	{
290 	  DPRINT((" init >"));
291 	  trans_i = tnfa->initial;
292 	  while (trans_i->state != NULL)
293 	    {
294 	      if (reach_pos[trans_i->state_id].pos < pos)
295 		{
296 		  if (trans_i->assertions
297 		      && CHECK_ASSERTIONS(trans_i->assertions))
298 		    {
299 		      DPRINT(("assertion failed\n"));
300 		      trans_i++;
301 		      continue;
302 		    }
303 
304 		  DPRINT((" %p", (void *)trans_i->state));
305 		  reach_next_i->state = trans_i->state;
306 		  for (i = 0; i < num_tags; i++)
307 		    reach_next_i->tags[i] = -1;
308 		  tag_i = trans_i->tags;
309 		  if (tag_i)
310 		    while (*tag_i >= 0)
311 		      {
312 			if (*tag_i < num_tags)
313 			  reach_next_i->tags[*tag_i] = pos;
314 			tag_i++;
315 		      }
316 		  if (reach_next_i->state == tnfa->final)
317 		    {
318 		      DPRINT(("	 found empty match\n"));
319 		      match_eo = pos;
320 		      new_match = 1;
321 		      for (i = 0; i < num_tags; i++)
322 			match_tags[i] = reach_next_i->tags[i];
323 		    }
324 		  reach_pos[trans_i->state_id].pos = pos;
325 		  reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
326 		  reach_next_i++;
327 		}
328 	      trans_i++;
329 	    }
330 	  DPRINT(("\n"));
331 	  reach_next_i->state = NULL;
332 	}
333       else
334 	{
335 	  if (num_tags == 0 || reach_next_i == reach_next)
336 	    /*�We have found a match. */
337 	    break;
338 	}
339 
340       /* Check for end of string. */
341       if (len < 0)
342 	{
343 	  if (type == STR_USER)
344 	    {
345 	      if (str_user_end)
346 		break;
347 	    }
348 	  else if (next_c == L'\0')
349 	    break;
350 	}
351       else
352 	{
353 	  if (pos >= len)
354 	    break;
355 	}
356 
357       GET_NEXT_WCHAR();
358 
359 #ifdef TRE_DEBUG
360       DPRINT(("%3d:%2lc/%05d |", pos - 1, (tre_cint_t)prev_c, (int)prev_c));
361       tre_print_reach(tnfa, reach_next, num_tags);
362       DPRINT(("%3d:%2lc/%05d |", pos, (tre_cint_t)next_c, (int)next_c));
363       tre_print_reach(tnfa, reach_next, num_tags);
364 #endif /* TRE_DEBUG */
365 
366       /* Swap `reach' and `reach_next'. */
367       reach_i = reach;
368       reach = reach_next;
369       reach_next = reach_i;
370 
371       /* For each state in `reach', weed out states that don't fulfill the
372 	 minimal matching conditions. */
373       if (tnfa->num_minimals && new_match)
374 	{
375 	  new_match = 0;
376 	  reach_next_i = reach_next;
377 	  for (reach_i = reach; reach_i->state; reach_i++)
378 	    {
379 	      int i;
380 	      int skip = 0;
381 	      for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
382 		{
383 		  int end = tnfa->minimal_tags[i];
384 		  int start = tnfa->minimal_tags[i + 1];
385 		  DPRINT(("  Minimal start %d, end %d\n", start, end));
386 		  if (end >= num_tags)
387 		    {
388 		      DPRINT(("	 Throwing %p out.\n", reach_i->state));
389 		      skip = 1;
390 		      break;
391 		    }
392 		  else if (reach_i->tags[start] == match_tags[start]
393 			   && reach_i->tags[end] < match_tags[end])
394 		    {
395 		      DPRINT(("	 Throwing %p out because t%d < %d\n",
396 			      reach_i->state, end, match_tags[end]));
397 		      skip = 1;
398 		      break;
399 		    }
400 		}
401 	      if (!skip)
402 		{
403 		  int *tmp_iptr;
404 		  reach_next_i->state = reach_i->state;
405 		  tmp_iptr = reach_next_i->tags;
406 		  reach_next_i->tags = reach_i->tags;
407 		  reach_i->tags = tmp_iptr;
408 		  reach_next_i++;
409 		}
410 	    }
411 	  reach_next_i->state = NULL;
412 
413 	  /* Swap `reach' and `reach_next'. */
414 	  reach_i = reach;
415 	  reach = reach_next;
416 	  reach_next = reach_i;
417 	}
418 
419       /* For each state in `reach' see if there is a transition leaving with
420 	 the current input symbol to a state not yet in `reach_next', and
421 	 add the destination states to `reach_next'. */
422       reach_next_i = reach_next;
423       for (reach_i = reach; reach_i->state; reach_i++)
424 	{
425 	  for (trans_i = reach_i->state; trans_i->state; trans_i++)
426 	    {
427 	      /* Does this transition match the input symbol? */
428 	      if (trans_i->code_min <= prev_c &&
429 		  trans_i->code_max >= prev_c)
430 		{
431 		  if (trans_i->assertions
432 		      && (CHECK_ASSERTIONS(trans_i->assertions)
433 			  /* Handle character class transitions. */
434 			  || ((trans_i->assertions & ASSERT_CHAR_CLASS)
435 			      && !(tnfa->cflags & REG_ICASE)
436 			      && !tre_isctype((tre_cint_t)prev_c,
437 					      trans_i->u.class))
438 			  || ((trans_i->assertions & ASSERT_CHAR_CLASS)
439 			      && (tnfa->cflags & REG_ICASE)
440 			      && (!tre_isctype(tre_tolower((tre_cint_t)prev_c),
441 					       trans_i->u.class)
442 				  && !tre_isctype(tre_toupper((tre_cint_t)prev_c),
443 						  trans_i->u.class)))
444 			  || ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG)
445 			      && tre_neg_char_classes_match(trans_i->neg_classes,
446 							    (tre_cint_t)prev_c,
447 							    tnfa->cflags & REG_ICASE))))
448 		    {
449 		      DPRINT(("assertion failed\n"));
450 		      continue;
451 		    }
452 
453 		  /* Compute the tags after this transition. */
454 		  for (i = 0; i < num_tags; i++)
455 		    tmp_tags[i] = reach_i->tags[i];
456 		  tag_i = trans_i->tags;
457 		  if (tag_i != NULL)
458 		    while (*tag_i >= 0)
459 		      {
460 			if (*tag_i < num_tags)
461 			  tmp_tags[*tag_i] = pos;
462 			tag_i++;
463 		      }
464 
465 		  if (reach_pos[trans_i->state_id].pos < pos)
466 		    {
467 		      /* Found an unvisited node. */
468 		      reach_next_i->state = trans_i->state;
469 		      tmp_iptr = reach_next_i->tags;
470 		      reach_next_i->tags = tmp_tags;
471 		      tmp_tags = tmp_iptr;
472 		      reach_pos[trans_i->state_id].pos = pos;
473 		      reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
474 
475 		      if (reach_next_i->state == tnfa->final
476 			  && (match_eo == -1
477 			      || (num_tags > 0
478 				  && reach_next_i->tags[0] <= match_tags[0])))
479 			{
480 			  DPRINT(("  found match %p\n", trans_i->state));
481 			  match_eo = pos;
482 			  new_match = 1;
483 			  for (i = 0; i < num_tags; i++)
484 			    match_tags[i] = reach_next_i->tags[i];
485 			}
486 		      reach_next_i++;
487 
488 		    }
489 		  else
490 		    {
491 		      assert(reach_pos[trans_i->state_id].pos == pos);
492 		      /* Another path has also reached this state.  We choose
493 			 the winner by examining the tag values for both
494 			 paths. */
495 		      if (tre_tag_order(num_tags, tnfa->tag_directions,
496 					tmp_tags,
497 					*reach_pos[trans_i->state_id].tags))
498 			{
499 			  /* The new path wins. */
500 			  tmp_iptr = *reach_pos[trans_i->state_id].tags;
501 			  *reach_pos[trans_i->state_id].tags = tmp_tags;
502 			  if (trans_i->state == tnfa->final)
503 			    {
504 			      DPRINT(("	 found better match\n"));
505 			      match_eo = pos;
506 			      new_match = 1;
507 			      for (i = 0; i < num_tags; i++)
508 				match_tags[i] = tmp_tags[i];
509 			    }
510 			  tmp_tags = tmp_iptr;
511 			}
512 		    }
513 		}
514 	    }
515 	}
516       reach_next_i->state = NULL;
517     }
518 
519   DPRINT(("match end offset = %d\n", match_eo));
520 
521 #ifndef TRE_USE_ALLOCA
522   if (buf)
523     xfree(buf);
524 #endif /* !TRE_USE_ALLOCA */
525 
526   *match_end_ofs = match_eo;
527   return match_eo >= 0 ? REG_OK : REG_NOMATCH;
528 }
529 
530 /* EOF */
531