1 /*
2   tre-match-parallel.c - TRE parallel regex matching engine
3 
4   This software is released under a BSD-style license.
5   See the file LICENSE for details and copyright.
6 
7 */
8 
9 /*
10   This algorithm searches for matches basically by reading characters
11   in the searched string one by one, starting at the beginning.	 All
12   matching paths in the TNFA are traversed in parallel.	 When two or
13   more paths reach the same state, exactly one is chosen according to
14   tag ordering rules; if returning submatches is not required it does
15   not matter which path is chosen.
16 
17   The worst case time required for finding the leftmost and longest
18   match, or determining that there is no match, is always linearly
19   dependent on the length of the text being searched.
20 
21   This algorithm cannot handle TNFAs with back referencing nodes.
22   See `tre-match-backtrack.c'.
23 */
24 
25 
26 #ifdef HAVE_CONFIG_H
27 #include <config.h>
28 #endif /* HAVE_CONFIG_H */
29 
30 /* Unset TRE_USE_ALLOCA to avoid using the stack to hold all the state
31    info while running */
32 #undef TRE_USE_ALLOCA
33 
34 #ifdef TRE_USE_ALLOCA
35 /* AIX requires this to be the first thing in the file.	 */
36 #ifndef __GNUC__
37 # if HAVE_ALLOCA_H
38 #  include <alloca.h>
39 # else
40 #  ifdef _AIX
41  #pragma alloca
42 #  else
43 #   ifndef alloca /* predefined by HP cc +Olibcalls */
44 char *alloca ();
45 #   endif
46 #  endif
47 # endif
48 #endif
49 #endif /* TRE_USE_ALLOCA */
50 
51 #include <assert.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #ifdef HAVE_WCHAR_H
55 #include <wchar.h>
56 #endif /* HAVE_WCHAR_H */
57 #ifdef HAVE_WCTYPE_H
58 #include <wctype.h>
59 #endif /* HAVE_WCTYPE_H */
60 #ifndef TRE_WCHAR
61 #include <ctype.h>
62 #endif /* !TRE_WCHAR */
63 #ifdef HAVE_MALLOC_H
64 #include <malloc.h>
65 #endif /* HAVE_MALLOC_H */
66 
67 #include "tre-internal.h"
68 #include "tre-match-utils.h"
69 #include "tre.h"
70 #include "xmalloc.h"
71 
72 
73 
74 typedef struct {
75   tre_tnfa_transition_t *state;
76   tre_tag_t *tags;
77 } tre_tnfa_reach_t;
78 
79 typedef struct {
80   int pos;
81   tre_tag_t **tags;
82 } tre_reach_pos_t;
83 
84 
85 #ifdef TRE_DEBUG
86 static void
87 tre_print_reach1(tre_tnfa_transition_t *state, tre_tag_t *tags, int num_tags)
88 {
89   DPRINT((" %p", (void *)state));
90   if (num_tags > 0)
91     {
92       DPRINT(("/"));
93       tre_print_tags(tags, num_tags);
94     }
95 }
96 
97 static void
98 tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags)
99 {
100   while (reach->state != NULL)
101     {
102       tre_print_reach1(reach->state, reach->tags, num_tags);
103       reach++;
104     }
105   DPRINT(("\n"));
106 
107 }
108 #endif /* TRE_DEBUG */
109 
110 reg_errcode_t
111 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
112 		      tre_str_type_t type, tre_tag_t *match_tags, int eflags,
113 		      int *match_end_ofs)
114 {
115   /* State variables required by GET_NEXT_WCHAR. */
116   tre_char_t prev_c = 0, next_c = 0;
117   const char *str_byte = string;
118   int pos = -1;
119   unsigned int pos_add_next = 1;
120 #ifdef TRE_WCHAR
121   const wchar_t *str_wide = string;
122 #ifdef TRE_MBSTATE
123   mbstate_t mbstate;
124 #endif /* TRE_MBSTATE */
125 #endif /* TRE_WCHAR */
126   int reg_notbol = eflags & REG_NOTBOL;
127   int reg_noteol = eflags & REG_NOTEOL;
128   int reg_newline = tnfa->cflags & REG_NEWLINE;
129 
130   char *buf;
131   tre_tnfa_transition_t *trans_i;
132   tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i;
133   tre_reach_pos_t *reach_pos;
134   int *tag_i;
135   int num_tags, i;
136 
137   int match_eo = -1;	   /* end offset of match (-1 if no match found yet) */
138 #ifdef TRE_DEBUG
139   int once;
140 #endif /* TRE_DEBUG */
141   tre_tag_t *tmp_tags = NULL;
142   tre_tag_t *tmp_iptr;
143   int tbytes;
144   int touch = 1;
145 
146 #ifdef TRE_MBSTATE
147   memset(&mbstate, '\0', sizeof(mbstate));
148 #endif /* TRE_MBSTATE */
149 
150   DPRINT(("tre_tnfa_run_parallel, input type %d\n", type));
151 
152   if (!match_tags)
153     num_tags = 0;
154   else
155     num_tags = tnfa->num_tags;
156 
157   /* Allocate memory for temporary data required for matching.	This needs to
158      be done for every matching operation to be thread safe.  This allocates
159      everything in a single large block from the stack frame using alloca()
160      or with malloc() if alloca is unavailable. */
161   {
162     int rbytes, pbytes, total_bytes;
163     char *tmp_buf;
164     /* Compute the length of the block we need. */
165     tbytes = sizeof(*tmp_tags) * num_tags;
166     rbytes = sizeof(*reach_next) * (tnfa->num_states + 1);
167     pbytes = sizeof(*reach_pos) * tnfa->num_states;
168     total_bytes =
169       (sizeof(long) - 1) * 4 /* for alignment paddings */
170       + (rbytes + tbytes * tnfa->num_states) * 2 + tbytes + pbytes;
171 
172     DPRINT(("tre_tnfa_run_parallel, allocate %d bytes\n", total_bytes));
173     /* Allocate the memory. */
174 #ifdef TRE_USE_ALLOCA
175     buf = alloca(total_bytes);
176 #else /* !TRE_USE_ALLOCA */
177     buf = xmalloc((unsigned)total_bytes);
178 #endif /* !TRE_USE_ALLOCA */
179     if (buf == NULL)
180       return REG_ESPACE;
181     memset(buf, 0, (size_t)total_bytes);
182 
183     /* Get the various pointers within tmp_buf (properly aligned). */
184     tmp_tags = (void *)buf;
185     tmp_buf = buf + tbytes;
186     tmp_buf += ALIGN(tmp_buf, long);
187     reach_next = (void *)tmp_buf;
188     tmp_buf += rbytes;
189     tmp_buf += ALIGN(tmp_buf, long);
190     reach = (void *)tmp_buf;
191     tmp_buf += rbytes;
192     tmp_buf += ALIGN(tmp_buf, long);
193     reach_pos = (void *)tmp_buf;
194     tmp_buf += pbytes;
195     tmp_buf += ALIGN(tmp_buf, long);
196     for (i = 0; i < tnfa->num_states; i++)
197       {
198 	reach[i].tags = (void *)tmp_buf;
199 	tmp_buf += tbytes;
200 	reach_next[i].tags = (void *)tmp_buf;
201 	tmp_buf += tbytes;
202       }
203   }
204 
205   for (i = 0; i < tnfa->num_states; i++)
206     reach_pos[i].pos = -1;
207 
208   /* If only one character can start a match, find it first. */
209   if (tnfa->first_char >= 0 && str_byte)
210     {
211       const char *orig_str = str_byte;
212       int first = tnfa->first_char;
213       int found_high_bit = 0;
214 
215 
216       if (type == STR_BYTE)
217 	{
218 	  if (len >= 0)
219 	    str_byte = memchr(orig_str, first, (size_t)len);
220 	  else
221 	    str_byte = strchr(orig_str, first);
222 	}
223       else if (type == STR_MBS)
224 	{
225 	  /*
226 	   * If the match character is ASCII, try to match the character
227 	   * directly, but if a high bit character is found, we stop there.
228 	   */
229 	  if (first < 0x80)
230 	    {
231 	      if (len >= 0)
232 		{
233 		  int i;
234 		  for (i = 0; ; str_byte++, i++)
235 		    {
236 		      if (i >= len)
237 			{
238 			  str_byte = NULL;
239 			  break;
240 			}
241 		      if (*str_byte == first)
242 			break;
243 		      if (*str_byte & 0x80)
244 			{
245 			  found_high_bit = 1;
246 			  break;
247 			}
248 		    }
249 		}
250 	      else
251 		{
252 		  for (; ; str_byte++)
253 		    {
254 		      if (!*str_byte)
255 			{
256 			  str_byte = NULL;
257 			  break;
258 			}
259 		      if (*str_byte == first)
260 			break;
261 		      if (*str_byte & 0x80)
262 			{
263 			  found_high_bit = 1;
264 			  break;
265 			}
266 		    }
267 		}
268 	    }
269 	  else
270 	    {
271 	      if (len >= 0)
272 		{
273 		  int i;
274 		  for (i = 0; ; str_byte++, i++)
275 		    {
276 		      if (i >= len)
277 			{
278 			  str_byte = NULL;
279 			  break;
280 			}
281 		      if (*str_byte & 0x80)
282 			{
283 			  found_high_bit = 1;
284 			  break;
285 			}
286 		    }
287 		}
288 	      else
289 		{
290 		  for (; ; str_byte++)
291 		    {
292 		      if (!*str_byte)
293 			{
294 			  str_byte = NULL;
295 			  break;
296 			}
297 		      if (*str_byte & 0x80)
298 			{
299 			  found_high_bit = 1;
300 			  break;
301 			}
302 		    }
303 		}
304 	    }
305 	}
306       if (str_byte == NULL)
307 	{
308 #ifndef TRE_USE_ALLOCA
309 	  if (buf)
310 	    xfree(buf);
311 #endif /* !TRE_USE_ALLOCA */
312 	  return REG_NOMATCH;
313 	}
314       DPRINT(("skipped %lu chars\n", (unsigned long)(str_byte - orig_str)));
315       if (!found_high_bit)
316 	{
317 	  if (str_byte >= orig_str + 1)
318 	    prev_c = (unsigned char)*(str_byte - 1);
319 	  next_c = (unsigned char)*str_byte;
320 	  pos = str_byte - orig_str;
321 	  if (len < 0 || pos < len)
322 	    str_byte++;
323 	}
324       else
325 	{
326 	  if (str_byte == orig_str)
327 	    goto no_first_optimization;
328 	  /*
329 	   * Back up one character, fix up the position, then call
330 	   * GET_NEXT_WCHAR() to process the multibyte character.
331 	   */
332 	  /* no need to set prev_c, since GET_NEXT_WCHAR will overwrite */
333 	  next_c = (unsigned char)*(str_byte - 1);
334 	  pos = (str_byte - 1) - orig_str;
335 	  GET_NEXT_WCHAR();
336 	}
337     }
338   else
339     {
340 no_first_optimization:
341       GET_NEXT_WCHAR();
342       pos = 0;
343     }
344 
345   DPRINT(("length: %d\n", len));
346   DPRINT(("pos:chr/code | states and tags\n"));
347   DPRINT(("-------------+------------------------------------------------\n"));
348 
349   reach_next_i = reach_next;
350   while (/*CONSTCOND*/1)
351     {
352       /* If no match found yet, add the initial states to `reach_next'. */
353       if (match_eo < 0)
354 	{
355 	  DPRINT((" init >"));
356 	  trans_i = tnfa->initial;
357 	  while (trans_i->state != NULL)
358 	    {
359 	      if (reach_pos[trans_i->state_id].pos < pos)
360 		{
361 		  if (trans_i->assertions
362 		      && CHECK_ASSERTIONS(trans_i->assertions))
363 		    {
364 		      DPRINT(("assertion failed\n"));
365 		      trans_i++;
366 		      continue;
367 		    }
368 
369 		  DPRINT((" %p", (void *)trans_i->state));
370 		  reach_next_i->state = trans_i->state;
371 		  memset(reach_next_i->tags, 0, tbytes);
372 		  tag_i = trans_i->tags;
373 		  if (tag_i)
374 		    {
375 		      while (*tag_i >= 0)
376 			{
377 			  if (*tag_i < num_tags)
378 			    tre_tag_set(reach_next_i->tags, *tag_i, pos, touch);
379 			  tag_i++;
380 			}
381 			touch++;
382 		    }
383 		  if (reach_next_i->state == tnfa->final)
384 		    {
385 		      DPRINT(("	 found empty match\n"));
386 		      match_eo = pos;
387 		      memcpy(match_tags, reach_next_i->tags, tbytes);
388 		    }
389 		  reach_pos[trans_i->state_id].pos = pos;
390 		  reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
391 		  reach_next_i++;
392 		}
393 	      trans_i++;
394 	    }
395 	  DPRINT(("\n"));
396 	  reach_next_i->state = NULL;
397 	}
398       else
399 	{
400 	  if (num_tags == 0 || reach_next_i == reach_next)
401 	    /*�We have found a match. */
402 	    break;
403 	}
404 
405       /* Check for end of string. */
406       if (len < 0)
407 	{
408 	  if (next_c == L'\0')
409 	    break;
410 	}
411       else
412 	{
413 	  if (pos >= len)
414 	    break;
415 	}
416 
417       GET_NEXT_WCHAR();
418 
419 #ifdef TRE_DEBUG
420       DPRINT(("%3d:%2lc/%05d |", pos - 1, (tre_cint_t)prev_c, (int)prev_c));
421       tre_print_reach(tnfa, reach_next, num_tags);
422 #endif /* TRE_DEBUG */
423 
424       /* Swap `reach' and `reach_next'. */
425       reach_i = reach;
426       reach = reach_next;
427       reach_next = reach_i;
428 
429 #ifdef TRE_DEBUG
430       once = 0;
431 #endif /* TRE_DEBUG */
432 
433       /* For each state in `reach' see if there is a transition leaving with
434 	 the current input symbol to a state not yet in `reach_next', and
435 	 add the destination states to `reach_next'. */
436       reach_next_i = reach_next;
437       for (reach_i = reach; reach_i->state; reach_i++)
438 	{
439 	  for (trans_i = reach_i->state; trans_i->state; trans_i++)
440 	    {
441 	      /* Does this transition match the input symbol? */
442 	      if (trans_i->code_min <= (tre_cint_t)prev_c &&
443 		  trans_i->code_max >= (tre_cint_t)prev_c)
444 		{
445 		  if (trans_i->assertions
446 		      && (CHECK_ASSERTIONS(trans_i->assertions)
447 			  || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
448 		    {
449 		      DPRINT(("assertion failed\n"));
450 		      continue;
451 		    }
452 
453 		  /* Compute the tags after this transition. */
454 		  memcpy(tmp_tags, reach_i->tags, tbytes);
455 		  tag_i = trans_i->tags;
456 		  if (tag_i != NULL)
457 		    {
458 		      while (*tag_i >= 0)
459 			{
460 			  if (*tag_i < num_tags)
461 			    tre_tag_set(tmp_tags, *tag_i, pos, touch);
462 			  tag_i++;
463 			}
464 			touch++;
465 		    }
466 
467 		  /* For each new transition, weed out those that don't
468 		     fulfill the minimal matching conditions. */
469 		  if (tnfa->num_minimals && match_eo >= 0)
470 		    {
471 		      int skip = 0;
472 #ifdef TRE_DEBUG
473 		      if (!once)
474 			{
475 			  DPRINT(("Checking minimal conditions: match_eo=%d "
476 				  "match_tags=", match_eo));
477 			  tre_print_tags(match_tags, num_tags);
478 			  DPRINT(("\n"));
479 			  once++;
480 			}
481 #endif /* TRE_DEBUG */
482 		      for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
483 			{
484 			   int end = tnfa->minimal_tags[i];
485 			   int start = tnfa->minimal_tags[i + 1];
486 			   DPRINT(("  Minimal start %d, end %d\n", start, end));
487 			   if (tre_minimal_tag_order(start, end, match_tags,
488 						     tmp_tags) > 0)
489 			     {
490 				skip = 1;
491 				break;
492 			     }
493 			}
494 		      if (skip)
495 			{
496 #ifdef TRE_DEBUG
497 			   DPRINT(("	 Throwing out"));
498 			   tre_print_reach1(reach_i->state, tmp_tags,
499 					     num_tags);
500 			   DPRINT(("\n"));
501 #endif /* TRE_DEBUG */
502 			   continue;
503 			}
504 		    }
505 
506 		  if (reach_pos[trans_i->state_id].pos < pos)
507 		    {
508 		      /* Found an unvisited node. */
509 		      reach_next_i->state = trans_i->state;
510 		      tmp_iptr = reach_next_i->tags;
511 		      reach_next_i->tags = tmp_tags;
512 		      tmp_tags = tmp_iptr;
513 		      reach_pos[trans_i->state_id].pos = pos;
514 		      reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
515 
516 		      if (reach_next_i->state == tnfa->final
517 			  && (match_eo == -1
518 			      || (num_tags > 0
519 				  && tre_tag_get(reach_next_i->tags, 0) <=
520 				  tre_tag_get(match_tags, 0))))
521 			{
522 #ifdef TRE_DEBUG
523 			  DPRINT(("  found match"));
524 			  tre_print_reach1(trans_i->state, reach_next_i->tags, num_tags);
525 			  DPRINT(("\n"));
526 #endif /* TRE_DEBUG */
527 			  match_eo = pos;
528 			  memcpy(match_tags, reach_next_i->tags, tbytes);
529 			}
530 		      reach_next_i++;
531 
532 		    }
533 		  else
534 		    {
535 		      assert(reach_pos[trans_i->state_id].pos == pos);
536 		      /* Another path has also reached this state.  We choose
537 			 the winner by examining the tag values for both
538 			 paths. */
539 		      if (tre_tag_order(num_tags, tnfa->tag_directions,
540 					tmp_tags,
541 					*reach_pos[trans_i->state_id].tags))
542 			{
543 			  /* The new path wins. */
544 			  tmp_iptr = *reach_pos[trans_i->state_id].tags;
545 			  *reach_pos[trans_i->state_id].tags = tmp_tags;
546 			  if (trans_i->state == tnfa->final)
547 			    {
548 #ifdef TRE_DEBUG
549 			      DPRINT(("	 found better match"));
550 			      tre_print_reach1(trans_i->state, tmp_tags, num_tags);
551 			      DPRINT(("\n"));
552 #endif /* TRE_DEBUG */
553 			      match_eo = pos;
554 			      memcpy(match_tags, tmp_tags, tbytes);
555 			    }
556 			  tmp_tags = tmp_iptr;
557 			}
558 		    }
559 		}
560 	    }
561 	}
562       reach_next_i->state = NULL;
563     }
564 
565   DPRINT(("match end offset = %d\n", match_eo));
566 
567   *match_end_ofs = match_eo;
568 #ifdef TRE_DEBUG
569   if (match_tags)
570     {
571       DPRINT(("Winning tags="));
572       tre_print_tags_all(match_tags, num_tags);
573       DPRINT((" touch=%d\n", touch));
574     }
575 #endif /* TRE_DEBUG */
576 
577 #ifndef TRE_USE_ALLOCA
578   if (buf)
579     xfree(buf);
580 #endif /* !TRE_USE_ALLOCA */
581 
582   return match_eo >= 0 ? REG_OK : REG_NOMATCH;
583 }
584 
585 /* EOF */
586