1 /*
2 * Stack-less Just-In-Time compiler
3 *
4 * Copyright Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without modification, are
7 * permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this list of
10 * conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice, this list
13 * of conditions and the following disclaimer in the documentation and/or other materials
14 * provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
17 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
19 * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
21 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
24 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27 #include "sljitLir.h"
28 #include "regexJIT.h"
29
30 #include <stdlib.h>
31
32 #ifdef REGEX_MATCH_VERBOSE
33 #include <stdio.h>
34 #endif
35
36 /* Extra, hidden flags:
37 {id!} where id > 0 found in the code. */
38 #define REGEX_ID_CHECK 0x100
39 /* When REGEX_NEWLINE && REGEX_MATCH_BEGIN defined, the pattern turn to a normal search,
40 which starts with [\r\n] character range. */
41 #define REGEX_FAKE_MATCH_BEGIN 0x200
42 /* When REGEX_NEWLINE && REGEX_MATCH_END defined, the pattern turn to a normal search,
43 which ends with [\r\n] character range. */
44 #define REGEX_FAKE_MATCH_END 0x400
45
46 /* Check match completition after every (FINISH_TEST + 1) steps. */
47 #define FINISH_TEST 0x7
48
49 /* --------------------------------------------------------------------- */
50 /* Structures for JIT-ed pattern matching */
51 /* --------------------------------------------------------------------- */
52
53 struct regex_machine
54 {
55 /* flags. */
56 int flags;
57 /* Number of state descriptors for one term. */
58 sljit_sw no_states;
59 /* Total size. */
60 sljit_sw size;
61
62 union {
63 void *init_match;
64 sljit_sw (SLJIT_CALL *call_init)(void *next, void* match);
65 } u;
66 #if (defined SLJIT_INDIRECT_CALL && SLJIT_INDIRECT_CALL)
67 struct sljit_function_context context;
68 #endif
69
70 void *continue_match;
71
72 /* Variable sized array to contain the handler addresses. */
73 sljit_uw entry_addrs[1];
74 };
75
76 struct regex_match
77 {
78 /* Current and next state array. */
79 sljit_sw *current;
80 sljit_sw *next;
81 /* Starting. */
82 sljit_sw head;
83 /* String character index (ever increasing). */
84 sljit_sw index;
85 /* Best match found so far (members in priority order). */
86 sljit_sw best_begin;
87 sljit_sw best_end;
88 sljit_sw best_id;
89 /* Bool flags (encoded as word). */
90 sljit_sw fast_quit;
91 sljit_sw fast_forward;
92 /* Machine. */
93 struct regex_machine *machine;
94
95 union {
96 void *continue_match;
97 void (SLJIT_CALL *call_continue)(struct regex_match *match, const regex_char_t *input_string, int length);
98 } u;
99
100 /* Variable sized array to contain the state arrays. */
101 sljit_sw states[1];
102 };
103
104 /* State vector
105 ITEM[0] - pointer to the address inside the machine code
106 ITEM[1] - next pointer
107 ITEM[2] - string started from (optional)
108 ITEM[3] - max ID (optional) */
109
110 /* Register allocation. */
111 /* Current state array (loaded & stored: regex_match->current). */
112 #define R_CURR_STATE SLJIT_S0
113 /* Next state array (loaded & stored: regex_match->next). */
114 #define R_NEXT_STATE SLJIT_S1
115 /* Head (loaded & stored: regex_match->head). */
116 #define R_NEXT_HEAD SLJIT_S2
117 /* String fragment pointer. */
118 #define R_STRING SLJIT_S3
119 /* String fragment length. */
120 #define R_LENGTH SLJIT_S4
121 /* 'struct regex_match*' */
122 #define R_REGEX_MATCH SLJIT_R0
123 /* Current character. */
124 #define R_CURR_CHAR SLJIT_R1
125 /* Temporary register. */
126 #define R_TEMP SLJIT_R2
127 /* Caches the regex_match->best_begin. */
128 #define R_BEST_BEGIN SLJIT_R3
129 /* Current character index. */
130 #define R_CURR_INDEX SLJIT_R4
131
132 /* --------------------------------------------------------------------- */
133 /* Stack management */
134 /* --------------------------------------------------------------------- */
135
136 /* Try to allocate 2^n blocks. */
137 #define STACK_FRAGMENT_SIZE (((64 * sizeof(struct stack_item)) - (sizeof(struct stack_fragment_data))) / (sizeof(struct stack_item)))
138
139 struct stack_item {
140 int type;
141 int value;
142 };
143
144 struct stack_fragment_data {
145 struct stack_fragment *next;
146 struct stack_fragment *prev;
147 };
148
149 struct stack_fragment {
150 struct stack_fragment_data data;
151 struct stack_item items[STACK_FRAGMENT_SIZE];
152 };
153
154 struct stack {
155 struct stack_fragment *first;
156 struct stack_fragment *last;
157 int index;
158 int count;
159 };
160
161 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
162
stack_check(struct stack * stack)163 static void stack_check(struct stack *stack)
164 {
165 struct stack_fragment *curr;
166 int found;
167
168 if (!stack)
169 return;
170
171 SLJIT_ASSERT(stack->index >= 0 && stack->index < STACK_FRAGMENT_SIZE);
172
173 if (stack->first == NULL) {
174 SLJIT_ASSERT(stack->first == NULL && stack->last == NULL);
175 SLJIT_ASSERT(stack->index == STACK_FRAGMENT_SIZE - 1 && stack->count == 0);
176 return;
177 }
178
179 found = 0;
180 if (stack->last == NULL) {
181 SLJIT_ASSERT(stack->index == STACK_FRAGMENT_SIZE - 1 && stack->count == 0);
182 found = 1;
183 }
184 else
185 SLJIT_ASSERT(stack->index >= 0 && stack->count >= 0);
186
187 SLJIT_ASSERT(stack->first->data.prev == NULL);
188 curr = stack->first;
189 while (curr) {
190 if (curr == stack->last)
191 found = 1;
192 if (curr->data.next)
193 SLJIT_ASSERT(curr->data.next->data.prev == curr);
194 curr = curr->data.next;
195 }
196 SLJIT_ASSERT(found);
197 }
198
199 #endif
200
stack_init(struct stack * stack)201 static void stack_init(struct stack *stack)
202 {
203 stack->first = NULL;
204 stack->last = NULL;
205 stack->index = STACK_FRAGMENT_SIZE - 1;
206 stack->count = 0;
207 }
208
stack_destroy(struct stack * stack)209 static void stack_destroy(struct stack *stack)
210 {
211 struct stack_fragment *curr = stack->first;
212 struct stack_fragment *prev;
213
214 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
215 stack_check(stack);
216 #endif
217
218 while (curr) {
219 prev = curr;
220 curr = curr->data.next;
221 SLJIT_FREE(prev, NULL);
222 }
223 }
224
stack_top(struct stack * stack)225 static SLJIT_INLINE struct stack_item* stack_top(struct stack *stack)
226 {
227 SLJIT_ASSERT(stack->last);
228 return stack->last->items + stack->index;
229 }
230
stack_push(struct stack * stack,int type,int value)231 static int stack_push(struct stack *stack, int type, int value)
232 {
233 if (stack->last) {
234 stack->index++;
235 if (stack->index >= STACK_FRAGMENT_SIZE) {
236 stack->index = 0;
237 if (!stack->last->data.next) {
238 stack->last->data.next = (struct stack_fragment*)SLJIT_MALLOC(sizeof(struct stack_fragment), NULL);
239 if (!stack->last->data.next)
240 return 1;
241 stack->last->data.next->data.next = NULL;
242 stack->last->data.next->data.prev = stack->last;
243 }
244 stack->last = stack->last->data.next;
245 }
246 }
247 else if (!stack->first) {
248 stack->last = (struct stack_fragment*)SLJIT_MALLOC(sizeof(struct stack_fragment), NULL);
249 if (!stack->last)
250 return 1;
251 stack->last->data.prev = NULL;
252 stack->last->data.next = NULL;
253 stack->first = stack->last;
254 stack->index = 0;
255 }
256 else {
257 stack->last = stack->first;
258 stack->index = 0;
259 }
260 stack->last->items[stack->index].type = type;
261 stack->last->items[stack->index].value = value;
262 stack->count++;
263 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
264 stack_check(stack);
265 #endif
266 return 0;
267 }
268
stack_pop(struct stack * stack)269 static struct stack_item* stack_pop(struct stack *stack)
270 {
271 struct stack_item *ret = stack_top(stack);
272
273 if (stack->index > 0)
274 stack->index--;
275 else {
276 stack->last = stack->last->data.prev;
277 stack->index = STACK_FRAGMENT_SIZE - 1;
278 }
279
280 stack->count--;
281 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
282 stack_check(stack);
283 #endif
284 return ret;
285 }
286
stack_clone(struct stack * src,struct stack * dst)287 static SLJIT_INLINE void stack_clone(struct stack *src, struct stack *dst)
288 {
289 *dst = *src;
290 }
291
stack_push_copy(struct stack * stack,int items,int length)292 static int stack_push_copy(struct stack *stack, int items, int length)
293 {
294 struct stack_fragment *frag1;
295 int ind1;
296 struct stack_fragment *frag2;
297 int ind2;
298 int counter;
299
300 SLJIT_ASSERT(stack->count >= length && items <= length && items > 0);
301
302 /* Allocate the necessary elements. */
303 counter = items;
304 frag1 = stack->last;
305 ind1 = stack->index;
306 while (counter > 0) {
307 if (stack->index + counter >= STACK_FRAGMENT_SIZE) {
308 counter -= STACK_FRAGMENT_SIZE - stack->index - 1 + 1;
309 stack->index = 0;
310 if (!stack->last->data.next) {
311 stack->last->data.next = (struct stack_fragment*)SLJIT_MALLOC(sizeof(struct stack_fragment), NULL);
312 if (!stack->last->data.next)
313 return 1;
314 stack->last->data.next->data.next = NULL;
315 stack->last->data.next->data.prev = stack->last;
316 }
317 stack->last = stack->last->data.next;
318 }
319 else {
320 stack->index += counter;
321 counter = 0;
322 }
323 }
324
325 frag2 = stack->last;
326 ind2 = stack->index;
327 while (length > 0) {
328 frag2->items[ind2--] = frag1->items[ind1--];
329 if (ind1 < 0) {
330 ind1 = STACK_FRAGMENT_SIZE - 1;
331 frag1 = frag1->data.prev;
332 }
333 if (ind2 < 0) {
334 ind2 = STACK_FRAGMENT_SIZE - 1;
335 frag2 = frag2->data.prev;
336 }
337 length--;
338 }
339
340 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
341 stack_check(stack);
342 #endif
343 stack->count += items;
344 return 0;
345 }
346
347 /* --------------------------------------------------------------------- */
348 /* Parser */
349 /* --------------------------------------------------------------------- */
350
351 enum {
352 /* Common. */
353 type_begin,
354 type_end,
355 type_char,
356 type_newline,
357 type_id,
358 type_rng_start,
359 type_rng_end,
360 type_rng_char,
361 type_rng_left,
362 type_rng_right,
363
364 /* generator only. */
365 type_branch,
366 type_jump,
367
368 /* Parser only. */
369 type_open_br,
370 type_close_br,
371 type_select,
372 type_asterisk,
373 type_plus_sign,
374 type_qestion_mark
375 };
376
377 struct compiler_common {
378 /* Temporary stacks. */
379 struct stack stack;
380 struct stack depth;
381 /* REGEX_ flags. */
382 int flags;
383 /* Encoded size of the dfa representation. */
384 sljit_sw dfa_size;
385 /* Number of terms. */
386 sljit_sw terms_size;
387 /* Number of state descriptors for one term (same as machine->no_states). */
388 sljit_sw no_states;
389 /* Number of type_rng_(char|left)-s in the longest character range. */
390 sljit_sw longest_range_size;
391
392 /* DFA linear representation (size: dfa_size). */
393 struct stack_item *dfa_transitions;
394 /* Term id and search state pairs (size: dfa_size). */
395 struct stack_item *search_states;
396
397 /* sljit compiler */
398 struct sljit_compiler *compiler;
399 /* Machine data, which must be kept for later use. */
400 struct regex_machine *machine;
401 /* Temporary space for jumps (size: longest_range_size). */
402 struct sljit_jump **range_jump_list;
403 };
404
decode_number(const regex_char_t * regex_string,int length,int * result)405 static const regex_char_t* decode_number(const regex_char_t *regex_string, int length, int *result)
406 {
407 int value = 0;
408
409 SLJIT_ASSERT(length > 0);
410 if (*regex_string < '0' || *regex_string > '9') {
411 *result = -1;
412 return regex_string;
413 }
414
415 while (length > 0 && *regex_string >= '0' && *regex_string <= '9') {
416 value = value * 10 + (*regex_string - '0');
417 length--;
418 regex_string++;
419 }
420
421 *result = value;
422 return regex_string;
423 }
424
iterate(struct stack * stack,int min,int max)425 static int iterate(struct stack *stack, int min, int max)
426 {
427 struct stack it;
428 struct stack_item *item;
429 int count = -1;
430 int len = 0;
431 int depth = 0;
432
433 stack_clone(stack, &it);
434
435 /* Calculate size. */
436 while (count < 0) {
437 item = stack_pop(&it);
438 switch (item->type) {
439 case type_id:
440 case type_rng_end:
441 case type_rng_char:
442 case type_rng_left:
443 case type_rng_right:
444 case type_plus_sign:
445 case type_qestion_mark:
446 len++;
447 break;
448
449 case type_asterisk:
450 len += 2;
451 break;
452
453 case type_close_br:
454 depth++;
455 break;
456
457 case type_open_br:
458 SLJIT_ASSERT(depth > 0);
459 depth--;
460 if (depth == 0)
461 count = it.count;
462 break;
463
464 case type_select:
465 SLJIT_ASSERT(depth > 0);
466 len += 2;
467 break;
468
469 default:
470 SLJIT_ASSERT(item->type != type_begin && item->type != type_end);
471 if (depth == 0)
472 count = it.count;
473 len++;
474 break;
475 }
476 }
477
478 if (min == 0 && max == 0) {
479 /* {0,0} case, not {0,} case: delete subtree. */
480 stack_clone(&it, stack);
481 /* and put an empty bracket expression instead of it. */
482 if (stack_push(stack, type_open_br, 0))
483 return REGEX_MEMORY_ERROR;
484 if (stack_push(stack, type_close_br, 0))
485 return REGEX_MEMORY_ERROR;
486 return len;
487 }
488
489 count = stack->count - count;
490
491 /* Put an open bracket before the sequence. */
492 if (stack_push_copy(stack, 1, count))
493 return -1;
494
495 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
496 SLJIT_ASSERT(stack_push(&it, type_open_br, 0) == 0);
497 #else
498 stack_push(&it, type_open_br, 0);
499 #endif
500
501 /* Copy the data. */
502 if (max > 0) {
503 len = len * (max - 1);
504 max -= min;
505 /* Insert ? operators. */
506 len += max;
507
508 if (min > 0) {
509 min--;
510 while (min > 0) {
511 if (stack_push_copy(stack, count, count))
512 return -1;
513 min--;
514 }
515 if (max > 0) {
516 if (stack_push_copy(stack, count, count))
517 return -1;
518 if (stack_push(stack, type_qestion_mark, 0))
519 return REGEX_MEMORY_ERROR;
520 count++;
521 max--;
522 }
523 }
524 else {
525 SLJIT_ASSERT(max > 0);
526 max--;
527 count++;
528 if (stack_push(stack, type_qestion_mark, 0))
529 return REGEX_MEMORY_ERROR;
530 }
531
532 while (max > 0) {
533 if (stack_push_copy(stack, count, count))
534 return -1;
535 max--;
536 }
537 }
538 else {
539 SLJIT_ASSERT(min > 0);
540 min--;
541 /* Insert + operator. */
542 len = len * min + 1;
543 while (min > 0) {
544 if (stack_push_copy(stack, count, count))
545 return -1;
546 min--;
547 }
548
549 if (stack_push(stack, type_plus_sign, 0))
550 return REGEX_MEMORY_ERROR;
551 }
552
553 /* Close the opened bracket. */
554 if (stack_push(stack, type_close_br, 0))
555 return REGEX_MEMORY_ERROR;
556
557 return len;
558 }
559
parse_iterator(const regex_char_t * regex_string,int length,struct stack * stack,sljit_sw * dfa_size,int begin)560 static int parse_iterator(const regex_char_t *regex_string, int length, struct stack *stack, sljit_sw *dfa_size, int begin)
561 {
562 /* We only know that *regex_string == { . */
563 int val1, val2;
564 const regex_char_t *base_from = regex_string;
565 const regex_char_t *from;
566
567 length--;
568 regex_string++;
569
570 /* Decode left value. */
571 val2 = -1;
572 if (length == 0)
573 return -2;
574 if (*regex_string == ',') {
575 val1 = 0;
576 length--;
577 regex_string++;
578 }
579 else {
580 from = regex_string;
581 regex_string = decode_number(regex_string, length, &val1);
582 if (val1 < 0)
583 return -2;
584 length -= regex_string - from;
585
586 if (length == 0)
587 return -2;
588 if (*regex_string == '}') {
589 val2 = val1;
590 if (val1 == 0)
591 val1 = -1;
592 }
593 else if (length >= 2 && *regex_string == '!' && regex_string[1] == '}') {
594 /* Non posix extension. */
595 if (stack_push(stack, type_id, val1))
596 return -1;
597 (*dfa_size)++;
598 return (regex_string - base_from) + 1;
599 }
600 else {
601 if (*regex_string != ',')
602 return -2;
603 length--;
604 regex_string++;
605 }
606 }
607
608 if (begin)
609 return -2;
610
611 /* Decode right value. */
612 if (val2 == -1) {
613 if (length == 0)
614 return -2;
615 if (*regex_string == '}')
616 val2 = 0;
617 else {
618 from = regex_string;
619 regex_string = decode_number(regex_string, length, &val2);
620 length -= regex_string - from;
621 if (val2 < 0 || length == 0 || *regex_string != '}' || val2 < val1)
622 return -2;
623 if (val2 == 0) {
624 SLJIT_ASSERT(val1 == 0);
625 val1 = -1;
626 }
627 }
628 }
629
630 /* Fast cases. */
631 if (val1 > 1 || val2 > 1) {
632 val1 = iterate(stack, val1, val2);
633 if (val1 < 0)
634 return -1;
635 *dfa_size += val1;
636 }
637 else if (val1 == 0 && val2 == 0) {
638 if (stack_push(stack, type_asterisk, 0))
639 return -1;
640 *dfa_size += 2;
641 }
642 else if (val1 == 1 && val2 == 0) {
643 if (stack_push(stack, type_plus_sign, 0))
644 return -1;
645 (*dfa_size)++;
646 }
647 else if (val1 == 0 && val2 == 1) {
648 if (stack_push(stack, type_qestion_mark, 0))
649 return -1;
650 (*dfa_size)++;
651 }
652 else if (val1 == -1) {
653 val1 = iterate(stack, 0, 0);
654 if (val1 < 0)
655 return -1;
656 *dfa_size -= val1;
657 SLJIT_ASSERT(*dfa_size >= 2);
658 }
659 else {
660 /* Ignore. */
661 SLJIT_ASSERT(val1 == 1 && val2 == 1);
662 }
663 return regex_string - base_from;
664 }
665
parse_char_range(const regex_char_t * regex_string,int length,struct compiler_common * compiler_common)666 static int parse_char_range(const regex_char_t *regex_string, int length, struct compiler_common *compiler_common)
667 {
668 struct stack* stack = &compiler_common->stack;
669 const regex_char_t *base_from = regex_string;
670 regex_char_t left_char, right_char, tmp_char;
671
672 length--;
673 regex_string++;
674
675 if (length == 0)
676 return -2;
677
678 if (*regex_string != '^') {
679 if (stack_push(stack, type_rng_start, 0))
680 return -1;
681 }
682 else {
683 length--;
684 regex_string++;
685
686 if (length == 0)
687 return -2;
688
689 if (stack_push(stack, type_rng_start, 1))
690 return -1;
691 }
692 /* For both the type_rng_start & type_rng_end. */
693 compiler_common->dfa_size += 2;
694
695 /* Range must be at least 1 character. */
696 if (*regex_string == ']') {
697 length--;
698 regex_string++;
699 if (stack_push(stack, type_rng_char, ']'))
700 return -1;
701 compiler_common->dfa_size++;
702 }
703
704 while (1) {
705 if (length == 0)
706 return -2;
707
708 if (*regex_string == ']')
709 break;
710
711 if (*regex_string != '\\')
712 left_char = *regex_string;
713 else {
714 regex_string++;
715 length--;
716 if (length == 0)
717 return -2;
718 left_char = *regex_string;
719 }
720 regex_string++;
721 length--;
722
723 /* Is a range here? */
724 if (length >= 3 && *regex_string == '-' && *(regex_string + 1) != ']') {
725 regex_string++;
726 length--;
727
728 if (*regex_string != '\\')
729 right_char = *regex_string;
730 else {
731 regex_string++;
732 length--;
733 if (length == 0)
734 return -2;
735 right_char = *regex_string;
736 }
737 regex_string++;
738 length--;
739
740 if (left_char > right_char) {
741 /* Swap if necessary. */
742 tmp_char = left_char;
743 left_char = right_char;
744 right_char = tmp_char;
745 }
746
747 if (stack_push(stack, type_rng_left, left_char))
748 return -1;
749 if (stack_push(stack, type_rng_right, right_char))
750 return -1;
751 compiler_common->dfa_size += 2;
752 }
753 else {
754 if (stack_push(stack, type_rng_char, left_char))
755 return -1;
756 compiler_common->dfa_size++;
757 }
758 }
759
760 if (stack_push(stack, type_rng_end, 0))
761 return -1;
762 return regex_string - base_from;
763 }
764
parse(const regex_char_t * regex_string,int length,struct compiler_common * compiler_common)765 static int parse(const regex_char_t *regex_string, int length, struct compiler_common *compiler_common)
766 {
767 /* Depth of bracketed expressions. */
768 int depth = 0;
769 /* Have we already found a term? '1' if not yet. */
770 int begin = 1;
771 /* Cache stack pointer. */
772 struct stack* stack = &compiler_common->stack;
773 int tmp;
774
775 /* Type_begin and type_end. */
776 compiler_common->dfa_size = 2;
777 stack_init(stack);
778 if (stack_push(stack, type_begin, 0))
779 return REGEX_MEMORY_ERROR;
780
781 if (length > 0 && *regex_string == '^') {
782 compiler_common->flags |= REGEX_MATCH_BEGIN;
783 length--;
784 regex_string++;
785 }
786
787 if ((compiler_common->flags & (REGEX_MATCH_BEGIN | REGEX_NEWLINE)) == (REGEX_MATCH_BEGIN | REGEX_NEWLINE)) {
788 /* Replace REGEX_MATCH_BEGIN flag to REGEX_FAKE_MATCH_BEGIN */
789 compiler_common->flags &= ~REGEX_MATCH_BEGIN;
790 compiler_common->flags |= REGEX_FAKE_MATCH_BEGIN;
791 /* and append a new-line search. */
792 if (stack_push(stack, type_newline, 0))
793 return REGEX_MEMORY_ERROR;
794 compiler_common->dfa_size++;
795 /* Begin intentionally kept as 1. */
796 }
797
798 while (length > 0) {
799 switch (*regex_string) {
800 case '\\' :
801 length--;
802 regex_string++;
803 if (length == 0)
804 return REGEX_INVALID_REGEX;
805 if (stack_push(stack, type_char, *regex_string))
806 return REGEX_MEMORY_ERROR;
807 begin = 0;
808 compiler_common->dfa_size++;
809 break;
810
811 case '.' :
812 if (stack_push(stack, type_rng_start, 1))
813 return REGEX_MEMORY_ERROR;
814 if (compiler_common->flags & REGEX_NEWLINE) {
815 if (stack_push(stack, type_rng_char, '\n'))
816 return REGEX_MEMORY_ERROR;
817 if (stack_push(stack, type_rng_char, '\r'))
818 return REGEX_MEMORY_ERROR;
819 compiler_common->dfa_size += 2;
820 }
821 if (stack_push(stack, type_rng_end, 1))
822 return REGEX_MEMORY_ERROR;
823 begin = 0;
824 compiler_common->dfa_size += 2;
825 break;
826
827 case '(' :
828 depth++;
829 if (stack_push(stack, type_open_br, 0))
830 return REGEX_MEMORY_ERROR;
831 begin = 1;
832 break;
833
834 case ')' :
835 if (depth == 0)
836 return REGEX_INVALID_REGEX;
837 depth--;
838 if (stack_push(stack, type_close_br, 0))
839 return REGEX_MEMORY_ERROR;
840 begin = 0;
841 break;
842
843 case '|' :
844 if (stack_push(stack, type_select, 0))
845 return REGEX_MEMORY_ERROR;
846 begin = 1;
847 compiler_common->dfa_size += 2;
848 break;
849
850 case '*' :
851 if (begin)
852 return REGEX_INVALID_REGEX;
853 if (stack_push(stack, type_asterisk, 0))
854 return REGEX_MEMORY_ERROR;
855 compiler_common->dfa_size += 2;
856 break;
857
858 case '?' :
859 case '+' :
860 if (begin)
861 return REGEX_INVALID_REGEX;
862 if (stack_push(stack, (*regex_string == '+') ? type_plus_sign : type_qestion_mark, 0))
863 return REGEX_MEMORY_ERROR;
864 compiler_common->dfa_size++;
865 break;
866
867 case '{' :
868 tmp = parse_iterator(regex_string, length, stack, &compiler_common->dfa_size, begin);
869
870 if (tmp >= 0) {
871 length -= tmp;
872 regex_string += tmp;
873 }
874 else if (tmp == -1)
875 return REGEX_MEMORY_ERROR;
876 else {
877 /* Not a valid range expression. */
878 SLJIT_ASSERT(tmp == -2);
879 if (stack_push(stack, type_char, '{'))
880 return REGEX_MEMORY_ERROR;
881 compiler_common->dfa_size++;
882 }
883 break;
884
885 case '[' :
886 tmp = parse_char_range(regex_string, length, compiler_common);
887 if (tmp >= 0) {
888 length -= tmp;
889 regex_string += tmp;
890 }
891 else if (tmp == -1)
892 return REGEX_MEMORY_ERROR;
893 else {
894 SLJIT_ASSERT(tmp == -2);
895 return REGEX_INVALID_REGEX;
896 }
897 begin = 0;
898 break;
899
900 default:
901 if (length == 1 && *regex_string == '$') {
902 compiler_common->flags |= REGEX_MATCH_END;
903 break;
904 }
905 if (stack_push(stack, type_char, *regex_string))
906 return REGEX_MEMORY_ERROR;
907 begin = 0;
908 compiler_common->dfa_size++;
909 break;
910 }
911 length--;
912 regex_string++;
913 }
914
915 if (depth != 0)
916 return REGEX_INVALID_REGEX;
917
918 if ((compiler_common->flags & (REGEX_MATCH_END | REGEX_NEWLINE)) == (REGEX_MATCH_END | REGEX_NEWLINE)) {
919 /* Replace REGEX_MATCH_END flag to REGEX_FAKE_MATCH_END */
920 compiler_common->flags &= ~REGEX_MATCH_END;
921 compiler_common->flags |= REGEX_FAKE_MATCH_END;
922 /* and append a new-line search. */
923 if (stack_push(stack, type_newline, 1))
924 return REGEX_MEMORY_ERROR;
925 compiler_common->dfa_size++;
926 /* Begin intentionally kept as 1. */
927 }
928
929 if (stack_push(stack, type_end, 0))
930 return REGEX_MEMORY_ERROR;
931
932 return REGEX_NO_ERROR;
933 }
934
935 /* --------------------------------------------------------------------- */
936 /* Generating machine state transitions */
937 /* --------------------------------------------------------------------- */
938
939 #define PUT_TRANSITION(typ, val) \
940 do { \
941 --transitions_ptr; \
942 transitions_ptr->type = typ; \
943 transitions_ptr->value = val; \
944 } while (0)
945
handle_iteratives(struct stack_item * transitions_ptr,struct stack_item * transitions,struct stack * depth)946 static struct stack_item* handle_iteratives(struct stack_item *transitions_ptr, struct stack_item *transitions, struct stack *depth)
947 {
948 struct stack_item *item;
949
950 while (1) {
951 item = stack_top(depth);
952
953 switch (item->type) {
954 case type_asterisk:
955 SLJIT_ASSERT(transitions[item->value].type == type_branch);
956 transitions[item->value].value = transitions_ptr - transitions;
957 PUT_TRANSITION(type_branch, item->value + 1);
958 break;
959
960 case type_plus_sign:
961 SLJIT_ASSERT(transitions[item->value].type == type_branch);
962 transitions[item->value].value = transitions_ptr - transitions;
963 break;
964
965 case type_qestion_mark:
966 PUT_TRANSITION(type_branch, item->value);
967 break;
968
969 default:
970 return transitions_ptr;
971 }
972 stack_pop(depth);
973 }
974 }
975
generate_transitions(struct compiler_common * compiler_common)976 static int generate_transitions(struct compiler_common *compiler_common)
977 {
978 struct stack *stack = &compiler_common->stack;
979 struct stack *depth = &compiler_common->depth;
980 struct stack_item *transitions_ptr;
981 struct stack_item *item;
982
983 stack_init(depth);
984 compiler_common->dfa_transitions = SLJIT_MALLOC(sizeof(struct stack_item) * compiler_common->dfa_size, NULL);
985 if (!compiler_common->dfa_transitions)
986 return REGEX_MEMORY_ERROR;
987
988 /* Go through the items of the stack and generate the necessary branches and jumps (edges of DFA). */
989 transitions_ptr = compiler_common->dfa_transitions + compiler_common->dfa_size;
990 while (stack->count > 0) {
991 item = stack_pop(stack);
992 switch (item->type) {
993 case type_begin:
994 case type_open_br:
995 item = stack_pop(depth);
996 if (item->type == type_select)
997 PUT_TRANSITION(type_branch, item->value + 1);
998 else
999 SLJIT_ASSERT(item->type == type_close_br);
1000 if (stack->count == 0)
1001 PUT_TRANSITION(type_begin, 0);
1002 else
1003 transitions_ptr = handle_iteratives(transitions_ptr, compiler_common->dfa_transitions, depth);
1004 break;
1005
1006 case type_end:
1007 case type_close_br:
1008 if (item->type == type_end)
1009 *--transitions_ptr = *item;
1010 if (stack_push(depth, type_close_br, transitions_ptr - compiler_common->dfa_transitions))
1011 return REGEX_MEMORY_ERROR;
1012 break;
1013
1014 case type_select:
1015 item = stack_top(depth);
1016 if (item->type == type_select) {
1017 SLJIT_ASSERT(compiler_common->dfa_transitions[item->value].type == type_jump);
1018 PUT_TRANSITION(type_branch, item->value + 1);
1019 PUT_TRANSITION(type_jump, item->value);
1020 item->value = transitions_ptr - compiler_common->dfa_transitions;
1021 }
1022 else {
1023 SLJIT_ASSERT(item->type == type_close_br);
1024 item->type = type_select;
1025 PUT_TRANSITION(type_jump, item->value);
1026 item->value = transitions_ptr - compiler_common->dfa_transitions;
1027 }
1028 break;
1029
1030 case type_asterisk:
1031 case type_plus_sign:
1032 case type_qestion_mark:
1033 if (item->type != type_qestion_mark)
1034 PUT_TRANSITION(type_branch, 0);
1035 if (stack_push(depth, item->type, transitions_ptr - compiler_common->dfa_transitions))
1036 return REGEX_MEMORY_ERROR;
1037 break;
1038
1039 case type_char:
1040 case type_newline:
1041 case type_rng_start:
1042 /* Requires handle_iteratives. */
1043 *--transitions_ptr = *item;
1044 transitions_ptr = handle_iteratives(transitions_ptr, compiler_common->dfa_transitions, depth);
1045 break;
1046
1047 default:
1048 *--transitions_ptr = *item;
1049 break;
1050 }
1051 }
1052
1053 SLJIT_ASSERT(compiler_common->dfa_transitions == transitions_ptr);
1054 SLJIT_ASSERT(depth->count == 0);
1055 return REGEX_NO_ERROR;
1056 }
1057
1058 #undef PUT_TRANSITION
1059
1060 #ifdef REGEX_MATCH_VERBOSE
1061
verbose_transitions(struct compiler_common * compiler_common)1062 static void verbose_transitions(struct compiler_common *compiler_common)
1063 {
1064 struct stack_item *transitions_ptr = compiler_common->dfa_transitions;
1065 struct stack_item *transitions_end = transitions_ptr + compiler_common->dfa_size;
1066 struct stack_item *search_states_ptr = compiler_common->search_states;
1067 int pos;
1068
1069 printf("-----------------\nTransitions\n-----------------\n");
1070 pos = 0;
1071 while (transitions_ptr < transitions_end) {
1072 printf("[%3d] ", pos++);
1073 if (search_states_ptr->type >= 0)
1074 printf("(%3d) ", search_states_ptr->type);
1075 switch (transitions_ptr->type) {
1076 case type_begin:
1077 printf("type_begin\n");
1078 break;
1079
1080 case type_end:
1081 printf("type_end\n");
1082 break;
1083
1084 case type_char:
1085 if (transitions_ptr->value >= ' ')
1086 printf("type_char '%c'\n", transitions_ptr->value);
1087 else
1088 printf("type_char 0x%x\n", transitions_ptr->value);
1089 break;
1090
1091 case type_newline:
1092 printf("type_newline %s\n", transitions_ptr->value ? "(end)" : "(begin)");
1093 break;
1094
1095 case type_id:
1096 printf("type_id %d\n", transitions_ptr->value);
1097 break;
1098
1099 case type_rng_start:
1100 printf("type_rng_start %s\n", transitions_ptr->value ? "(invert)" : "(normal)");
1101 break;
1102
1103 case type_rng_end:
1104 printf("type_rng_end\n");
1105 break;
1106
1107 case type_rng_char:
1108 if (transitions_ptr->value >= ' ')
1109 printf("type_rng_char '%c'\n", transitions_ptr->value);
1110 else
1111 printf("type_rng_char 0x%x\n", transitions_ptr->value);
1112 break;
1113
1114 case type_rng_left:
1115 if (transitions_ptr->value >= ' ')
1116 printf("type_rng_left '%c'\n", transitions_ptr->value);
1117 else
1118 printf("type_rng_left 0x%x\n", transitions_ptr->value);
1119 break;
1120
1121 case type_rng_right:
1122 if (transitions_ptr->value >= ' ')
1123 printf("type_rng_right '%c'\n", transitions_ptr->value);
1124 else
1125 printf("type_rng_right 0x%x\n", transitions_ptr->value);
1126 break;
1127
1128 case type_branch:
1129 printf("type_branch -> %d\n", transitions_ptr->value);
1130 break;
1131
1132 case type_jump:
1133 printf("type_jump -> %d\n", transitions_ptr->value);
1134 break;
1135
1136 default:
1137 printf("UNEXPECTED TYPE\n");
1138 break;
1139 }
1140 transitions_ptr++;
1141 search_states_ptr++;
1142 }
1143 printf("flags: ");
1144 if (!(compiler_common->flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END | REGEX_NEWLINE | REGEX_ID_CHECK | REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END)))
1145 printf("none ");
1146 if (compiler_common->flags & REGEX_MATCH_BEGIN)
1147 printf("REGEX_MATCH_BEGIN ");
1148 if (compiler_common->flags & REGEX_MATCH_END)
1149 printf("REGEX_MATCH_END ");
1150 if (compiler_common->flags & REGEX_NEWLINE)
1151 printf("REGEX_NEWLINE ");
1152 if (compiler_common->flags & REGEX_ID_CHECK)
1153 printf("REGEX_ID_CHECK ");
1154 if (compiler_common->flags & REGEX_FAKE_MATCH_BEGIN)
1155 printf("REGEX_FAKE_MATCH_BEGIN ");
1156 if (compiler_common->flags & REGEX_FAKE_MATCH_END)
1157 printf("REGEX_FAKE_MATCH_END ");
1158 if (compiler_common->longest_range_size > 0)
1159 printf("(longest range: %ld) ", (long)compiler_common->longest_range_size);
1160 printf("\n");
1161 }
1162
1163 #endif
1164
1165 /* --------------------------------------------------------------------- */
1166 /* Utilities */
1167 /* --------------------------------------------------------------------- */
1168
generate_search_states(struct compiler_common * compiler_common)1169 static int generate_search_states(struct compiler_common *compiler_common)
1170 {
1171 struct stack_item *transitions_ptr = compiler_common->dfa_transitions;
1172 struct stack_item *transitions_end = transitions_ptr + compiler_common->dfa_size;
1173 struct stack_item *search_states_ptr;
1174 struct stack_item *rng_start = NULL;
1175
1176 compiler_common->terms_size = !(compiler_common->flags & REGEX_FAKE_MATCH_END) ? 1 : 2;
1177 compiler_common->longest_range_size = 0;
1178 compiler_common->search_states = SLJIT_MALLOC(sizeof(struct stack_item) * compiler_common->dfa_size, NULL);
1179 if (!compiler_common->search_states)
1180 return REGEX_MEMORY_ERROR;
1181
1182 search_states_ptr = compiler_common->search_states;
1183 while (transitions_ptr < transitions_end) {
1184 switch (transitions_ptr->type) {
1185 case type_begin:
1186 case type_end:
1187 search_states_ptr->type = 0;
1188 break;
1189
1190 case type_char:
1191 search_states_ptr->type = compiler_common->terms_size++;
1192 break;
1193
1194 case type_newline:
1195 if (transitions_ptr->value)
1196 search_states_ptr->type = 1;
1197 else
1198 search_states_ptr->type = compiler_common->terms_size++;
1199 SLJIT_ASSERT(search_states_ptr->type == 1 || search_states_ptr->type == 2);
1200 break;
1201
1202 case type_id:
1203 if (transitions_ptr->value > 0)
1204 compiler_common->flags |= REGEX_ID_CHECK;
1205 search_states_ptr->type = -1;
1206 break;
1207
1208 case type_rng_start:
1209 search_states_ptr->type = compiler_common->terms_size;
1210 rng_start = search_states_ptr;
1211 break;
1212
1213 case type_rng_end:
1214 search_states_ptr->type = compiler_common->terms_size++;
1215 /* Ok, this is a blunt over estimation :) */
1216 if (compiler_common->longest_range_size < search_states_ptr - rng_start)
1217 compiler_common->longest_range_size = search_states_ptr - rng_start;
1218 break;
1219
1220 default:
1221 search_states_ptr->type = -1;
1222 break;
1223 }
1224 search_states_ptr->value = -1;
1225 search_states_ptr++;
1226 transitions_ptr++;
1227 }
1228 return REGEX_NO_ERROR;
1229 }
1230
trace_transitions(int from,struct compiler_common * compiler_common)1231 static int trace_transitions(int from, struct compiler_common *compiler_common)
1232 {
1233 int id = 0;
1234 struct stack *stack = &compiler_common->stack;
1235 struct stack *depth = &compiler_common->depth;
1236 struct stack_item *dfa_transitions = compiler_common->dfa_transitions;
1237 struct stack_item *search_states = compiler_common->search_states;
1238
1239 SLJIT_ASSERT(search_states[from].type >= 0);
1240
1241 from++;
1242
1243 /* Be prepared for any paths (loops, etc). */
1244 while (1) {
1245 if (dfa_transitions[from].type == type_id)
1246 if (id < dfa_transitions[from].value)
1247 id = dfa_transitions[from].value;
1248
1249 if (search_states[from].value < id) {
1250 /* Forward step. */
1251 if (search_states[from].value == -1)
1252 if (stack_push(stack, 0, from))
1253 return REGEX_MEMORY_ERROR;
1254 search_states[from].value = id;
1255
1256 if (dfa_transitions[from].type == type_branch) {
1257 if (stack_push(depth, id, from))
1258 return REGEX_MEMORY_ERROR;
1259 from++;
1260 continue;
1261 }
1262 else if (dfa_transitions[from].type == type_jump) {
1263 from = dfa_transitions[from].value;
1264 continue;
1265 }
1266 else if (search_states[from].type < 0) {
1267 from++;
1268 continue;
1269 }
1270 }
1271
1272 /* Back tracking. */
1273 if (depth->count > 0) {
1274 id = stack_top(depth)->type;
1275 from = dfa_transitions[stack_pop(depth)->value].value;
1276 continue;
1277 }
1278 return 0;
1279 }
1280 }
1281
1282 /* --------------------------------------------------------------------- */
1283 /* Code generator */
1284 /* --------------------------------------------------------------------- */
1285
1286 #define TERM_OFFSET_OF(index, offs) (((index) * no_states + (offs)) * sizeof(sljit_sw))
1287 #define TERM_REL_OFFSET_OF(base, offs) ((base) + ((offs) * sizeof(sljit_sw)))
1288
1289 #define EMIT_OP1(type, arg1, arg2, arg3, arg4) \
1290 CHECK(sljit_emit_op1(compiler, type, arg1, arg2, arg3, arg4))
1291
1292 #define EMIT_OP2(type, arg1, arg2, arg3, arg4, arg5, arg6) \
1293 CHECK(sljit_emit_op2(compiler, type, arg1, arg2, arg3, arg4, arg5, arg6))
1294
1295 #define EMIT_LABEL(label) \
1296 label = sljit_emit_label(compiler); \
1297 CHECK(!label)
1298
1299 #define EMIT_JUMP(jump, type) \
1300 jump = sljit_emit_jump(compiler, type); \
1301 CHECK(!jump)
1302
1303 #define EMIT_CMP(jump, type, arg1, arg2, arg3, arg4) \
1304 jump = sljit_emit_cmp(compiler, type, arg1, arg2, arg3, arg4); \
1305 CHECK(!jump)
1306
1307 /* CHECK depends on the use case. */
1308
1309 #define CHECK(exp) \
1310 if (SLJIT_UNLIKELY(exp)) \
1311 return REGEX_MEMORY_ERROR
1312
compile_uncond_tran(struct compiler_common * compiler_common,int reg)1313 static int compile_uncond_tran(struct compiler_common *compiler_common, int reg)
1314 {
1315 struct sljit_compiler *compiler = compiler_common->compiler;
1316 struct stack *stack = &compiler_common->stack;
1317 struct stack_item *search_states = compiler_common->search_states;
1318 int flags = compiler_common->flags;
1319 sljit_sw no_states = compiler_common->no_states;
1320 sljit_uw head = 0;
1321 sljit_sw offset, value;
1322
1323 if (reg != R_CURR_STATE || !(compiler_common->flags & REGEX_FAKE_MATCH_BEGIN)) {
1324 CHECK(trace_transitions(0, compiler_common));
1325 }
1326 else {
1327 CHECK(trace_transitions(1, compiler_common));
1328 }
1329
1330 while (stack->count > 0) {
1331 value = stack_pop(stack)->value;
1332 if (search_states[value].type >= 0) {
1333 offset = TERM_OFFSET_OF(search_states[value].type, 0);
1334 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 1), SLJIT_IMM, head);
1335 if (offset > 0)
1336 head = offset;
1337
1338 if (!(flags & REGEX_MATCH_BEGIN)) {
1339 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 2), R_TEMP, 0);
1340 if (flags & REGEX_ID_CHECK) {
1341 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 3), SLJIT_IMM, search_states[value].value);
1342 }
1343 }
1344 else if (flags & REGEX_ID_CHECK) {
1345 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 2), SLJIT_IMM, search_states[value].value);
1346 }
1347 }
1348 search_states[value].value = -1;
1349 }
1350 if (reg == R_NEXT_STATE) {
1351 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_NEXT_HEAD, 0);
1352 }
1353 else if (flags & REGEX_FAKE_MATCH_BEGIN) {
1354 SLJIT_ASSERT(compiler_common->dfa_transitions[1].type == type_newline && !compiler_common->dfa_transitions[1].value);
1355 offset = TERM_OFFSET_OF(search_states[1].type, 0);
1356
1357 SLJIT_ASSERT(!(flags & REGEX_MATCH_BEGIN));
1358
1359 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 1), SLJIT_IMM, head);
1360 head = offset;
1361
1362 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 2), SLJIT_IMM, 1);
1363 if (flags & REGEX_ID_CHECK) {
1364 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 3), SLJIT_IMM, 0);
1365 }
1366 }
1367 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, head);
1368 return REGEX_NO_ERROR;
1369 }
1370
compile_cond_tran(struct compiler_common * compiler_common,sljit_sw curr_index)1371 static int compile_cond_tran(struct compiler_common *compiler_common, sljit_sw curr_index)
1372 {
1373 struct sljit_compiler *compiler = compiler_common->compiler;
1374 struct stack *stack = &compiler_common->stack;
1375 struct stack_item *search_states = compiler_common->search_states;
1376 sljit_sw offset;
1377 int flags;
1378 sljit_sw no_states;
1379 sljit_sw value;
1380 struct sljit_jump *jump1;
1381 struct sljit_jump *jump2;
1382 struct sljit_jump *jump3;
1383 struct sljit_jump *jump4;
1384 struct sljit_jump *jump5;
1385 struct sljit_label *label1;
1386
1387 flags = compiler_common->flags;
1388 no_states = compiler_common->no_states;
1389
1390 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 0);
1391 if (!(flags & (REGEX_ID_CHECK | REGEX_MATCH_BEGIN))) {
1392 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 2));
1393 }
1394
1395 while (stack->count > 0) {
1396 value = stack_pop(stack)->value;
1397 if (search_states[value].type >= 0) {
1398 #ifdef REGEX_MATCH_VERBOSE
1399 if (flags & REGEX_MATCH_VERBOSE)
1400 printf("-> (%3d:%3d) ", search_states[value].type, search_states[value].value);
1401 #endif
1402 offset = TERM_OFFSET_OF(search_states[value].type, 0);
1403
1404 if (!(flags & REGEX_ID_CHECK)) {
1405 if (!(flags & REGEX_MATCH_BEGIN)) {
1406 /* Check whether item is inserted. */
1407 EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), SLJIT_IMM, -1);
1408 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), R_NEXT_HEAD, 0);
1409 if (offset > 0) {
1410 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset);
1411 }
1412 EMIT_JUMP(jump2, SLJIT_JUMP);
1413
1414 /* Check whether old index <= index. */
1415 EMIT_LABEL(label1);
1416 sljit_set_label(jump1, label1);
1417
1418 EMIT_CMP(jump1, SLJIT_LESS_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0);
1419
1420 EMIT_LABEL(label1);
1421 sljit_set_label(jump2, label1);
1422 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0);
1423
1424 EMIT_LABEL(label1);
1425 sljit_set_label(jump1, label1);
1426 }
1427 else {
1428 /* Check whether item is inserted. */
1429 EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), SLJIT_IMM, -1);
1430 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), R_NEXT_HEAD, 0);
1431 if (offset > 0) {
1432 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset);
1433 }
1434 EMIT_LABEL(label1);
1435 sljit_set_label(jump1, label1);
1436 }
1437 }
1438 else {
1439 if (!(flags & REGEX_MATCH_BEGIN)) {
1440 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 2));
1441
1442 /* Check whether item is inserted. */
1443 EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), SLJIT_IMM, -1);
1444 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), R_NEXT_HEAD, 0);
1445 if (offset > 0) {
1446 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset);
1447 }
1448 EMIT_JUMP(jump2, SLJIT_JUMP);
1449
1450 /* Check whether old index != index. */
1451 EMIT_LABEL(label1);
1452 sljit_set_label(jump1, label1);
1453
1454 EMIT_OP2(SLJIT_SUB | SLJIT_SET_Z | SLJIT_SET_LESS, SLJIT_UNUSED, 0, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0);
1455 EMIT_JUMP(jump1, SLJIT_LESS);
1456 EMIT_JUMP(jump3, SLJIT_NOT_EQUAL); /* Greater. */
1457
1458 /* Old index == index. */
1459 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 3));
1460 if (search_states[value].value > 0) {
1461 EMIT_CMP(jump4, SLJIT_GREATER, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1462
1463 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1464 EMIT_LABEL(label1);
1465 sljit_set_label(jump4, label1);
1466 }
1467
1468 EMIT_OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, SLJIT_MEM1(R_NEXT_STATE), offset + 3 * sizeof(sljit_sw), R_TEMP, 0);
1469 EMIT_JUMP(jump4, SLJIT_GREATER_EQUAL);
1470 EMIT_JUMP(jump5, SLJIT_JUMP);
1471
1472 /* Overwrite index & id. */
1473 EMIT_LABEL(label1);
1474 sljit_set_label(jump3, label1);
1475 sljit_set_label(jump2, label1);
1476 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0);
1477
1478 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 3));
1479 if (search_states[value].value > 0) {
1480 EMIT_CMP(jump3, SLJIT_GREATER, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1481
1482 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1483 EMIT_LABEL(label1);
1484 sljit_set_label(jump3, label1);
1485 }
1486
1487 EMIT_LABEL(label1);
1488 sljit_set_label(jump5, label1);
1489 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 3 * sizeof(sljit_sw), R_TEMP, 0);
1490
1491 /* Exit. */
1492 EMIT_LABEL(label1);
1493 sljit_set_label(jump1, label1);
1494 sljit_set_label(jump4, label1);
1495 }
1496 else {
1497 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 2));
1498
1499 if (search_states[value].value > 0) {
1500 EMIT_CMP(jump1, SLJIT_GREATER, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1501
1502 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1503 EMIT_LABEL(label1);
1504 sljit_set_label(jump1, label1);
1505 }
1506
1507 /* Check whether item is inserted. */
1508 EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), SLJIT_IMM, -1);
1509 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), R_NEXT_HEAD, 0);
1510 if (offset > 0) {
1511 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset);
1512 }
1513 EMIT_JUMP(jump2, SLJIT_JUMP);
1514
1515 /* Check whether old id >= id. */
1516 EMIT_LABEL(label1);
1517 sljit_set_label(jump1, label1);
1518
1519 EMIT_CMP(jump1, SLJIT_GREATER_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0);
1520
1521 EMIT_LABEL(label1);
1522 sljit_set_label(jump2, label1);
1523 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0);
1524
1525 EMIT_LABEL(label1);
1526 sljit_set_label(jump1, label1);
1527 }
1528 }
1529 }
1530 search_states[value].value = -1;
1531 }
1532
1533 #ifdef REGEX_MATCH_VERBOSE
1534 if (flags & REGEX_MATCH_VERBOSE)
1535 printf("\n");
1536 #endif
1537 return REGEX_NO_ERROR;
1538 }
1539
compile_end_check(struct compiler_common * compiler_common,struct sljit_label * end_check_label)1540 static int compile_end_check(struct compiler_common *compiler_common, struct sljit_label *end_check_label)
1541 {
1542 struct sljit_compiler *compiler = compiler_common->compiler;
1543 struct sljit_jump *jump;
1544 struct sljit_jump *clear_states_jump;
1545 struct sljit_label *label;
1546 struct sljit_label *leave_label;
1547 struct sljit_label *begin_loop_label;
1548
1549 /* Priority order: best_begin > best_end > best_id.
1550 In other words:
1551 if (new best_begin > old test_begin) do nothing
1552 otherwise we know that new_end > old_end, since R_CURR_INDEX ever increasing
1553 therefore we must overwrite all best_* variables (new_id also contains the highest id for this turn). */
1554
1555 /* Both R_CURR_CHAR and R_BEST_BEGIN used as temporary registers. */
1556
1557 if (!(compiler_common->flags & REGEX_MATCH_BEGIN)) {
1558 EMIT_OP1(SLJIT_MOV, R_CURR_CHAR, 0, SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 2));
1559 EMIT_CMP(jump, !(compiler_common->flags & REGEX_MATCH_NON_GREEDY) ? SLJIT_LESS : SLJIT_LESS_EQUAL, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), R_CURR_CHAR, 0);
1560 sljit_set_label(jump, end_check_label);
1561
1562 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), R_CURR_CHAR, 0);
1563 if (!(compiler_common->flags & (REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END))) {
1564 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0);
1565 }
1566 else {
1567 if ((compiler_common->flags & (REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END)) == (REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END)) {
1568 EMIT_OP2(SLJIT_SUB, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0, SLJIT_IMM, 2);
1569 }
1570 else {
1571 EMIT_OP2(SLJIT_SUB, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0, SLJIT_IMM, 1);
1572 }
1573 }
1574 if (compiler_common->flags & REGEX_ID_CHECK) {
1575 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 3));
1576 }
1577
1578 EMIT_CMP(clear_states_jump, SLJIT_LESS, R_CURR_CHAR, 0, R_BEST_BEGIN, 0);
1579
1580 EMIT_LABEL(leave_label);
1581 EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, R_CURR_CHAR, 0);
1582 EMIT_JUMP(jump, SLJIT_JUMP);
1583 sljit_set_label(jump, end_check_label);
1584
1585 /* A loop to clear all states, which are > (or >=) than R_CURR_CHAR. */
1586 EMIT_LABEL(label);
1587 sljit_set_label(clear_states_jump, label);
1588
1589 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_NEXT_HEAD, 0);
1590 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
1591
1592 /* Begin of the loop. */
1593 EMIT_LABEL(begin_loop_label);
1594 EMIT_CMP(jump, SLJIT_EQUAL, R_TEMP, 0, SLJIT_IMM, 0);
1595 sljit_set_label(jump, leave_label);
1596
1597 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, R_CURR_STATE, 0);
1598 EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, SLJIT_MEM1(R_TEMP), sizeof(sljit_sw));
1599 EMIT_CMP(clear_states_jump, !(compiler_common->flags & REGEX_MATCH_NON_GREEDY) ? SLJIT_GREATER : SLJIT_GREATER_EQUAL, SLJIT_MEM1(R_TEMP), 2 * sizeof(sljit_sw), R_CURR_CHAR, 0);
1600
1601 /* Case 1: keep this case. */
1602 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_TEMP), sizeof(sljit_sw), R_NEXT_HEAD, 0);
1603 EMIT_OP2(SLJIT_SUB, R_NEXT_HEAD, 0, R_TEMP, 0, R_CURR_STATE, 0);
1604
1605 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_BEST_BEGIN, 0);
1606 EMIT_JUMP(jump, SLJIT_JUMP);
1607 sljit_set_label(jump, begin_loop_label);
1608
1609 /* Case 2: remove this case. */
1610 EMIT_LABEL(label);
1611 sljit_set_label(clear_states_jump, label);
1612
1613 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_TEMP), sizeof(sljit_sw), SLJIT_IMM, -1);
1614
1615 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_BEST_BEGIN, 0);
1616 EMIT_JUMP(jump, SLJIT_JUMP);
1617 sljit_set_label(jump, begin_loop_label);
1618 }
1619 else {
1620 EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, SLJIT_IMM, 0);
1621 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, 0);
1622 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0);
1623 if (compiler_common->flags & REGEX_ID_CHECK) {
1624 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 2));
1625 }
1626 EMIT_JUMP(jump, SLJIT_JUMP);
1627 sljit_set_label(jump, end_check_label);
1628 }
1629 return REGEX_NO_ERROR;
1630 }
1631
compile_leave_fast_forward(struct compiler_common * compiler_common,struct sljit_label * fast_forward_label)1632 static int compile_leave_fast_forward(struct compiler_common *compiler_common, struct sljit_label *fast_forward_label)
1633 {
1634 struct sljit_compiler *compiler = compiler_common->compiler;
1635 struct stack *stack = &compiler_common->stack;
1636 struct stack_item *dfa_transitions = compiler_common->dfa_transitions;
1637 struct stack_item *search_states = compiler_common->search_states;
1638 int ind;
1639 struct sljit_jump *jump;
1640 int init_range = 1, prev_value = 0;
1641
1642 while (stack->count > 0) {
1643 ind = stack_pop(stack)->value;
1644 search_states[ind].value = -1;
1645 if (search_states[ind].type >= 0) {
1646 if (dfa_transitions[ind].type == type_char) {
1647 EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, dfa_transitions[ind].value);
1648 sljit_set_label(jump, fast_forward_label);
1649 }
1650 else if (dfa_transitions[ind].type == type_rng_start) {
1651 SLJIT_ASSERT(!dfa_transitions[ind].value);
1652 ind++;
1653 while (dfa_transitions[ind].type != type_rng_end) {
1654 if (dfa_transitions[ind].type == type_rng_char) {
1655 EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, dfa_transitions[ind].value);
1656 sljit_set_label(jump, fast_forward_label);
1657 }
1658 else {
1659 SLJIT_ASSERT(dfa_transitions[ind].type == type_rng_left);
1660 if (init_range) {
1661 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_CHAR, 0);
1662 init_range = 0;
1663 }
1664 if (dfa_transitions[ind].value != prev_value) {
1665 /* Best compatibility to all archs. */
1666 prev_value -= dfa_transitions[ind].value;
1667 if (prev_value < 0) {
1668 EMIT_OP2(SLJIT_SUB, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, -prev_value);
1669 }
1670 else {
1671 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, prev_value);
1672 }
1673 prev_value = dfa_transitions[ind].value;
1674 }
1675 EMIT_CMP(jump, SLJIT_LESS_EQUAL, R_TEMP, 0, SLJIT_IMM, dfa_transitions[ind + 1].value - dfa_transitions[ind].value);
1676 sljit_set_label(jump, fast_forward_label);
1677 ind++;
1678 }
1679 ind++;
1680 }
1681 }
1682 else {
1683 SLJIT_ASSERT(dfa_transitions[ind].type == type_newline);
1684 EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\n');
1685 sljit_set_label(jump, fast_forward_label);
1686 EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\r');
1687 sljit_set_label(jump, fast_forward_label);
1688 }
1689 }
1690 }
1691 return REGEX_NO_ERROR;
1692 }
1693
compile_newline_check(struct compiler_common * compiler_common,sljit_sw ind)1694 static int compile_newline_check(struct compiler_common *compiler_common, sljit_sw ind)
1695 {
1696 struct sljit_compiler *compiler = compiler_common->compiler;
1697 struct sljit_jump *jump1;
1698 struct sljit_jump *jump2;
1699 struct sljit_label *label;
1700 sljit_sw no_states;
1701 sljit_sw offset;
1702
1703 /* Check whether a new-line character is found. */
1704 EMIT_CMP(jump1, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\n');
1705 EMIT_CMP(jump2, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\r');
1706
1707 no_states = compiler_common->no_states;
1708 offset = TERM_OFFSET_OF(compiler_common->search_states[ind].type, 1);
1709 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), offset);
1710 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), offset, SLJIT_IMM, -1);
1711 CHECK(sljit_emit_ijump(compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
1712
1713 EMIT_LABEL(label);
1714 sljit_set_label(jump1, label);
1715 sljit_set_label(jump2, label);
1716 return REGEX_NO_ERROR;
1717 }
1718
1719 #undef CHECK
1720
1721 #define CHECK(exp) \
1722 if (SLJIT_UNLIKELY(exp)) \
1723 return 0
1724
range_set_label(struct sljit_jump ** range_jump_list,struct sljit_label * label)1725 static SLJIT_INLINE void range_set_label(struct sljit_jump **range_jump_list, struct sljit_label *label)
1726 {
1727 while (*range_jump_list) {
1728 sljit_set_label(*range_jump_list, label);
1729 range_jump_list++;
1730 }
1731 }
1732
compile_range_check(struct compiler_common * compiler_common,sljit_sw ind)1733 static sljit_sw compile_range_check(struct compiler_common *compiler_common, sljit_sw ind)
1734 {
1735 struct sljit_compiler *compiler = compiler_common->compiler;
1736 struct stack_item *dfa_transitions = compiler_common->dfa_transitions;
1737 struct sljit_jump **range_jump_list = compiler_common->range_jump_list;
1738 int invert = dfa_transitions[ind].value;
1739 struct sljit_label *label;
1740 sljit_sw no_states;
1741 sljit_sw offset;
1742 int init_range = 1, prev_value = 0;
1743
1744 ind++;
1745
1746 while (dfa_transitions[ind].type != type_rng_end) {
1747 if (dfa_transitions[ind].type == type_rng_char) {
1748 EMIT_CMP(*range_jump_list, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, dfa_transitions[ind].value);
1749 range_jump_list++;
1750 }
1751 else {
1752 SLJIT_ASSERT(dfa_transitions[ind].type == type_rng_left);
1753 if (init_range) {
1754 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_CHAR, 0);
1755 init_range = 0;
1756 }
1757 if (dfa_transitions[ind].value != prev_value) {
1758 /* Best compatibility to all archs. */
1759 prev_value -= dfa_transitions[ind].value;
1760 if (prev_value < 0) {
1761 EMIT_OP2(SLJIT_SUB, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, -prev_value);
1762 }
1763 else {
1764 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, prev_value);
1765 }
1766 prev_value = dfa_transitions[ind].value;
1767 }
1768 EMIT_CMP(*range_jump_list, SLJIT_LESS_EQUAL, R_TEMP, 0, SLJIT_IMM, dfa_transitions[ind + 1].value - dfa_transitions[ind].value);
1769 range_jump_list++;
1770 ind++;
1771 }
1772 ind++;
1773 }
1774
1775 *range_jump_list = NULL;
1776
1777 if (!invert) {
1778 no_states = compiler_common->no_states;
1779 offset = TERM_OFFSET_OF(compiler_common->search_states[ind].type, 1);
1780 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), offset);
1781 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), offset, SLJIT_IMM, -1);
1782 CHECK(sljit_emit_ijump(compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
1783
1784 EMIT_LABEL(label);
1785 range_set_label(compiler_common->range_jump_list, label);
1786 /* Clears the jump list. */
1787 *compiler_common->range_jump_list = NULL;
1788 }
1789 return ind;
1790 }
1791
1792 #undef TERM_OFFSET_OF
1793 #undef EMIT_OP1
1794 #undef EMIT_OP2
1795 #undef EMIT_LABEL
1796 #undef EMIT_JUMP
1797 #undef EMIT_CMP
1798 #undef CHECK
1799
1800 /* --------------------------------------------------------------------- */
1801 /* Main compiler */
1802 /* --------------------------------------------------------------------- */
1803
1804 #define TERM_OFFSET_OF(ind, offs) (((ind) * compiler_common.no_states + (offs)) * sizeof(sljit_sw))
1805
1806 #define EMIT_OP1(type, arg1, arg2, arg3, arg4) \
1807 CHECK(sljit_emit_op1(compiler_common.compiler, type, arg1, arg2, arg3, arg4))
1808
1809 #define EMIT_OP2(type, arg1, arg2, arg3, arg4, arg5, arg6) \
1810 CHECK(sljit_emit_op2(compiler_common.compiler, type, arg1, arg2, arg3, arg4, arg5, arg6))
1811
1812 #define EMIT_LABEL(label) \
1813 label = sljit_emit_label(compiler_common.compiler); \
1814 CHECK(!label)
1815
1816 #define EMIT_JUMP(jump, type) \
1817 jump = sljit_emit_jump(compiler_common.compiler, type); \
1818 CHECK(!jump)
1819
1820 #define EMIT_CMP(jump, type, arg1, arg2, arg3, arg4) \
1821 jump = sljit_emit_cmp(compiler_common.compiler, type, arg1, arg2, arg3, arg4); \
1822 CHECK(!jump)
1823
1824 /* A do {} while(0) expression helps to avoid goto statements. */
1825 #define BEGIN_GUARD \
1826 do {
1827
1828 #define END_GUARD \
1829 } while(0);
1830
1831 #define CHECK(exp) \
1832 if (SLJIT_UNLIKELY(exp)) \
1833 break;
1834
regex_compile(const regex_char_t * regex_string,int length,int re_flags,int * error)1835 struct regex_machine* regex_compile(const regex_char_t *regex_string, int length, int re_flags, int *error)
1836 {
1837 struct compiler_common compiler_common;
1838 sljit_sw ind;
1839 int error_code, done, suggest_fast_forward;
1840 /* ID of an empty match (-1 if not reachable). */
1841 int empty_match_id;
1842
1843 struct sljit_jump *jump;
1844 struct sljit_jump *best_match_found_jump;
1845 struct sljit_jump *fast_forward_jump = NULL;
1846 struct sljit_jump *length_is_zero_jump;
1847 struct sljit_jump *end_check_jump = NULL;
1848 struct sljit_jump *best_match_check_jump = NULL;
1849 struct sljit_jump *non_greedy_end_jump = NULL;
1850 struct sljit_label *label;
1851 struct sljit_label *end_check_label = NULL;
1852 struct sljit_label *start_label;
1853 struct sljit_label *fast_forward_label;
1854 struct sljit_label *fast_forward_return_label;
1855
1856 if (error)
1857 *error = REGEX_NO_ERROR;
1858 #ifdef REGEX_MATCH_VERBOSE
1859 compiler_common.flags = re_flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END | REGEX_MATCH_NON_GREEDY | REGEX_NEWLINE | REGEX_MATCH_VERBOSE);
1860 #else
1861 compiler_common.flags = re_flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END | REGEX_MATCH_NON_GREEDY | REGEX_NEWLINE);
1862 #endif
1863
1864 /* Step 1: parsing (Left->Right).
1865 Syntax check and AST generator. */
1866 error_code = parse(regex_string, length, &compiler_common);
1867 if (error_code) {
1868 stack_destroy(&compiler_common.stack);
1869 if (error)
1870 *error = error_code;
1871 return NULL;
1872 }
1873
1874 /* Step 2: generating branches (Right->Left). */
1875 error_code = generate_transitions(&compiler_common);
1876 stack_destroy(&compiler_common.stack);
1877 stack_destroy(&compiler_common.depth);
1878 if (error_code) {
1879 if (compiler_common.dfa_transitions)
1880 SLJIT_FREE(compiler_common.dfa_transitions, NULL);
1881 if (error)
1882 *error = error_code;
1883 return NULL;
1884 }
1885
1886 /* Step 3: Generate necessary data for depth-first search (Left->Right). */
1887 error_code = generate_search_states(&compiler_common);
1888 if (error_code) {
1889 SLJIT_FREE(compiler_common.dfa_transitions, NULL);
1890 if (error)
1891 *error = error_code;
1892 return NULL;
1893 }
1894
1895 #ifdef REGEX_MATCH_VERBOSE
1896 if (compiler_common.flags & REGEX_MATCH_VERBOSE)
1897 verbose_transitions(&compiler_common);
1898 #endif
1899
1900 /* Step 4: Left->Right generate code. */
1901 stack_init(&compiler_common.stack);
1902 stack_init(&compiler_common.depth);
1903 done = 0;
1904 compiler_common.machine = NULL;
1905 compiler_common.compiler = NULL;
1906 compiler_common.range_jump_list = NULL;
1907
1908 BEGIN_GUARD
1909
1910 compiler_common.machine = (struct regex_machine*)SLJIT_MALLOC(sizeof(struct regex_machine) + (compiler_common.terms_size - 1) * sizeof(sljit_uw), NULL);
1911 CHECK(!compiler_common.machine);
1912
1913 compiler_common.compiler = sljit_create_compiler(NULL);
1914 CHECK(!compiler_common.compiler);
1915
1916 if (compiler_common.longest_range_size > 0) {
1917 compiler_common.range_jump_list = (struct sljit_jump**)SLJIT_MALLOC(sizeof(struct sljit_jump*) * compiler_common.longest_range_size, NULL);
1918 CHECK(!compiler_common.range_jump_list);
1919 }
1920
1921 if ((compiler_common.flags & REGEX_ID_CHECK) && !(compiler_common.flags & REGEX_MATCH_BEGIN))
1922 compiler_common.no_states = 4;
1923 else if (!(compiler_common.flags & REGEX_ID_CHECK) && (compiler_common.flags & REGEX_MATCH_BEGIN))
1924 compiler_common.no_states = 2;
1925 else
1926 compiler_common.no_states = 3;
1927
1928 compiler_common.machine->flags = compiler_common.flags;
1929 compiler_common.machine->no_states = compiler_common.no_states;
1930 compiler_common.machine->size = compiler_common.machine->no_states * compiler_common.terms_size;
1931
1932 /* Study the regular expression. */
1933 empty_match_id = -1;
1934 suggest_fast_forward = 1;
1935 if (!(compiler_common.flags & REGEX_FAKE_MATCH_BEGIN)) {
1936 CHECK(trace_transitions(0, &compiler_common));
1937 while (compiler_common.stack.count > 0) {
1938 ind = stack_pop(&compiler_common.stack)->value;
1939 if (compiler_common.search_states[ind].type == 0) {
1940 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_end);
1941 suggest_fast_forward = 0;
1942 empty_match_id = compiler_common.search_states[ind].value;
1943 }
1944 else if (compiler_common.search_states[ind].type > 0) {
1945 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type != type_end);
1946 if (compiler_common.dfa_transitions[ind].type == type_rng_start && compiler_common.dfa_transitions[ind].value)
1947 suggest_fast_forward = 0;
1948 }
1949 compiler_common.search_states[ind].value = -1;
1950 }
1951 }
1952 else {
1953 SLJIT_ASSERT(compiler_common.dfa_transitions[1].type == type_newline);
1954 CHECK(trace_transitions(1, &compiler_common));
1955 while (compiler_common.stack.count > 0) {
1956 ind = stack_pop(&compiler_common.stack)->value;
1957 if (compiler_common.search_states[ind].type == 0) {
1958 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_end);
1959 suggest_fast_forward = 0;
1960 empty_match_id = compiler_common.search_states[ind].value;
1961 }
1962 compiler_common.search_states[ind].value = -1;
1963 }
1964 }
1965
1966 /* Step 4.1: Generate entry. */
1967 CHECK(sljit_emit_enter(compiler_common.compiler, 0, 3, 5, 5, 0, 0, 0));
1968
1969 /* Copy arguments to their place. */
1970 EMIT_OP1(SLJIT_MOV, R_REGEX_MATCH, 0, SLJIT_S0, 0);
1971 EMIT_OP1(SLJIT_MOV, R_STRING, 0, SLJIT_S1, 0);
1972 EMIT_OP2(SLJIT_ADD, R_LENGTH, 0, SLJIT_S2, 0, SLJIT_IMM, 1);
1973
1974 /* Init global registers. */
1975 EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current));
1976 EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next));
1977 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head));
1978 EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin));
1979 EMIT_OP1(SLJIT_MOV, R_CURR_INDEX, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, index));
1980
1981 /* Check whether the best match has already found in a previous frame. */
1982 EMIT_CMP(jump, SLJIT_EQUAL, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_quit), SLJIT_IMM, 0);
1983 EMIT_JUMP(best_match_found_jump, SLJIT_JUMP);
1984
1985 #ifdef REGEX_MATCH_VERBOSE
1986 if (compiler_common.flags & REGEX_MATCH_VERBOSE)
1987 printf("\n-----------------\nTrace\n-----------------\n");
1988 #endif
1989
1990 /* Step 4.2: Generate code for state 0. */
1991 EMIT_LABEL(label);
1992 compiler_common.machine->entry_addrs[0] = (sljit_uw)label;
1993
1994 /* Swapping current and next. */
1995 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_STATE, 0);
1996 EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, R_NEXT_STATE, 0);
1997 EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, R_TEMP, 0);
1998
1999 /* Checking whether the best case needs to be updated. */
2000 if (!(compiler_common.flags & REGEX_MATCH_END)) {
2001 EMIT_CMP(end_check_jump, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 1), SLJIT_IMM, -1);
2002 EMIT_LABEL(end_check_label);
2003 }
2004 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), TERM_REL_OFFSET_OF(0, 1), SLJIT_IMM, -1);
2005 EMIT_OP2(SLJIT_ADD, R_CURR_INDEX, 0, R_CURR_INDEX, 0, SLJIT_IMM, 1);
2006
2007 /* Checking whether best case has already found. */
2008 if (!(compiler_common.flags & REGEX_MATCH_END) || (compiler_common.flags & REGEX_MATCH_BEGIN)) {
2009 if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2010 /* We can bail out if no more active states remain and R_BEST_BEGIN != -1. */
2011 EMIT_CMP(best_match_check_jump, SLJIT_NOT_EQUAL, R_BEST_BEGIN, 0, SLJIT_IMM, -1);
2012 }
2013 else {
2014 /* We can bail out if no more active states remain (regardless of R_BEST_BEGIN). */
2015 EMIT_CMP(best_match_check_jump, SLJIT_EQUAL, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
2016 }
2017 }
2018
2019 EMIT_LABEL(start_label);
2020 sljit_set_label(jump, start_label);
2021
2022 if (!(compiler_common.flags & REGEX_MATCH_BEGIN) && suggest_fast_forward) {
2023 EMIT_CMP(fast_forward_jump, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 0);
2024 }
2025
2026 /* Loading the next character. */
2027 EMIT_OP2(SLJIT_SUB | SLJIT_SET_Z, R_LENGTH, 0, R_LENGTH, 0, SLJIT_IMM, 1);
2028 EMIT_JUMP(length_is_zero_jump, SLJIT_EQUAL);
2029
2030 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_STRING, 0);
2031 #ifdef REGEX_USE_8BIT_CHARS
2032 EMIT_OP1(SLJIT_MOV_U8, R_CURR_CHAR, 0, SLJIT_MEM1(R_TEMP), 0);
2033 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, 1);
2034 #else
2035 EMIT_OP1(SLJIT_MOV_UH, R_CURR_CHAR, 0, SLJIT_MEM1(R_TEMP), 0);
2036 EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, 2);
2037 #endif
2038 EMIT_OP1(SLJIT_MOV, R_STRING, 0, R_TEMP, 0);
2039
2040 #ifdef REGEX_MATCH_VERBOSE
2041 if (compiler_common.flags & REGEX_MATCH_VERBOSE) {
2042 printf("(%3d): ", 0);
2043 CHECK(trace_transitions(0, &compiler_common));
2044 while (compiler_common.stack.count > 0) {
2045 ind = stack_pop(&compiler_common.stack)->value;
2046 if (compiler_common.search_states[ind].type >= 0)
2047 printf("-> (%3d:%3d) ", compiler_common.search_states[ind].type, compiler_common.search_states[ind].value);
2048 compiler_common.search_states[ind].value = -1;
2049 }
2050 printf("\n");
2051 }
2052 #endif
2053
2054 EMIT_LABEL(fast_forward_return_label);
2055 if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2056 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 1);
2057 if (!(compiler_common.flags & REGEX_MATCH_END)) {
2058 EMIT_CMP(jump, SLJIT_NOT_EQUAL, R_BEST_BEGIN, 0, SLJIT_IMM, -1);
2059 }
2060
2061 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_INDEX, 0);
2062 CHECK(compile_uncond_tran(&compiler_common, R_NEXT_STATE));
2063 /* And branching to the first state. */
2064 CHECK(sljit_emit_ijump(compiler_common.compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
2065
2066 if (!(compiler_common.flags & REGEX_MATCH_END)) {
2067 EMIT_LABEL(label);
2068 sljit_set_label(jump, label);
2069 }
2070 }
2071 /* This is the case where we only have to reset the R_NEXT_HEAD. */
2072 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_NEXT_HEAD, 0);
2073 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
2074 CHECK(sljit_emit_ijump(compiler_common.compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
2075
2076 /* Fast-forward loop. */
2077 if (fast_forward_jump) {
2078 /* Quit from fast-forward loop. */
2079 EMIT_LABEL(fast_forward_label);
2080 EMIT_OP2(SLJIT_SUB, R_TEMP, 0, R_NEXT_HEAD, 0, SLJIT_IMM, 1);
2081 EMIT_OP1(SLJIT_MOV, R_LENGTH, 0, R_NEXT_STATE, 0);
2082 EMIT_OP1(SLJIT_MOV, R_STRING, 0, R_CURR_STATE, 0);
2083 EMIT_OP1(SLJIT_MOV, R_CURR_INDEX, 0, R_NEXT_HEAD, 0);
2084 EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next));
2085 EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current));
2086 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head));
2087
2088 /* Update the start field of the locations. */
2089 CHECK(trace_transitions(0, &compiler_common));
2090 while (compiler_common.stack.count > 0) {
2091 ind = stack_pop(&compiler_common.stack)->value;
2092 if (compiler_common.search_states[ind].type >= 0) {
2093 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(compiler_common.search_states[ind].type, 2), R_TEMP, 0);
2094 }
2095 compiler_common.search_states[ind].value = -1;
2096 }
2097 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 0);
2098 EMIT_JUMP(jump, SLJIT_JUMP);
2099 sljit_set_label(jump, fast_forward_return_label);
2100
2101 /* Start fast-forward. */
2102 EMIT_LABEL(label);
2103 sljit_set_label(fast_forward_jump, label);
2104
2105 /* Moving everything to registers. */
2106 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next), R_NEXT_STATE, 0);
2107 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current), R_CURR_STATE, 0);
2108 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head), R_NEXT_HEAD, 0);
2109 EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, R_LENGTH, 0);
2110 EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, R_STRING, 0);
2111 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, R_CURR_INDEX, 0);
2112
2113 /* Fast forward mainloop. */
2114 EMIT_LABEL(label);
2115 EMIT_OP2(SLJIT_SUB | SLJIT_SET_Z, R_NEXT_STATE, 0, R_NEXT_STATE, 0, SLJIT_IMM, 1);
2116 EMIT_JUMP(fast_forward_jump, SLJIT_EQUAL);
2117
2118 #ifdef REGEX_USE_8BIT_CHARS
2119 EMIT_OP1(SLJIT_MOV_U8, R_CURR_CHAR, 0, SLJIT_MEM1(R_CURR_STATE), 0);
2120 EMIT_OP2(SLJIT_ADD, R_CURR_STATE, 0, R_CURR_STATE, 0, SLJIT_IMM, 1);
2121 #else
2122 EMIT_OP1(SLJIT_MOV_UH, R_CURR_CHAR, 0, SLJIT_MEM1(R_CURR_STATE), 0);
2123 EMIT_OP2(SLJIT_ADD, R_CURR_STATE, 0, R_CURR_STATE, 0, SLJIT_IMM, 2);
2124 #endif
2125
2126 CHECK(trace_transitions(0, &compiler_common));
2127 CHECK(compile_leave_fast_forward(&compiler_common, fast_forward_label));
2128
2129 EMIT_OP2(SLJIT_ADD, R_NEXT_HEAD, 0, R_NEXT_HEAD, 0, SLJIT_IMM, 1);
2130 EMIT_JUMP(jump, SLJIT_JUMP);
2131 sljit_set_label(jump, label);
2132
2133 /* String is finished. */
2134 EMIT_LABEL(label);
2135 sljit_set_label(fast_forward_jump, label);
2136 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, index), R_NEXT_HEAD, 0);
2137 EMIT_JUMP(fast_forward_jump, SLJIT_JUMP);
2138 }
2139
2140 /* End check. */
2141 if (end_check_jump) {
2142 EMIT_LABEL(label);
2143 sljit_set_label(end_check_jump, label);
2144
2145 if (!(compiler_common.flags & REGEX_MATCH_NON_GREEDY) || !(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2146 CHECK(compile_end_check(&compiler_common, end_check_label));
2147 }
2148 else {
2149 /* Since we leave, we do not need to update the R_BEST_BEGIN. */
2150 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, 0);
2151 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0);
2152 if (compiler_common.flags & REGEX_ID_CHECK) {
2153 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 2));
2154 }
2155 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_quit), SLJIT_IMM, 1);
2156 EMIT_JUMP(non_greedy_end_jump, SLJIT_JUMP);
2157 }
2158 }
2159
2160 /* Finish check. */
2161 if (best_match_check_jump) {
2162 EMIT_LABEL(label);
2163 sljit_set_label(best_match_check_jump, label);
2164
2165 if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2166 EMIT_CMP(jump, SLJIT_NOT_EQUAL, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
2167 sljit_set_label(jump, start_label);
2168 }
2169 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_quit), SLJIT_IMM, 1);
2170 }
2171
2172 /* Leaving matching and storing the necessary values. */
2173 EMIT_LABEL(label);
2174 sljit_set_label(length_is_zero_jump, label);
2175 if (non_greedy_end_jump)
2176 sljit_set_label(non_greedy_end_jump, label);
2177
2178 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, index), R_CURR_INDEX, 0);
2179 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head), R_NEXT_HEAD, 0);
2180 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next), R_NEXT_STATE, 0);
2181 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current), R_CURR_STATE, 0);
2182
2183 /* Exit from JIT. */
2184 EMIT_LABEL(label);
2185 sljit_set_label(best_match_found_jump, label);
2186 if (fast_forward_jump)
2187 sljit_set_label(fast_forward_jump, label);
2188 CHECK(sljit_emit_return(compiler_common.compiler, SLJIT_UNUSED, 0, 0));
2189
2190 for (ind = 1; ind < compiler_common.dfa_size - 1; ind++) {
2191 if (compiler_common.search_states[ind].type >= 0) {
2192 SLJIT_ASSERT(compiler_common.search_states[ind].type < compiler_common.terms_size);
2193 EMIT_LABEL(label);
2194 compiler_common.machine->entry_addrs[compiler_common.search_states[ind].type] = (sljit_uw)label;
2195
2196 if (compiler_common.dfa_transitions[ind].type == type_char) {
2197 EMIT_CMP(jump, SLJIT_NOT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, compiler_common.dfa_transitions[ind].value);
2198 }
2199 else if (compiler_common.dfa_transitions[ind].type == type_rng_start) {
2200 ind = compile_range_check(&compiler_common, ind);
2201 CHECK(!ind);
2202 }
2203 else {
2204 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_newline);
2205 CHECK(compile_newline_check(&compiler_common, ind));
2206 }
2207
2208 CHECK(trace_transitions(ind, &compiler_common));
2209 #ifdef REGEX_MATCH_VERBOSE
2210 if (compiler_common.flags & REGEX_MATCH_VERBOSE)
2211 printf("(%3d): ", compiler_common.search_states[ind].type);
2212 #endif
2213 CHECK(compile_cond_tran(&compiler_common, compiler_common.search_states[ind].type));
2214
2215 if (compiler_common.dfa_transitions[ind].type == type_char) {
2216 EMIT_LABEL(label);
2217 sljit_set_label(jump, label);
2218 }
2219 else if (compiler_common.dfa_transitions[ind].type == type_rng_end) {
2220 EMIT_LABEL(label);
2221 range_set_label(compiler_common.range_jump_list, label);
2222 }
2223 else {
2224 SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_newline);
2225 }
2226
2227 /* Branch to the next item in the list. */
2228 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(compiler_common.search_states[ind].type, 1));
2229 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(compiler_common.search_states[ind].type, 1), SLJIT_IMM, -1);
2230 CHECK(sljit_emit_ijump(compiler_common.compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
2231 }
2232 }
2233
2234 if (ind == compiler_common.dfa_size - 1) {
2235 /* Generate an init stub function. */
2236 EMIT_LABEL(label);
2237 CHECK(sljit_emit_enter(compiler_common.compiler, 0, 2, 3, 3, 0, 0, 0));
2238
2239 if (empty_match_id == -1) {
2240 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, -1);
2241 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_IMM, 0);
2242 }
2243 else {
2244 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, 0);
2245 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_IMM, empty_match_id);
2246 }
2247
2248 EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, index), SLJIT_IMM, !(compiler_common.flags & REGEX_FAKE_MATCH_BEGIN) ? 1 : 2);
2249
2250 if (!(compiler_common.flags & REGEX_MATCH_NON_GREEDY) || empty_match_id == -1) {
2251 /* The else is a really rare event, so we still generate an empty function instead of a runtime pointer check. */
2252 SLJIT_ASSERT(R_CURR_STATE == SLJIT_S0);
2253 if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2254 /* R_CURR_INDEX (put to R_TEMP) is zero. */
2255 EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, 0);
2256 }
2257 CHECK(compile_uncond_tran(&compiler_common, R_CURR_STATE));
2258 }
2259 else {
2260 EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
2261 }
2262 CHECK(sljit_emit_return(compiler_common.compiler, SLJIT_MOV, R_NEXT_HEAD, 0));
2263
2264 compiler_common.machine->continue_match = sljit_generate_code(compiler_common.compiler);
2265 #ifndef SLJIT_INDIRECT_CALL
2266 compiler_common.machine->u.init_match = (void*)(sljit_sw)sljit_get_label_addr(label);
2267 #else
2268 sljit_set_function_context(&compiler_common.machine->u.init_match, &compiler_common.machine->context, sljit_get_label_addr(label), regex_compile);
2269 #endif
2270 #ifdef REGEX_MATCH_VERBOSE
2271 if (compiler_common.flags & REGEX_MATCH_VERBOSE)
2272 printf("Continue match: %p Init match: %p\n\n", compiler_common.machine->continue_match, compiler_common.machine->u.init_match);
2273 #endif
2274 if (compiler_common.machine->continue_match) {
2275 for (ind = 0; ind < compiler_common.terms_size; ++ind)
2276 compiler_common.machine->entry_addrs[ind] = sljit_get_label_addr((struct sljit_label*)compiler_common.machine->entry_addrs[ind]);
2277 done = 1;
2278 }
2279 }
2280 END_GUARD
2281
2282 stack_destroy(&compiler_common.stack);
2283 stack_destroy(&compiler_common.depth);
2284 SLJIT_FREE(compiler_common.dfa_transitions, NULL);
2285 SLJIT_FREE(compiler_common.search_states, NULL);
2286 if (compiler_common.range_jump_list)
2287 SLJIT_FREE(compiler_common.range_jump_list, NULL);
2288 if (compiler_common.compiler)
2289 sljit_free_compiler(compiler_common.compiler);
2290 if (done)
2291 return compiler_common.machine;
2292
2293 if (compiler_common.machine) {
2294 SLJIT_FREE(compiler_common.machine, NULL);
2295 }
2296 if (error)
2297 *error = REGEX_MEMORY_ERROR;
2298 return NULL;
2299 }
2300
2301 #undef TERM_OFFSET_OF
2302 #undef EMIT_OP1
2303 #undef EMIT_OP2
2304 #undef EMIT_LABEL
2305 #undef EMIT_JUMP
2306 #undef EMIT_CMP
2307 #undef BEGIN_GUARD
2308 #undef END_GUARD
2309 #undef CHECK
2310
regex_free_machine(struct regex_machine * machine)2311 void regex_free_machine(struct regex_machine *machine)
2312 {
2313 sljit_free_code(machine->continue_match);
2314 SLJIT_FREE(machine, NULL);
2315 }
2316
regex_get_platform_name(void)2317 const char* regex_get_platform_name(void)
2318 {
2319 return sljit_get_platform_name();
2320 }
2321
2322 /* --------------------------------------------------------------------- */
2323 /* Mathching utilities */
2324 /* --------------------------------------------------------------------- */
2325
regex_begin_match(struct regex_machine * machine)2326 struct regex_match* regex_begin_match(struct regex_machine *machine)
2327 {
2328 sljit_sw *ptr1;
2329 sljit_sw *ptr2;
2330 sljit_sw *end;
2331 sljit_sw *entry_addrs;
2332
2333 struct regex_match *match = (struct regex_match*)SLJIT_MALLOC(sizeof(struct regex_match) + (machine->size * 2 - 1) * sizeof(sljit_sw), NULL);
2334 if (!match)
2335 return NULL;
2336
2337 ptr1 = match->states;
2338 ptr2 = match->states + machine->size;
2339 end = ptr2;
2340 entry_addrs = (sljit_sw*)machine->entry_addrs;
2341
2342 match->current = ptr1;
2343 match->next = ptr2;
2344 match->head = 0;
2345 match->machine = machine;
2346
2347 /* Init machine states. */
2348 switch (machine->no_states) {
2349 case 2:
2350 while (ptr1 < end) {
2351 *ptr1++ = *entry_addrs;
2352 *ptr2++ = *entry_addrs++;
2353 *ptr1++ = -1;
2354 *ptr2++ = -1;
2355 }
2356 break;
2357
2358 case 3:
2359 while (ptr1 < end) {
2360 *ptr1++ = *entry_addrs;
2361 *ptr2++ = *entry_addrs++;
2362 *ptr1++ = -1;
2363 *ptr2++ = -1;
2364 *ptr1++ = 0;
2365 *ptr2++ = 0;
2366 }
2367 break;
2368
2369 case 4:
2370 while (ptr1 < end) {
2371 *ptr1++ = *entry_addrs;
2372 *ptr2++ = *entry_addrs++;
2373 *ptr1++ = -1;
2374 *ptr2++ = -1;
2375 *ptr1++ = 0;
2376 *ptr2++ = 0;
2377 *ptr1++ = 0;
2378 *ptr2++ = 0;
2379 }
2380 break;
2381
2382 default:
2383 SLJIT_UNREACHABLE();
2384 break;
2385 }
2386
2387 SLJIT_ASSERT(ptr1 == end);
2388
2389 match->u.continue_match = machine->continue_match;
2390
2391 regex_reset_match(match);
2392 return match;
2393 }
2394
regex_reset_match(struct regex_match * match)2395 void regex_reset_match(struct regex_match *match)
2396 {
2397 struct regex_machine *machine = match->machine;
2398 sljit_sw current, ind;
2399 sljit_sw *current_ptr;
2400
2401 match->best_end = 0;
2402 match->fast_quit = 0;
2403 match->fast_forward = 0;
2404
2405 if (match->head != 0) {
2406 /* Clear the current state. */
2407 current = match->head;
2408 current_ptr = match->current;
2409 do {
2410 ind = (current / sizeof(sljit_sw)) + 1;
2411 current = current_ptr[ind];
2412 current_ptr[ind] = -1;
2413 } while (current != 0);
2414 }
2415 match->head = machine->u.call_init(match->current, match);
2416 }
2417
regex_free_match(struct regex_match * match)2418 void regex_free_match(struct regex_match *match)
2419 {
2420 SLJIT_FREE(match, NULL);
2421 }
2422
regex_continue_match(struct regex_match * match,const regex_char_t * input_string,int length)2423 void regex_continue_match(struct regex_match *match, const regex_char_t *input_string, int length)
2424 {
2425 match->u.call_continue(match, input_string, length);
2426 }
2427
regex_get_result(struct regex_match * match,int * end,int * id)2428 int regex_get_result(struct regex_match *match, int *end, int *id)
2429 {
2430 int flags = match->machine->flags;
2431 sljit_sw no_states;
2432
2433 *end = match->best_end;
2434 *id = match->best_id;
2435 if (!(flags & (REGEX_MATCH_END | REGEX_FAKE_MATCH_END)))
2436 return match->best_begin;
2437
2438 if (flags & REGEX_FAKE_MATCH_END) {
2439 SLJIT_ASSERT(!(flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END)));
2440 if (match->best_begin != -1)
2441 return match->best_begin;
2442
2443 no_states = match->machine->no_states;
2444 if (match->current[no_states + 1] == -1)
2445 return -1;
2446 if (flags & REGEX_ID_CHECK)
2447 *id = match->current[no_states + 3];
2448 if (!(flags & REGEX_FAKE_MATCH_BEGIN))
2449 *end = match->index - 1;
2450 else
2451 *end = match->index - 2;
2452 return match->current[no_states + 2];
2453 }
2454 else {
2455 /* Check the status of the last code. */
2456 if (!(flags & REGEX_MATCH_BEGIN)) {
2457 /* No shortcut in this case. */
2458 if (!(flags & REGEX_ID_CHECK)) {
2459 if (match->current[1] == -1)
2460 return -1;
2461 *end = match->index - 1;
2462 return match->current[2];
2463 }
2464
2465 if (match->current[1] == -1)
2466 return -1;
2467 *end = match->index - 1;
2468 *id = match->current[3];
2469 return match->current[2];
2470 }
2471
2472 /* Shortcut is possible in this case. */
2473 if (!(flags & REGEX_ID_CHECK)) {
2474 if (match->current[1] == -1 || match->head == -1)
2475 return -1;
2476 *end = match->index - 1;
2477 return 0;
2478 }
2479
2480 if (match->current[1] == -1 || match->head == -1)
2481 return -1;
2482 *end = match->index - 1;
2483 *id = match->current[2];
2484 return 0;
2485 }
2486 }
2487
regex_is_match_finished(struct regex_match * match)2488 int regex_is_match_finished(struct regex_match *match)
2489 {
2490 return match->fast_quit;
2491 }
2492
2493 #ifdef REGEX_MATCH_VERBOSE
regex_continue_match_debug(struct regex_match * match,const regex_char_t * input_string,int length)2494 void regex_continue_match_debug(struct regex_match *match, const regex_char_t *input_string, int length)
2495 {
2496 sljit_sw *ptr;
2497 sljit_sw *end;
2498 sljit_sw count;
2499 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
2500 sljit_sw current;
2501 #endif
2502 sljit_sw no_states = match->machine->no_states;
2503 sljit_sw len = match->machine->size;
2504
2505 while (length > 0) {
2506 match->u.call_continue(match, input_string, 1);
2507
2508 if (match->fast_forward) {
2509 if (match->machine->flags & REGEX_MATCH_VERBOSE)
2510 printf("fast forward\n");
2511 }
2512
2513 /* Verbose (first). */
2514 if (match->machine->flags & REGEX_MATCH_VERBOSE) {
2515 ptr = match->current;
2516 end = ptr + len;
2517 count = 0;
2518 printf("'%c' (%3ld->%3ld [%3ld]) ", *input_string, (long)match->best_begin, (long)match->best_end, (long)match->best_id);
2519 while (ptr < end) {
2520 printf("[%3ld:", (long)count++);
2521 switch (no_states) {
2522 case 2:
2523 if (ptr[1] != -1)
2524 printf("+] ");
2525 else
2526 printf(" ] ");
2527 break;
2528
2529 case 3:
2530 if (ptr[1] != -1)
2531 printf("+,%3ld] ", (long)ptr[2]);
2532 else
2533 printf(" ,XXX] ");
2534 break;
2535
2536 case 4:
2537 if (ptr[1] != -1)
2538 printf("+,%3ld,%3ld] ", (long)ptr[2], (long)ptr[3]);
2539 else
2540 printf(" ,XXX,XXX] ");
2541 break;
2542 }
2543 ptr += no_states;
2544 }
2545 printf("\n");
2546 }
2547
2548 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
2549 /* Sanity check (later). */
2550 ptr = match->next;
2551 end = ptr + len;
2552 while (ptr < end) {
2553 SLJIT_ASSERT(ptr[1] == -1);
2554 ptr += no_states;
2555 }
2556
2557 /* Check number of active elements. */
2558 ptr = match->current + no_states;
2559 end = ptr + len - no_states;
2560 count = 0;
2561 while (ptr < end) {
2562 if (ptr[1] != -1)
2563 count++;
2564 ptr += no_states;
2565 }
2566
2567 /* Check chain list. */
2568 current = match->head;
2569 ptr = match->current;
2570 while (current != 0) {
2571 SLJIT_ASSERT(current >= 0 && current < len * sizeof(sljit_sw));
2572 SLJIT_ASSERT((current % (no_states * sizeof(sljit_sw))) == 0);
2573 SLJIT_ASSERT(count > 0);
2574 current = ptr[(current / sizeof(sljit_sw)) + 1];
2575 count--;
2576 }
2577 SLJIT_ASSERT(count == 0);
2578 #endif
2579
2580 if (match->fast_quit) {
2581 /* the machine has stopped working. */
2582 if (match->machine->flags & REGEX_MATCH_VERBOSE)
2583 printf("Best match has found\n");
2584 break;
2585 }
2586
2587 input_string++;
2588 length--;
2589 }
2590 }
2591 #endif
2592