1 /*
2  *    Stack-less Just-In-Time compiler
3  *
4  *    Copyright Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without modification, are
7  * permitted provided that the following conditions are met:
8  *
9  *   1. Redistributions of source code must retain the above copyright notice, this list of
10  *      conditions and the following disclaimer.
11  *
12  *   2. Redistributions in binary form must reproduce the above copyright notice, this list
13  *      of conditions and the following disclaimer in the documentation and/or other materials
14  *      provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
17  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
19  * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
21  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
24  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 #include "sljitLir.h"
28 #include "regexJIT.h"
29 
30 #include <stdlib.h>
31 
32 #ifdef REGEX_MATCH_VERBOSE
33 #include <stdio.h>
34 #endif
35 
36 /* Extra, hidden flags:
37    {id!} where id > 0 found in the code. */
38 #define REGEX_ID_CHECK		0x100
39 /* When REGEX_NEWLINE && REGEX_MATCH_BEGIN defined, the pattern turn to a normal search,
40    which starts with [\r\n] character range. */
41 #define REGEX_FAKE_MATCH_BEGIN	0x200
42 /* When REGEX_NEWLINE && REGEX_MATCH_END defined, the pattern turn to a normal search,
43    which ends with [\r\n] character range. */
44 #define REGEX_FAKE_MATCH_END	0x400
45 
46 /* Check match completition after every (FINISH_TEST + 1) steps. */
47 #define FINISH_TEST	0x7
48 
49 /* --------------------------------------------------------------------- */
50 /*  Structures for JIT-ed pattern matching                               */
51 /* --------------------------------------------------------------------- */
52 
53 struct regex_machine
54 {
55 	/* flags. */
56 	int flags;
57 	/* Number of state descriptors for one term. */
58 	sljit_sw no_states;
59 	/* Total size. */
60 	sljit_sw size;
61 
62 	union {
63 		void *init_match;
64 		sljit_sw (SLJIT_CALL *call_init)(void *next, void* match);
65 	} u;
66 #if (defined SLJIT_INDIRECT_CALL && SLJIT_INDIRECT_CALL)
67 	struct sljit_function_context context;
68 #endif
69 
70 	void *continue_match;
71 
72 	/* Variable sized array to contain the handler addresses. */
73 	sljit_uw entry_addrs[1];
74 };
75 
76 struct regex_match
77 {
78 	/* Current and next state array. */
79 	sljit_sw *current;
80 	sljit_sw *next;
81 	/* Starting. */
82 	sljit_sw head;
83 	/* String character index (ever increasing). */
84 	sljit_sw index;
85 	/* Best match found so far (members in priority order). */
86 	sljit_sw best_begin;
87 	sljit_sw best_end;
88 	sljit_sw best_id;
89 	/* Bool flags (encoded as word). */
90 	sljit_sw fast_quit;
91 	sljit_sw fast_forward;
92 	/* Machine. */
93 	struct regex_machine *machine;
94 
95 	union {
96 		void *continue_match;
97 		void (SLJIT_CALL *call_continue)(struct regex_match *match, const regex_char_t *input_string, int length);
98 	} u;
99 
100 	/* Variable sized array to contain the state arrays. */
101 	sljit_sw states[1];
102 };
103 
104 /* State vector
105     ITEM[0] - pointer to the address inside the machine code
106     ITEM[1] - next pointer
107     ITEM[2] - string started from (optional)
108     ITEM[3] - max ID (optional) */
109 
110 /* Register allocation. */
111 /* Current state array (loaded & stored: regex_match->current). */
112 #define R_CURR_STATE	SLJIT_S0
113 /* Next state array (loaded & stored: regex_match->next). */
114 #define R_NEXT_STATE	SLJIT_S1
115 /* Head (loaded & stored: regex_match->head). */
116 #define R_NEXT_HEAD	SLJIT_S2
117 /* String fragment pointer. */
118 #define R_STRING	SLJIT_S3
119 /* String fragment length. */
120 #define R_LENGTH	SLJIT_S4
121 /* 'struct regex_match*' */
122 #define R_REGEX_MATCH	SLJIT_R0
123 /* Current character. */
124 #define R_CURR_CHAR	SLJIT_R1
125 /* Temporary register. */
126 #define R_TEMP		SLJIT_R2
127 /* Caches the regex_match->best_begin. */
128 #define R_BEST_BEGIN	SLJIT_R3
129 /* Current character index. */
130 #define R_CURR_INDEX	SLJIT_R4
131 
132 /* --------------------------------------------------------------------- */
133 /*  Stack management                                                     */
134 /* --------------------------------------------------------------------- */
135 
136 /* Try to allocate 2^n blocks. */
137 #define STACK_FRAGMENT_SIZE (((64 * sizeof(struct stack_item)) - (sizeof(struct stack_fragment_data))) / (sizeof(struct stack_item)))
138 
139 struct stack_item {
140 	int type;
141 	int value;
142 };
143 
144 struct stack_fragment_data {
145 	struct stack_fragment *next;
146 	struct stack_fragment *prev;
147 };
148 
149 struct stack_fragment {
150 	struct stack_fragment_data data;
151 	struct stack_item items[STACK_FRAGMENT_SIZE];
152 };
153 
154 struct stack {
155 	struct stack_fragment *first;
156 	struct stack_fragment *last;
157 	int index;
158 	int count;
159 };
160 
161 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
162 
stack_check(struct stack * stack)163 static void stack_check(struct stack *stack)
164 {
165 	struct stack_fragment *curr;
166 	int found;
167 
168 	if (!stack)
169 		return;
170 
171 	SLJIT_ASSERT(stack->index >= 0 && stack->index < STACK_FRAGMENT_SIZE);
172 
173 	if (stack->first == NULL) {
174 		SLJIT_ASSERT(stack->first == NULL && stack->last == NULL);
175 		SLJIT_ASSERT(stack->index == STACK_FRAGMENT_SIZE - 1 && stack->count == 0);
176 		return;
177 	}
178 
179 	found = 0;
180 	if (stack->last == NULL) {
181 		SLJIT_ASSERT(stack->index == STACK_FRAGMENT_SIZE - 1 && stack->count == 0);
182 		found = 1;
183 	}
184 	else
185 		SLJIT_ASSERT(stack->index >= 0 && stack->count >= 0);
186 
187 	SLJIT_ASSERT(stack->first->data.prev == NULL);
188 	curr = stack->first;
189 	while (curr) {
190 		if (curr == stack->last)
191 			found = 1;
192 		if (curr->data.next)
193 			SLJIT_ASSERT(curr->data.next->data.prev == curr);
194 		curr = curr->data.next;
195 	}
196 	SLJIT_ASSERT(found);
197 }
198 
199 #endif
200 
stack_init(struct stack * stack)201 static void stack_init(struct stack *stack)
202 {
203 	stack->first = NULL;
204 	stack->last = NULL;
205 	stack->index = STACK_FRAGMENT_SIZE - 1;
206 	stack->count = 0;
207 }
208 
stack_destroy(struct stack * stack)209 static void stack_destroy(struct stack *stack)
210 {
211 	struct stack_fragment *curr = stack->first;
212 	struct stack_fragment *prev;
213 
214 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
215 	stack_check(stack);
216 #endif
217 
218 	while (curr) {
219 		prev = curr;
220 		curr = curr->data.next;
221 		SLJIT_FREE(prev, NULL);
222 	}
223 }
224 
stack_top(struct stack * stack)225 static SLJIT_INLINE struct stack_item* stack_top(struct stack *stack)
226 {
227 	SLJIT_ASSERT(stack->last);
228 	return stack->last->items + stack->index;
229 }
230 
stack_push(struct stack * stack,int type,int value)231 static int stack_push(struct stack *stack, int type, int value)
232 {
233 	if (stack->last) {
234 		stack->index++;
235 		if (stack->index >= STACK_FRAGMENT_SIZE) {
236 			stack->index = 0;
237 			if (!stack->last->data.next) {
238 				stack->last->data.next = (struct stack_fragment*)SLJIT_MALLOC(sizeof(struct stack_fragment), NULL);
239 				if (!stack->last->data.next)
240 					return 1;
241 				stack->last->data.next->data.next = NULL;
242 				stack->last->data.next->data.prev = stack->last;
243 			}
244 			stack->last = stack->last->data.next;
245 		}
246 	}
247 	else if (!stack->first) {
248 		stack->last = (struct stack_fragment*)SLJIT_MALLOC(sizeof(struct stack_fragment), NULL);
249 		if (!stack->last)
250 			return 1;
251 		stack->last->data.prev = NULL;
252 		stack->last->data.next = NULL;
253 		stack->first = stack->last;
254 		stack->index = 0;
255 	}
256 	else {
257 		stack->last = stack->first;
258 		stack->index = 0;
259 	}
260 	stack->last->items[stack->index].type = type;
261 	stack->last->items[stack->index].value = value;
262 	stack->count++;
263 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
264 	stack_check(stack);
265 #endif
266 	return 0;
267 }
268 
stack_pop(struct stack * stack)269 static struct stack_item* stack_pop(struct stack *stack)
270 {
271 	struct stack_item *ret = stack_top(stack);
272 
273 	if (stack->index > 0)
274 		stack->index--;
275 	else {
276 		stack->last = stack->last->data.prev;
277 		stack->index = STACK_FRAGMENT_SIZE - 1;
278 	}
279 
280 	stack->count--;
281 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
282 	stack_check(stack);
283 #endif
284 	return ret;
285 }
286 
stack_clone(struct stack * src,struct stack * dst)287 static SLJIT_INLINE void stack_clone(struct stack *src, struct stack *dst)
288 {
289 	*dst = *src;
290 }
291 
stack_push_copy(struct stack * stack,int items,int length)292 static int stack_push_copy(struct stack *stack, int items, int length)
293 {
294 	struct stack_fragment *frag1;
295 	int ind1;
296 	struct stack_fragment *frag2;
297 	int ind2;
298 	int counter;
299 
300 	SLJIT_ASSERT(stack->count >= length && items <= length && items > 0);
301 
302 	/* Allocate the necessary elements. */
303 	counter = items;
304 	frag1 = stack->last;
305 	ind1 = stack->index;
306 	while (counter > 0) {
307 		if (stack->index + counter >= STACK_FRAGMENT_SIZE) {
308 			counter -= STACK_FRAGMENT_SIZE - stack->index - 1 + 1;
309 			stack->index = 0;
310 			if (!stack->last->data.next) {
311 				stack->last->data.next = (struct stack_fragment*)SLJIT_MALLOC(sizeof(struct stack_fragment), NULL);
312 				if (!stack->last->data.next)
313 					return 1;
314 				stack->last->data.next->data.next = NULL;
315 				stack->last->data.next->data.prev = stack->last;
316 			}
317 			stack->last = stack->last->data.next;
318 		}
319 		else {
320 			stack->index += counter;
321 			counter = 0;
322 		}
323 	}
324 
325 	frag2 = stack->last;
326 	ind2 = stack->index;
327 	while (length > 0) {
328 		frag2->items[ind2--] = frag1->items[ind1--];
329 		if (ind1 < 0) {
330 			ind1 = STACK_FRAGMENT_SIZE - 1;
331 			frag1 = frag1->data.prev;
332 		}
333 		if (ind2 < 0) {
334 			ind2 = STACK_FRAGMENT_SIZE - 1;
335 			frag2 = frag2->data.prev;
336 		}
337 		length--;
338 	}
339 
340 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
341 	stack_check(stack);
342 #endif
343 	stack->count += items;
344 	return 0;
345 }
346 
347 /* --------------------------------------------------------------------- */
348 /*  Parser                                                               */
349 /* --------------------------------------------------------------------- */
350 
351 enum {
352 	/* Common. */
353 	type_begin,
354 	type_end,
355 	type_char,
356 	type_newline,
357 	type_id,
358 	type_rng_start,
359 	type_rng_end,
360 	type_rng_char,
361 	type_rng_left,
362 	type_rng_right,
363 
364 	/* generator only. */
365 	type_branch,
366 	type_jump,
367 
368 	/* Parser only. */
369 	type_open_br,
370 	type_close_br,
371 	type_select,
372 	type_asterisk,
373 	type_plus_sign,
374 	type_qestion_mark
375 };
376 
377 struct compiler_common {
378 	/* Temporary stacks. */
379 	struct stack stack;
380 	struct stack depth;
381 	/* REGEX_ flags. */
382 	int flags;
383 	/* Encoded size of the dfa representation. */
384 	sljit_sw dfa_size;
385 	/* Number of terms. */
386 	sljit_sw terms_size;
387 	/* Number of state descriptors for one term (same as machine->no_states). */
388 	sljit_sw no_states;
389 	/* Number of type_rng_(char|left)-s in the longest character range. */
390 	sljit_sw longest_range_size;
391 
392 	/* DFA linear representation (size: dfa_size). */
393 	struct stack_item *dfa_transitions;
394 	/* Term id and search state pairs (size: dfa_size). */
395 	struct stack_item *search_states;
396 
397 	/* sljit compiler */
398 	struct sljit_compiler *compiler;
399 	/* Machine data, which must be kept for later use. */
400 	struct regex_machine *machine;
401 	/* Temporary space for jumps (size: longest_range_size). */
402 	struct sljit_jump **range_jump_list;
403 };
404 
decode_number(const regex_char_t * regex_string,int length,int * result)405 static const regex_char_t* decode_number(const regex_char_t *regex_string, int length, int *result)
406 {
407 	int value = 0;
408 
409 	SLJIT_ASSERT(length > 0);
410 	if (*regex_string < '0' || *regex_string > '9') {
411 		*result = -1;
412 		return regex_string;
413 	}
414 
415 	while (length > 0 && *regex_string >= '0' && *regex_string <= '9') {
416 		value = value * 10 + (*regex_string - '0');
417 		length--;
418 		regex_string++;
419 	}
420 
421 	*result = value;
422 	return regex_string;
423 }
424 
iterate(struct stack * stack,int min,int max)425 static int iterate(struct stack *stack, int min, int max)
426 {
427 	struct stack it;
428 	struct stack_item *item;
429 	int count = -1;
430 	int len = 0;
431 	int depth = 0;
432 
433 	stack_clone(stack, &it);
434 
435 	/* Calculate size. */
436 	while (count < 0) {
437 		item = stack_pop(&it);
438 		switch (item->type) {
439 		case type_id:
440 		case type_rng_end:
441 		case type_rng_char:
442 		case type_rng_left:
443 		case type_rng_right:
444 		case type_plus_sign:
445 		case type_qestion_mark:
446 			len++;
447 			break;
448 
449 		case type_asterisk:
450 			len += 2;
451 			break;
452 
453 		case type_close_br:
454 			depth++;
455 			break;
456 
457 		case type_open_br:
458 			SLJIT_ASSERT(depth > 0);
459 			depth--;
460 			if (depth == 0)
461 				count = it.count;
462 			break;
463 
464 		case type_select:
465 			SLJIT_ASSERT(depth > 0);
466 			len += 2;
467 			break;
468 
469 		default:
470 			SLJIT_ASSERT(item->type != type_begin && item->type != type_end);
471 			if (depth == 0)
472 				count = it.count;
473 			len++;
474 			break;
475 		}
476 	}
477 
478 	if (min == 0 && max == 0) {
479 		/* {0,0} case, not {0,} case: delete subtree. */
480 		stack_clone(&it, stack);
481 		/* and put an empty bracket expression instead of it. */
482 		if (stack_push(stack, type_open_br, 0))
483 			return REGEX_MEMORY_ERROR;
484 		if (stack_push(stack, type_close_br, 0))
485 			return REGEX_MEMORY_ERROR;
486 		return len;
487 	}
488 
489 	count = stack->count - count;
490 
491 	/* Put an open bracket before the sequence. */
492 	if (stack_push_copy(stack, 1, count))
493 		return -1;
494 
495 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
496 	SLJIT_ASSERT(stack_push(&it, type_open_br, 0) == 0);
497 #else
498 	stack_push(&it, type_open_br, 0);
499 #endif
500 
501 	/* Copy the data. */
502 	if (max > 0) {
503 		len = len * (max - 1);
504 		max -= min;
505 		/* Insert ? operators. */
506 		len += max;
507 
508 		if (min > 0) {
509 			min--;
510 			while (min > 0) {
511 				if (stack_push_copy(stack, count, count))
512 					return -1;
513 				min--;
514 			}
515 			if (max > 0) {
516 				if (stack_push_copy(stack, count, count))
517 					return -1;
518 				if (stack_push(stack, type_qestion_mark, 0))
519 					return REGEX_MEMORY_ERROR;
520 				count++;
521 				max--;
522 			}
523 		}
524 		else {
525 			SLJIT_ASSERT(max > 0);
526 			max--;
527 			count++;
528 			if (stack_push(stack, type_qestion_mark, 0))
529 				return REGEX_MEMORY_ERROR;
530 		}
531 
532 		while (max > 0) {
533 			if (stack_push_copy(stack, count, count))
534 				return -1;
535 			max--;
536 		}
537 	}
538 	else {
539 		SLJIT_ASSERT(min > 0);
540 		min--;
541 		/* Insert + operator. */
542 		len = len * min + 1;
543 		while (min > 0) {
544 			if (stack_push_copy(stack, count, count))
545 				return -1;
546 			min--;
547 		}
548 
549 		if (stack_push(stack, type_plus_sign, 0))
550 			return REGEX_MEMORY_ERROR;
551 	}
552 
553 	/* Close the opened bracket. */
554 	if (stack_push(stack, type_close_br, 0))
555 		return REGEX_MEMORY_ERROR;
556 
557 	return len;
558 }
559 
parse_iterator(const regex_char_t * regex_string,int length,struct stack * stack,sljit_sw * dfa_size,int begin)560 static int parse_iterator(const regex_char_t *regex_string, int length, struct stack *stack, sljit_sw *dfa_size, int begin)
561 {
562 	/* We only know that *regex_string == { . */
563 	int val1, val2;
564 	const regex_char_t *base_from = regex_string;
565 	const regex_char_t *from;
566 
567 	length--;
568 	regex_string++;
569 
570 	/* Decode left value. */
571 	val2 = -1;
572 	if (length == 0)
573 		return -2;
574 	if (*regex_string == ',') {
575 		val1 = 0;
576 		length--;
577 		regex_string++;
578 	}
579 	else {
580 		from = regex_string;
581 		regex_string = decode_number(regex_string, length, &val1);
582 		if (val1 < 0)
583 			return -2;
584 		length -= regex_string - from;
585 
586 		if (length == 0)
587 			return -2;
588 		if (*regex_string == '}') {
589 			val2 = val1;
590 			if (val1 == 0)
591 				val1 = -1;
592 		}
593 		else if (length >= 2 && *regex_string == '!' && regex_string[1] == '}') {
594 			/* Non posix extension. */
595 			if (stack_push(stack, type_id, val1))
596 				return -1;
597 			(*dfa_size)++;
598 			return (regex_string - base_from) + 1;
599 		}
600 		else {
601 			if (*regex_string != ',')
602 				return -2;
603 			length--;
604 			regex_string++;
605 		}
606 	}
607 
608 	if (begin)
609 		return -2;
610 
611 	/* Decode right value. */
612 	if (val2 == -1) {
613 		if (length == 0)
614 			return -2;
615 		if (*regex_string == '}')
616 			val2 = 0;
617 		else {
618 			from = regex_string;
619 			regex_string = decode_number(regex_string, length, &val2);
620 			length -= regex_string - from;
621 			if (val2 < 0 || length == 0 || *regex_string != '}' || val2 < val1)
622 				return -2;
623 			if (val2 == 0) {
624 				SLJIT_ASSERT(val1 == 0);
625 				val1 = -1;
626 			}
627 		}
628 	}
629 
630 	/* Fast cases. */
631 	if (val1 > 1 || val2 > 1) {
632 		val1 = iterate(stack, val1, val2);
633 		if (val1 < 0)
634 			return -1;
635 		*dfa_size += val1;
636 	}
637 	else if (val1 == 0 && val2 == 0) {
638 		if (stack_push(stack, type_asterisk, 0))
639 			return -1;
640 		*dfa_size += 2;
641 	}
642 	else if (val1 == 1 && val2 == 0) {
643 		if (stack_push(stack, type_plus_sign, 0))
644 			return -1;
645 		(*dfa_size)++;
646 	}
647 	else if (val1 == 0 && val2 == 1) {
648 		if (stack_push(stack, type_qestion_mark, 0))
649 			return -1;
650 		(*dfa_size)++;
651 	}
652 	else if (val1 == -1) {
653 		val1 = iterate(stack, 0, 0);
654 		if (val1 < 0)
655 			return -1;
656 		*dfa_size -= val1;
657 		SLJIT_ASSERT(*dfa_size >= 2);
658 	}
659 	else {
660 		/* Ignore. */
661 		SLJIT_ASSERT(val1 == 1 && val2 == 1);
662 	}
663 	return regex_string - base_from;
664 }
665 
parse_char_range(const regex_char_t * regex_string,int length,struct compiler_common * compiler_common)666 static int parse_char_range(const regex_char_t *regex_string, int length, struct compiler_common *compiler_common)
667 {
668 	struct stack* stack = &compiler_common->stack;
669 	const regex_char_t *base_from = regex_string;
670 	regex_char_t left_char, right_char, tmp_char;
671 
672 	length--;
673 	regex_string++;
674 
675 	if (length == 0)
676 		return -2;
677 
678 	if (*regex_string != '^') {
679 		if (stack_push(stack, type_rng_start, 0))
680 			return -1;
681 	}
682 	else {
683 		length--;
684 		regex_string++;
685 
686 		if (length == 0)
687 			return -2;
688 
689 		if (stack_push(stack, type_rng_start, 1))
690 			return -1;
691 	}
692 	/* For both the type_rng_start & type_rng_end. */
693 	compiler_common->dfa_size += 2;
694 
695 	/* Range must be at least 1 character. */
696 	if (*regex_string == ']') {
697 		length--;
698 		regex_string++;
699 		if (stack_push(stack, type_rng_char, ']'))
700 			return -1;
701 		compiler_common->dfa_size++;
702 	}
703 
704 	while (1) {
705 		if (length == 0)
706 			return -2;
707 
708 		if (*regex_string == ']')
709 			break;
710 
711 		if (*regex_string != '\\')
712 			left_char = *regex_string;
713 		else {
714 			regex_string++;
715 			length--;
716 			if (length == 0)
717 				return -2;
718 			left_char = *regex_string;
719 		}
720 		regex_string++;
721 		length--;
722 
723 		/* Is a range here? */
724 		if (length >= 3 && *regex_string == '-' && *(regex_string + 1) != ']') {
725 			regex_string++;
726 			length--;
727 
728 			if (*regex_string != '\\')
729 				right_char = *regex_string;
730 			else {
731 				regex_string++;
732 				length--;
733 				if (length == 0)
734 					return -2;
735 				right_char = *regex_string;
736 			}
737 			regex_string++;
738 			length--;
739 
740 			if (left_char > right_char) {
741 				/* Swap if necessary. */
742 				tmp_char = left_char;
743 				left_char = right_char;
744 				right_char = tmp_char;
745 			}
746 
747 			if (stack_push(stack, type_rng_left, left_char))
748 				return -1;
749 			if (stack_push(stack, type_rng_right, right_char))
750 				return -1;
751 			compiler_common->dfa_size += 2;
752 		}
753 		else {
754 			if (stack_push(stack, type_rng_char, left_char))
755 				return -1;
756 			compiler_common->dfa_size++;
757 		}
758 	}
759 
760 	if (stack_push(stack, type_rng_end, 0))
761 		return -1;
762 	return regex_string - base_from;
763 }
764 
parse(const regex_char_t * regex_string,int length,struct compiler_common * compiler_common)765 static int parse(const regex_char_t *regex_string, int length, struct compiler_common *compiler_common)
766 {
767 	/* Depth of bracketed expressions. */
768 	int depth = 0;
769 	/* Have we already found a term? '1' if not yet. */
770 	int begin = 1;
771 	/* Cache stack pointer. */
772 	struct stack* stack = &compiler_common->stack;
773 	int tmp;
774 
775 	/* Type_begin and type_end. */
776 	compiler_common->dfa_size = 2;
777 	stack_init(stack);
778 	if (stack_push(stack, type_begin, 0))
779 		return REGEX_MEMORY_ERROR;
780 
781 	if (length > 0 && *regex_string == '^') {
782 		compiler_common->flags |= REGEX_MATCH_BEGIN;
783 		length--;
784 		regex_string++;
785 	}
786 
787 	if ((compiler_common->flags & (REGEX_MATCH_BEGIN | REGEX_NEWLINE)) == (REGEX_MATCH_BEGIN | REGEX_NEWLINE)) {
788 		/* Replace REGEX_MATCH_BEGIN flag to REGEX_FAKE_MATCH_BEGIN */
789 		compiler_common->flags &= ~REGEX_MATCH_BEGIN;
790 		compiler_common->flags |= REGEX_FAKE_MATCH_BEGIN;
791 		/* and append a new-line search. */
792 		if (stack_push(stack, type_newline, 0))
793 			return REGEX_MEMORY_ERROR;
794 		compiler_common->dfa_size++;
795 		/* Begin intentionally kept as 1. */
796 	}
797 
798 	while (length > 0) {
799 		switch (*regex_string) {
800 		case '\\' :
801 			length--;
802 			regex_string++;
803 			if (length == 0)
804 				return REGEX_INVALID_REGEX;
805 			if (stack_push(stack, type_char, *regex_string))
806 				return REGEX_MEMORY_ERROR;
807 			begin = 0;
808 			compiler_common->dfa_size++;
809 			break;
810 
811 		case '.' :
812 			if (stack_push(stack, type_rng_start, 1))
813 				return REGEX_MEMORY_ERROR;
814 			if (compiler_common->flags & REGEX_NEWLINE) {
815 				if (stack_push(stack, type_rng_char, '\n'))
816 					return REGEX_MEMORY_ERROR;
817 				if (stack_push(stack, type_rng_char, '\r'))
818 					return REGEX_MEMORY_ERROR;
819 				compiler_common->dfa_size += 2;
820 			}
821 			if (stack_push(stack, type_rng_end, 1))
822 				return REGEX_MEMORY_ERROR;
823 			begin = 0;
824 			compiler_common->dfa_size += 2;
825 			break;
826 
827 		case '(' :
828 			depth++;
829 			if (stack_push(stack, type_open_br, 0))
830 				return REGEX_MEMORY_ERROR;
831 			begin = 1;
832 			break;
833 
834 		case ')' :
835 			if (depth == 0)
836 				return REGEX_INVALID_REGEX;
837 			depth--;
838 			if (stack_push(stack, type_close_br, 0))
839 				return REGEX_MEMORY_ERROR;
840 			begin = 0;
841 			break;
842 
843 		case '|' :
844 			if (stack_push(stack, type_select, 0))
845 				return REGEX_MEMORY_ERROR;
846 			begin = 1;
847 			compiler_common->dfa_size += 2;
848 			break;
849 
850 		case '*' :
851 			if (begin)
852 				return REGEX_INVALID_REGEX;
853 			if (stack_push(stack, type_asterisk, 0))
854 				return REGEX_MEMORY_ERROR;
855 			compiler_common->dfa_size += 2;
856 			break;
857 
858 		case '?' :
859 		case '+' :
860 			if (begin)
861 				return REGEX_INVALID_REGEX;
862 			if (stack_push(stack, (*regex_string == '+') ? type_plus_sign : type_qestion_mark, 0))
863 				return REGEX_MEMORY_ERROR;
864 			compiler_common->dfa_size++;
865 			break;
866 
867 		case '{' :
868 			tmp = parse_iterator(regex_string, length, stack, &compiler_common->dfa_size, begin);
869 
870 			if (tmp >= 0) {
871 				length -= tmp;
872 				regex_string += tmp;
873 			}
874 			else if (tmp == -1)
875 				return REGEX_MEMORY_ERROR;
876 			else {
877 				/* Not a valid range expression. */
878 				SLJIT_ASSERT(tmp == -2);
879 				if (stack_push(stack, type_char, '{'))
880 					return REGEX_MEMORY_ERROR;
881 				compiler_common->dfa_size++;
882 			}
883 			break;
884 
885 		case '[' :
886 			tmp = parse_char_range(regex_string, length, compiler_common);
887 			if (tmp >= 0) {
888 				length -= tmp;
889 				regex_string += tmp;
890 			}
891 			else if (tmp == -1)
892 				return REGEX_MEMORY_ERROR;
893 			else {
894 				SLJIT_ASSERT(tmp == -2);
895 				return REGEX_INVALID_REGEX;
896 			}
897 			begin = 0;
898 			break;
899 
900 		default:
901 			if (length == 1 && *regex_string == '$') {
902 				compiler_common->flags |= REGEX_MATCH_END;
903 				break;
904 			}
905 			if (stack_push(stack, type_char, *regex_string))
906 				return REGEX_MEMORY_ERROR;
907 			begin = 0;
908 			compiler_common->dfa_size++;
909 			break;
910 		}
911 		length--;
912 		regex_string++;
913 	}
914 
915 	if (depth != 0)
916 		return REGEX_INVALID_REGEX;
917 
918 	if ((compiler_common->flags & (REGEX_MATCH_END | REGEX_NEWLINE)) == (REGEX_MATCH_END | REGEX_NEWLINE)) {
919 		/* Replace REGEX_MATCH_END flag to REGEX_FAKE_MATCH_END */
920 		compiler_common->flags &= ~REGEX_MATCH_END;
921 		compiler_common->flags |= REGEX_FAKE_MATCH_END;
922 		/* and append a new-line search. */
923 		if (stack_push(stack, type_newline, 1))
924 			return REGEX_MEMORY_ERROR;
925 		compiler_common->dfa_size++;
926 		/* Begin intentionally kept as 1. */
927 	}
928 
929 	if (stack_push(stack, type_end, 0))
930 		return REGEX_MEMORY_ERROR;
931 
932 	return REGEX_NO_ERROR;
933 }
934 
935 /* --------------------------------------------------------------------- */
936 /*  Generating machine state transitions                                 */
937 /* --------------------------------------------------------------------- */
938 
939 #define PUT_TRANSITION(typ, val) \
940 	do { \
941 		--transitions_ptr; \
942 		transitions_ptr->type = typ; \
943 		transitions_ptr->value = val; \
944 	} while (0)
945 
handle_iteratives(struct stack_item * transitions_ptr,struct stack_item * transitions,struct stack * depth)946 static struct stack_item* handle_iteratives(struct stack_item *transitions_ptr, struct stack_item *transitions, struct stack *depth)
947 {
948 	struct stack_item *item;
949 
950 	while (1) {
951 		item = stack_top(depth);
952 
953 		switch (item->type) {
954 		case type_asterisk:
955 			SLJIT_ASSERT(transitions[item->value].type == type_branch);
956 			transitions[item->value].value = transitions_ptr - transitions;
957 			PUT_TRANSITION(type_branch, item->value + 1);
958 			break;
959 
960 		case type_plus_sign:
961 			SLJIT_ASSERT(transitions[item->value].type == type_branch);
962 			transitions[item->value].value = transitions_ptr - transitions;
963 			break;
964 
965 		case type_qestion_mark:
966 			PUT_TRANSITION(type_branch, item->value);
967 			break;
968 
969 		default:
970 			return transitions_ptr;
971 		}
972 		stack_pop(depth);
973 	}
974 }
975 
generate_transitions(struct compiler_common * compiler_common)976 static int generate_transitions(struct compiler_common *compiler_common)
977 {
978 	struct stack *stack = &compiler_common->stack;
979 	struct stack *depth = &compiler_common->depth;
980 	struct stack_item *transitions_ptr;
981 	struct stack_item *item;
982 
983 	stack_init(depth);
984 	compiler_common->dfa_transitions = SLJIT_MALLOC(sizeof(struct stack_item) * compiler_common->dfa_size, NULL);
985 	if (!compiler_common->dfa_transitions)
986 		return REGEX_MEMORY_ERROR;
987 
988 	/* Go through the items of the stack and generate the necessary branches and jumps (edges of DFA). */
989 	transitions_ptr = compiler_common->dfa_transitions + compiler_common->dfa_size;
990 	while (stack->count > 0) {
991 		item = stack_pop(stack);
992 		switch (item->type) {
993 		case type_begin:
994 		case type_open_br:
995 			item = stack_pop(depth);
996 			if (item->type == type_select)
997 				PUT_TRANSITION(type_branch, item->value + 1);
998 			else
999 				SLJIT_ASSERT(item->type == type_close_br);
1000 			if (stack->count == 0)
1001 				PUT_TRANSITION(type_begin, 0);
1002 			else
1003 				transitions_ptr = handle_iteratives(transitions_ptr, compiler_common->dfa_transitions, depth);
1004 			break;
1005 
1006 		case type_end:
1007 		case type_close_br:
1008 			if (item->type == type_end)
1009 				*--transitions_ptr = *item;
1010 			if (stack_push(depth, type_close_br, transitions_ptr - compiler_common->dfa_transitions))
1011 				return REGEX_MEMORY_ERROR;
1012 			break;
1013 
1014 		case type_select:
1015 			item = stack_top(depth);
1016 			if (item->type == type_select) {
1017 				SLJIT_ASSERT(compiler_common->dfa_transitions[item->value].type == type_jump);
1018 				PUT_TRANSITION(type_branch, item->value + 1);
1019 				PUT_TRANSITION(type_jump, item->value);
1020 				item->value = transitions_ptr - compiler_common->dfa_transitions;
1021 			}
1022 			else {
1023 				SLJIT_ASSERT(item->type == type_close_br);
1024 				item->type = type_select;
1025 				PUT_TRANSITION(type_jump, item->value);
1026 				item->value = transitions_ptr - compiler_common->dfa_transitions;
1027 			}
1028 			break;
1029 
1030 		case type_asterisk:
1031 		case type_plus_sign:
1032 		case type_qestion_mark:
1033 			if (item->type != type_qestion_mark)
1034 				PUT_TRANSITION(type_branch, 0);
1035 			if (stack_push(depth, item->type, transitions_ptr - compiler_common->dfa_transitions))
1036 				return REGEX_MEMORY_ERROR;
1037 			break;
1038 
1039 		case type_char:
1040 		case type_newline:
1041 		case type_rng_start:
1042 			/* Requires handle_iteratives. */
1043 			*--transitions_ptr = *item;
1044 			transitions_ptr = handle_iteratives(transitions_ptr, compiler_common->dfa_transitions, depth);
1045 			break;
1046 
1047 		default:
1048 			*--transitions_ptr = *item;
1049 			break;
1050 		}
1051 	}
1052 
1053 	SLJIT_ASSERT(compiler_common->dfa_transitions == transitions_ptr);
1054 	SLJIT_ASSERT(depth->count == 0);
1055 	return REGEX_NO_ERROR;
1056 }
1057 
1058 #undef PUT_TRANSITION
1059 
1060 #ifdef REGEX_MATCH_VERBOSE
1061 
verbose_transitions(struct compiler_common * compiler_common)1062 static void verbose_transitions(struct compiler_common *compiler_common)
1063 {
1064 	struct stack_item *transitions_ptr = compiler_common->dfa_transitions;
1065 	struct stack_item *transitions_end = transitions_ptr + compiler_common->dfa_size;
1066 	struct stack_item *search_states_ptr = compiler_common->search_states;
1067 	int pos;
1068 
1069 	printf("-----------------\nTransitions\n-----------------\n");
1070 	pos = 0;
1071 	while (transitions_ptr < transitions_end) {
1072 		printf("[%3d] ", pos++);
1073 		if (search_states_ptr->type >= 0)
1074 			printf("(%3d) ", search_states_ptr->type);
1075 		switch (transitions_ptr->type) {
1076 		case type_begin:
1077 			printf("type_begin\n");
1078 			break;
1079 
1080 		case type_end:
1081 			printf("type_end\n");
1082 			break;
1083 
1084 		case type_char:
1085 			if (transitions_ptr->value >= ' ')
1086 				printf("type_char '%c'\n", transitions_ptr->value);
1087 			else
1088 				printf("type_char 0x%x\n", transitions_ptr->value);
1089 			break;
1090 
1091 		case type_newline:
1092 			printf("type_newline %s\n", transitions_ptr->value ? "(end)" : "(begin)");
1093 			break;
1094 
1095 		case type_id:
1096 			printf("type_id %d\n", transitions_ptr->value);
1097 			break;
1098 
1099 		case type_rng_start:
1100 			printf("type_rng_start %s\n", transitions_ptr->value ? "(invert)" : "(normal)");
1101 			break;
1102 
1103 		case type_rng_end:
1104 			printf("type_rng_end\n");
1105 			break;
1106 
1107 		case type_rng_char:
1108 			if (transitions_ptr->value >= ' ')
1109 				printf("type_rng_char '%c'\n", transitions_ptr->value);
1110 			else
1111 				printf("type_rng_char 0x%x\n", transitions_ptr->value);
1112 			break;
1113 
1114 		case type_rng_left:
1115 			if (transitions_ptr->value >= ' ')
1116 				printf("type_rng_left '%c'\n", transitions_ptr->value);
1117 			else
1118 				printf("type_rng_left 0x%x\n", transitions_ptr->value);
1119 			break;
1120 
1121 		case type_rng_right:
1122 			if (transitions_ptr->value >= ' ')
1123 				printf("type_rng_right '%c'\n", transitions_ptr->value);
1124 			else
1125 				printf("type_rng_right 0x%x\n", transitions_ptr->value);
1126 			break;
1127 
1128 		case type_branch:
1129 			printf("type_branch -> %d\n", transitions_ptr->value);
1130 			break;
1131 
1132 		case type_jump:
1133 			printf("type_jump -> %d\n", transitions_ptr->value);
1134 			break;
1135 
1136 		default:
1137 			printf("UNEXPECTED TYPE\n");
1138 			break;
1139 		}
1140 		transitions_ptr++;
1141 		search_states_ptr++;
1142 	}
1143 	printf("flags: ");
1144 	if (!(compiler_common->flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END | REGEX_NEWLINE | REGEX_ID_CHECK | REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END)))
1145 		printf("none ");
1146 	if (compiler_common->flags & REGEX_MATCH_BEGIN)
1147 		printf("REGEX_MATCH_BEGIN ");
1148 	if (compiler_common->flags & REGEX_MATCH_END)
1149 		printf("REGEX_MATCH_END ");
1150 	if (compiler_common->flags & REGEX_NEWLINE)
1151 		printf("REGEX_NEWLINE ");
1152 	if (compiler_common->flags & REGEX_ID_CHECK)
1153 		printf("REGEX_ID_CHECK ");
1154 	if (compiler_common->flags & REGEX_FAKE_MATCH_BEGIN)
1155 		printf("REGEX_FAKE_MATCH_BEGIN ");
1156 	if (compiler_common->flags & REGEX_FAKE_MATCH_END)
1157 		printf("REGEX_FAKE_MATCH_END ");
1158 	if (compiler_common->longest_range_size > 0)
1159 		printf("(longest range: %ld) ", (long)compiler_common->longest_range_size);
1160 	printf("\n");
1161 }
1162 
1163 #endif
1164 
1165 /* --------------------------------------------------------------------- */
1166 /*  Utilities                                                            */
1167 /* --------------------------------------------------------------------- */
1168 
generate_search_states(struct compiler_common * compiler_common)1169 static int generate_search_states(struct compiler_common *compiler_common)
1170 {
1171 	struct stack_item *transitions_ptr = compiler_common->dfa_transitions;
1172 	struct stack_item *transitions_end = transitions_ptr + compiler_common->dfa_size;
1173 	struct stack_item *search_states_ptr;
1174 	struct stack_item *rng_start = NULL;
1175 
1176 	compiler_common->terms_size = !(compiler_common->flags & REGEX_FAKE_MATCH_END) ? 1 : 2;
1177 	compiler_common->longest_range_size = 0;
1178 	compiler_common->search_states = SLJIT_MALLOC(sizeof(struct stack_item) * compiler_common->dfa_size, NULL);
1179 	if (!compiler_common->search_states)
1180 		return REGEX_MEMORY_ERROR;
1181 
1182 	search_states_ptr = compiler_common->search_states;
1183 	while (transitions_ptr < transitions_end) {
1184 		switch (transitions_ptr->type) {
1185 		case type_begin:
1186 		case type_end:
1187 			search_states_ptr->type = 0;
1188 			break;
1189 
1190 		case type_char:
1191 			search_states_ptr->type = compiler_common->terms_size++;
1192 			break;
1193 
1194 		case type_newline:
1195 			if (transitions_ptr->value)
1196 				search_states_ptr->type = 1;
1197 			else
1198 				search_states_ptr->type = compiler_common->terms_size++;
1199 			SLJIT_ASSERT(search_states_ptr->type == 1 || search_states_ptr->type == 2);
1200 			break;
1201 
1202 		case type_id:
1203 			if (transitions_ptr->value > 0)
1204 				compiler_common->flags |= REGEX_ID_CHECK;
1205 			search_states_ptr->type = -1;
1206 			break;
1207 
1208 		case type_rng_start:
1209 			search_states_ptr->type = compiler_common->terms_size;
1210 			rng_start = search_states_ptr;
1211 			break;
1212 
1213 		case type_rng_end:
1214 			search_states_ptr->type = compiler_common->terms_size++;
1215 			/* Ok, this is a blunt over estimation :) */
1216 			if (compiler_common->longest_range_size < search_states_ptr - rng_start)
1217 				compiler_common->longest_range_size = search_states_ptr - rng_start;
1218 			break;
1219 
1220 		default:
1221 			search_states_ptr->type = -1;
1222 			break;
1223 		}
1224 		search_states_ptr->value = -1;
1225 		search_states_ptr++;
1226 		transitions_ptr++;
1227 	}
1228 	return REGEX_NO_ERROR;
1229 }
1230 
trace_transitions(int from,struct compiler_common * compiler_common)1231 static int trace_transitions(int from, struct compiler_common *compiler_common)
1232 {
1233 	int id = 0;
1234 	struct stack *stack = &compiler_common->stack;
1235 	struct stack *depth = &compiler_common->depth;
1236 	struct stack_item *dfa_transitions = compiler_common->dfa_transitions;
1237 	struct stack_item *search_states = compiler_common->search_states;
1238 
1239 	SLJIT_ASSERT(search_states[from].type >= 0);
1240 
1241 	from++;
1242 
1243 	/* Be prepared for any paths (loops, etc). */
1244 	while (1) {
1245 		if (dfa_transitions[from].type == type_id)
1246 			if (id < dfa_transitions[from].value)
1247 				id = dfa_transitions[from].value;
1248 
1249 		if (search_states[from].value < id) {
1250 			/* Forward step. */
1251 			if (search_states[from].value == -1)
1252 				if (stack_push(stack, 0, from))
1253 					return REGEX_MEMORY_ERROR;
1254 			search_states[from].value = id;
1255 
1256 			if (dfa_transitions[from].type == type_branch) {
1257 				if (stack_push(depth, id, from))
1258 					return REGEX_MEMORY_ERROR;
1259 				from++;
1260 				continue;
1261 			}
1262 			else if (dfa_transitions[from].type == type_jump) {
1263 				from = dfa_transitions[from].value;
1264 				continue;
1265 			}
1266 			else if (search_states[from].type < 0) {
1267 				from++;
1268 				continue;
1269 			}
1270 		}
1271 
1272 		/* Back tracking. */
1273 		if (depth->count > 0) {
1274 			id = stack_top(depth)->type;
1275 			from = dfa_transitions[stack_pop(depth)->value].value;
1276 			continue;
1277 		}
1278 		return 0;
1279 	}
1280 }
1281 
1282 /* --------------------------------------------------------------------- */
1283 /*  Code generator                                                       */
1284 /* --------------------------------------------------------------------- */
1285 
1286 #define TERM_OFFSET_OF(index, offs)	(((index) * no_states + (offs)) * sizeof(sljit_sw))
1287 #define TERM_REL_OFFSET_OF(base, offs)	((base) + ((offs) * sizeof(sljit_sw)))
1288 
1289 #define EMIT_OP1(type, arg1, arg2, arg3, arg4) \
1290 	CHECK(sljit_emit_op1(compiler, type, arg1, arg2, arg3, arg4))
1291 
1292 #define EMIT_OP2(type, arg1, arg2, arg3, arg4, arg5, arg6) \
1293 	CHECK(sljit_emit_op2(compiler, type, arg1, arg2, arg3, arg4, arg5, arg6))
1294 
1295 #define EMIT_LABEL(label) \
1296 	label = sljit_emit_label(compiler); \
1297 	CHECK(!label)
1298 
1299 #define EMIT_JUMP(jump, type) \
1300 	jump = sljit_emit_jump(compiler, type); \
1301 	CHECK(!jump)
1302 
1303 #define EMIT_CMP(jump, type, arg1, arg2, arg3, arg4) \
1304 	jump = sljit_emit_cmp(compiler, type, arg1, arg2, arg3, arg4); \
1305 	CHECK(!jump)
1306 
1307 /* CHECK depends on the use case. */
1308 
1309 #define CHECK(exp) \
1310 	if (SLJIT_UNLIKELY(exp)) \
1311 		return REGEX_MEMORY_ERROR
1312 
compile_uncond_tran(struct compiler_common * compiler_common,int reg)1313 static int compile_uncond_tran(struct compiler_common *compiler_common, int reg)
1314 {
1315 	struct sljit_compiler *compiler = compiler_common->compiler;
1316 	struct stack *stack = &compiler_common->stack;
1317 	struct stack_item *search_states = compiler_common->search_states;
1318 	int flags = compiler_common->flags;
1319 	sljit_sw no_states = compiler_common->no_states;
1320 	sljit_uw head = 0;
1321 	sljit_sw offset, value;
1322 
1323 	if (reg != R_CURR_STATE || !(compiler_common->flags & REGEX_FAKE_MATCH_BEGIN)) {
1324 		CHECK(trace_transitions(0, compiler_common));
1325 	}
1326 	else {
1327 		CHECK(trace_transitions(1, compiler_common));
1328 	}
1329 
1330 	while (stack->count > 0) {
1331 		value = stack_pop(stack)->value;
1332 		if (search_states[value].type >= 0) {
1333 			offset = TERM_OFFSET_OF(search_states[value].type, 0);
1334 			EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 1), SLJIT_IMM, head);
1335 			if (offset > 0)
1336 				head = offset;
1337 
1338 			if (!(flags & REGEX_MATCH_BEGIN)) {
1339 				EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 2), R_TEMP, 0);
1340 				if (flags & REGEX_ID_CHECK) {
1341 					EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 3), SLJIT_IMM, search_states[value].value);
1342 				}
1343 			}
1344 			else if (flags & REGEX_ID_CHECK) {
1345 				EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 2), SLJIT_IMM, search_states[value].value);
1346 			}
1347 		}
1348 		search_states[value].value = -1;
1349 	}
1350 	if (reg == R_NEXT_STATE) {
1351 		EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_NEXT_HEAD, 0);
1352 	}
1353 	else if (flags & REGEX_FAKE_MATCH_BEGIN) {
1354 		SLJIT_ASSERT(compiler_common->dfa_transitions[1].type == type_newline && !compiler_common->dfa_transitions[1].value);
1355 		offset = TERM_OFFSET_OF(search_states[1].type, 0);
1356 
1357 		SLJIT_ASSERT(!(flags & REGEX_MATCH_BEGIN));
1358 
1359 		EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 1), SLJIT_IMM, head);
1360 		head = offset;
1361 
1362 		EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 2), SLJIT_IMM, 1);
1363 		if (flags & REGEX_ID_CHECK) {
1364 			EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 3), SLJIT_IMM, 0);
1365 		}
1366 	}
1367 	EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, head);
1368 	return REGEX_NO_ERROR;
1369 }
1370 
compile_cond_tran(struct compiler_common * compiler_common,sljit_sw curr_index)1371 static int compile_cond_tran(struct compiler_common *compiler_common, sljit_sw curr_index)
1372 {
1373 	struct sljit_compiler *compiler = compiler_common->compiler;
1374 	struct stack *stack = &compiler_common->stack;
1375 	struct stack_item *search_states = compiler_common->search_states;
1376 	sljit_sw offset;
1377 	int flags;
1378 	sljit_sw no_states;
1379 	sljit_sw value;
1380 	struct sljit_jump *jump1;
1381 	struct sljit_jump *jump2;
1382 	struct sljit_jump *jump3;
1383 	struct sljit_jump *jump4;
1384 	struct sljit_jump *jump5;
1385 	struct sljit_label *label1;
1386 
1387 	flags = compiler_common->flags;
1388 	no_states = compiler_common->no_states;
1389 
1390 	EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 0);
1391 	if (!(flags & (REGEX_ID_CHECK | REGEX_MATCH_BEGIN))) {
1392 		EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 2));
1393 	}
1394 
1395 	while (stack->count > 0) {
1396 		value = stack_pop(stack)->value;
1397 		if (search_states[value].type >= 0) {
1398 #ifdef REGEX_MATCH_VERBOSE
1399 			if (flags & REGEX_MATCH_VERBOSE)
1400 				printf("-> (%3d:%3d) ", search_states[value].type, search_states[value].value);
1401 #endif
1402 			offset = TERM_OFFSET_OF(search_states[value].type, 0);
1403 
1404 			if (!(flags & REGEX_ID_CHECK)) {
1405 				if (!(flags & REGEX_MATCH_BEGIN)) {
1406 					/* Check whether item is inserted. */
1407 					EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), SLJIT_IMM, -1);
1408 					EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), R_NEXT_HEAD, 0);
1409 					if (offset > 0) {
1410 						EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset);
1411 					}
1412 					EMIT_JUMP(jump2, SLJIT_JUMP);
1413 
1414 					/* Check whether old index <= index. */
1415 					EMIT_LABEL(label1);
1416 					sljit_set_label(jump1, label1);
1417 
1418 					EMIT_CMP(jump1, SLJIT_LESS_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0);
1419 
1420 					EMIT_LABEL(label1);
1421 					sljit_set_label(jump2, label1);
1422 					EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0);
1423 
1424 					EMIT_LABEL(label1);
1425 					sljit_set_label(jump1, label1);
1426 				}
1427 				else {
1428 					/* Check whether item is inserted. */
1429 					EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), SLJIT_IMM, -1);
1430 					EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), R_NEXT_HEAD, 0);
1431 					if (offset > 0) {
1432 						EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset);
1433 					}
1434 					EMIT_LABEL(label1);
1435 					sljit_set_label(jump1, label1);
1436 				}
1437 			}
1438 			else {
1439 				if (!(flags & REGEX_MATCH_BEGIN)) {
1440 					EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 2));
1441 
1442 					/* Check whether item is inserted. */
1443 					EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), SLJIT_IMM, -1);
1444 					EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), R_NEXT_HEAD, 0);
1445 					if (offset > 0) {
1446 						EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset);
1447 					}
1448 					EMIT_JUMP(jump2, SLJIT_JUMP);
1449 
1450 					/* Check whether old index != index. */
1451 					EMIT_LABEL(label1);
1452 					sljit_set_label(jump1, label1);
1453 
1454 					EMIT_OP2(SLJIT_SUB | SLJIT_SET_Z | SLJIT_SET_LESS, SLJIT_UNUSED, 0, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0);
1455 					EMIT_JUMP(jump1, SLJIT_LESS);
1456 					EMIT_JUMP(jump3, SLJIT_NOT_EQUAL); /* Greater. */
1457 
1458 					/* Old index == index. */
1459 					EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 3));
1460 					if (search_states[value].value > 0) {
1461 						EMIT_CMP(jump4, SLJIT_GREATER, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1462 
1463 						EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1464 						EMIT_LABEL(label1);
1465 						sljit_set_label(jump4, label1);
1466 					}
1467 
1468 					EMIT_OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, SLJIT_MEM1(R_NEXT_STATE), offset + 3 * sizeof(sljit_sw), R_TEMP, 0);
1469 					EMIT_JUMP(jump4, SLJIT_GREATER_EQUAL);
1470 					EMIT_JUMP(jump5, SLJIT_JUMP);
1471 
1472 					/* Overwrite index & id. */
1473 					EMIT_LABEL(label1);
1474 					sljit_set_label(jump3, label1);
1475 					sljit_set_label(jump2, label1);
1476 					EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0);
1477 
1478 					EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 3));
1479 					if (search_states[value].value > 0) {
1480 						EMIT_CMP(jump3, SLJIT_GREATER, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1481 
1482 						EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1483 						EMIT_LABEL(label1);
1484 						sljit_set_label(jump3, label1);
1485 					}
1486 
1487 					EMIT_LABEL(label1);
1488 					sljit_set_label(jump5, label1);
1489 					EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 3 * sizeof(sljit_sw), R_TEMP, 0);
1490 
1491 					/* Exit. */
1492 					EMIT_LABEL(label1);
1493 					sljit_set_label(jump1, label1);
1494 					sljit_set_label(jump4, label1);
1495 				}
1496 				else {
1497 					EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 2));
1498 
1499 					if (search_states[value].value > 0) {
1500 						EMIT_CMP(jump1, SLJIT_GREATER, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1501 
1502 						EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
1503 						EMIT_LABEL(label1);
1504 						sljit_set_label(jump1, label1);
1505 					}
1506 
1507 					/* Check whether item is inserted. */
1508 					EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), SLJIT_IMM, -1);
1509 					EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), R_NEXT_HEAD, 0);
1510 					if (offset > 0) {
1511 						EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset);
1512 					}
1513 					EMIT_JUMP(jump2, SLJIT_JUMP);
1514 
1515 					/* Check whether old id >= id. */
1516 					EMIT_LABEL(label1);
1517 					sljit_set_label(jump1, label1);
1518 
1519 					EMIT_CMP(jump1, SLJIT_GREATER_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0);
1520 
1521 					EMIT_LABEL(label1);
1522 					sljit_set_label(jump2, label1);
1523 					EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0);
1524 
1525 					EMIT_LABEL(label1);
1526 					sljit_set_label(jump1, label1);
1527 				}
1528 			}
1529 		}
1530 		search_states[value].value = -1;
1531 	}
1532 
1533 #ifdef REGEX_MATCH_VERBOSE
1534 	if (flags & REGEX_MATCH_VERBOSE)
1535 		printf("\n");
1536 #endif
1537 	return REGEX_NO_ERROR;
1538 }
1539 
compile_end_check(struct compiler_common * compiler_common,struct sljit_label * end_check_label)1540 static int compile_end_check(struct compiler_common *compiler_common, struct sljit_label *end_check_label)
1541 {
1542 	struct sljit_compiler *compiler = compiler_common->compiler;
1543 	struct sljit_jump *jump;
1544 	struct sljit_jump *clear_states_jump;
1545 	struct sljit_label *label;
1546 	struct sljit_label *leave_label;
1547 	struct sljit_label *begin_loop_label;
1548 
1549 	/* Priority order: best_begin > best_end > best_id.
1550 	   In other words:
1551 	       if (new best_begin > old test_begin) do nothing
1552 	       otherwise we know that new_end > old_end, since R_CURR_INDEX ever increasing
1553 	       therefore we must overwrite all best_* variables (new_id also contains the highest id for this turn). */
1554 
1555 	/* Both R_CURR_CHAR and R_BEST_BEGIN used as temporary registers. */
1556 
1557 	if (!(compiler_common->flags & REGEX_MATCH_BEGIN)) {
1558 		EMIT_OP1(SLJIT_MOV, R_CURR_CHAR, 0, SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 2));
1559 		EMIT_CMP(jump, !(compiler_common->flags & REGEX_MATCH_NON_GREEDY) ? SLJIT_LESS : SLJIT_LESS_EQUAL, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), R_CURR_CHAR, 0);
1560 		sljit_set_label(jump, end_check_label);
1561 
1562 		EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), R_CURR_CHAR, 0);
1563 		if (!(compiler_common->flags & (REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END))) {
1564 			EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0);
1565 		}
1566 		else {
1567 			if ((compiler_common->flags & (REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END)) == (REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END)) {
1568 				EMIT_OP2(SLJIT_SUB, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0, SLJIT_IMM, 2);
1569 			}
1570 			else {
1571 				EMIT_OP2(SLJIT_SUB, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0, SLJIT_IMM, 1);
1572 			}
1573 		}
1574 		if (compiler_common->flags & REGEX_ID_CHECK) {
1575 			EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 3));
1576 		}
1577 
1578 		EMIT_CMP(clear_states_jump, SLJIT_LESS, R_CURR_CHAR, 0, R_BEST_BEGIN, 0);
1579 
1580 		EMIT_LABEL(leave_label);
1581 		EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, R_CURR_CHAR, 0);
1582 		EMIT_JUMP(jump, SLJIT_JUMP);
1583 		sljit_set_label(jump, end_check_label);
1584 
1585 		/* A loop to clear all states, which are > (or >=) than R_CURR_CHAR. */
1586 		EMIT_LABEL(label);
1587 		sljit_set_label(clear_states_jump, label);
1588 
1589 		EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_NEXT_HEAD, 0);
1590 		EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
1591 
1592 		/* Begin of the loop. */
1593 		EMIT_LABEL(begin_loop_label);
1594 		EMIT_CMP(jump, SLJIT_EQUAL, R_TEMP, 0, SLJIT_IMM, 0);
1595 		sljit_set_label(jump, leave_label);
1596 
1597 		EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, R_CURR_STATE, 0);
1598 		EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, SLJIT_MEM1(R_TEMP), sizeof(sljit_sw));
1599 		EMIT_CMP(clear_states_jump, !(compiler_common->flags & REGEX_MATCH_NON_GREEDY) ? SLJIT_GREATER : SLJIT_GREATER_EQUAL, SLJIT_MEM1(R_TEMP), 2 * sizeof(sljit_sw), R_CURR_CHAR, 0);
1600 
1601 		/* Case 1: keep this case. */
1602 		EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_TEMP), sizeof(sljit_sw), R_NEXT_HEAD, 0);
1603 		EMIT_OP2(SLJIT_SUB, R_NEXT_HEAD, 0, R_TEMP, 0, R_CURR_STATE, 0);
1604 
1605 		EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_BEST_BEGIN, 0);
1606 		EMIT_JUMP(jump, SLJIT_JUMP);
1607 		sljit_set_label(jump, begin_loop_label);
1608 
1609 		/* Case 2: remove this case. */
1610 		EMIT_LABEL(label);
1611 		sljit_set_label(clear_states_jump, label);
1612 
1613 		EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_TEMP), sizeof(sljit_sw), SLJIT_IMM, -1);
1614 
1615 		EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_BEST_BEGIN, 0);
1616 		EMIT_JUMP(jump, SLJIT_JUMP);
1617 		sljit_set_label(jump, begin_loop_label);
1618 	}
1619 	else {
1620 		EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, SLJIT_IMM, 0);
1621 		EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, 0);
1622 		EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0);
1623 		if (compiler_common->flags & REGEX_ID_CHECK) {
1624 			EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 2));
1625 		}
1626 		EMIT_JUMP(jump, SLJIT_JUMP);
1627 		sljit_set_label(jump, end_check_label);
1628 	}
1629 	return REGEX_NO_ERROR;
1630 }
1631 
compile_leave_fast_forward(struct compiler_common * compiler_common,struct sljit_label * fast_forward_label)1632 static int compile_leave_fast_forward(struct compiler_common *compiler_common, struct sljit_label *fast_forward_label)
1633 {
1634 	struct sljit_compiler *compiler = compiler_common->compiler;
1635 	struct stack *stack = &compiler_common->stack;
1636 	struct stack_item *dfa_transitions = compiler_common->dfa_transitions;
1637 	struct stack_item *search_states = compiler_common->search_states;
1638 	int ind;
1639 	struct sljit_jump *jump;
1640 	int init_range = 1, prev_value = 0;
1641 
1642 	while (stack->count > 0) {
1643 		ind = stack_pop(stack)->value;
1644 		search_states[ind].value = -1;
1645 		if (search_states[ind].type >= 0) {
1646 			if (dfa_transitions[ind].type == type_char) {
1647 				EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, dfa_transitions[ind].value);
1648 				sljit_set_label(jump, fast_forward_label);
1649 			}
1650 			else if (dfa_transitions[ind].type == type_rng_start) {
1651 				SLJIT_ASSERT(!dfa_transitions[ind].value);
1652 				ind++;
1653 				while (dfa_transitions[ind].type != type_rng_end) {
1654 					if (dfa_transitions[ind].type == type_rng_char) {
1655 						EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, dfa_transitions[ind].value);
1656 						sljit_set_label(jump, fast_forward_label);
1657 					}
1658 					else {
1659 						SLJIT_ASSERT(dfa_transitions[ind].type == type_rng_left);
1660 						if (init_range) {
1661 							EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_CHAR, 0);
1662 							init_range = 0;
1663 						}
1664 						if (dfa_transitions[ind].value != prev_value) {
1665 							/* Best compatibility to all archs. */
1666 							prev_value -= dfa_transitions[ind].value;
1667 							if (prev_value < 0) {
1668 								EMIT_OP2(SLJIT_SUB, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, -prev_value);
1669 							}
1670 							else {
1671 								EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, prev_value);
1672 							}
1673 							prev_value = dfa_transitions[ind].value;
1674 						}
1675 						EMIT_CMP(jump, SLJIT_LESS_EQUAL, R_TEMP, 0, SLJIT_IMM, dfa_transitions[ind + 1].value - dfa_transitions[ind].value);
1676 						sljit_set_label(jump, fast_forward_label);
1677 						ind++;
1678 					}
1679 					ind++;
1680 				}
1681 			}
1682 			else {
1683 				SLJIT_ASSERT(dfa_transitions[ind].type == type_newline);
1684 				EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\n');
1685 				sljit_set_label(jump, fast_forward_label);
1686 				EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\r');
1687 				sljit_set_label(jump, fast_forward_label);
1688 			}
1689 		}
1690 	}
1691 	return REGEX_NO_ERROR;
1692 }
1693 
compile_newline_check(struct compiler_common * compiler_common,sljit_sw ind)1694 static int compile_newline_check(struct compiler_common *compiler_common, sljit_sw ind)
1695 {
1696 	struct sljit_compiler *compiler = compiler_common->compiler;
1697 	struct sljit_jump *jump1;
1698 	struct sljit_jump *jump2;
1699 	struct sljit_label *label;
1700 	sljit_sw no_states;
1701 	sljit_sw offset;
1702 
1703 	/* Check whether a new-line character is found. */
1704 	EMIT_CMP(jump1, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\n');
1705 	EMIT_CMP(jump2, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\r');
1706 
1707 	no_states = compiler_common->no_states;
1708 	offset = TERM_OFFSET_OF(compiler_common->search_states[ind].type, 1);
1709 	EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), offset);
1710 	EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), offset, SLJIT_IMM, -1);
1711 	CHECK(sljit_emit_ijump(compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
1712 
1713 	EMIT_LABEL(label);
1714 	sljit_set_label(jump1, label);
1715 	sljit_set_label(jump2, label);
1716 	return REGEX_NO_ERROR;
1717 }
1718 
1719 #undef CHECK
1720 
1721 #define CHECK(exp) \
1722 	if (SLJIT_UNLIKELY(exp)) \
1723 		return 0
1724 
range_set_label(struct sljit_jump ** range_jump_list,struct sljit_label * label)1725 static SLJIT_INLINE void range_set_label(struct sljit_jump **range_jump_list, struct sljit_label *label)
1726 {
1727 	while (*range_jump_list) {
1728 		sljit_set_label(*range_jump_list, label);
1729 		range_jump_list++;
1730 	}
1731 }
1732 
compile_range_check(struct compiler_common * compiler_common,sljit_sw ind)1733 static sljit_sw compile_range_check(struct compiler_common *compiler_common, sljit_sw ind)
1734 {
1735 	struct sljit_compiler *compiler = compiler_common->compiler;
1736 	struct stack_item *dfa_transitions = compiler_common->dfa_transitions;
1737 	struct sljit_jump **range_jump_list = compiler_common->range_jump_list;
1738 	int invert = dfa_transitions[ind].value;
1739 	struct sljit_label *label;
1740 	sljit_sw no_states;
1741 	sljit_sw offset;
1742 	int init_range = 1, prev_value = 0;
1743 
1744 	ind++;
1745 
1746 	while (dfa_transitions[ind].type != type_rng_end) {
1747 		if (dfa_transitions[ind].type == type_rng_char) {
1748 			EMIT_CMP(*range_jump_list, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, dfa_transitions[ind].value);
1749 			range_jump_list++;
1750 		}
1751 		else {
1752 			SLJIT_ASSERT(dfa_transitions[ind].type == type_rng_left);
1753 			if (init_range) {
1754 				EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_CHAR, 0);
1755 				init_range = 0;
1756 			}
1757 			if (dfa_transitions[ind].value != prev_value) {
1758 				/* Best compatibility to all archs. */
1759 				prev_value -= dfa_transitions[ind].value;
1760 				if (prev_value < 0) {
1761 					EMIT_OP2(SLJIT_SUB, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, -prev_value);
1762 				}
1763 				else {
1764 					EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, prev_value);
1765 				}
1766 				prev_value = dfa_transitions[ind].value;
1767 			}
1768 			EMIT_CMP(*range_jump_list, SLJIT_LESS_EQUAL, R_TEMP, 0, SLJIT_IMM, dfa_transitions[ind + 1].value - dfa_transitions[ind].value);
1769 			range_jump_list++;
1770 			ind++;
1771 		}
1772 		ind++;
1773 	}
1774 
1775 	*range_jump_list = NULL;
1776 
1777 	if (!invert) {
1778 		no_states = compiler_common->no_states;
1779 		offset = TERM_OFFSET_OF(compiler_common->search_states[ind].type, 1);
1780 		EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), offset);
1781 		EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), offset, SLJIT_IMM, -1);
1782 		CHECK(sljit_emit_ijump(compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
1783 
1784 		EMIT_LABEL(label);
1785 		range_set_label(compiler_common->range_jump_list, label);
1786 		/* Clears the jump list. */
1787 		*compiler_common->range_jump_list = NULL;
1788 	}
1789 	return ind;
1790 }
1791 
1792 #undef TERM_OFFSET_OF
1793 #undef EMIT_OP1
1794 #undef EMIT_OP2
1795 #undef EMIT_LABEL
1796 #undef EMIT_JUMP
1797 #undef EMIT_CMP
1798 #undef CHECK
1799 
1800 /* --------------------------------------------------------------------- */
1801 /*  Main compiler                                                        */
1802 /* --------------------------------------------------------------------- */
1803 
1804 #define TERM_OFFSET_OF(ind, offs) (((ind) * compiler_common.no_states + (offs)) * sizeof(sljit_sw))
1805 
1806 #define EMIT_OP1(type, arg1, arg2, arg3, arg4) \
1807 	CHECK(sljit_emit_op1(compiler_common.compiler, type, arg1, arg2, arg3, arg4))
1808 
1809 #define EMIT_OP2(type, arg1, arg2, arg3, arg4, arg5, arg6) \
1810 	CHECK(sljit_emit_op2(compiler_common.compiler, type, arg1, arg2, arg3, arg4, arg5, arg6))
1811 
1812 #define EMIT_LABEL(label) \
1813 	label = sljit_emit_label(compiler_common.compiler); \
1814 	CHECK(!label)
1815 
1816 #define EMIT_JUMP(jump, type) \
1817 	jump = sljit_emit_jump(compiler_common.compiler, type); \
1818 	CHECK(!jump)
1819 
1820 #define EMIT_CMP(jump, type, arg1, arg2, arg3, arg4) \
1821 	jump = sljit_emit_cmp(compiler_common.compiler, type, arg1, arg2, arg3, arg4); \
1822 	CHECK(!jump)
1823 
1824 /* A do {} while(0) expression helps to avoid goto statements. */
1825 #define BEGIN_GUARD \
1826 	do {
1827 
1828 #define END_GUARD \
1829 	} while(0);
1830 
1831 #define CHECK(exp) \
1832 	if (SLJIT_UNLIKELY(exp)) \
1833 		break;
1834 
regex_compile(const regex_char_t * regex_string,int length,int re_flags,int * error)1835 struct regex_machine* regex_compile(const regex_char_t *regex_string, int length, int re_flags, int *error)
1836 {
1837 	struct compiler_common compiler_common;
1838 	sljit_sw ind;
1839 	int error_code, done, suggest_fast_forward;
1840 	/* ID of an empty match (-1 if not reachable). */
1841 	int empty_match_id;
1842 
1843 	struct sljit_jump *jump;
1844 	struct sljit_jump *best_match_found_jump;
1845 	struct sljit_jump *fast_forward_jump = NULL;
1846 	struct sljit_jump *length_is_zero_jump;
1847 	struct sljit_jump *end_check_jump = NULL;
1848 	struct sljit_jump *best_match_check_jump = NULL;
1849 	struct sljit_jump *non_greedy_end_jump = NULL;
1850 	struct sljit_label *label;
1851 	struct sljit_label *end_check_label = NULL;
1852 	struct sljit_label *start_label;
1853 	struct sljit_label *fast_forward_label;
1854 	struct sljit_label *fast_forward_return_label;
1855 
1856 	if (error)
1857 		*error = REGEX_NO_ERROR;
1858 #ifdef REGEX_MATCH_VERBOSE
1859 	compiler_common.flags = re_flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END | REGEX_MATCH_NON_GREEDY | REGEX_NEWLINE | REGEX_MATCH_VERBOSE);
1860 #else
1861 	compiler_common.flags = re_flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END | REGEX_MATCH_NON_GREEDY | REGEX_NEWLINE);
1862 #endif
1863 
1864 	/* Step 1: parsing (Left->Right).
1865 	   Syntax check and AST generator. */
1866 	error_code = parse(regex_string, length, &compiler_common);
1867 	if (error_code) {
1868 		stack_destroy(&compiler_common.stack);
1869 		if (error)
1870 			*error = error_code;
1871 		return NULL;
1872 	}
1873 
1874 	/* Step 2: generating branches (Right->Left). */
1875 	error_code = generate_transitions(&compiler_common);
1876 	stack_destroy(&compiler_common.stack);
1877 	stack_destroy(&compiler_common.depth);
1878 	if (error_code) {
1879 		if (compiler_common.dfa_transitions)
1880 			SLJIT_FREE(compiler_common.dfa_transitions, NULL);
1881 		if (error)
1882 			*error = error_code;
1883 		return NULL;
1884 	}
1885 
1886 	/* Step 3: Generate necessary data for depth-first search (Left->Right). */
1887 	error_code = generate_search_states(&compiler_common);
1888 	if (error_code) {
1889 		SLJIT_FREE(compiler_common.dfa_transitions, NULL);
1890 		if (error)
1891 			*error = error_code;
1892 		return NULL;
1893 	}
1894 
1895 #ifdef REGEX_MATCH_VERBOSE
1896 	if (compiler_common.flags & REGEX_MATCH_VERBOSE)
1897 		verbose_transitions(&compiler_common);
1898 #endif
1899 
1900 	/* Step 4: Left->Right generate code. */
1901 	stack_init(&compiler_common.stack);
1902 	stack_init(&compiler_common.depth);
1903 	done = 0;
1904 	compiler_common.machine = NULL;
1905 	compiler_common.compiler = NULL;
1906 	compiler_common.range_jump_list = NULL;
1907 
1908 	BEGIN_GUARD
1909 
1910 	compiler_common.machine = (struct regex_machine*)SLJIT_MALLOC(sizeof(struct regex_machine) + (compiler_common.terms_size - 1) * sizeof(sljit_uw), NULL);
1911 	CHECK(!compiler_common.machine);
1912 
1913 	compiler_common.compiler = sljit_create_compiler(NULL);
1914 	CHECK(!compiler_common.compiler);
1915 
1916 	if (compiler_common.longest_range_size > 0) {
1917 		compiler_common.range_jump_list = (struct sljit_jump**)SLJIT_MALLOC(sizeof(struct sljit_jump*) * compiler_common.longest_range_size, NULL);
1918 		CHECK(!compiler_common.range_jump_list);
1919 	}
1920 
1921 	if ((compiler_common.flags & REGEX_ID_CHECK) && !(compiler_common.flags & REGEX_MATCH_BEGIN))
1922 		compiler_common.no_states = 4;
1923 	else if (!(compiler_common.flags & REGEX_ID_CHECK) && (compiler_common.flags & REGEX_MATCH_BEGIN))
1924 		compiler_common.no_states = 2;
1925 	else
1926 		compiler_common.no_states = 3;
1927 
1928 	compiler_common.machine->flags = compiler_common.flags;
1929 	compiler_common.machine->no_states = compiler_common.no_states;
1930 	compiler_common.machine->size = compiler_common.machine->no_states * compiler_common.terms_size;
1931 
1932 	/* Study the regular expression. */
1933 	empty_match_id = -1;
1934 	suggest_fast_forward = 1;
1935 	if (!(compiler_common.flags & REGEX_FAKE_MATCH_BEGIN)) {
1936 		CHECK(trace_transitions(0, &compiler_common));
1937 		while (compiler_common.stack.count > 0) {
1938 			ind = stack_pop(&compiler_common.stack)->value;
1939 			if (compiler_common.search_states[ind].type == 0) {
1940 				SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_end);
1941 				suggest_fast_forward = 0;
1942 				empty_match_id = compiler_common.search_states[ind].value;
1943 			}
1944 			else if (compiler_common.search_states[ind].type > 0) {
1945 				SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type != type_end);
1946 				if (compiler_common.dfa_transitions[ind].type == type_rng_start && compiler_common.dfa_transitions[ind].value)
1947 					suggest_fast_forward = 0;
1948 			}
1949 			compiler_common.search_states[ind].value = -1;
1950 		}
1951 	}
1952 	else {
1953 		SLJIT_ASSERT(compiler_common.dfa_transitions[1].type == type_newline);
1954 		CHECK(trace_transitions(1, &compiler_common));
1955 		while (compiler_common.stack.count > 0) {
1956 			ind = stack_pop(&compiler_common.stack)->value;
1957 			if (compiler_common.search_states[ind].type == 0) {
1958 				SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_end);
1959 				suggest_fast_forward = 0;
1960 				empty_match_id = compiler_common.search_states[ind].value;
1961 			}
1962 			compiler_common.search_states[ind].value = -1;
1963 		}
1964 	}
1965 
1966 	/* Step 4.1: Generate entry. */
1967 	CHECK(sljit_emit_enter(compiler_common.compiler, 0, 3, 5, 5, 0, 0, 0));
1968 
1969 	/* Copy arguments to their place. */
1970 	EMIT_OP1(SLJIT_MOV, R_REGEX_MATCH, 0, SLJIT_S0, 0);
1971 	EMIT_OP1(SLJIT_MOV, R_STRING, 0, SLJIT_S1, 0);
1972 	EMIT_OP2(SLJIT_ADD, R_LENGTH, 0, SLJIT_S2, 0, SLJIT_IMM, 1);
1973 
1974 	/* Init global registers. */
1975 	EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current));
1976 	EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next));
1977 	EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head));
1978 	EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin));
1979 	EMIT_OP1(SLJIT_MOV, R_CURR_INDEX, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, index));
1980 
1981 	/* Check whether the best match has already found in a previous frame. */
1982 	EMIT_CMP(jump, SLJIT_EQUAL, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_quit), SLJIT_IMM, 0);
1983 	EMIT_JUMP(best_match_found_jump, SLJIT_JUMP);
1984 
1985 #ifdef REGEX_MATCH_VERBOSE
1986 	if (compiler_common.flags & REGEX_MATCH_VERBOSE)
1987 		printf("\n-----------------\nTrace\n-----------------\n");
1988 #endif
1989 
1990 	/* Step 4.2: Generate code for state 0. */
1991 	EMIT_LABEL(label);
1992 	compiler_common.machine->entry_addrs[0] = (sljit_uw)label;
1993 
1994 	/* Swapping current and next. */
1995 	EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_STATE, 0);
1996 	EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, R_NEXT_STATE, 0);
1997 	EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, R_TEMP, 0);
1998 
1999 	/* Checking whether the best case needs to be updated. */
2000 	if (!(compiler_common.flags & REGEX_MATCH_END)) {
2001 		EMIT_CMP(end_check_jump, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 1), SLJIT_IMM, -1);
2002 		EMIT_LABEL(end_check_label);
2003 	}
2004 	EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), TERM_REL_OFFSET_OF(0, 1), SLJIT_IMM, -1);
2005 	EMIT_OP2(SLJIT_ADD, R_CURR_INDEX, 0, R_CURR_INDEX, 0, SLJIT_IMM, 1);
2006 
2007 	/* Checking whether best case has already found. */
2008 	if (!(compiler_common.flags & REGEX_MATCH_END) || (compiler_common.flags & REGEX_MATCH_BEGIN)) {
2009 		if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2010 			/* We can bail out if no more active states remain and R_BEST_BEGIN != -1. */
2011 			EMIT_CMP(best_match_check_jump, SLJIT_NOT_EQUAL, R_BEST_BEGIN, 0, SLJIT_IMM, -1);
2012 		}
2013 		else {
2014 			/* We can bail out if no more active states remain (regardless of R_BEST_BEGIN). */
2015 			EMIT_CMP(best_match_check_jump, SLJIT_EQUAL, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
2016 		}
2017 	}
2018 
2019 	EMIT_LABEL(start_label);
2020 	sljit_set_label(jump, start_label);
2021 
2022 	if (!(compiler_common.flags & REGEX_MATCH_BEGIN) && suggest_fast_forward) {
2023 		EMIT_CMP(fast_forward_jump, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 0);
2024 	}
2025 
2026 	/* Loading the next character. */
2027 	EMIT_OP2(SLJIT_SUB | SLJIT_SET_Z, R_LENGTH, 0, R_LENGTH, 0, SLJIT_IMM, 1);
2028 	EMIT_JUMP(length_is_zero_jump, SLJIT_EQUAL);
2029 
2030 	EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_STRING, 0);
2031 #ifdef REGEX_USE_8BIT_CHARS
2032 	EMIT_OP1(SLJIT_MOV_U8, R_CURR_CHAR, 0, SLJIT_MEM1(R_TEMP), 0);
2033 	EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, 1);
2034 #else
2035 	EMIT_OP1(SLJIT_MOV_UH, R_CURR_CHAR, 0, SLJIT_MEM1(R_TEMP), 0);
2036 	EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, 2);
2037 #endif
2038 	EMIT_OP1(SLJIT_MOV, R_STRING, 0, R_TEMP, 0);
2039 
2040 #ifdef REGEX_MATCH_VERBOSE
2041 	if (compiler_common.flags & REGEX_MATCH_VERBOSE) {
2042 		printf("(%3d): ", 0);
2043 		CHECK(trace_transitions(0, &compiler_common));
2044 		while (compiler_common.stack.count > 0) {
2045 			ind = stack_pop(&compiler_common.stack)->value;
2046 			if (compiler_common.search_states[ind].type >= 0)
2047 				printf("-> (%3d:%3d) ", compiler_common.search_states[ind].type, compiler_common.search_states[ind].value);
2048 			compiler_common.search_states[ind].value = -1;
2049 		}
2050 		printf("\n");
2051 	}
2052 #endif
2053 
2054 	EMIT_LABEL(fast_forward_return_label);
2055 	if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2056 		EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 1);
2057 		if (!(compiler_common.flags & REGEX_MATCH_END)) {
2058 			EMIT_CMP(jump, SLJIT_NOT_EQUAL, R_BEST_BEGIN, 0, SLJIT_IMM, -1);
2059 		}
2060 
2061 		EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_INDEX, 0);
2062 		CHECK(compile_uncond_tran(&compiler_common, R_NEXT_STATE));
2063 		/* And branching to the first state. */
2064 		CHECK(sljit_emit_ijump(compiler_common.compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
2065 
2066 		if (!(compiler_common.flags & REGEX_MATCH_END)) {
2067 			EMIT_LABEL(label);
2068 			sljit_set_label(jump, label);
2069 		}
2070 	}
2071 	/* This is the case where we only have to reset the R_NEXT_HEAD. */
2072 	EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_NEXT_HEAD, 0);
2073 	EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
2074 	CHECK(sljit_emit_ijump(compiler_common.compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
2075 
2076 	/* Fast-forward loop. */
2077 	if (fast_forward_jump) {
2078 		/* Quit from fast-forward loop. */
2079 		EMIT_LABEL(fast_forward_label);
2080 		EMIT_OP2(SLJIT_SUB, R_TEMP, 0, R_NEXT_HEAD, 0, SLJIT_IMM, 1);
2081 		EMIT_OP1(SLJIT_MOV, R_LENGTH, 0, R_NEXT_STATE, 0);
2082 		EMIT_OP1(SLJIT_MOV, R_STRING, 0, R_CURR_STATE, 0);
2083 		EMIT_OP1(SLJIT_MOV, R_CURR_INDEX, 0, R_NEXT_HEAD, 0);
2084 		EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next));
2085 		EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current));
2086 		EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head));
2087 
2088 		/* Update the start field of the locations. */
2089 		CHECK(trace_transitions(0, &compiler_common));
2090 		while (compiler_common.stack.count > 0) {
2091 			ind = stack_pop(&compiler_common.stack)->value;
2092 			if (compiler_common.search_states[ind].type >= 0) {
2093 				EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(compiler_common.search_states[ind].type, 2), R_TEMP, 0);
2094 			}
2095 			compiler_common.search_states[ind].value = -1;
2096 		}
2097 		EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 0);
2098 		EMIT_JUMP(jump, SLJIT_JUMP);
2099 		sljit_set_label(jump, fast_forward_return_label);
2100 
2101 		/* Start fast-forward. */
2102 		EMIT_LABEL(label);
2103 		sljit_set_label(fast_forward_jump, label);
2104 
2105 		/* Moving everything to registers. */
2106 		EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next), R_NEXT_STATE, 0);
2107 		EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current), R_CURR_STATE, 0);
2108 		EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head), R_NEXT_HEAD, 0);
2109 		EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, R_LENGTH, 0);
2110 		EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, R_STRING, 0);
2111 		EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, R_CURR_INDEX, 0);
2112 
2113 		/* Fast forward mainloop. */
2114 		EMIT_LABEL(label);
2115 		EMIT_OP2(SLJIT_SUB | SLJIT_SET_Z, R_NEXT_STATE, 0, R_NEXT_STATE, 0, SLJIT_IMM, 1);
2116 		EMIT_JUMP(fast_forward_jump, SLJIT_EQUAL);
2117 
2118 #ifdef REGEX_USE_8BIT_CHARS
2119 		EMIT_OP1(SLJIT_MOV_U8, R_CURR_CHAR, 0, SLJIT_MEM1(R_CURR_STATE), 0);
2120 		EMIT_OP2(SLJIT_ADD, R_CURR_STATE, 0, R_CURR_STATE, 0, SLJIT_IMM, 1);
2121 #else
2122 		EMIT_OP1(SLJIT_MOV_UH, R_CURR_CHAR, 0, SLJIT_MEM1(R_CURR_STATE), 0);
2123 		EMIT_OP2(SLJIT_ADD, R_CURR_STATE, 0, R_CURR_STATE, 0, SLJIT_IMM, 2);
2124 #endif
2125 
2126 		CHECK(trace_transitions(0, &compiler_common));
2127 		CHECK(compile_leave_fast_forward(&compiler_common, fast_forward_label));
2128 
2129 		EMIT_OP2(SLJIT_ADD, R_NEXT_HEAD, 0, R_NEXT_HEAD, 0, SLJIT_IMM, 1);
2130 		EMIT_JUMP(jump, SLJIT_JUMP);
2131 		sljit_set_label(jump, label);
2132 
2133 		/* String is finished. */
2134 		EMIT_LABEL(label);
2135 		sljit_set_label(fast_forward_jump, label);
2136 		EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, index), R_NEXT_HEAD, 0);
2137 		EMIT_JUMP(fast_forward_jump, SLJIT_JUMP);
2138 	}
2139 
2140 	/* End check. */
2141 	if (end_check_jump) {
2142 		EMIT_LABEL(label);
2143 		sljit_set_label(end_check_jump, label);
2144 
2145 		if (!(compiler_common.flags & REGEX_MATCH_NON_GREEDY) || !(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2146 			CHECK(compile_end_check(&compiler_common, end_check_label));
2147 		}
2148 		else {
2149 			/* Since we leave, we do not need to update the R_BEST_BEGIN. */
2150 			EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, 0);
2151 			EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0);
2152 			if (compiler_common.flags & REGEX_ID_CHECK) {
2153 				EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 2));
2154 			}
2155 			EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_quit), SLJIT_IMM, 1);
2156 			EMIT_JUMP(non_greedy_end_jump, SLJIT_JUMP);
2157 		}
2158 	}
2159 
2160 	/* Finish check. */
2161 	if (best_match_check_jump) {
2162 		EMIT_LABEL(label);
2163 		sljit_set_label(best_match_check_jump, label);
2164 
2165 		if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2166 			EMIT_CMP(jump, SLJIT_NOT_EQUAL, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
2167 			sljit_set_label(jump, start_label);
2168 		}
2169 		EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_quit), SLJIT_IMM, 1);
2170 	}
2171 
2172 	/* Leaving matching and storing the necessary values. */
2173 	EMIT_LABEL(label);
2174 	sljit_set_label(length_is_zero_jump, label);
2175 	if (non_greedy_end_jump)
2176 		sljit_set_label(non_greedy_end_jump, label);
2177 
2178 	EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, index), R_CURR_INDEX, 0);
2179 	EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head), R_NEXT_HEAD, 0);
2180 	EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next), R_NEXT_STATE, 0);
2181 	EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current), R_CURR_STATE, 0);
2182 
2183 	/* Exit from JIT. */
2184 	EMIT_LABEL(label);
2185 	sljit_set_label(best_match_found_jump, label);
2186 	if (fast_forward_jump)
2187 		sljit_set_label(fast_forward_jump, label);
2188 	CHECK(sljit_emit_return(compiler_common.compiler, SLJIT_UNUSED, 0, 0));
2189 
2190 	for (ind = 1; ind < compiler_common.dfa_size - 1; ind++) {
2191 		if (compiler_common.search_states[ind].type >= 0) {
2192 			SLJIT_ASSERT(compiler_common.search_states[ind].type < compiler_common.terms_size);
2193 			EMIT_LABEL(label);
2194 			compiler_common.machine->entry_addrs[compiler_common.search_states[ind].type] = (sljit_uw)label;
2195 
2196 			if (compiler_common.dfa_transitions[ind].type == type_char) {
2197 				EMIT_CMP(jump, SLJIT_NOT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, compiler_common.dfa_transitions[ind].value);
2198 			}
2199 			else if (compiler_common.dfa_transitions[ind].type == type_rng_start) {
2200 				ind = compile_range_check(&compiler_common, ind);
2201 				CHECK(!ind);
2202 			}
2203 			else {
2204 				SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_newline);
2205 				CHECK(compile_newline_check(&compiler_common, ind));
2206 			}
2207 
2208 			CHECK(trace_transitions(ind, &compiler_common));
2209 #ifdef REGEX_MATCH_VERBOSE
2210 			if (compiler_common.flags & REGEX_MATCH_VERBOSE)
2211 				printf("(%3d): ", compiler_common.search_states[ind].type);
2212 #endif
2213 			CHECK(compile_cond_tran(&compiler_common, compiler_common.search_states[ind].type));
2214 
2215 			if (compiler_common.dfa_transitions[ind].type == type_char) {
2216 				EMIT_LABEL(label);
2217 				sljit_set_label(jump, label);
2218 			}
2219 			else if (compiler_common.dfa_transitions[ind].type == type_rng_end) {
2220 				EMIT_LABEL(label);
2221 				range_set_label(compiler_common.range_jump_list, label);
2222 			}
2223 			else {
2224 				SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_newline);
2225 			}
2226 
2227 			/* Branch to the next item in the list. */
2228 			EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(compiler_common.search_states[ind].type, 1));
2229 			EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(compiler_common.search_states[ind].type, 1), SLJIT_IMM, -1);
2230 			CHECK(sljit_emit_ijump(compiler_common.compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
2231 		}
2232 	}
2233 
2234 	if (ind == compiler_common.dfa_size - 1) {
2235 		/* Generate an init stub function. */
2236 		EMIT_LABEL(label);
2237 		CHECK(sljit_emit_enter(compiler_common.compiler, 0, 2, 3, 3, 0, 0, 0));
2238 
2239 		if (empty_match_id == -1) {
2240 			EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, -1);
2241 			EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_IMM, 0);
2242 		}
2243 		else {
2244 			EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, 0);
2245 			EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_IMM, empty_match_id);
2246 		}
2247 
2248 		EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, index), SLJIT_IMM, !(compiler_common.flags & REGEX_FAKE_MATCH_BEGIN) ? 1 : 2);
2249 
2250 		if (!(compiler_common.flags & REGEX_MATCH_NON_GREEDY) || empty_match_id == -1) {
2251 			/* The else is a really rare event, so we still generate an empty function instead of a runtime pointer check. */
2252 			SLJIT_ASSERT(R_CURR_STATE == SLJIT_S0);
2253 			if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) {
2254 				/* R_CURR_INDEX (put to R_TEMP) is zero. */
2255 				EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, 0);
2256 			}
2257 			CHECK(compile_uncond_tran(&compiler_common, R_CURR_STATE));
2258 		}
2259 		else {
2260 			EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
2261 		}
2262 		CHECK(sljit_emit_return(compiler_common.compiler, SLJIT_MOV, R_NEXT_HEAD, 0));
2263 
2264 		compiler_common.machine->continue_match = sljit_generate_code(compiler_common.compiler);
2265 #ifndef SLJIT_INDIRECT_CALL
2266 		compiler_common.machine->u.init_match = (void*)(sljit_sw)sljit_get_label_addr(label);
2267 #else
2268 		sljit_set_function_context(&compiler_common.machine->u.init_match, &compiler_common.machine->context, sljit_get_label_addr(label), regex_compile);
2269 #endif
2270 #ifdef REGEX_MATCH_VERBOSE
2271 		if (compiler_common.flags & REGEX_MATCH_VERBOSE)
2272 			printf("Continue match: %p Init match: %p\n\n", compiler_common.machine->continue_match, compiler_common.machine->u.init_match);
2273 #endif
2274 		if (compiler_common.machine->continue_match) {
2275 			for (ind = 0; ind < compiler_common.terms_size; ++ind)
2276 				compiler_common.machine->entry_addrs[ind] = sljit_get_label_addr((struct sljit_label*)compiler_common.machine->entry_addrs[ind]);
2277 			done = 1;
2278 		}
2279 	}
2280 	END_GUARD
2281 
2282 	stack_destroy(&compiler_common.stack);
2283 	stack_destroy(&compiler_common.depth);
2284 	SLJIT_FREE(compiler_common.dfa_transitions, NULL);
2285 	SLJIT_FREE(compiler_common.search_states, NULL);
2286 	if (compiler_common.range_jump_list)
2287 		SLJIT_FREE(compiler_common.range_jump_list, NULL);
2288 	if (compiler_common.compiler)
2289 		sljit_free_compiler(compiler_common.compiler);
2290 	if (done)
2291 		return compiler_common.machine;
2292 
2293 	if (compiler_common.machine) {
2294 		SLJIT_FREE(compiler_common.machine, NULL);
2295 	}
2296 	if (error)
2297 		*error = REGEX_MEMORY_ERROR;
2298 	return NULL;
2299 }
2300 
2301 #undef TERM_OFFSET_OF
2302 #undef EMIT_OP1
2303 #undef EMIT_OP2
2304 #undef EMIT_LABEL
2305 #undef EMIT_JUMP
2306 #undef EMIT_CMP
2307 #undef BEGIN_GUARD
2308 #undef END_GUARD
2309 #undef CHECK
2310 
regex_free_machine(struct regex_machine * machine)2311 void regex_free_machine(struct regex_machine *machine)
2312 {
2313 	sljit_free_code(machine->continue_match);
2314 	SLJIT_FREE(machine, NULL);
2315 }
2316 
regex_get_platform_name(void)2317 const char* regex_get_platform_name(void)
2318 {
2319 	return sljit_get_platform_name();
2320 }
2321 
2322 /* --------------------------------------------------------------------- */
2323 /*  Mathching utilities                                                  */
2324 /* --------------------------------------------------------------------- */
2325 
regex_begin_match(struct regex_machine * machine)2326 struct regex_match* regex_begin_match(struct regex_machine *machine)
2327 {
2328 	sljit_sw *ptr1;
2329 	sljit_sw *ptr2;
2330 	sljit_sw *end;
2331 	sljit_sw *entry_addrs;
2332 
2333 	struct regex_match *match = (struct regex_match*)SLJIT_MALLOC(sizeof(struct regex_match) + (machine->size * 2 - 1) * sizeof(sljit_sw), NULL);
2334 	if (!match)
2335 		return NULL;
2336 
2337 	ptr1 = match->states;
2338 	ptr2 = match->states + machine->size;
2339 	end = ptr2;
2340 	entry_addrs = (sljit_sw*)machine->entry_addrs;
2341 
2342 	match->current = ptr1;
2343 	match->next = ptr2;
2344 	match->head = 0;
2345 	match->machine = machine;
2346 
2347 	/* Init machine states. */
2348 	switch (machine->no_states) {
2349 	case 2:
2350 		while (ptr1 < end) {
2351 			*ptr1++ = *entry_addrs;
2352 			*ptr2++ = *entry_addrs++;
2353 			*ptr1++ = -1;
2354 			*ptr2++ = -1;
2355 		}
2356 		break;
2357 
2358 	case 3:
2359 		while (ptr1 < end) {
2360 			*ptr1++ = *entry_addrs;
2361 			*ptr2++ = *entry_addrs++;
2362 			*ptr1++ = -1;
2363 			*ptr2++ = -1;
2364 			*ptr1++ = 0;
2365 			*ptr2++ = 0;
2366 		}
2367 		break;
2368 
2369 	case 4:
2370 		while (ptr1 < end) {
2371 			*ptr1++ = *entry_addrs;
2372 			*ptr2++ = *entry_addrs++;
2373 			*ptr1++ = -1;
2374 			*ptr2++ = -1;
2375 			*ptr1++ = 0;
2376 			*ptr2++ = 0;
2377 			*ptr1++ = 0;
2378 			*ptr2++ = 0;
2379 		}
2380 		break;
2381 
2382 	default:
2383 		SLJIT_UNREACHABLE();
2384 		break;
2385 	}
2386 
2387 	SLJIT_ASSERT(ptr1 == end);
2388 
2389 	match->u.continue_match = machine->continue_match;
2390 
2391 	regex_reset_match(match);
2392 	return match;
2393 }
2394 
regex_reset_match(struct regex_match * match)2395 void regex_reset_match(struct regex_match *match)
2396 {
2397 	struct regex_machine *machine = match->machine;
2398 	sljit_sw current, ind;
2399 	sljit_sw *current_ptr;
2400 
2401 	match->best_end = 0;
2402 	match->fast_quit = 0;
2403 	match->fast_forward = 0;
2404 
2405 	if (match->head != 0) {
2406 		/* Clear the current state. */
2407 		current = match->head;
2408 		current_ptr = match->current;
2409 		do {
2410 			ind = (current / sizeof(sljit_sw)) + 1;
2411 			current = current_ptr[ind];
2412 			current_ptr[ind] = -1;
2413 		} while (current != 0);
2414 	}
2415 	match->head = machine->u.call_init(match->current, match);
2416 }
2417 
regex_free_match(struct regex_match * match)2418 void regex_free_match(struct regex_match *match)
2419 {
2420 	SLJIT_FREE(match, NULL);
2421 }
2422 
regex_continue_match(struct regex_match * match,const regex_char_t * input_string,int length)2423 void regex_continue_match(struct regex_match *match, const regex_char_t *input_string, int length)
2424 {
2425 	match->u.call_continue(match, input_string, length);
2426 }
2427 
regex_get_result(struct regex_match * match,int * end,int * id)2428 int regex_get_result(struct regex_match *match, int *end, int *id)
2429 {
2430 	int flags = match->machine->flags;
2431 	sljit_sw no_states;
2432 
2433 	*end = match->best_end;
2434 	*id = match->best_id;
2435 	if (!(flags & (REGEX_MATCH_END | REGEX_FAKE_MATCH_END)))
2436 		return match->best_begin;
2437 
2438 	if (flags & REGEX_FAKE_MATCH_END) {
2439 		SLJIT_ASSERT(!(flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END)));
2440 		if (match->best_begin != -1)
2441 			return match->best_begin;
2442 
2443 		no_states = match->machine->no_states;
2444 		if (match->current[no_states + 1] == -1)
2445 			return -1;
2446 		if (flags & REGEX_ID_CHECK)
2447 			*id = match->current[no_states + 3];
2448 		if (!(flags & REGEX_FAKE_MATCH_BEGIN))
2449 			*end = match->index - 1;
2450 		else
2451 			*end = match->index - 2;
2452 		return match->current[no_states + 2];
2453 	}
2454 	else {
2455 		/* Check the status of the last code. */
2456 		if (!(flags & REGEX_MATCH_BEGIN)) {
2457 			/* No shortcut in this case. */
2458 			if (!(flags & REGEX_ID_CHECK)) {
2459 				if (match->current[1] == -1)
2460 					return -1;
2461 				*end = match->index - 1;
2462 				return match->current[2];
2463 			}
2464 
2465 			if (match->current[1] == -1)
2466 				return -1;
2467 			*end = match->index - 1;
2468 			*id = match->current[3];
2469 			return match->current[2];
2470 		}
2471 
2472 		/* Shortcut is possible in this case. */
2473 		if (!(flags & REGEX_ID_CHECK)) {
2474 			if (match->current[1] == -1 || match->head == -1)
2475 				return -1;
2476 			*end = match->index - 1;
2477 			return 0;
2478 		}
2479 
2480 		if (match->current[1] == -1 || match->head == -1)
2481 			return -1;
2482 		*end = match->index - 1;
2483 		*id = match->current[2];
2484 		return 0;
2485 	}
2486 }
2487 
regex_is_match_finished(struct regex_match * match)2488 int regex_is_match_finished(struct regex_match *match)
2489 {
2490 	return match->fast_quit;
2491 }
2492 
2493 #ifdef REGEX_MATCH_VERBOSE
regex_continue_match_debug(struct regex_match * match,const regex_char_t * input_string,int length)2494 void regex_continue_match_debug(struct regex_match *match, const regex_char_t *input_string, int length)
2495 {
2496 	sljit_sw *ptr;
2497 	sljit_sw *end;
2498 	sljit_sw count;
2499 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
2500 	sljit_sw current;
2501 #endif
2502 	sljit_sw no_states = match->machine->no_states;
2503 	sljit_sw len = match->machine->size;
2504 
2505 	while (length > 0) {
2506 		match->u.call_continue(match, input_string, 1);
2507 
2508 		if (match->fast_forward) {
2509 			if (match->machine->flags & REGEX_MATCH_VERBOSE)
2510 				printf("fast forward\n");
2511 		}
2512 
2513 		/* Verbose (first). */
2514 		if (match->machine->flags & REGEX_MATCH_VERBOSE) {
2515 			ptr = match->current;
2516 			end = ptr + len;
2517 			count = 0;
2518 			printf("'%c' (%3ld->%3ld [%3ld]) ", *input_string, (long)match->best_begin, (long)match->best_end, (long)match->best_id);
2519 			while (ptr < end) {
2520 				printf("[%3ld:", (long)count++);
2521 				switch (no_states) {
2522 				case 2:
2523 					if (ptr[1] != -1)
2524 						printf("+] ");
2525 					else
2526 						printf(" ] ");
2527 					break;
2528 
2529 				case 3:
2530 					if (ptr[1] != -1)
2531 						printf("+,%3ld] ", (long)ptr[2]);
2532 					else
2533 						printf(" ,XXX] ");
2534 					break;
2535 
2536 				case 4:
2537 					if (ptr[1] != -1)
2538 						printf("+,%3ld,%3ld] ", (long)ptr[2], (long)ptr[3]);
2539 					else
2540 						printf(" ,XXX,XXX] ");
2541 					break;
2542 				}
2543 				ptr += no_states;
2544 			}
2545 			printf("\n");
2546 		}
2547 
2548 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
2549 		/* Sanity check (later). */
2550 		ptr = match->next;
2551 		end = ptr + len;
2552 		while (ptr < end) {
2553 			SLJIT_ASSERT(ptr[1] == -1);
2554 			ptr += no_states;
2555 		}
2556 
2557 		/* Check number of active elements. */
2558 		ptr = match->current + no_states;
2559 		end = ptr + len - no_states;
2560 		count = 0;
2561 		while (ptr < end) {
2562 			if (ptr[1] != -1)
2563 				count++;
2564 			ptr += no_states;
2565 		}
2566 
2567 		/* Check chain list. */
2568 		current = match->head;
2569 		ptr = match->current;
2570 		while (current != 0) {
2571 			SLJIT_ASSERT(current >= 0 && current < len * sizeof(sljit_sw));
2572 			SLJIT_ASSERT((current % (no_states * sizeof(sljit_sw))) == 0);
2573 			SLJIT_ASSERT(count > 0);
2574 			current = ptr[(current / sizeof(sljit_sw)) + 1];
2575 			count--;
2576 		}
2577 		SLJIT_ASSERT(count == 0);
2578 #endif
2579 
2580 		if (match->fast_quit) {
2581 			/* the machine has stopped working. */
2582 			if (match->machine->flags & REGEX_MATCH_VERBOSE)
2583 				printf("Best match has found\n");
2584 			break;
2585 		}
2586 
2587 		input_string++;
2588 		length--;
2589 	}
2590 }
2591 #endif
2592