1 /*
2 Implementation of backtracking std.regex engine.
3 Contains both compile-time and run-time versions.
4 */
5 module std.regex.internal.backtracking;
6
7 package(std.regex):
8
9 import core.stdc.stdlib, std.range.primitives, std.traits, std.typecons;
10 import std.regex.internal.ir;
11
12 import core.memory : pureMalloc, pureFree;
13
14 /+
15 BacktrackingMatcher implements backtracking scheme of matching
16 regular expressions.
17 +/
18 @trusted class BacktrackingMatcher(Char, Stream = Input!Char) : Matcher!Char
19 if (is(Char : dchar))
20 {
21 alias DataIndex = Stream.DataIndex;
22 struct State
23 {//top bit in pc is set if saved along with matches
24 DataIndex index;
25 uint pc, counter, infiniteNesting;
26 }
27 static assert(State.sizeof % size_t.sizeof == 0);
28 enum stateSize = State.sizeof / size_t.sizeof;
29 enum initialStack = 1 << 11; // items in a block of segmented stack
30 alias String = const(Char)[];
31 alias RegEx = Regex!Char;
32 alias MatchFn = bool function(BacktrackingMatcher) pure;
33 const RegEx re; // regex program
34 MatchFn nativeFn; // native code for that program
35 // Stream state
36 Stream s;
37 DataIndex index;
38 dchar front;
39 bool exhausted;
40 // Backtracking machine state
41 uint pc, counter;
42 DataIndex lastState = 0; // Top of state stack
43 uint infiniteNesting;
44 size_t[] memory;
45 Trace[] merge;
46 static struct Trace
47 {
48 ulong mask;
49 size_t offset;
50
markTrace51 bool mark(size_t idx)
52 {
53 immutable d = idx - offset;
54 if (d < 64) // including overflow
55 {
56 immutable p = mask & (1UL << d);
57 mask |= 1UL << d;
58 return p != 0;
59 }
60 else
61 {
62 offset = idx;
63 mask = 1;
64 return false;
65 }
66 }
67 }
68 //local slice of matches, global for backref
69 Group!DataIndex[] matches, backrefed;
70 size_t _refCount;
71 final:
72
refCount()73 override @property ref size_t refCount() { return _refCount; }
pattern()74 override @property ref const(RegEx) pattern(){ return re; }
75
76 static if (__traits(hasMember,Stream, "search"))
77 {
78 enum kicked = true;
79 }
80 else
81 enum kicked = false;
82
initialMemory(const ref RegEx re)83 static size_t initialMemory(const ref RegEx re)
84 {
85 return stackSize(re)*size_t.sizeof + re.hotspotTableSize*Trace.sizeof;
86 }
87
stackSize(const ref RegEx re)88 static size_t stackSize(const ref RegEx re)
89 {
90 size_t itemSize = stateSize
91 + re.ngroup * (Group!DataIndex).sizeof / size_t.sizeof;
92 return initialStack * itemSize + 2;
93 }
94
atStart()95 @property bool atStart(){ return index == 0; }
96
atEnd()97 @property bool atEnd(){ return index == s.lastIndex && s.atEnd; }
98
next()99 void next()
100 {
101 if (!s.nextChar(front, index))
102 index = s.lastIndex;
103 }
104
search()105 void search()
106 {
107 static if (kicked)
108 {
109 if (!s.search(re.kickstart, front, index))
110 {
111 index = s.lastIndex;
112 }
113 }
114 else
115 next();
116 }
117
118 //
newStack()119 void newStack()
120 {
121 auto chunk = mallocArray!(size_t)(stackSize(re));
122 chunk[0] = cast(size_t)(memory.ptr);
123 chunk[1] = lastState;
124 memory = chunk[2..$];
125 lastState = 0;
126 }
127
prevStack()128 bool prevStack()
129 {
130 // pointer to previous block
131 size_t* prev = cast(size_t*) memory.ptr[-2];
132 if (!prev)
133 {
134 // The last segment is freed in RegexMatch
135 return false;
136 }
137 else
138 {
139 import core.memory : pureFree;
140 // memory used in previous block
141 size_t size = memory.ptr[-1];
142 pureFree(memory.ptr-2);
143 memory = prev[0 .. size];
144 lastState = size;
145 return true;
146 }
147 }
148
initExternalMemory(void[]memBlock)149 void initExternalMemory(void[] memBlock)
150 {
151 merge = arrayInChunk!(Trace)(re.hotspotTableSize, memBlock);
152 merge[] = Trace.init;
153 memory = cast(size_t[]) memBlock;
154 memory[0] = 0; // hidden pointer
155 memory[1] = 0; // used size
156 memory = memory[2..$];
157 }
158
initialize(ref const RegEx program,Stream stream,void[]memBlock)159 void initialize(ref const RegEx program, Stream stream, void[] memBlock)
160 {
161 s = stream;
162 exhausted = false;
163 initExternalMemory(memBlock);
164 backrefed = null;
165 }
166
167 override void dupTo(Matcher!Char m, void[] memBlock)
168 {
169 auto backtracking = cast(BacktrackingMatcher) m;
170 backtracking.s = s;
171 backtracking.front = front;
172 backtracking.index = index;
173 backtracking.exhausted = exhausted;
174 backtracking.initExternalMemory(memBlock);
175 }
176
177 override Matcher!Char rearm(in Char[] data)
178 {
179 merge[] = Trace.init;
180 exhausted = false;
181 s = Stream(data);
182 next();
183 return this;
184 }
185
this(ref const RegEx program,Stream stream,void[]memBlock,dchar ch,DataIndex idx)186 this(ref const RegEx program, Stream stream, void[] memBlock, dchar ch, DataIndex idx)
187 {
188 _refCount = 1;
189 re = program;
190 nativeFn = null;
191 initialize(program, stream, memBlock);
192 front = ch;
193 index = idx;
194 }
195
this(ref const RegEx program,MatchFn func,Stream stream,void[]memBlock)196 this(ref const RegEx program, MatchFn func, Stream stream, void[] memBlock)
197 {
198 _refCount = 1;
199 re = program;
200 initialize(program, stream, memBlock);
201 nativeFn = func;
202 next();
203 }
204
this(ref const RegEx program,Stream stream,void[]memBlock)205 this(ref const RegEx program, Stream stream, void[] memBlock)
206 {
207 _refCount = 1;
208 re = program;
209 nativeFn = null;
210 initialize(program, stream, memBlock);
211 next();
212 }
213
fwdMatcher(ref const RegEx re,void[]memBlock)214 auto fwdMatcher(ref const RegEx re, void[] memBlock)
215 {
216 alias BackMatcher = BacktrackingMatcher!(Char, Stream);
217 auto fwdMatcher = new BackMatcher(re, s, memBlock, front, index);
218 return fwdMatcher;
219 }
220
bwdMatcher(ref const RegEx re,void[]memBlock)221 auto bwdMatcher(ref const RegEx re, void[] memBlock)
222 {
223 alias BackMatcher = BacktrackingMatcher!(Char, typeof(s.loopBack(index)));
224 auto fwdMatcher =
225 new BackMatcher(re, s.loopBack(index), memBlock);
226 return fwdMatcher;
227 }
228
229 //
matchFinalize()230 int matchFinalize()
231 {
232 immutable start = index;
233 immutable val = matchImpl();
234 if (val)
235 {//stream is updated here
236 matches[0].begin = start;
237 matches[0].end = index;
238 if (!(re.flags & RegexOption.global) || atEnd)
239 exhausted = true;
240 if (start == index)//empty match advances input
241 next();
242 return val;
243 }
244 else
245 return 0;
246 }
247
248 //lookup next match, fill matches with indices into input
249 override int match(Group!DataIndex[] matches)
250 {
debug(std_regex_matcher)251 debug(std_regex_matcher)
252 {
253 writeln("------------------------------------------");
254 }
255 if (exhausted) //all matches collected
256 return false;
257 this.matches = matches;
258 if (re.flags & RegexInfo.oneShot)
259 {
260 exhausted = true;
261 const DataIndex start = index;
262 immutable m = matchImpl();
263 if (m)
264 {
265 matches[0].begin = start;
266 matches[0].end = index;
267 }
268 return m;
269 }
270 static if (kicked)
271 {
272 if (!re.kickstart.empty)
273 {
274 for (;;)
275 {
276 immutable val = matchFinalize();
277 if (val)
278 return val;
279 else
280 {
281 if (atEnd)
282 break;
283 search();
284 if (atEnd)
285 {
286 exhausted = true;
287 return matchFinalize();
288 }
289 }
290 }
291 exhausted = true;
292 return 0; //early return
293 }
294 }
295 //no search available - skip a char at a time
296 for (;;)
297 {
298 immutable val = matchFinalize();
299 if (val)
300 return val;
301 else
302 {
303 if (atEnd)
304 break;
305 next();
306 if (atEnd)
307 {
308 exhausted = true;
309 return matchFinalize();
310 }
311 }
312 }
313 exhausted = true;
314 return 0;
315 }
316
317 /+
318 match subexpression against input,
319 results are stored in matches
320 +/
matchImpl()321 int matchImpl() pure
322 {
323 if (nativeFn)
324 {
325 debug(std_regex_ctr) writeln("using C-T matcher");
326 return nativeFn(this);
327 }
328 else
329 {
330 pc = 0;
331 counter = 0;
332 lastState = 0;
333 infiniteNesting = 0;
334 matches[] = Group!DataIndex.init;
335 auto start = s._index;
336 debug(std_regex_matcher)
337 writeln("Try match starting at ", s[index .. s.lastIndex]);
338 for (;;)
339 {
340 debug(std_regex_matcher)
341 writefln("PC: %s\tCNT: %s\t%s \tfront: %s src: %s",
342 pc, counter, disassemble(re.ir, pc, re.dict),
343 front, s._index);
344 switch (re.ir[pc].code)
345 {
346 case IR.OrChar://assumes IRL!(OrChar) == 1
347 if (atEnd)
348 goto L_backtrack;
349 uint len = re.ir[pc].sequence;
350 uint end = pc + len;
351 if (re.ir[pc].data != front && re.ir[pc+1].data != front)
352 {
353 for (pc = pc+2; pc < end; pc++)
354 if (re.ir[pc].data == front)
355 break;
356 if (pc == end)
357 goto L_backtrack;
358 }
359 pc = end;
360 next();
361 break;
362 case IR.Char:
363 if (atEnd || front != re.ir[pc].data)
364 goto L_backtrack;
365 pc += IRL!(IR.Char);
366 next();
367 break;
368 case IR.Any:
369 if (atEnd)
370 goto L_backtrack;
371 pc += IRL!(IR.Any);
372 next();
373 break;
374 case IR.CodepointSet:
375 if (atEnd || !re.charsets[re.ir[pc].data].scanFor(front))
376 goto L_backtrack;
377 next();
378 pc += IRL!(IR.CodepointSet);
379 break;
380 case IR.Trie:
381 if (atEnd || !re.matchers[re.ir[pc].data][front])
382 goto L_backtrack;
383 next();
384 pc += IRL!(IR.Trie);
385 break;
386 case IR.Wordboundary:
387 dchar back;
388 DataIndex bi;
389 //at start & end of input
390 if (atStart && wordMatcher[front])
391 {
392 pc += IRL!(IR.Wordboundary);
393 break;
394 }
395 else if (atEnd && s.loopBack(index).nextChar(back, bi)
396 && wordMatcher[back])
397 {
398 pc += IRL!(IR.Wordboundary);
399 break;
400 }
401 else if (s.loopBack(index).nextChar(back, bi))
402 {
403 immutable af = wordMatcher[front];
404 immutable ab = wordMatcher[back];
405 if (af ^ ab)
406 {
407 pc += IRL!(IR.Wordboundary);
408 break;
409 }
410 }
411 goto L_backtrack;
412 case IR.Notwordboundary:
413 dchar back;
414 DataIndex bi;
415 //at start & end of input
416 if (atStart && wordMatcher[front])
417 goto L_backtrack;
418 else if (atEnd && s.loopBack(index).nextChar(back, bi)
419 && wordMatcher[back])
420 goto L_backtrack;
421 else if (s.loopBack(index).nextChar(back, bi))
422 {
423 immutable af = wordMatcher[front];
424 immutable ab = wordMatcher[back];
425 if (af ^ ab)
426 goto L_backtrack;
427 }
428 pc += IRL!(IR.Wordboundary);
429 break;
430 case IR.Bof:
431 if (atStart)
432 pc += IRL!(IR.Bol);
433 else
434 goto L_backtrack;
435 break;
436 case IR.Bol:
437 dchar back;
438 DataIndex bi;
439 if (atStart)
440 pc += IRL!(IR.Bol);
441 else if (s.loopBack(index).nextChar(back,bi)
442 && endOfLine(back, front == '\n'))
443 {
444 pc += IRL!(IR.Bol);
445 }
446 else
447 goto L_backtrack;
448 break;
449 case IR.Eof:
450 if (atEnd)
451 pc += IRL!(IR.Eol);
452 else
453 goto L_backtrack;
454 break;
455 case IR.Eol:
456 dchar back;
457 DataIndex bi;
458 debug(std_regex_matcher) writefln("EOL (front 0x%x) %s", front, s[index .. s.lastIndex]);
459 //no matching inside \r\n
460 if (atEnd || (endOfLine(front, s.loopBack(index).nextChar(back,bi)
461 && back == '\r')))
462 {
463 pc += IRL!(IR.Eol);
464 }
465 else
466 goto L_backtrack;
467 break;
468 case IR.InfiniteStart, IR.InfiniteQStart:
469 pc += re.ir[pc].data + IRL!(IR.InfiniteStart);
470 //now pc is at end IR.Infinite(Q)End
471 uint len = re.ir[pc].data;
472 if (re.ir[pc].code == IR.InfiniteEnd)
473 {
474 pushState(pc+IRL!(IR.InfiniteEnd), counter);
475 pc -= len;
476 }
477 else
478 {
479 pushState(pc - len, counter);
480 pc += IRL!(IR.InfiniteEnd);
481 }
482 break;
483 case IR.InfiniteBloomStart:
484 pc += re.ir[pc].data + IRL!(IR.InfiniteBloomStart);
485 //now pc is at end IR.InfiniteBloomEnd
486 immutable len = re.ir[pc].data;
487 immutable filterIdx = re.ir[pc+2].raw;
488 if (re.filters[filterIdx][front])
489 pushState(pc+IRL!(IR.InfiniteBloomEnd), counter);
490 pc -= len;
491 break;
492 case IR.RepeatStart, IR.RepeatQStart:
493 pc += re.ir[pc].data + IRL!(IR.RepeatStart);
494 break;
495 case IR.RepeatEnd:
496 case IR.RepeatQEnd:
497 if (merge[re.ir[pc + 1].raw+counter].mark(index))
498 {
499 // merged!
500 goto L_backtrack;
501 }
502 //len, step, min, max
503 immutable len = re.ir[pc].data;
504 immutable step = re.ir[pc+2].raw;
505 immutable min = re.ir[pc+3].raw;
506 immutable max = re.ir[pc+4].raw;
507 if (counter < min)
508 {
509 counter += step;
510 pc -= len;
511 }
512 else if (counter < max)
513 {
514 if (re.ir[pc].code == IR.RepeatEnd)
515 {
516 pushState(pc + IRL!(IR.RepeatEnd), counter%step);
517 counter += step;
518 pc -= len;
519 }
520 else
521 {
522 pushState(pc - len, counter + step);
523 counter = counter%step;
524 pc += IRL!(IR.RepeatEnd);
525 }
526 }
527 else
528 {
529 counter = counter%step;
530 pc += IRL!(IR.RepeatEnd);
531 }
532 break;
533 case IR.InfiniteEnd:
534 case IR.InfiniteQEnd:
535 debug(std_regex_matcher) writeln("Infinited nesting:", infiniteNesting);
536 if (merge[re.ir[pc + 1].raw+counter].mark(index))
537 {
538 // merged!
539 goto L_backtrack;
540 }
541 immutable len = re.ir[pc].data;
542 if (re.ir[pc].code == IR.InfiniteEnd)
543 {
544 pushState(pc + IRL!(IR.InfiniteEnd), counter);
545 pc -= len;
546 }
547 else
548 {
549 pushState(pc-len, counter);
550 pc += IRL!(IR.InfiniteEnd);
551 }
552 break;
553 case IR.InfiniteBloomEnd:
554 debug(std_regex_matcher) writeln("Infinited nesting:", infiniteNesting);
555 if (merge[re.ir[pc + 1].raw+counter].mark(index))
556 {
557 // merged!
558 goto L_backtrack;
559 }
560 immutable len = re.ir[pc].data;
561 immutable filterIdx = re.ir[pc+2].raw;
562 if (re.filters[filterIdx][front])
563 {
564 infiniteNesting--;
565 pushState(pc + IRL!(IR.InfiniteBloomEnd), counter);
566 infiniteNesting++;
567 }
568 pc -= len;
569 break;
570 case IR.OrEnd:
571 if (merge[re.ir[pc + 1].raw+counter].mark(index))
572 {
573 // merged!
574 goto L_backtrack;
575 }
576 pc += IRL!(IR.OrEnd);
577 break;
578 case IR.OrStart:
579 pc += IRL!(IR.OrStart);
580 goto case;
581 case IR.Option:
582 immutable len = re.ir[pc].data;
583 if (re.ir[pc+len].code == IR.GotoEndOr)//not a last one
584 {
585 pushState(pc + len + IRL!(IR.Option), counter); //remember 2nd branch
586 }
587 pc += IRL!(IR.Option);
588 break;
589 case IR.GotoEndOr:
590 pc = pc + re.ir[pc].data + IRL!(IR.GotoEndOr);
591 break;
592 case IR.GroupStart:
593 immutable n = re.ir[pc].data;
594 matches[n].begin = index;
595 debug(std_regex_matcher) writefln("IR group #%u starts at %u", n, index);
596 pc += IRL!(IR.GroupStart);
597 break;
598 case IR.GroupEnd:
599 immutable n = re.ir[pc].data;
600 matches[n].end = index;
601 debug(std_regex_matcher) writefln("IR group #%u ends at %u", n, index);
602 pc += IRL!(IR.GroupEnd);
603 break;
604 case IR.LookaheadStart:
605 case IR.NeglookaheadStart:
606 immutable len = re.ir[pc].data;
607 auto save = index;
608 immutable ms = re.ir[pc+1].raw, me = re.ir[pc+2].raw;
609 auto mem = pureMalloc(initialMemory(re))[0 .. initialMemory(re)];
610 scope(exit) pureFree(mem.ptr);
611 auto slicedRe = re.withCode(re.ir[
612 pc+IRL!(IR.LookaheadStart) .. pc+IRL!(IR.LookaheadStart)+len+IRL!(IR.LookaheadEnd)
613 ]);
614 static if (Stream.isLoopback)
615 {
616 auto matcher = bwdMatcher(slicedRe, mem);
617 }
618 else
619 {
620 auto matcher = fwdMatcher(slicedRe, mem);
621 }
622 matcher.matches = matches[ms .. me];
623 matcher.backrefed = backrefed.empty ? matches : backrefed;
624 immutable match = (matcher.matchImpl() != 0) ^ (re.ir[pc].code == IR.NeglookaheadStart);
625 s.reset(save);
626 next();
627 if (!match)
628 goto L_backtrack;
629 else
630 {
631 pc += IRL!(IR.LookaheadStart)+len+IRL!(IR.LookaheadEnd);
632 }
633 break;
634 case IR.LookbehindStart:
635 case IR.NeglookbehindStart:
636 immutable len = re.ir[pc].data;
637 immutable ms = re.ir[pc+1].raw, me = re.ir[pc+2].raw;
638 auto mem = pureMalloc(initialMemory(re))[0 .. initialMemory(re)];
639 scope(exit) pureFree(mem.ptr);
640 auto slicedRe = re.withCode(re.ir[
641 pc + IRL!(IR.LookbehindStart) .. pc + IRL!(IR.LookbehindStart) + len + IRL!(IR.LookbehindEnd)
642 ]);
643 static if (Stream.isLoopback)
644 {
645 alias Matcher = BacktrackingMatcher!(Char, Stream);
646 auto matcher = new Matcher(slicedRe, s, mem, front, index);
647 }
648 else
649 {
650 alias Matcher = BacktrackingMatcher!(Char, typeof(s.loopBack(index)));
651 auto matcher = new Matcher(slicedRe, s.loopBack(index), mem);
652 }
653 matcher.matches = matches[ms .. me];
654 matcher.backrefed = backrefed.empty ? matches : backrefed;
655 immutable match = (matcher.matchImpl() != 0) ^ (re.ir[pc].code == IR.NeglookbehindStart);
656 if (!match)
657 goto L_backtrack;
658 else
659 {
660 pc += IRL!(IR.LookbehindStart)+len+IRL!(IR.LookbehindEnd);
661 }
662 break;
663 case IR.Backref:
664 immutable n = re.ir[pc].data;
665 auto referenced = re.ir[pc].localRef
666 ? s[matches[n].begin .. matches[n].end]
667 : s[backrefed[n].begin .. backrefed[n].end];
668 while (!atEnd && !referenced.empty && front == referenced.front)
669 {
670 next();
671 referenced.popFront();
672 }
673 if (referenced.empty)
674 pc++;
675 else
676 goto L_backtrack;
677 break;
678 case IR.Nop:
679 pc += IRL!(IR.Nop);
680 break;
681 case IR.LookaheadEnd:
682 case IR.NeglookaheadEnd:
683 case IR.LookbehindEnd:
684 case IR.NeglookbehindEnd:
685 case IR.End:
686 // cleanup stale stack blocks if any
687 while (prevStack()) {}
688 return re.ir[pc].data;
689 default:
690 debug printBytecode(re.ir[0..$]);
691 assert(0);
692 L_backtrack:
693 if (!popState())
694 {
695 s.reset(start);
696 return 0;
697 }
698 }
699 }
700 }
701 assert(0);
702 }
703
stackAvail()704 @property size_t stackAvail()
705 {
706 return memory.length - lastState;
707 }
708
709 void stackPush(T)(T val)
710 if (!isDynamicArray!T)
711 {
712 *cast(T*)&memory[lastState] = val;
713 enum delta = (T.sizeof+size_t.sizeof/2)/size_t.sizeof;
714 lastState += delta;
715 debug(std_regex_matcher) writeln("push element SP= ", lastState);
716 }
717
stackPush(T)718 void stackPush(T)(T[] val)
719 {
720 static assert(T.sizeof % size_t.sizeof == 0);
721 (cast(T*)&memory[lastState])[0 .. val.length]
722 = val[0..$];
723 lastState += val.length*(T.sizeof/size_t.sizeof);
724 debug(std_regex_matcher) writeln("push array SP= ", lastState);
725 }
726
727 void stackPop(T)(ref T val)
728 if (!isDynamicArray!T)
729 {
730 enum delta = (T.sizeof+size_t.sizeof/2)/size_t.sizeof;
731 lastState -= delta;
732 val = *cast(T*)&memory[lastState];
733 debug(std_regex_matcher) writeln("pop element SP= ", lastState);
734 }
735
stackPop(T)736 void stackPop(T)(T[] val)
737 {
738 stackPop(val); // call ref version
739 }
stackPop(T)740 void stackPop(T)(ref T[] val)
741 {
742 lastState -= val.length*(T.sizeof/size_t.sizeof);
743 val[0..$] = (cast(T*)&memory[lastState])[0 .. val.length];
744 debug(std_regex_matcher) writeln("pop array SP= ", lastState);
745 }
746 //helper function, saves engine state
pushState(uint pc,uint counter)747 void pushState(uint pc, uint counter)
748 {
749 if (stateSize + 2 * matches.length > stackAvail)
750 {
751 newStack();
752 }
753 *cast(State*)&memory[lastState] =
754 State(index, pc, counter, infiniteNesting);
755 lastState += stateSize;
756 memory[lastState .. lastState + 2 * matches.length] = (cast(size_t[]) matches)[];
757 lastState += 2*matches.length;
758 debug(std_regex_matcher)
759 writefln("Saved(pc=%s) front: %s src: %s",
760 pc, front, s[index .. s.lastIndex]);
761 }
762
763 //helper function, restores engine state
popState()764 bool popState()
765 {
766 if (!lastState && !prevStack())
767 return false;
768 lastState -= 2*matches.length;
769 auto pm = cast(size_t[]) matches;
770 pm[] = memory[lastState .. lastState + 2 * matches.length];
771 lastState -= stateSize;
772 State* state = cast(State*)&memory[lastState];
773 index = state.index;
774 pc = state.pc;
775 counter = state.counter;
776 infiniteNesting = state.infiniteNesting;
777 debug(std_regex_matcher)
778 {
779 writefln("Restored matches", front, s[index .. s.lastIndex]);
780 foreach (i, m; matches)
781 writefln("Sub(%d) : %s..%s", i, m.begin, m.end);
782 }
783 s.reset(index);
784 next();
785 debug(std_regex_matcher)
786 writefln("Backtracked (pc=%s) front: %s src: %s",
787 pc, front, s[index .. s.lastIndex]);
788 return true;
789 }
790 }
791
792 //very shitty string formatter, $$ replaced with next argument converted to string
ctSub(U...)793 @trusted string ctSub( U...)(string format, U args)
794 {
795 import std.conv : to;
796 bool seenDollar;
797 foreach (i, ch; format)
798 {
799 if (ch == '$')
800 {
801 if (seenDollar)
802 {
803 static if (args.length > 0)
804 {
805 return format[0 .. i - 1] ~ to!string(args[0])
806 ~ ctSub(format[i + 1 .. $], args[1 .. $]);
807 }
808 else
809 assert(0);
810 }
811 else
812 seenDollar = true;
813 }
814 else
815 seenDollar = false;
816
817 }
818 return format;
819 }
820
821 struct CtContext
822 {
823 import std.conv : to, text;
824 //dirty flags
825 bool counter;
826 //to mark the portion of matches to save
827 int match, total_matches;
828 int reserved;
829 const(CodepointInterval)[][] charsets;
830
831
832 //state of codegenerator
833 static struct CtState
834 {
835 string code;
836 int addr;
837 }
838
thisCtContext839 this(Char)(ref const Regex!Char re)
840 {
841 match = 1;
842 reserved = 1; //first match is skipped
843 total_matches = re.ngroup;
844 foreach (ref set; re.charsets)
845 {
846 charsets ~= set.intervals;
847 }
848 }
849
lookaroundCtContext850 CtContext lookaround(uint s, uint e)
851 {
852 CtContext ct;
853 ct.total_matches = e - s;
854 ct.match = 1;
855 return ct;
856 }
857
858 //restore state having current context
restoreCodeCtContext859 string restoreCode()
860 {
861 string text;
862 //stack is checked in L_backtrack
863 text ~= counter
864 ? "
865 stackPop(counter);"
866 : "
867 counter = 0;";
868 if (match < total_matches)
869 {
870 text ~= ctSub("
871 stackPop(matches[$$..$$]);", reserved, match);
872 text ~= ctSub("
873 matches[$$..$] = typeof(matches[0]).init;", match);
874 }
875 else
876 text ~= ctSub("
877 stackPop(matches[$$..$]);", reserved);
878 return text;
879 }
880
881 //save state having current context
882 string saveCode(uint pc, string count_expr="counter")
883 {
884 string text = ctSub("
885 if (stackAvail < $$*(Group!(DataIndex)).sizeof/size_t.sizeof + $$)
886 {
887 newStack();
888 }", match - reserved, cast(int) counter + 2);
889 if (match < total_matches)
890 text ~= ctSub("
891 stackPush(matches[$$..$$]);", reserved, match);
892 else
893 text ~= ctSub("
894 stackPush(matches[$$..$]);", reserved);
895 text ~= counter ? ctSub("
896 stackPush($$);", count_expr) : "";
897 text ~= ctSub("
898 stackPush(index); stackPush($$); \n", pc);
899 return text;
900 }
901
902 //
ctGenBlockCtContext903 CtState ctGenBlock(const(Bytecode)[] ir, int addr)
904 {
905 CtState result;
906 result.addr = addr;
907 while (!ir.empty)
908 {
909 auto n = ctGenGroup(ir, result.addr);
910 result.code ~= n.code;
911 result.addr = n.addr;
912 }
913 return result;
914 }
915
916 //
ctGenGroupCtContext917 CtState ctGenGroup(ref const(Bytecode)[] ir, int addr)
918 {
919 import std.algorithm.comparison : max;
920 auto bailOut = "goto L_backtrack;";
921 auto nextInstr = ctSub("goto case $$;", addr+1);
922 CtState r;
923 assert(!ir.empty);
924 switch (ir[0].code)
925 {
926 case IR.InfiniteStart, IR.InfiniteBloomStart,IR.InfiniteQStart, IR.RepeatStart, IR.RepeatQStart:
927 immutable infLoop =
928 ir[0].code == IR.InfiniteStart || ir[0].code == IR.InfiniteQStart ||
929 ir[0].code == IR.InfiniteBloomStart;
930
931 counter = counter ||
932 ir[0].code == IR.RepeatStart || ir[0].code == IR.RepeatQStart;
933 immutable len = ir[0].data;
934 auto nir = ir[ir[0].length .. ir[0].length+len];
935 r = ctGenBlock(nir, addr+1);
936 //start/end codegen
937 //r.addr is at last test+ jump of loop, addr+1 is body of loop
938 nir = ir[ir[0].length + len .. $];
939 r.code = ctGenFixupCode(ir[0 .. ir[0].length], addr, r.addr) ~ r.code;
940 r.code ~= ctGenFixupCode(nir, r.addr, addr+1);
941 r.addr += 2; //account end instruction + restore state
942 ir = nir;
943 break;
944 case IR.OrStart:
945 immutable len = ir[0].data;
946 auto nir = ir[ir[0].length .. ir[0].length+len];
947 r = ctGenAlternation(nir, addr);
948 ir = ir[ir[0].length + len .. $];
949 assert(ir[0].code == IR.OrEnd);
950 ir = ir[ir[0].length..$];
951 break;
952 case IR.LookaheadStart:
953 case IR.NeglookaheadStart:
954 case IR.LookbehindStart:
955 case IR.NeglookbehindStart:
956 immutable len = ir[0].data;
957 immutable behind = ir[0].code == IR.LookbehindStart || ir[0].code == IR.NeglookbehindStart;
958 immutable negative = ir[0].code == IR.NeglookaheadStart || ir[0].code == IR.NeglookbehindStart;
959 string fwdType = "typeof(fwdMatcher(re, []))";
960 string bwdType = "typeof(bwdMatcher(re, []))";
961 string fwdCreate = "fwdMatcher(re, mem)";
962 string bwdCreate = "bwdMatcher(re, mem)";
963 immutable start = IRL!(IR.LookbehindStart);
964 immutable end = IRL!(IR.LookbehindStart)+len+IRL!(IR.LookaheadEnd);
965 CtContext context = lookaround(ir[1].raw, ir[2].raw); //split off new context
966 auto slice = ir[start .. end];
967 r.code ~= ctSub(`
968 case $$: //fake lookaround "atom"
969 static if (typeof(matcher.s).isLoopback)
970 alias Lookaround = $$;
971 else
972 alias Lookaround = $$;
973 static bool matcher_$$(Lookaround matcher) @trusted
974 {
975 //(neg)lookaround piece start
976 $$
977 //(neg)lookaround piece ends
978 }
979 auto save = index;
980 auto mem = pureMalloc(initialMemory(re))[0 .. initialMemory(re)];
981 scope(exit) pureFree(mem.ptr);
982 static if (typeof(matcher.s).isLoopback)
983 auto lookaround = $$;
984 else
985 auto lookaround = $$;
986 lookaround.matches = matches[$$..$$];
987 lookaround.backrefed = backrefed.empty ? matches : backrefed;
988 lookaround.nativeFn = &matcher_$$; //hookup closure's binary code
989 int match = $$;
990 s.reset(save);
991 next();
992 if (match)
993 $$
994 else
995 $$`, addr,
996 behind ? fwdType : bwdType, behind ? bwdType : fwdType,
997 addr, context.ctGenRegEx(slice),
998 behind ? fwdCreate : bwdCreate, behind ? bwdCreate : fwdCreate,
999 ir[1].raw, ir[2].raw, //start - end of matches slice
1000 addr,
1001 negative ? "!lookaround.matchImpl()" : "lookaround.matchImpl()",
1002 nextInstr, bailOut);
1003 ir = ir[end .. $];
1004 r.addr = addr + 1;
1005 break;
1006 case IR.LookaheadEnd: case IR.NeglookaheadEnd:
1007 case IR.LookbehindEnd: case IR.NeglookbehindEnd:
1008 ir = ir[IRL!(IR.LookaheadEnd) .. $];
1009 r.addr = addr;
1010 break;
1011 default:
1012 assert(ir[0].isAtom, text(ir[0].mnemonic));
1013 r = ctGenAtom(ir, addr);
1014 }
1015 return r;
1016 }
1017
1018 //generate source for bytecode contained in OrStart ... OrEnd
ctGenAlternationCtContext1019 CtState ctGenAlternation(const(Bytecode)[] ir, int addr)
1020 {
1021 CtState[] pieces;
1022 CtState r;
1023 enum optL = IRL!(IR.Option);
1024 for (;;)
1025 {
1026 assert(ir[0].code == IR.Option);
1027 auto len = ir[0].data;
1028 if (optL+len < ir.length && ir[optL+len].code == IR.Option)//not a last option
1029 {
1030 auto nir = ir[optL .. optL+len-IRL!(IR.GotoEndOr)];
1031 r = ctGenBlock(nir, addr+2);//space for Option + restore state
1032 //r.addr+1 to account GotoEndOr at end of branch
1033 r.code = ctGenFixupCode(ir[0 .. ir[0].length], addr, r.addr+1) ~ r.code;
1034 addr = r.addr+1;//leave space for GotoEndOr
1035 pieces ~= r;
1036 ir = ir[optL + len .. $];
1037 }
1038 else
1039 {
1040 pieces ~= ctGenBlock(ir[optL..$], addr);
1041 addr = pieces[$-1].addr;
1042 break;
1043 }
1044 }
1045 r = pieces[0];
1046 for (uint i = 1; i < pieces.length; i++)
1047 {
1048 r.code ~= ctSub(`
1049 case $$:
1050 goto case $$; `, pieces[i-1].addr, addr);
1051 r.code ~= pieces[i].code;
1052 }
1053 r.addr = addr;
1054 return r;
1055 }
1056
1057 // generate fixup code for instruction in ir,
1058 // fixup means it has an alternative way for control flow
ctGenFixupCodeCtContext1059 string ctGenFixupCode(const(Bytecode)[] ir, int addr, int fixup)
1060 {
1061 return ctGenFixupCode(ir, addr, fixup); // call ref Bytecode[] version
1062 }
ctGenFixupCodeCtContext1063 string ctGenFixupCode(ref const(Bytecode)[] ir, int addr, int fixup)
1064 {
1065 string r;
1066 string testCode;
1067 r = ctSub(`
1068 case $$: debug(std_regex_matcher) writeln("#$$");`,
1069 addr, addr);
1070 switch (ir[0].code)
1071 {
1072 case IR.InfiniteStart, IR.InfiniteQStart, IR.InfiniteBloomStart:
1073 r ~= ctSub( `
1074 goto case $$;`, fixup);
1075 ir = ir[ir[0].length..$];
1076 break;
1077 case IR.InfiniteEnd:
1078 testCode = ctQuickTest(ir[IRL!(IR.InfiniteEnd) .. $],addr + 1);
1079 r ~= ctSub( `
1080 if (merge[$$+counter].mark(index))
1081 {
1082 // merged!
1083 goto L_backtrack;
1084 }
1085
1086 $$
1087 {
1088 $$
1089 }
1090 goto case $$;
1091 case $$: //restore state and go out of loop
1092 $$
1093 goto case;`, ir[1].raw, testCode, saveCode(addr+1), fixup,
1094 addr+1, restoreCode());
1095 ir = ir[ir[0].length..$];
1096 break;
1097 case IR.InfiniteBloomEnd:
1098 //TODO: check bloom filter and skip on failure
1099 testCode = ctQuickTest(ir[IRL!(IR.InfiniteBloomEnd) .. $],addr + 1);
1100 r ~= ctSub( `
1101 if (merge[$$+counter].mark(index))
1102 {
1103 // merged!
1104 goto L_backtrack;
1105 }
1106
1107 $$
1108 {
1109 $$
1110 }
1111 goto case $$;
1112 case $$: //restore state and go out of loop
1113 $$
1114 goto case;`, ir[1].raw, testCode, saveCode(addr+1), fixup,
1115 addr+1, restoreCode());
1116 ir = ir[ir[0].length..$];
1117 break;
1118 case IR.InfiniteQEnd:
1119 testCode = ctQuickTest(ir[IRL!(IR.InfiniteEnd) .. $],addr + 1);
1120 auto altCode = testCode.length ? ctSub("else goto case $$;", fixup) : "";
1121 r ~= ctSub( `
1122 if (merge[$$+counter].mark(index))
1123 {
1124 // merged!
1125 goto L_backtrack;
1126 }
1127
1128 $$
1129 {
1130 $$
1131 goto case $$;
1132 }
1133 $$
1134 case $$://restore state and go inside loop
1135 $$
1136 goto case $$;`, ir[1].raw,
1137 testCode, saveCode(addr+1), addr+2, altCode,
1138 addr+1, restoreCode(), fixup);
1139 ir = ir[ir[0].length..$];
1140 break;
1141 case IR.RepeatStart, IR.RepeatQStart:
1142 r ~= ctSub( `
1143 goto case $$;`, fixup);
1144 ir = ir[ir[0].length..$];
1145 break;
1146 case IR.RepeatEnd, IR.RepeatQEnd:
1147 //len, step, min, max
1148 immutable len = ir[0].data;
1149 immutable step = ir[2].raw;
1150 immutable min = ir[3].raw;
1151 immutable max = ir[4].raw;
1152 r ~= ctSub(`
1153 if (merge[$$+counter].mark(index))
1154 {
1155 // merged!
1156 goto L_backtrack;
1157 }
1158 if (counter < $$)
1159 {
1160 debug(std_regex_matcher) writeln("RepeatEnd min case pc=", $$);
1161 counter += $$;
1162 goto case $$;
1163 }`, ir[1].raw, min, addr, step, fixup);
1164 if (ir[0].code == IR.RepeatEnd)
1165 {
1166 string counter_expr = ctSub("counter % $$", step);
1167 r ~= ctSub(`
1168 else if (counter < $$)
1169 {
1170 $$
1171 counter += $$;
1172 goto case $$;
1173 }`, max, saveCode(addr+1, counter_expr), step, fixup);
1174 }
1175 else
1176 {
1177 string counter_expr = ctSub("counter % $$", step);
1178 r ~= ctSub(`
1179 else if (counter < $$)
1180 {
1181 $$
1182 counter = counter % $$;
1183 goto case $$;
1184 }`, max, saveCode(addr+1,counter_expr), step, addr+2);
1185 }
1186 r ~= ctSub(`
1187 else
1188 {
1189 counter = counter % $$;
1190 goto case $$;
1191 }
1192 case $$: //restore state
1193 $$
1194 goto case $$;`, step, addr+2, addr+1, restoreCode(),
1195 ir[0].code == IR.RepeatEnd ? addr+2 : fixup );
1196 ir = ir[ir[0].length..$];
1197 break;
1198 case IR.Option:
1199 r ~= ctSub( `
1200 {
1201 $$
1202 }
1203 goto case $$;
1204 case $$://restore thunk to go to the next group
1205 $$
1206 goto case $$;`, saveCode(addr+1), addr+2,
1207 addr+1, restoreCode(), fixup);
1208 ir = ir[ir[0].length..$];
1209 break;
1210 default:
1211 assert(0, text(ir[0].mnemonic));
1212 }
1213 return r;
1214 }
1215
1216
ctQuickTestCtContext1217 string ctQuickTest(const(Bytecode)[] ir, int id)
1218 {
1219 uint pc = 0;
1220 while (pc < ir.length && ir[pc].isAtom)
1221 {
1222 if (ir[pc].code == IR.GroupStart || ir[pc].code == IR.GroupEnd)
1223 {
1224 pc++;
1225 }
1226 else if (ir[pc].code == IR.Backref)
1227 break;
1228 else
1229 {
1230 auto code = ctAtomCode(ir[pc..$], -1);
1231 return ctSub(`
1232 int test_$$()
1233 {
1234 $$ //$$
1235 }
1236 if (test_$$() >= 0)`, id, code.ptr ? code : "return 0;",
1237 ir[pc].mnemonic, id);
1238 }
1239 }
1240 return "";
1241 }
1242
1243 //process & generate source for simple bytecodes at front of ir using address addr
ctGenAtomCtContext1244 CtState ctGenAtom(ref const(Bytecode)[] ir, int addr)
1245 {
1246 CtState result;
1247 result.code = ctAtomCode(ir, addr);
1248 ir.popFrontN(ir[0].code == IR.OrChar ? ir[0].sequence : ir[0].length);
1249 result.addr = addr + 1;
1250 return result;
1251 }
1252
1253 //D code for atom at ir using address addr, addr < 0 means quickTest
ctAtomCodeCtContext1254 string ctAtomCode(const(Bytecode)[] ir, int addr)
1255 {
1256 string code;
1257 string bailOut, nextInstr;
1258 if (addr < 0)
1259 {
1260 bailOut = "return -1;";
1261 nextInstr = "return 0;";
1262 }
1263 else
1264 {
1265 bailOut = "goto L_backtrack;";
1266 nextInstr = ctSub("goto case $$;", addr+1);
1267 code ~= ctSub( `
1268 case $$: debug(std_regex_matcher) writeln("#$$");
1269 `, addr, addr);
1270 }
1271 switch (ir[0].code)
1272 {
1273 case IR.OrChar://assumes IRL!(OrChar) == 1
1274 code ~= ctSub(`
1275 if (atEnd)
1276 $$`, bailOut);
1277 immutable len = ir[0].sequence;
1278 for (uint i = 0; i < len; i++)
1279 {
1280 code ~= ctSub( `
1281 if (front == $$)
1282 {
1283 $$
1284 $$
1285 }`, ir[i].data, addr >= 0 ? "next();" :"", nextInstr);
1286 }
1287 code ~= ctSub( `
1288 $$`, bailOut);
1289 break;
1290 case IR.Char:
1291 code ~= ctSub( `
1292 if (atEnd || front != $$)
1293 $$
1294 $$
1295 $$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr);
1296 break;
1297 case IR.Any:
1298 code ~= ctSub( `
1299 if (atEnd || (!(re.flags & RegexOption.singleline)
1300 && (front == '\r' || front == '\n')))
1301 $$
1302 $$
1303 $$`, bailOut, addr >= 0 ? "next();" :"",nextInstr);
1304 break;
1305 case IR.CodepointSet:
1306 if (charsets.length)
1307 {
1308 string name = `func_`~to!string(addr+1);
1309 string funcCode = CodepointSet.toSourceCode(charsets[ir[0].data], name);
1310 code ~= ctSub( `
1311 static $$
1312 if (atEnd || !$$(front))
1313 $$
1314 $$
1315 $$`, funcCode, name, bailOut, addr >= 0 ? "next();" :"", nextInstr);
1316 }
1317 else
1318 code ~= ctSub( `
1319 if (atEnd || !re.charsets[$$].scanFor(front))
1320 $$
1321 $$
1322 $$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr);
1323 break;
1324 case IR.Trie:
1325 if (charsets.length && charsets[ir[0].data].length <= 8)
1326 goto case IR.CodepointSet;
1327 code ~= ctSub( `
1328 if (atEnd || !re.matchers[$$][front])
1329 $$
1330 $$
1331 $$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr);
1332 break;
1333 case IR.Wordboundary:
1334 code ~= ctSub( `
1335 dchar back;
1336 DataIndex bi;
1337 if (atStart && wordMatcher[front])
1338 {
1339 $$
1340 }
1341 else if (atEnd && s.loopBack(index).nextChar(back, bi)
1342 && wordMatcher[back])
1343 {
1344 $$
1345 }
1346 else if (s.loopBack(index).nextChar(back, bi))
1347 {
1348 bool af = wordMatcher[front];
1349 bool ab = wordMatcher[back];
1350 if (af ^ ab)
1351 {
1352 $$
1353 }
1354 }
1355 $$`, nextInstr, nextInstr, nextInstr, bailOut);
1356 break;
1357 case IR.Notwordboundary:
1358 code ~= ctSub( `
1359 dchar back;
1360 DataIndex bi;
1361 //at start & end of input
1362 if (atStart && wordMatcher[front])
1363 $$
1364 else if (atEnd && s.loopBack(index).nextChar(back, bi)
1365 && wordMatcher[back])
1366 $$
1367 else if (s.loopBack(index).nextChar(back, bi))
1368 {
1369 bool af = wordMatcher[front];
1370 bool ab = wordMatcher[back];
1371 if (af ^ ab)
1372 $$
1373 }
1374 $$`, bailOut, bailOut, bailOut, nextInstr);
1375
1376 break;
1377 case IR.Bol:
1378 code ~= ctSub(`
1379 dchar back;
1380 DataIndex bi;
1381 if (atStart || (s.loopBack(index).nextChar(back,bi)
1382 && endOfLine(back, front == '\n')))
1383 {
1384 debug(std_regex_matcher) writeln("BOL matched");
1385 $$
1386 }
1387 else
1388 $$`, nextInstr, bailOut);
1389
1390 break;
1391 case IR.Bof:
1392 code ~= ctSub(`
1393 if (atStart)
1394 {
1395 debug(std_regex_matcher) writeln("BOF matched");
1396 $$
1397 }
1398 else
1399 $$`, nextInstr, bailOut);
1400 break;
1401 case IR.Eol:
1402 code ~= ctSub(`
1403 dchar back;
1404 DataIndex bi;
1405 debug(std_regex_matcher) writefln("EOL (front 0x%x) %s", front, s[index .. s.lastIndex]);
1406 //no matching inside \r\n
1407 if (atEnd || (endOfLine(front, s.loopBack(index).nextChar(back,bi)
1408 && back == '\r')))
1409 {
1410 debug(std_regex_matcher) writeln("EOL matched");
1411 $$
1412 }
1413 else
1414 $$`, nextInstr, bailOut);
1415 break;
1416 case IR.Eof:
1417 code ~= ctSub(`
1418 if (atEnd)
1419 {
1420 debug(std_regex_matcher) writeln("BOF matched");
1421 $$
1422 }
1423 else
1424 $$`, nextInstr, bailOut);
1425 break;
1426 case IR.GroupStart:
1427 code ~= ctSub(`
1428 matches[$$].begin = index;
1429 $$`, ir[0].data, nextInstr);
1430 match = ir[0].data+1;
1431 break;
1432 case IR.GroupEnd:
1433 code ~= ctSub(`
1434 matches[$$].end = index;
1435 $$`, ir[0].data, nextInstr);
1436 break;
1437 case IR.Backref:
1438 string mStr = "auto referenced = ";
1439 mStr ~= ir[0].localRef
1440 ? ctSub("s[matches[$$].begin .. matches[$$].end];",
1441 ir[0].data, ir[0].data)
1442 : ctSub("s[backrefed[$$].begin .. backrefed[$$].end];",
1443 ir[0].data, ir[0].data);
1444 code ~= ctSub( `
1445 $$
1446 while (!atEnd && !referenced.empty && front == referenced.front)
1447 {
1448 next();
1449 referenced.popFront();
1450 }
1451 if (referenced.empty)
1452 $$
1453 else
1454 $$`, mStr, nextInstr, bailOut);
1455 break;
1456 case IR.Nop:
1457 case IR.End:
1458 break;
1459 default:
1460 assert(0, text(ir[0].mnemonic, " is not supported yet"));
1461 }
1462 return code;
1463 }
1464
1465 //generate D code for the whole regex
ctGenRegExCtContext1466 public string ctGenRegEx(const(Bytecode)[] ir)
1467 {
1468 auto bdy = ctGenBlock(ir, 0);
1469 auto r = `
1470 import core.memory : pureMalloc, pureFree;
1471 with(matcher)
1472 {
1473 pc = 0;
1474 counter = 0;
1475 lastState = 0;
1476 matches[] = Group!DataIndex.init;
1477 auto start = s._index;`;
1478 r ~= `
1479 goto StartLoop;
1480 debug(std_regex_matcher) writeln("Try CT matching starting at ",s[index .. s.lastIndex]);
1481 L_backtrack:
1482 if (lastState || prevStack())
1483 {
1484 stackPop(pc);
1485 stackPop(index);
1486 s.reset(index);
1487 next();
1488 }
1489 else
1490 {
1491 s.reset(start);
1492 return false;
1493 }
1494 StartLoop:
1495 switch (pc)
1496 {
1497 `;
1498 r ~= bdy.code;
1499 r ~= ctSub(`
1500 case $$: break;`,bdy.addr);
1501 r ~= `
1502 default:
1503 assert(0);
1504 }
1505 // cleanup stale stack blocks
1506 while (prevStack()) {}
1507 return true;
1508 }
1509 `;
1510 return r;
1511 }
1512
1513 }
1514
ctGenRegExCode(Char)1515 string ctGenRegExCode(Char)(const Regex!Char re)
1516 {
1517 auto context = CtContext(re);
1518 return context.ctGenRegEx(re.ir);
1519 }
1520