1 //Written in the D programming language
2 /*
3 Regular expression pattern parser.
4 */
5 module std.regex.internal.parser;
6
7 static import std.ascii;
8 import std.range.primitives, std.uni, std.meta,
9 std.traits, std.typecons, std.exception;
10 import std.regex.internal.ir;
11
12 // package relevant info from parser into a regex object
makeRegex(S,CG)13 auto makeRegex(S, CG)(Parser!(S, CG) p)
14 {
15 Regex!(BasicElementOf!S) re;
16 auto g = p.g;
17 with(re)
18 {
19 ir = g.ir;
20 dict = g.dict;
21 ngroup = g.ngroup;
22 maxCounterDepth = g.counterDepth;
23 flags = p.re_flags;
24 charsets = g.charsets;
25 matchers = g.matchers;
26 backrefed = g.backrefed;
27 re.postprocess();
28 debug(std_regex_parser)
29 {
30 __ctfe || print();
31 }
32 //@@@BUG@@@ (not reduced)
33 //somehow just using validate _collides_ with std.utf.validate (!)
34 version (assert) re.validateRe();
35 }
36 return re;
37 }
38
39 // helper for unittest
40 auto makeRegex(S)(S arg)
41 if (isSomeString!S)
42 {
43 return makeRegex(Parser!(S, CodeGen)(arg, ""));
44 }
45
46 @system unittest
47 {
48 import std.algorithm.comparison : equal;
49 auto re = makeRegex(`(?P<name>\w+) = (?P<var>\d+)`);
50 auto nc = re.namedCaptures;
51 static assert(isRandomAccessRange!(typeof(nc)));
52 assert(!nc.empty);
53 assert(nc.length == 2);
54 assert(nc.equal(["name", "var"]));
55 assert(nc[0] == "name");
56 assert(nc[1..$].equal(["var"]));
57
58 re = makeRegex(`(\w+) (?P<named>\w+) (\w+)`);
59 nc = re.namedCaptures;
60 assert(nc.length == 1);
61 assert(nc[0] == "named");
62 assert(nc.front == "named");
63 assert(nc.back == "named");
64
65 re = makeRegex(`(\w+) (\w+)`);
66 nc = re.namedCaptures;
67 assert(nc.empty);
68
69 re = makeRegex(`(?P<year>\d{4})/(?P<month>\d{2})/(?P<day>\d{2})/`);
70 nc = re.namedCaptures;
71 auto cp = nc.save;
72 assert(nc.equal(cp));
73 nc.popFront();
74 assert(nc.equal(cp[1..$]));
75 nc.popBack();
76 assert(nc.equal(cp[1 .. $ - 1]));
77 }
78
79
reverseBytecode()80 @trusted void reverseBytecode()(Bytecode[] code)
81 {
82 Bytecode[] rev = new Bytecode[code.length];
83 uint revPc = cast(uint) rev.length;
84 Stack!(Tuple!(uint, uint, uint)) stack;
85 uint start = 0;
86 uint end = cast(uint) code.length;
87 for (;;)
88 {
89 for (uint pc = start; pc < end; )
90 {
91 immutable len = code[pc].length;
92 if (code[pc].code == IR.GotoEndOr)
93 break; //pick next alternation branch
94 if (code[pc].isAtom)
95 {
96 rev[revPc - len .. revPc] = code[pc .. pc + len];
97 revPc -= len;
98 pc += len;
99 }
100 else if (code[pc].isStart || code[pc].isEnd)
101 {
102 //skip over other embedded lookbehinds they are reversed
103 if (code[pc].code == IR.LookbehindStart
104 || code[pc].code == IR.NeglookbehindStart)
105 {
106 immutable blockLen = len + code[pc].data
107 + code[pc].pairedLength;
108 rev[revPc - blockLen .. revPc] = code[pc .. pc + blockLen];
109 pc += blockLen;
110 revPc -= blockLen;
111 continue;
112 }
113 immutable second = code[pc].indexOfPair(pc);
114 immutable secLen = code[second].length;
115 rev[revPc - secLen .. revPc] = code[second .. second + secLen];
116 revPc -= secLen;
117 if (code[pc].code == IR.OrStart)
118 {
119 //we pass len bytes forward, but secLen in reverse
120 immutable revStart = revPc - (second + len - secLen - pc);
121 uint r = revStart;
122 uint i = pc + IRL!(IR.OrStart);
123 while (code[i].code == IR.Option)
124 {
125 if (code[i - 1].code != IR.OrStart)
126 {
127 assert(code[i - 1].code == IR.GotoEndOr);
128 rev[r - 1] = code[i - 1];
129 }
130 rev[r] = code[i];
131 auto newStart = i + IRL!(IR.Option);
132 auto newEnd = newStart + code[i].data;
133 auto newRpc = r + code[i].data + IRL!(IR.Option);
134 if (code[newEnd].code != IR.OrEnd)
135 {
136 newRpc--;
137 }
138 stack.push(tuple(newStart, newEnd, newRpc));
139 r += code[i].data + IRL!(IR.Option);
140 i += code[i].data + IRL!(IR.Option);
141 }
142 pc = i;
143 revPc = revStart;
144 assert(code[pc].code == IR.OrEnd);
145 }
146 else
147 pc += len;
148 }
149 }
150 if (stack.empty)
151 break;
152 start = stack.top[0];
153 end = stack.top[1];
154 revPc = stack.top[2];
155 stack.pop();
156 }
157 code[] = rev[];
158 }
159
160 //test if a given string starts with hex number of maxDigit that's a valid codepoint
161 //returns it's value and skips these maxDigit chars on success, throws on failure
parseUniHex(Char)162 dchar parseUniHex(Char)(ref Char[] str, size_t maxDigit)
163 {
164 //std.conv.parse is both @system and bogus
165 enforce(str.length >= maxDigit,"incomplete escape sequence");
166 uint val;
167 for (int k = 0; k < maxDigit; k++)
168 {
169 immutable current = str[k];//accepts ascii only, so it's OK to index directly
170 if ('0' <= current && current <= '9')
171 val = val * 16 + current - '0';
172 else if ('a' <= current && current <= 'f')
173 val = val * 16 + current -'a' + 10;
174 else if ('A' <= current && current <= 'F')
175 val = val * 16 + current - 'A' + 10;
176 else
177 throw new Exception("invalid escape sequence");
178 }
179 enforce(val <= 0x10FFFF, "invalid codepoint");
180 str = str[maxDigit..$];
181 return val;
182 }
183
184 @system unittest //BUG canFind is system
185 {
186 import std.algorithm.searching : canFind;
187 string[] non_hex = [ "000j", "000z", "FffG", "0Z"];
188 string[] hex = [ "01", "ff", "00af", "10FFFF" ];
189 int[] value = [ 1, 0xFF, 0xAF, 0x10FFFF ];
190 foreach (v; non_hex)
191 assert(collectException(parseUniHex(v, v.length)).msg
192 .canFind("invalid escape sequence"));
193 foreach (i, v; hex)
194 assert(parseUniHex(v, v.length) == value[i]);
195 string over = "0011FFFF";
196 assert(collectException(parseUniHex(over, over.length)).msg
197 .canFind("invalid codepoint"));
198 }
199
caseEnclose(CodepointSet set)200 auto caseEnclose(CodepointSet set)
201 {
202 auto cased = set & unicode.LC;
203 foreach (dchar ch; cased.byCodepoint)
204 {
205 foreach (c; simpleCaseFoldings(ch))
206 set |= c;
207 }
208 return set;
209 }
210
211 /+
212 fetch codepoint set corresponding to a name (InBlock or binary property)
213 +/
getUnicodeSet(in char[]name,bool negated,bool casefold)214 @trusted CodepointSet getUnicodeSet(in char[] name, bool negated, bool casefold)
215 {
216 CodepointSet s = unicode(name);
217 //FIXME: caseEnclose for new uni as Set | CaseEnclose(SET && LC)
218 if (casefold)
219 s = caseEnclose(s);
220 if (negated)
221 s = s.inverted;
222 return s;
223 }
224
225 //basic stack, just in case it gets used anywhere else then Parser
Stack(T)226 @trusted struct Stack(T)
227 {
228 T[] data;
229 @property bool empty(){ return data.empty; }
230
231 @property size_t length(){ return data.length; }
232
233 void push(T val){ data ~= val; }
234
235 T pop()
236 {
237 assert(!empty);
238 auto val = data[$ - 1];
239 data = data[0 .. $ - 1];
240 if (!__ctfe)
241 cast(void) data.assumeSafeAppend();
242 return val;
243 }
244
245 @property ref T top()
246 {
247 assert(!empty);
248 return data[$ - 1];
249 }
250 }
251
252 struct CodeGen
253 {
254 Bytecode[] ir; // resulting bytecode
255 Stack!(uint) fixupStack; // stack of opened start instructions
256 NamedGroup[] dict; // maps name -> user group number
257 Stack!(uint) groupStack; // stack of current number of group
258 uint nesting = 0; // group nesting level and repetitions step
259 uint lookaroundNest = 0; // nesting of lookaround
260 uint counterDepth = 0; // current depth of nested counted repetitions
261 CodepointSet[] charsets; // sets for char classes
262 const(CharMatcher)[] matchers; // matchers for char classes
263 uint[] backrefed; // bitarray for groups refered by backref
264 uint ngroup; // final number of groups (of all patterns)
265
startCodeGen266 void start(uint length)
267 {
268 if (!__ctfe)
269 ir.reserve((length*5+2)/4);
270 fixupStack.push(0);
271 groupStack.push(1);//0 - whole match
272 }
273
274 //mark referenced groups for latter processing
markBackrefCodeGen275 void markBackref(uint n)
276 {
277 if (n/32 >= backrefed.length)
278 backrefed.length = n/32 + 1;
279 backrefed[n / 32] |= 1 << (n & 31);
280 }
281
isOpenGroupCodeGen282 bool isOpenGroup(uint n)
283 {
284 import std.algorithm.searching : canFind;
285 // walk the fixup stack and see if there are groups labeled 'n'
286 // fixup '0' is reserved for alternations
287 return fixupStack.data[1..$].
288 canFind!(fix => ir[fix].code == IR.GroupStart && ir[fix].data == n)();
289 }
290
putCodeGen291 void put(Bytecode code)
292 {
293 enforce(ir.length < maxCompiledLength,
294 "maximum compiled pattern length is exceeded");
295 ir ~= code;
296 }
297
putRawCodeGen298 void putRaw(uint number)
299 {
300 enforce(ir.length < maxCompiledLength,
301 "maximum compiled pattern length is exceeded");
302 ir ~= Bytecode.fromRaw(number);
303 }
304
305 //try to generate optimal IR code for this CodepointSet
charsetToIrCodeGen306 @trusted void charsetToIr(CodepointSet set)
307 {//@@@BUG@@@ writeln is @system
308 uint chars = cast(uint) set.length;
309 if (chars < Bytecode.maxSequence)
310 {
311 switch (chars)
312 {
313 case 1:
314 put(Bytecode(IR.Char, set.byCodepoint.front));
315 break;
316 case 0:
317 throw new RegexException("empty CodepointSet not allowed");
318 default:
319 foreach (ch; set.byCodepoint)
320 put(Bytecode(IR.OrChar, ch, chars));
321 }
322 }
323 else
324 {
325 import std.algorithm.searching : countUntil;
326 const ivals = set.byInterval;
327 immutable n = charsets.countUntil(set);
328 if (n >= 0)
329 {
330 if (ivals.length*2 > maxCharsetUsed)
331 put(Bytecode(IR.Trie, cast(uint) n));
332 else
333 put(Bytecode(IR.CodepointSet, cast(uint) n));
334 return;
335 }
336 if (ivals.length*2 > maxCharsetUsed)
337 {
338 auto t = getMatcher(set);
339 put(Bytecode(IR.Trie, cast(uint) matchers.length));
340 matchers ~= t;
341 debug(std_regex_allocation) writeln("Trie generated");
342 }
343 else
344 {
345 put(Bytecode(IR.CodepointSet, cast(uint) charsets.length));
346 matchers ~= CharMatcher.init;
347 }
348 charsets ~= set;
349 assert(charsets.length == matchers.length);
350 }
351 }
352
genLogicGroupCodeGen353 void genLogicGroup()
354 {
355 nesting++;
356 pushFixup(length);
357 put(Bytecode(IR.Nop, 0));
358 }
359
genGroupCodeGen360 void genGroup()
361 {
362 nesting++;
363 pushFixup(length);
364 immutable nglob = groupStack.top++;
365 enforce(groupStack.top <= maxGroupNumber, "limit on number of submatches is exceeded");
366 put(Bytecode(IR.GroupStart, nglob));
367 }
368
genNamedGroupCodeGen369 void genNamedGroup(string name)
370 {
371 import std.array : insertInPlace;
372 import std.range : assumeSorted;
373 nesting++;
374 pushFixup(length);
375 immutable nglob = groupStack.top++;
376 enforce(groupStack.top <= maxGroupNumber, "limit on submatches is exceeded");
377 auto t = NamedGroup(name, nglob);
378 auto d = assumeSorted!"a.name < b.name"(dict);
379 immutable ind = d.lowerBound(t).length;
380 insertInPlace(dict, ind, t);
381 put(Bytecode(IR.GroupStart, nglob));
382 }
383
384 //generate code for start of lookaround: (?= (?! (?<= (?<!
genLookaroundCodeGen385 void genLookaround(IR opcode)
386 {
387 nesting++;
388 pushFixup(length);
389 put(Bytecode(opcode, 0));
390 put(Bytecode.fromRaw(0));
391 put(Bytecode.fromRaw(0));
392 groupStack.push(0);
393 lookaroundNest++;
394 enforce(lookaroundNest <= maxLookaroundDepth,
395 "maximum lookaround depth is exceeded");
396 }
397
endPatternCodeGen398 void endPattern(uint num)
399 {
400 import std.algorithm.comparison : max;
401 put(Bytecode(IR.End, num));
402 ngroup = max(ngroup, groupStack.top);
403 groupStack.top = 1; // reset group counter
404 }
405
406 //fixup lookaround with start at offset fix and append a proper *-End opcode
fixLookaroundCodeGen407 void fixLookaround(uint fix)
408 {
409 lookaroundNest--;
410 ir[fix] = Bytecode(ir[fix].code,
411 cast(uint) ir.length - fix - IRL!(IR.LookaheadStart));
412 auto g = groupStack.pop();
413 assert(!groupStack.empty);
414 ir[fix+1] = Bytecode.fromRaw(groupStack.top);
415 //groups are cumulative across lookarounds
416 ir[fix+2] = Bytecode.fromRaw(groupStack.top+g);
417 groupStack.top += g;
418 if (ir[fix].code == IR.LookbehindStart || ir[fix].code == IR.NeglookbehindStart)
419 {
420 reverseBytecode(ir[fix + IRL!(IR.LookbehindStart) .. $]);
421 }
422 put(ir[fix].paired);
423 }
424
425 // repetition of {1,1}
fixRepetitionCodeGen426 void fixRepetition(uint offset)
427 {
428 import std.algorithm.mutation : copy;
429 immutable replace = ir[offset].code == IR.Nop;
430 if (replace)
431 {
432 copy(ir[offset + 1 .. $], ir[offset .. $ - 1]);
433 ir.length -= 1;
434 }
435 }
436
437 // repetition of {x,y}
fixRepetitionCodeGen438 void fixRepetition(uint offset, uint min, uint max, bool greedy)
439 {
440 static import std.algorithm.comparison;
441 import std.algorithm.mutation : copy;
442 import std.array : insertInPlace;
443 immutable replace = ir[offset].code == IR.Nop;
444 immutable len = cast(uint) ir.length - offset - replace;
445 if (max != infinite)
446 {
447 if (min != 1 || max != 1)
448 {
449 Bytecode op = Bytecode(greedy ? IR.RepeatStart : IR.RepeatQStart, len);
450 if (replace)
451 ir[offset] = op;
452 else
453 insertInPlace(ir, offset, op);
454 put(Bytecode(greedy ? IR.RepeatEnd : IR.RepeatQEnd, len));
455 put(Bytecode.init); //hotspot
456 putRaw(1);
457 putRaw(min);
458 putRaw(max);
459 counterDepth = std.algorithm.comparison.max(counterDepth, nesting+1);
460 }
461 }
462 else if (min) //&& max is infinite
463 {
464 if (min != 1)
465 {
466 Bytecode op = Bytecode(greedy ? IR.RepeatStart : IR.RepeatQStart, len);
467 if (replace)
468 ir[offset] = op;
469 else
470 insertInPlace(ir, offset, op);
471 offset += 1;//so it still points to the repeated block
472 put(Bytecode(greedy ? IR.RepeatEnd : IR.RepeatQEnd, len));
473 put(Bytecode.init); //hotspot
474 putRaw(1);
475 putRaw(min);
476 putRaw(min);
477 counterDepth = std.algorithm.comparison.max(counterDepth, nesting+1);
478 }
479 else if (replace)
480 {
481 copy(ir[offset+1 .. $], ir[offset .. $-1]);
482 ir.length -= 1;
483 }
484 put(Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len));
485 enforce(ir.length + len < maxCompiledLength, "maximum compiled pattern length is exceeded");
486 ir ~= ir[offset .. offset+len];
487 //IR.InfinteX is always a hotspot
488 put(Bytecode(greedy ? IR.InfiniteEnd : IR.InfiniteQEnd, len));
489 put(Bytecode.init); //merge index
490 }
491 else//vanila {0,inf}
492 {
493 Bytecode op = Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len);
494 if (replace)
495 ir[offset] = op;
496 else
497 insertInPlace(ir, offset, op);
498 //IR.InfinteX is always a hotspot
499 put(Bytecode(greedy ? IR.InfiniteEnd : IR.InfiniteQEnd, len));
500 put(Bytecode.init); //merge index
501 }
502 }
503
fixAlternationCodeGen504 void fixAlternation()
505 {
506 import std.array : insertInPlace;
507 uint fix = fixupStack.top;
508 if (ir.length > fix && ir[fix].code == IR.Option)
509 {
510 ir[fix] = Bytecode(ir[fix].code, cast(uint) ir.length - fix);
511 put(Bytecode(IR.GotoEndOr, 0));
512 fixupStack.top = cast(uint) ir.length; //replace latest fixup for Option
513 put(Bytecode(IR.Option, 0));
514 return;
515 }
516 uint len, orStart;
517 //start a new option
518 if (fixupStack.length == 1)
519 {//only root entry, effectively no fixup
520 len = cast(uint) ir.length + IRL!(IR.GotoEndOr);
521 orStart = 0;
522 }
523 else
524 {//IR.lookahead, etc. fixups that have length > 1, thus check ir[x].length
525 len = cast(uint) ir.length - fix - (ir[fix].length - 1);
526 orStart = fix + ir[fix].length;
527 }
528 insertInPlace(ir, orStart, Bytecode(IR.OrStart, 0), Bytecode(IR.Option, len));
529 assert(ir[orStart].code == IR.OrStart);
530 put(Bytecode(IR.GotoEndOr, 0));
531 fixupStack.push(orStart); //fixup for StartOR
532 fixupStack.push(cast(uint) ir.length); //for second Option
533 put(Bytecode(IR.Option, 0));
534 }
535
536 // finalizes IR.Option, fix points to the first option of sequence
finishAlternationCodeGen537 void finishAlternation(uint fix)
538 {
539 enforce(ir[fix].code == IR.Option, "no matching ')'");
540 ir[fix] = Bytecode(ir[fix].code, cast(uint) ir.length - fix - IRL!(IR.OrStart));
541 fix = fixupStack.pop();
542 enforce(ir[fix].code == IR.OrStart, "no matching ')'");
543 ir[fix] = Bytecode(IR.OrStart, cast(uint) ir.length - fix - IRL!(IR.OrStart));
544 put(Bytecode(IR.OrEnd, cast(uint) ir.length - fix - IRL!(IR.OrStart)));
545 uint pc = fix + IRL!(IR.OrStart);
546 while (ir[pc].code == IR.Option)
547 {
548 pc = pc + ir[pc].data;
549 if (ir[pc].code != IR.GotoEndOr)
550 break;
551 ir[pc] = Bytecode(IR.GotoEndOr, cast(uint)(ir.length - pc - IRL!(IR.OrEnd)));
552 pc += IRL!(IR.GotoEndOr);
553 }
554 put(Bytecode.fromRaw(0));
555 }
556
557 // returns: (flag - repetition possible?, fixup of the start of this "group")
558 Tuple!(bool, uint) onClose()
559 {
560 nesting--;
561 uint fix = popFixup();
562 switch (ir[fix].code)
563 {
564 case IR.GroupStart:
565 put(Bytecode(IR.GroupEnd, ir[fix].data));
566 return tuple(true, fix);
567 case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
568 assert(lookaroundNest);
569 fixLookaround(fix);
570 return tuple(false, 0u);
571 case IR.Option: //| xxx )
572 //two fixups: last option + full OR
573 finishAlternation(fix);
574 fix = topFixup;
575 switch (ir[fix].code)
576 {
577 case IR.GroupStart:
578 popFixup();
579 put(Bytecode(IR.GroupEnd, ir[fix].data));
580 return tuple(true, fix);
581 case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
582 assert(lookaroundNest);
583 fix = popFixup();
584 fixLookaround(fix);
585 return tuple(false, 0u);
586 default://(?:xxx)
587 popFixup();
588 return tuple(true, fix);
589 }
590 default://(?:xxx)
591 return tuple(true, fix);
592 }
593 }
594
popFixupCodeGen595 uint popFixup(){ return fixupStack.pop(); }
596
pushFixupCodeGen597 void pushFixup(uint val){ return fixupStack.push(val); }
598
topFixupCodeGen599 @property uint topFixup(){ return fixupStack.top; }
600
fixupLengthCodeGen601 @property size_t fixupLength(){ return fixupStack.data.length; }
602
lengthCodeGen603 @property uint length(){ return cast(uint) ir.length; }
604 }
605
606 // safety limits
607 enum maxGroupNumber = 2^^19;
608 enum maxLookaroundDepth = 16;
609 // *Bytecode.sizeof, i.e. 1Mb of bytecode alone
610 enum maxCompiledLength = 2^^18;
611 // amounts to up to 4 Mb of auxilary table for matching
612 enum maxCumulativeRepetitionLength = 2^^20;
613 // marker to indicate infinite repetition
614 enum infinite = ~0u;
615
616 struct Parser(R, Generator)
617 if (isForwardRange!R && is(ElementType!R : dchar))
618 {
619 dchar _current;
620 bool empty;
621 R pat, origin; //keep full pattern for pretty printing error messages
622 uint re_flags = 0; //global flags e.g. multiline + internal ones
623 Generator g;
624
625 @trusted this(S)(R pattern, S flags)
626 if (isSomeString!S)
627 {
628 pat = origin = pattern;
629 //reserve slightly more then avg as sampled from unittests
630 parseFlags(flags);
631 _current = ' ';//a safe default for freeform parsing
632 next();
633 g.start(cast(uint) pat.length);
634 try
635 {
636 parseRegex();
637 }
catch(Exception e)638 catch (Exception e)
639 {
640 error(e.msg);//also adds pattern location
641 }
642 g.endPattern(1);
643 }
644
current()645 @property dchar current(){ return _current; }
646
_next()647 bool _next()
648 {
649 if (pat.empty)
650 {
651 empty = true;
652 return false;
653 }
654 _current = pat.front;
655 pat.popFront();
656 return true;
657 }
658
skipSpace()659 void skipSpace()
660 {
661 while (isWhite(current) && _next()){ }
662 }
663
next()664 bool next()
665 {
666 if (re_flags & RegexOption.freeform)
667 {
668 immutable r = _next();
669 skipSpace();
670 return r;
671 }
672 else
673 return _next();
674 }
675
676 //parsing number with basic overflow check
parseDecimal()677 uint parseDecimal()
678 {
679 uint r = 0;
680 while (std.ascii.isDigit(current))
681 {
682 if (r >= (uint.max/10))
683 error("Overflow in decimal number");
684 r = 10*r + cast(uint)(current-'0');
685 if (!next())
686 break;
687 }
688 return r;
689 }
690
691 //parse control code of form \cXXX, c assumed to be the current symbol
parseControlCode()692 dchar parseControlCode()
693 {
694 enforce(next(), "Unfinished escape sequence");
695 enforce(('a' <= current && current <= 'z') || ('A' <= current && current <= 'Z'),
696 "Only letters are allowed after \\c");
697 return current & 0x1f;
698 }
699
700 //
parseFlags(S)701 @trusted void parseFlags(S)(S flags)
702 {//@@@BUG@@@ text is @system
703 import std.conv : text;
704 foreach (ch; flags)//flags are ASCII anyway
705 {
706 L_FlagSwitch:
707 switch (ch)
708 {
709
710 foreach (i, op; __traits(allMembers, RegexOption))
711 {
712 case RegexOptionNames[i]:
713 if (re_flags & mixin("RegexOption."~op))
714 throw new RegexException(text("redundant flag specified: ",ch));
715 re_flags |= mixin("RegexOption."~op);
716 break L_FlagSwitch;
717 }
718 default:
719 throw new RegexException(text("unknown regex flag '",ch,"'"));
720 }
721 }
722 }
723
724 //parse and store IR for regex pattern
parseRegex()725 @trusted void parseRegex()
726 {
727 uint fix;//fixup pointer
728
729 while (!empty)
730 {
731 debug(std_regex_parser)
732 __ctfe || writeln("*LR*\nSource: ", pat, "\nStack: ",fixupStack.data);
733 switch (current)
734 {
735 case '(':
736 next();
737 if (current == '?')
738 {
739 next();
740 switch (current)
741 {
742 case '#':
743 for (;;)
744 {
745 if (!next())
746 error("Unexpected end of pattern");
747 if (current == ')')
748 {
749 next();
750 break;
751 }
752 }
753 break;
754 case ':':
755 g.genLogicGroup();
756 next();
757 break;
758 case '=':
759 g.genLookaround(IR.LookaheadStart);
760 next();
761 break;
762 case '!':
763 g.genLookaround(IR.NeglookaheadStart);
764 next();
765 break;
766 case 'P':
767 next();
768 if (current != '<')
769 error("Expected '<' in named group");
770 string name;
771 if (!next() || !(isAlpha(current) || current == '_'))
772 error("Expected alpha starting a named group");
773 name ~= current;
774 while (next() && (isAlpha(current) ||
775 current == '_' || std.ascii.isDigit(current)))
776 {
777 name ~= current;
778 }
779 if (current != '>')
780 error("Expected '>' closing named group");
781 next();
782 g.genNamedGroup(name);
783 break;
784 case '<':
785 next();
786 if (current == '=')
787 g.genLookaround(IR.LookbehindStart);
788 else if (current == '!')
789 g.genLookaround(IR.NeglookbehindStart);
790 else
791 error("'!' or '=' expected after '<'");
792 next();
793 break;
794 default:
795 uint enableFlags, disableFlags;
796 bool enable = true;
797 do
798 {
799 switch (current)
800 {
801 case 's':
802 if (enable)
803 enableFlags |= RegexOption.singleline;
804 else
805 disableFlags |= RegexOption.singleline;
806 break;
807 case 'x':
808 if (enable)
809 enableFlags |= RegexOption.freeform;
810 else
811 disableFlags |= RegexOption.freeform;
812 break;
813 case 'i':
814 if (enable)
815 enableFlags |= RegexOption.casefold;
816 else
817 disableFlags |= RegexOption.casefold;
818 break;
819 case 'm':
820 if (enable)
821 enableFlags |= RegexOption.multiline;
822 else
823 disableFlags |= RegexOption.multiline;
824 break;
825 case '-':
826 if (!enable)
827 error(" unexpected second '-' in flags");
828 enable = false;
829 break;
830 default:
831 error(" 's', 'x', 'i', 'm' or '-' expected after '(?' ");
832 }
833 next();
834 }while (current != ')');
835 next();
836 re_flags |= enableFlags;
837 re_flags &= ~disableFlags;
838 }
839 }
840 else
841 {
842 g.genGroup();
843 }
844 break;
845 case ')':
846 enforce(g.nesting, "Unmatched ')'");
847 next();
848 auto pair = g.onClose();
849 if (pair[0])
850 parseQuantifier(pair[1]);
851 break;
852 case '|':
853 next();
854 g.fixAlternation();
855 break;
856 default://no groups or whatever
857 immutable start = g.length;
858 parseAtom();
859 parseQuantifier(start);
860 }
861 }
862
863 if (g.fixupLength != 1)
864 {
865 fix = g.popFixup();
866 g.finishAlternation(fix);
867 enforce(g.fixupLength == 1, "no matching ')'");
868 }
869 }
870
871
872 //parse and store IR for atom-quantifier pair
parseQuantifier(uint offset)873 @trusted void parseQuantifier(uint offset)
874 {//copy is @system
875 if (empty)
876 return g.fixRepetition(offset);
877 uint min, max;
878 switch (current)
879 {
880 case '*':
881 min = 0;
882 max = infinite;
883 break;
884 case '?':
885 min = 0;
886 max = 1;
887 break;
888 case '+':
889 min = 1;
890 max = infinite;
891 break;
892 case '{':
893 enforce(next(), "Unexpected end of regex pattern");
894 enforce(std.ascii.isDigit(current), "First number required in repetition");
895 min = parseDecimal();
896 if (current == '}')
897 max = min;
898 else if (current == ',')
899 {
900 next();
901 if (std.ascii.isDigit(current))
902 max = parseDecimal();
903 else if (current == '}')
904 max = infinite;
905 else
906 error("Unexpected symbol in regex pattern");
907 skipSpace();
908 if (current != '}')
909 error("Unmatched '{' in regex pattern");
910 }
911 else
912 error("Unexpected symbol in regex pattern");
913 if (min > max)
914 error("Illegal {n,m} quantifier");
915 break;
916 default:
917 g.fixRepetition(offset);
918 return;
919 }
920 bool greedy = true;
921 //check only if we managed to get new symbol
922 if (next() && current == '?')
923 {
924 greedy = false;
925 next();
926 }
927 g.fixRepetition(offset, min, max, greedy);
928 }
929
930 //parse and store IR for atom
parseAtom()931 void parseAtom()
932 {
933 if (empty)
934 return;
935 switch (current)
936 {
937 case '*', '?', '+', '|', '{', '}':
938 error("'*', '+', '?', '{', '}' not allowed in atom");
939 break;
940 case '.':
941 if (re_flags & RegexOption.singleline)
942 g.put(Bytecode(IR.Any, 0));
943 else
944 {
945 CodepointSet set;
946 g.charsetToIr(set.add('\n','\n'+1).add('\r', '\r'+1).inverted);
947 }
948 next();
949 break;
950 case '[':
951 parseCharset();
952 break;
953 case '\\':
954 enforce(_next(), "Unfinished escape sequence");
955 parseEscape();
956 break;
957 case '^':
958 if (re_flags & RegexOption.multiline)
959 g.put(Bytecode(IR.Bol, 0));
960 else
961 g.put(Bytecode(IR.Bof, 0));
962 next();
963 break;
964 case '$':
965 if (re_flags & RegexOption.multiline)
966 g.put(Bytecode(IR.Eol, 0));
967 else
968 g.put(Bytecode(IR.Eof, 0));
969 next();
970 break;
971 default:
972 //FIXME: getCommonCasing in new std uni
973 if (re_flags & RegexOption.casefold)
974 {
975 auto range = simpleCaseFoldings(current);
976 assert(range.length <= 5);
977 if (range.length == 1)
978 g.put(Bytecode(IR.Char, range.front));
979 else
980 foreach (v; range)
981 g.put(Bytecode(IR.OrChar, v, cast(uint) range.length));
982 }
983 else
984 g.put(Bytecode(IR.Char, current));
985 next();
986 }
987 }
988
989
990
991 //CodepointSet operations relatively in order of priority
992 enum Operator:uint {
993 Open = 0, Negate, Difference, SymDifference, Intersection, Union, None
994 }
995
996 //parse unit of CodepointSet spec, most notably escape sequences and char ranges
997 //also fetches next set operation
998 Tuple!(CodepointSet,Operator) parseCharTerm()
999 {
1000 enum State{ Start, Char, Escape, CharDash, CharDashEscape,
1001 PotentialTwinSymbolOperator }
1002 Operator op = Operator.None;
1003 dchar last;
1004 CodepointSet set;
1005 State state = State.Start;
1006
addWithFlags(ref CodepointSet set,uint ch,uint re_flags)1007 static void addWithFlags(ref CodepointSet set, uint ch, uint re_flags)
1008 {
1009 if (re_flags & RegexOption.casefold)
1010 {
1011 auto range = simpleCaseFoldings(ch);
1012 foreach (v; range)
1013 set |= v;
1014 }
1015 else
1016 set |= ch;
1017 }
1018
twinSymbolOperator(dchar symbol)1019 static Operator twinSymbolOperator(dchar symbol)
1020 {
1021 switch (symbol)
1022 {
1023 case '|':
1024 return Operator.Union;
1025 case '-':
1026 return Operator.Difference;
1027 case '~':
1028 return Operator.SymDifference;
1029 case '&':
1030 return Operator.Intersection;
1031 default:
1032 assert(false);
1033 }
1034 }
1035
1036 L_CharTermLoop:
1037 for (;;)
1038 {
1039 final switch (state)
1040 {
1041 case State.Start:
1042 switch (current)
1043 {
1044 case '|':
1045 case '-':
1046 case '~':
1047 case '&':
1048 state = State.PotentialTwinSymbolOperator;
1049 last = current;
1050 break;
1051 case '[':
1052 op = Operator.Union;
1053 goto case;
1054 case ']':
1055 break L_CharTermLoop;
1056 case '\\':
1057 state = State.Escape;
1058 break;
1059 default:
1060 state = State.Char;
1061 last = current;
1062 }
1063 break;
1064 case State.Char:
1065 // xxx last current xxx
1066 switch (current)
1067 {
1068 case '|':
1069 case '~':
1070 case '&':
1071 // then last is treated as normal char and added as implicit union
1072 state = State.PotentialTwinSymbolOperator;
1073 addWithFlags(set, last, re_flags);
1074 last = current;
1075 break;
1076 case '-': // still need more info
1077 state = State.CharDash;
1078 break;
1079 case '\\':
1080 set |= last;
1081 state = State.Escape;
1082 break;
1083 case '[':
1084 op = Operator.Union;
1085 goto case;
1086 case ']':
1087 addWithFlags(set, last, re_flags);
1088 break L_CharTermLoop;
1089 default:
1090 state = State.Char;
1091 addWithFlags(set, last, re_flags);
1092 last = current;
1093 }
1094 break;
1095 case State.PotentialTwinSymbolOperator:
1096 // xxx last current xxxx
1097 // where last = [|-&~]
1098 if (current == last)
1099 {
1100 op = twinSymbolOperator(last);
1101 next();//skip second twin char
1102 break L_CharTermLoop;
1103 }
1104 goto case State.Char;
1105 case State.Escape:
1106 // xxx \ current xxx
1107 switch (current)
1108 {
1109 case 'f':
1110 last = '\f';
1111 state = State.Char;
1112 break;
1113 case 'n':
1114 last = '\n';
1115 state = State.Char;
1116 break;
1117 case 'r':
1118 last = '\r';
1119 state = State.Char;
1120 break;
1121 case 't':
1122 last = '\t';
1123 state = State.Char;
1124 break;
1125 case 'v':
1126 last = '\v';
1127 state = State.Char;
1128 break;
1129 case 'c':
1130 last = parseControlCode();
1131 state = State.Char;
1132 break;
foreach(val;Escapables)1133 foreach (val; Escapables)
1134 {
1135 case val:
1136 }
1137 last = current;
1138 state = State.Char;
1139 break;
1140 case 'p':
1141 set.add(parseUnicodePropertySpec(false));
1142 state = State.Start;
1143 continue L_CharTermLoop; //next char already fetched
1144 case 'P':
1145 set.add(parseUnicodePropertySpec(true));
1146 state = State.Start;
1147 continue L_CharTermLoop; //next char already fetched
1148 case 'x':
1149 last = parseUniHex(pat, 2);
1150 state = State.Char;
1151 break;
1152 case 'u':
1153 last = parseUniHex(pat, 4);
1154 state = State.Char;
1155 break;
1156 case 'U':
1157 last = parseUniHex(pat, 8);
1158 state = State.Char;
1159 break;
1160 case 'd':
1161 set.add(unicode.Nd);
1162 state = State.Start;
1163 break;
1164 case 'D':
1165 set.add(unicode.Nd.inverted);
1166 state = State.Start;
1167 break;
1168 case 's':
1169 set.add(unicode.White_Space);
1170 state = State.Start;
1171 break;
1172 case 'S':
1173 set.add(unicode.White_Space.inverted);
1174 state = State.Start;
1175 break;
1176 case 'w':
1177 set.add(wordCharacter);
1178 state = State.Start;
1179 break;
1180 case 'W':
1181 set.add(wordCharacter.inverted);
1182 state = State.Start;
1183 break;
1184 default:
1185 if (current >= privateUseStart && current <= privateUseEnd)
1186 enforce(false, "no matching ']' found while parsing character class");
1187 enforce(false, "invalid escape sequence");
1188 }
1189 break;
1190 case State.CharDash:
1191 // xxx last - current xxx
1192 switch (current)
1193 {
1194 case '[':
1195 op = Operator.Union;
1196 goto case;
1197 case ']':
1198 //means dash is a single char not an interval specifier
1199 addWithFlags(set, last, re_flags);
1200 addWithFlags(set, '-', re_flags);
1201 break L_CharTermLoop;
1202 case '-'://set Difference again
1203 addWithFlags(set, last, re_flags);
1204 op = Operator.Difference;
1205 next();//skip '-'
1206 break L_CharTermLoop;
1207 case '\\':
1208 state = State.CharDashEscape;
1209 break;
1210 default:
1211 enforce(last <= current, "inverted range");
1212 if (re_flags & RegexOption.casefold)
1213 {
1214 for (uint ch = last; ch <= current; ch++)
1215 addWithFlags(set, ch, re_flags);
1216 }
1217 else
1218 set.add(last, current + 1);
1219 state = State.Start;
1220 }
1221 break;
1222 case State.CharDashEscape:
1223 //xxx last - \ current xxx
1224 uint end;
1225 switch (current)
1226 {
1227 case 'f':
1228 end = '\f';
1229 break;
1230 case 'n':
1231 end = '\n';
1232 break;
1233 case 'r':
1234 end = '\r';
1235 break;
1236 case 't':
1237 end = '\t';
1238 break;
1239 case 'v':
1240 end = '\v';
1241 break;
foreach(val;Escapables)1242 foreach (val; Escapables)
1243 {
1244 case val:
1245 }
1246 end = current;
1247 break;
1248 case 'c':
1249 end = parseControlCode();
1250 break;
1251 case 'x':
1252 end = parseUniHex(pat, 2);
1253 break;
1254 case 'u':
1255 end = parseUniHex(pat, 4);
1256 break;
1257 case 'U':
1258 end = parseUniHex(pat, 8);
1259 break;
1260 default:
1261 if (current >= privateUseStart && current <= privateUseEnd)
1262 enforce(false, "no matching ']' found while parsing character class");
1263 error("invalid escape sequence");
1264 }
1265 // Lookahead to check if it's a \T
1266 // where T is sub-pattern terminator in multi-pattern scheme
1267 if (end == '\\' && !pat.empty)
1268 {
1269 if (pat.front >= privateUseStart && pat.front <= privateUseEnd)
1270 enforce(false, "invalid escape sequence");
1271 }
1272 enforce(last <= end,"inverted range");
1273 set.add(last, end + 1);
1274 state = State.Start;
1275 break;
1276 }
1277 enforce(next(), "unexpected end of CodepointSet");
1278 }
1279 return tuple(set, op);
1280 }
1281
1282 alias ValStack = Stack!(CodepointSet);
1283 alias OpStack = Stack!(Operator);
1284
1285 //parse and store IR for CodepointSet
parseCharset()1286 void parseCharset()
1287 {
1288 const save = re_flags;
1289 re_flags &= ~RegexOption.freeform; // stop ignoring whitespace if we did
1290 parseCharsetImpl();
1291 re_flags = save;
1292 // Last next() in parseCharsetImp is executed w/o freeform flag
1293 if (re_flags & RegexOption.freeform) skipSpace();
1294 }
1295
parseCharsetImpl()1296 void parseCharsetImpl()
1297 {
1298 ValStack vstack;
1299 OpStack opstack;
1300 import std.functional : unaryFun;
1301 //
1302 static bool apply(Operator op, ref ValStack stack)
1303 {
1304 switch (op)
1305 {
1306 case Operator.Negate:
1307 enforce(!stack.empty, "no operand for '^'");
1308 stack.top = stack.top.inverted;
1309 break;
1310 case Operator.Union:
1311 auto s = stack.pop();//2nd operand
1312 enforce(!stack.empty, "no operand for '||'");
1313 stack.top.add(s);
1314 break;
1315 case Operator.Difference:
1316 auto s = stack.pop();//2nd operand
1317 enforce(!stack.empty, "no operand for '--'");
1318 stack.top.sub(s);
1319 break;
1320 case Operator.SymDifference:
1321 auto s = stack.pop();//2nd operand
1322 enforce(!stack.empty, "no operand for '~~'");
1323 stack.top ~= s;
1324 break;
1325 case Operator.Intersection:
1326 auto s = stack.pop();//2nd operand
1327 enforce(!stack.empty, "no operand for '&&'");
1328 stack.top.intersect(s);
1329 break;
1330 default:
1331 return false;
1332 }
1333 return true;
1334 }
1335 static bool unrollWhile(alias cond)(ref ValStack vstack, ref OpStack opstack)
1336 {
1337 while (cond(opstack.top))
1338 {
1339 if (!apply(opstack.pop(),vstack))
1340 return false;//syntax error
1341 if (opstack.empty)
1342 return false;
1343 }
1344 return true;
1345 }
1346
1347 L_CharsetLoop:
1348 do
1349 {
1350 switch (current)
1351 {
1352 case '[':
1353 opstack.push(Operator.Open);
1354 enforce(next(), "unexpected end of character class");
1355 if (current == '^')
1356 {
1357 opstack.push(Operator.Negate);
1358 enforce(next(), "unexpected end of character class");
1359 }
1360 else if (current == ']') // []...] is special cased
1361 {
1362 enforce(next(), "wrong character set");
1363 auto pair = parseCharTerm();
1364 pair[0].add(']', ']'+1);
1365 if (pair[1] != Operator.None)
1366 {
1367 if (opstack.top == Operator.Union)
1368 unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
1369 opstack.push(pair[1]);
1370 }
1371 vstack.push(pair[0]);
1372 }
1373 break;
1374 case ']':
1375 enforce(unrollWhile!(unaryFun!"a != a.Open")(vstack, opstack),
1376 "character class syntax error");
1377 enforce(!opstack.empty, "unmatched ']'");
1378 opstack.pop();
1379 if (!next() || opstack.empty)
1380 break L_CharsetLoop;
1381 auto pair = parseCharTerm();
1382 if (!pair[0].empty)//not only operator e.g. -- or ~~
1383 {
1384 vstack.top.add(pair[0]);//apply union
1385 }
1386 if (pair[1] != Operator.None)
1387 {
1388 if (opstack.top == Operator.Union)
1389 unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
1390 opstack.push(pair[1]);
1391 }
1392 break;
1393 //
1394 default://yet another pair of term(op)?
1395 auto pair = parseCharTerm();
1396 if (pair[1] != Operator.None)
1397 {
1398 if (opstack.top == Operator.Union)
1399 unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
1400 opstack.push(pair[1]);
1401 }
1402 vstack.push(pair[0]);
1403 }
1404 }while (!empty || !opstack.empty);
1405 while (!opstack.empty)
1406 {
1407 enforce(opstack.top != Operator.Open,
1408 "no matching ']' found while parsing character class");
1409 apply(opstack.pop(), vstack);
1410 }
1411 assert(vstack.length == 1);
1412 g.charsetToIr(vstack.top);
1413 }
1414
1415 //parse and generate IR for escape stand alone escape sequence
parseEscape()1416 @trusted void parseEscape()
1417 {//accesses array of appender
1418 import std.algorithm.iteration : sum;
1419 switch (current)
1420 {
1421 case 'f': next(); g.put(Bytecode(IR.Char, '\f')); break;
1422 case 'n': next(); g.put(Bytecode(IR.Char, '\n')); break;
1423 case 'r': next(); g.put(Bytecode(IR.Char, '\r')); break;
1424 case 't': next(); g.put(Bytecode(IR.Char, '\t')); break;
1425 case 'v': next(); g.put(Bytecode(IR.Char, '\v')); break;
1426
1427 case 'd':
1428 next();
1429 g.charsetToIr(unicode.Nd);
1430 break;
1431 case 'D':
1432 next();
1433 g.charsetToIr(unicode.Nd.inverted);
1434 break;
1435 case 'b': next(); g.put(Bytecode(IR.Wordboundary, 0)); break;
1436 case 'B': next(); g.put(Bytecode(IR.Notwordboundary, 0)); break;
1437 case 's':
1438 next();
1439 g.charsetToIr(unicode.White_Space);
1440 break;
1441 case 'S':
1442 next();
1443 g.charsetToIr(unicode.White_Space.inverted);
1444 break;
1445 case 'w':
1446 next();
1447 g.charsetToIr(wordCharacter);
1448 break;
1449 case 'W':
1450 next();
1451 g.charsetToIr(wordCharacter.inverted);
1452 break;
1453 case 'p': case 'P':
1454 auto CodepointSet = parseUnicodePropertySpec(current == 'P');
1455 g.charsetToIr(CodepointSet);
1456 break;
1457 case 'x':
1458 immutable code = parseUniHex(pat, 2);
1459 next();
1460 g.put(Bytecode(IR.Char,code));
1461 break;
1462 case 'u': case 'U':
1463 immutable code = parseUniHex(pat, current == 'u' ? 4 : 8);
1464 next();
1465 g.put(Bytecode(IR.Char, code));
1466 break;
1467 case 'c': //control codes
1468 Bytecode code = Bytecode(IR.Char, parseControlCode());
1469 next();
1470 g.put(code);
1471 break;
1472 case '0':
1473 next();
1474 g.put(Bytecode(IR.Char, 0));//NUL character
1475 break;
1476 case '1': .. case '9':
1477 uint nref = cast(uint) current - '0';
1478 immutable maxBackref = sum(g.groupStack.data);
1479 enforce(nref < maxBackref, "Backref to unseen group");
1480 //perl's disambiguation rule i.e.
1481 //get next digit only if there is such group number
1482 while (nref < maxBackref && next() && std.ascii.isDigit(current))
1483 {
1484 nref = nref * 10 + current - '0';
1485 }
1486 if (nref >= maxBackref)
1487 nref /= 10;
1488 enforce(!g.isOpenGroup(nref), "Backref to open group");
1489 uint localLimit = maxBackref - g.groupStack.top;
1490 if (nref >= localLimit)
1491 {
1492 g.put(Bytecode(IR.Backref, nref-localLimit));
1493 g.ir[$-1].setLocalRef();
1494 }
1495 else
1496 g.put(Bytecode(IR.Backref, nref));
1497 g.markBackref(nref);
1498 break;
1499 default:
1500 // Lookahead to check if it's a \T
1501 // where T is sub-pattern terminator in multi-pattern scheme
1502 if (current == '\\' && !pat.empty)
1503 {
1504 if (pat.front >= privateUseStart && current <= privateUseEnd)
1505 enforce(false, "invalid escape sequence");
1506 }
1507 if (current >= privateUseStart && current <= privateUseEnd)
1508 {
1509 g.endPattern(current - privateUseStart + 1);
1510 break;
1511 }
1512 auto op = Bytecode(IR.Char, current);
1513 next();
1514 g.put(op);
1515 }
1516 }
1517
1518 //parse and return a CodepointSet for \p{...Property...} and \P{...Property..},
1519 //\ - assumed to be processed, p - is current
parseUnicodePropertySpec(bool negated)1520 CodepointSet parseUnicodePropertySpec(bool negated)
1521 {
1522 enum MAX_PROPERTY = 128;
1523 char[MAX_PROPERTY] result;
1524 uint k = 0;
1525 enforce(next(), "eof parsing unicode property spec");
1526 if (current == '{')
1527 {
1528 while (k < MAX_PROPERTY && next() && current !='}' && current !=':')
1529 if (current != '-' && current != ' ' && current != '_')
1530 result[k++] = cast(char) std.ascii.toLower(current);
1531 enforce(k != MAX_PROPERTY, "invalid property name");
1532 enforce(current == '}', "} expected ");
1533 }
1534 else
1535 {//single char properties e.g.: \pL, \pN ...
1536 enforce(current < 0x80, "invalid property name");
1537 result[k++] = cast(char) current;
1538 }
1539 auto s = getUnicodeSet(result[0 .. k], negated,
1540 cast(bool)(re_flags & RegexOption.casefold));
1541 enforce(!s.empty, "unrecognized unicode property spec");
1542 next();
1543 return s;
1544 }
1545
1546 //
error(string msg)1547 @trusted void error(string msg)
1548 {
1549 import std.array : appender;
1550 import std.format : formattedWrite;
1551 auto app = appender!string();
1552 formattedWrite(app, "%s\nPattern with error: `%s` <--HERE-- `%s`",
1553 msg, origin[0..$-pat.length], pat);
1554 throw new RegexException(app.data);
1555 }
1556
1557 alias Char = BasicElementOf!R;
1558
program()1559 @property program()
1560 {
1561 return makeRegex(this);
1562 }
1563 }
1564
1565 /+
1566 Postproces the IR, then optimize.
1567 +/
postprocess(Char)1568 @trusted void postprocess(Char)(ref Regex!Char zis)
1569 {//@@@BUG@@@ write is @system
1570 with(zis)
1571 {
1572 struct FixedStack(T)
1573 {
1574 T[] arr;
1575 uint _top;
1576 //this(T[] storage){ arr = storage; _top = -1; }
1577 @property ref T top(){ assert(!empty); return arr[_top]; }
1578 void push(T x){ arr[++_top] = x; }
1579 T pop() { assert(!empty); return arr[_top--]; }
1580 @property bool empty(){ return _top == -1; }
1581 }
1582 auto counterRange = FixedStack!uint(new uint[maxCounterDepth+1], -1);
1583 counterRange.push(1);
1584 ulong cumRange = 0;
1585 for (uint i = 0; i < ir.length; i += ir[i].length)
1586 {
1587 if (ir[i].hotspot)
1588 {
1589 assert(i + 1 < ir.length,
1590 "unexpected end of IR while looking for hotspot");
1591 ir[i+1] = Bytecode.fromRaw(hotspotTableSize);
1592 hotspotTableSize += counterRange.top;
1593 }
1594 switch (ir[i].code)
1595 {
1596 case IR.RepeatStart, IR.RepeatQStart:
1597 uint repEnd = cast(uint)(i + ir[i].data + IRL!(IR.RepeatStart));
1598 assert(ir[repEnd].code == ir[i].paired.code);
1599 immutable max = ir[repEnd + 4].raw;
1600 ir[repEnd+2].raw = counterRange.top;
1601 ir[repEnd+3].raw *= counterRange.top;
1602 ir[repEnd+4].raw *= counterRange.top;
1603 ulong cntRange = cast(ulong)(max)*counterRange.top;
1604 cumRange += cntRange;
1605 enforce(cumRange < maxCumulativeRepetitionLength,
1606 "repetition length limit is exceeded");
1607 counterRange.push(cast(uint) cntRange + counterRange.top);
1608 threadCount += counterRange.top;
1609 break;
1610 case IR.RepeatEnd, IR.RepeatQEnd:
1611 threadCount += counterRange.top;
1612 counterRange.pop();
1613 break;
1614 case IR.GroupStart:
1615 if (isBackref(ir[i].data))
1616 ir[i].setBackrefence();
1617 threadCount += counterRange.top;
1618 break;
1619 case IR.GroupEnd:
1620 if (isBackref(ir[i].data))
1621 ir[i].setBackrefence();
1622 threadCount += counterRange.top;
1623 break;
1624 default:
1625 threadCount += counterRange.top;
1626 }
1627 }
1628 checkIfOneShot();
1629 if (!(flags & RegexInfo.oneShot))
1630 kickstart = Kickstart!Char(zis, new uint[](256));
1631 debug(std_regex_allocation) writefln("IR processed, max threads: %d", threadCount);
1632 optimize(zis);
1633 }
1634 }
1635
fixupBytecode()1636 void fixupBytecode()(Bytecode[] ir)
1637 {
1638 Stack!uint fixups;
1639
1640 with(IR) for (uint i=0; i<ir.length; i+= ir[i].length)
1641 {
1642 if (ir[i].isStart || ir[i].code == Option)
1643 fixups.push(i);
1644 else if (ir[i].code == OrEnd)
1645 {
1646 // Alternatives need more care
1647 auto j = fixups.pop(); // last Option
1648 ir[j].data = i - j - ir[j].length;
1649 j = fixups.pop(); // OrStart
1650 ir[j].data = i - j - ir[j].length;
1651 ir[i].data = ir[j].data;
1652
1653 // fixup all GotoEndOrs
1654 j = j + IRL!(OrStart);
1655 assert(ir[j].code == Option);
1656 for (;;)
1657 {
1658 auto next = j + ir[j].data + IRL!(Option);
1659 if (ir[next].code == IR.OrEnd)
1660 break;
1661 ir[next - IRL!(GotoEndOr)].data = i - next;
1662 j = next;
1663 }
1664 }
1665 else if (ir[i].code == GotoEndOr)
1666 {
1667 auto j = fixups.pop(); // Option
1668 ir[j].data = i - j + IRL!(GotoEndOr)- IRL!(Option); // to the next option
1669 }
1670 else if (ir[i].isEnd)
1671 {
1672 auto j = fixups.pop();
1673 ir[i].data = i - j - ir[j].length;
1674 ir[j].data = ir[i].data;
1675 }
1676 }
1677 assert(fixups.empty);
1678 }
1679
optimize(Char)1680 void optimize(Char)(ref Regex!Char zis)
1681 {
1682 import std.array : insertInPlace;
1683 CodepointSet nextSet(uint idx)
1684 {
1685 CodepointSet set;
1686 with(zis) with(IR)
1687 Outer:
1688 for (uint i = idx; i < ir.length; i += ir[i].length)
1689 {
1690 switch (ir[i].code)
1691 {
1692 case Char:
1693 set.add(ir[i].data, ir[i].data+1);
1694 goto default;
1695 //TODO: OrChar
1696 case Trie, CodepointSet:
1697 set = zis.charsets[ir[i].data];
1698 goto default;
1699 case GroupStart,GroupEnd:
1700 break;
1701 default:
1702 break Outer;
1703 }
1704 }
1705 return set;
1706 }
1707
1708 with(zis) with(IR) for (uint i = 0; i < ir.length; i += ir[i].length)
1709 {
1710 if (ir[i].code == InfiniteEnd)
1711 {
1712 auto set = nextSet(i+IRL!(InfiniteEnd));
1713 if (!set.empty && set.length < 10_000)
1714 {
1715 ir[i] = Bytecode(InfiniteBloomEnd, ir[i].data);
1716 ir[i - ir[i].data - IRL!(InfiniteStart)] =
1717 Bytecode(InfiniteBloomStart, ir[i].data);
1718 ir.insertInPlace(i+IRL!(InfiniteEnd),
1719 Bytecode.fromRaw(cast(uint) zis.filters.length));
1720 zis.filters ~= BitTable(set);
1721 fixupBytecode(ir);
1722 }
1723 }
1724 }
1725 }
1726
1727 //IR code validator - proper nesting, illegal instructions, etc.
validateRe(Char)1728 @trusted void validateRe(Char)(ref Regex!Char zis)
1729 {//@@@BUG@@@ text is @system
1730 import std.conv : text;
1731 with(zis)
1732 {
1733 for (uint pc = 0; pc < ir.length; pc += ir[pc].length)
1734 {
1735 if (ir[pc].isStart || ir[pc].isEnd)
1736 {
1737 immutable dest = ir[pc].indexOfPair(pc);
1738 assert(dest < ir.length, text("Wrong length in opcode at pc=",
1739 pc, " ", dest, " vs ", ir.length));
1740 assert(ir[dest].paired == ir[pc],
1741 text("Wrong pairing of opcodes at pc=", pc, "and pc=", dest));
1742 }
1743 else if (ir[pc].isAtom)
1744 {
1745
1746 }
1747 else
1748 assert(0, text("Unknown type of instruction at pc=", pc));
1749 }
1750 }
1751 }
1752